{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 371940, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "learning_rate": 9.375e-06, "loss": 6.8593, "step": 1000 }, { "epoch": 0.11, "learning_rate": 1.875e-05, "loss": 5.3643, "step": 2000 }, { "epoch": 0.16, "learning_rate": 2.8125e-05, "loss": 5.0364, "step": 3000 }, { "epoch": 0.22, "learning_rate": 3.75e-05, "loss": 4.8218, "step": 4000 }, { "epoch": 0.27, "learning_rate": 4.6874999999999994e-05, "loss": 4.6436, "step": 5000 }, { "epoch": 0.32, "learning_rate": 5.625e-05, "loss": 4.5074, "step": 6000 }, { "epoch": 0.38, "learning_rate": 6.5625e-05, "loss": 4.3958, "step": 7000 }, { "epoch": 0.43, "learning_rate": 7.5e-05, "loss": 4.2967, "step": 8000 }, { "epoch": 0.48, "learning_rate": 8.437499999999999e-05, "loss": 4.2068, "step": 9000 }, { "epoch": 0.54, "learning_rate": 9.374999999999999e-05, "loss": 4.1332, "step": 10000 }, { "epoch": 0.59, "learning_rate": 0.000103115625, "loss": 4.0676, "step": 11000 }, { "epoch": 0.65, "learning_rate": 0.000112490625, "loss": 4.0093, "step": 12000 }, { "epoch": 0.7, "learning_rate": 0.000121865625, "loss": 3.9526, "step": 13000 }, { "epoch": 0.75, "learning_rate": 0.00013123125, "loss": 3.8934, "step": 14000 }, { "epoch": 0.81, "learning_rate": 0.00014060625, "loss": 3.8436, "step": 15000 }, { "epoch": 0.86, "learning_rate": 0.000149971875, "loss": 3.8065, "step": 16000 }, { "epoch": 0.91, "learning_rate": 0.000159346875, "loss": 3.7709, "step": 17000 }, { "epoch": 0.97, "learning_rate": 0.00016871249999999996, "loss": 3.7368, "step": 18000 }, { "epoch": 1.0, "eval_accuracy": 0.34518001634673895, "eval_loss": 3.9162497520446777, "eval_runtime": 152.9029, "eval_samples_per_second": 378.789, "eval_steps_per_second": 5.919, "step": 18597 }, { "epoch": 1.02, "learning_rate": 0.000178078125, "loss": 3.6974, "step": 19000 }, { "epoch": 1.08, "learning_rate": 0.00018745312499999998, "loss": 3.6666, "step": 20000 }, { "epoch": 1.13, "learning_rate": 0.00019682812499999998, "loss": 3.6461, "step": 21000 }, { "epoch": 1.18, "learning_rate": 0.00020619374999999998, "loss": 3.6223, "step": 22000 }, { "epoch": 1.24, "learning_rate": 0.00021556874999999998, "loss": 3.6097, "step": 23000 }, { "epoch": 1.29, "learning_rate": 0.00022494374999999998, "loss": 3.5886, "step": 24000 }, { "epoch": 1.34, "learning_rate": 0.00023430937499999997, "loss": 3.5771, "step": 25000 }, { "epoch": 1.4, "learning_rate": 0.00024368437499999997, "loss": 3.5614, "step": 26000 }, { "epoch": 1.45, "learning_rate": 0.00025305, "loss": 3.5469, "step": 27000 }, { "epoch": 1.51, "learning_rate": 0.000262425, "loss": 3.5355, "step": 28000 }, { "epoch": 1.56, "learning_rate": 0.0002718, "loss": 3.5275, "step": 29000 }, { "epoch": 1.61, "learning_rate": 0.000281165625, "loss": 3.5145, "step": 30000 }, { "epoch": 1.67, "learning_rate": 0.00029053124999999994, "loss": 3.5052, "step": 31000 }, { "epoch": 1.72, "learning_rate": 0.00029990624999999993, "loss": 3.4957, "step": 32000 }, { "epoch": 1.77, "learning_rate": 0.000299126316408778, "loss": 3.4811, "step": 33000 }, { "epoch": 1.83, "learning_rate": 0.000298243807730776, "loss": 3.4709, "step": 34000 }, { "epoch": 1.88, "learning_rate": 0.000297361299052774, "loss": 3.4514, "step": 35000 }, { "epoch": 1.94, "learning_rate": 0.00029647967288344996, "loss": 3.4411, "step": 36000 }, { "epoch": 1.99, "learning_rate": 0.000295597164205448, "loss": 3.4348, "step": 37000 }, { "epoch": 2.0, "eval_accuracy": 0.37482168201575206, "eval_loss": 3.6327266693115234, "eval_runtime": 153.9354, "eval_samples_per_second": 376.249, "eval_steps_per_second": 5.879, "step": 37194 }, { "epoch": 2.04, "learning_rate": 0.00029471553803612404, "loss": 3.3847, "step": 38000 }, { "epoch": 2.1, "learning_rate": 0.000293833029358122, "loss": 3.3766, "step": 39000 }, { "epoch": 2.15, "learning_rate": 0.000292951403188798, "loss": 3.3701, "step": 40000 }, { "epoch": 2.2, "learning_rate": 0.000292069777019474, "loss": 3.3642, "step": 41000 }, { "epoch": 2.26, "learning_rate": 0.000291187268341472, "loss": 3.3595, "step": 42000 }, { "epoch": 2.31, "learning_rate": 0.00029030475966347, "loss": 3.3568, "step": 43000 }, { "epoch": 2.37, "learning_rate": 0.000289422250985468, "loss": 3.3475, "step": 44000 }, { "epoch": 2.42, "learning_rate": 0.00028854062481614397, "loss": 3.3403, "step": 45000 }, { "epoch": 2.47, "learning_rate": 0.000287658116138142, "loss": 3.3384, "step": 46000 }, { "epoch": 2.53, "learning_rate": 0.00028677560746014, "loss": 3.3311, "step": 47000 }, { "epoch": 2.58, "learning_rate": 0.000285893981290816, "loss": 3.3236, "step": 48000 }, { "epoch": 2.63, "learning_rate": 0.00028501235512149196, "loss": 3.3185, "step": 49000 }, { "epoch": 2.69, "learning_rate": 0.00028412984644349, "loss": 3.3144, "step": 50000 }, { "epoch": 2.74, "learning_rate": 0.00028324822027416604, "loss": 3.3129, "step": 51000 }, { "epoch": 2.8, "learning_rate": 0.000282365711596164, "loss": 3.3052, "step": 52000 }, { "epoch": 2.85, "learning_rate": 0.000281483202918162, "loss": 3.2975, "step": 53000 }, { "epoch": 2.9, "learning_rate": 0.000280601576748838, "loss": 3.2951, "step": 54000 }, { "epoch": 2.96, "learning_rate": 0.000279719068070836, "loss": 3.2919, "step": 55000 }, { "epoch": 3.0, "eval_accuracy": 0.3899740583191344, "eval_loss": 3.508404493331909, "eval_runtime": 154.0998, "eval_samples_per_second": 375.847, "eval_steps_per_second": 5.873, "step": 55791 }, { "epoch": 3.01, "learning_rate": 0.000278836559392834, "loss": 3.2736, "step": 56000 }, { "epoch": 3.07, "learning_rate": 0.000277954050714832, "loss": 3.2226, "step": 57000 }, { "epoch": 3.12, "learning_rate": 0.00027707154203683, "loss": 3.2278, "step": 58000 }, { "epoch": 3.17, "learning_rate": 0.000276189915867506, "loss": 3.2315, "step": 59000 }, { "epoch": 3.23, "learning_rate": 0.000275307407189504, "loss": 3.2244, "step": 60000 }, { "epoch": 3.28, "learning_rate": 0.00027442578102017997, "loss": 3.229, "step": 61000 }, { "epoch": 3.33, "learning_rate": 0.000273543272342178, "loss": 3.231, "step": 62000 }, { "epoch": 3.39, "learning_rate": 0.00027266076366417604, "loss": 3.224, "step": 63000 }, { "epoch": 3.44, "learning_rate": 0.000271779137494852, "loss": 3.2256, "step": 64000 }, { "epoch": 3.5, "learning_rate": 0.00027089662881685, "loss": 3.2241, "step": 65000 }, { "epoch": 3.55, "learning_rate": 0.000270015002647526, "loss": 3.2223, "step": 66000 }, { "epoch": 3.6, "learning_rate": 0.00026913249396952403, "loss": 3.2191, "step": 67000 }, { "epoch": 3.66, "learning_rate": 0.000268249985291522, "loss": 3.2179, "step": 68000 }, { "epoch": 3.71, "learning_rate": 0.000267368359122198, "loss": 3.2133, "step": 69000 }, { "epoch": 3.76, "learning_rate": 0.00026648585044419603, "loss": 3.2172, "step": 70000 }, { "epoch": 3.82, "learning_rate": 0.000265603341766194, "loss": 3.2109, "step": 71000 }, { "epoch": 3.87, "learning_rate": 0.000264720833088192, "loss": 3.2077, "step": 72000 }, { "epoch": 3.93, "learning_rate": 0.00026383832441018997, "loss": 3.2076, "step": 73000 }, { "epoch": 3.98, "learning_rate": 0.000262956698240866, "loss": 3.2086, "step": 74000 }, { "epoch": 4.0, "eval_accuracy": 0.3956238680662571, "eval_loss": 3.4501850605010986, "eval_runtime": 154.25, "eval_samples_per_second": 375.481, "eval_steps_per_second": 5.867, "step": 74388 }, { "epoch": 4.03, "learning_rate": 0.000262075072071542, "loss": 3.1628, "step": 75000 }, { "epoch": 4.09, "learning_rate": 0.00026119256339354003, "loss": 3.1437, "step": 76000 }, { "epoch": 4.14, "learning_rate": 0.000260310054715538, "loss": 3.1446, "step": 77000 }, { "epoch": 4.19, "learning_rate": 0.000259428428546214, "loss": 3.1476, "step": 78000 }, { "epoch": 4.25, "learning_rate": 0.00025854591986821204, "loss": 3.1475, "step": 79000 }, { "epoch": 4.3, "learning_rate": 0.00025766341119021, "loss": 3.1509, "step": 80000 }, { "epoch": 4.36, "learning_rate": 0.000256780902512208, "loss": 3.1536, "step": 81000 }, { "epoch": 4.41, "learning_rate": 0.00025589839383420603, "loss": 3.1514, "step": 82000 }, { "epoch": 4.46, "learning_rate": 0.000255015885156204, "loss": 3.149, "step": 83000 }, { "epoch": 4.52, "learning_rate": 0.00025413425898688, "loss": 3.1531, "step": 84000 }, { "epoch": 4.57, "learning_rate": 0.000253251750308878, "loss": 3.1487, "step": 85000 }, { "epoch": 4.62, "learning_rate": 0.000252370124139554, "loss": 3.1467, "step": 86000 }, { "epoch": 4.68, "learning_rate": 0.000251487615461552, "loss": 3.1496, "step": 87000 }, { "epoch": 4.73, "learning_rate": 0.00025060510678355004, "loss": 3.1494, "step": 88000 }, { "epoch": 4.79, "learning_rate": 0.000249723480614226, "loss": 3.148, "step": 89000 }, { "epoch": 4.84, "learning_rate": 0.000248840971936224, "loss": 3.1496, "step": 90000 }, { "epoch": 4.89, "learning_rate": 0.00024795846325822204, "loss": 3.1477, "step": 91000 }, { "epoch": 4.95, "learning_rate": 0.00024707595458022, "loss": 3.1474, "step": 92000 }, { "epoch": 5.0, "eval_accuracy": 0.3994938234828981, "eval_loss": 3.423523426055908, "eval_runtime": 154.6648, "eval_samples_per_second": 374.474, "eval_steps_per_second": 5.851, "step": 92985 }, { "epoch": 5.0, "learning_rate": 0.000246194328410896, "loss": 3.1438, "step": 93000 }, { "epoch": 5.05, "learning_rate": 0.000245311819732894, "loss": 3.0764, "step": 94000 }, { "epoch": 5.11, "learning_rate": 0.000244429311054892, "loss": 3.0802, "step": 95000 }, { "epoch": 5.16, "learning_rate": 0.000243547684885568, "loss": 3.0875, "step": 96000 }, { "epoch": 5.22, "learning_rate": 0.00024266605871624405, "loss": 3.0881, "step": 97000 }, { "epoch": 5.27, "learning_rate": 0.00024178355003824203, "loss": 3.0951, "step": 98000 }, { "epoch": 5.32, "learning_rate": 0.00024090192386891804, "loss": 3.0977, "step": 99000 }, { "epoch": 5.38, "learning_rate": 0.00024001941519091602, "loss": 3.093, "step": 100000 }, { "epoch": 5.43, "learning_rate": 0.00023913778902159204, "loss": 3.097, "step": 101000 }, { "epoch": 5.48, "learning_rate": 0.00023825528034359002, "loss": 3.0942, "step": 102000 }, { "epoch": 5.54, "learning_rate": 0.00023737277166558803, "loss": 3.0994, "step": 103000 }, { "epoch": 5.59, "learning_rate": 0.000236491145496264, "loss": 3.1007, "step": 104000 }, { "epoch": 5.65, "learning_rate": 0.00023560863681826202, "loss": 3.1006, "step": 105000 }, { "epoch": 5.7, "learning_rate": 0.00023472612814026003, "loss": 3.1013, "step": 106000 }, { "epoch": 5.75, "learning_rate": 0.00023384450197093602, "loss": 3.103, "step": 107000 }, { "epoch": 5.81, "learning_rate": 0.00023296199329293402, "loss": 3.1001, "step": 108000 }, { "epoch": 5.86, "learning_rate": 0.00023208036712361004, "loss": 3.0997, "step": 109000 }, { "epoch": 5.91, "learning_rate": 0.00023119785844560804, "loss": 3.0997, "step": 110000 }, { "epoch": 5.97, "learning_rate": 0.00023031623227628403, "loss": 3.1012, "step": 111000 }, { "epoch": 6.0, "eval_accuracy": 0.4019542992689383, "eval_loss": 3.4030864238739014, "eval_runtime": 154.526, "eval_samples_per_second": 374.811, "eval_steps_per_second": 5.857, "step": 111582 }, { "epoch": 6.02, "learning_rate": 0.00022943372359828204, "loss": 3.0671, "step": 112000 }, { "epoch": 6.08, "learning_rate": 0.00022855121492028005, "loss": 3.0316, "step": 113000 }, { "epoch": 6.13, "learning_rate": 0.00022766870624227803, "loss": 3.0381, "step": 114000 }, { "epoch": 6.18, "learning_rate": 0.00022678619756427604, "loss": 3.0465, "step": 115000 }, { "epoch": 6.24, "learning_rate": 0.00022590368888627402, "loss": 3.0474, "step": 116000 }, { "epoch": 6.29, "learning_rate": 0.00022502294522562804, "loss": 3.0522, "step": 117000 }, { "epoch": 6.35, "learning_rate": 0.00022414043654762602, "loss": 3.0489, "step": 118000 }, { "epoch": 6.4, "learning_rate": 0.00022325881037830203, "loss": 3.0538, "step": 119000 }, { "epoch": 6.45, "learning_rate": 0.0002223763017003, "loss": 3.0525, "step": 120000 }, { "epoch": 6.51, "learning_rate": 0.00022149379302229805, "loss": 3.0535, "step": 121000 }, { "epoch": 6.56, "learning_rate": 0.00022061128434429605, "loss": 3.0572, "step": 122000 }, { "epoch": 6.61, "learning_rate": 0.00021972877566629403, "loss": 3.06, "step": 123000 }, { "epoch": 6.67, "learning_rate": 0.00021884714949697005, "loss": 3.0585, "step": 124000 }, { "epoch": 6.72, "learning_rate": 0.00021796552332764603, "loss": 3.0623, "step": 125000 }, { "epoch": 6.78, "learning_rate": 0.00021708301464964404, "loss": 3.0628, "step": 126000 }, { "epoch": 6.83, "learning_rate": 0.00021620138848032003, "loss": 3.0624, "step": 127000 }, { "epoch": 6.88, "learning_rate": 0.00021531887980231804, "loss": 3.0592, "step": 128000 }, { "epoch": 6.94, "learning_rate": 0.00021443637112431604, "loss": 3.0606, "step": 129000 }, { "epoch": 6.99, "learning_rate": 0.00021355386244631402, "loss": 3.0638, "step": 130000 }, { "epoch": 7.0, "eval_accuracy": 0.4030296153303013, "eval_loss": 3.4128024578094482, "eval_runtime": 154.0025, "eval_samples_per_second": 376.085, "eval_steps_per_second": 5.877, "step": 130179 }, { "epoch": 7.04, "learning_rate": 0.00021267311878566802, "loss": 3.0018, "step": 131000 }, { "epoch": 7.1, "learning_rate": 0.00021179061010766602, "loss": 2.9983, "step": 132000 }, { "epoch": 7.15, "learning_rate": 0.00021090810142966406, "loss": 3.0046, "step": 133000 }, { "epoch": 7.21, "learning_rate": 0.00021002559275166204, "loss": 3.0095, "step": 134000 }, { "epoch": 7.26, "learning_rate": 0.00020914396658233805, "loss": 3.0113, "step": 135000 }, { "epoch": 7.31, "learning_rate": 0.00020826145790433606, "loss": 3.0139, "step": 136000 }, { "epoch": 7.37, "learning_rate": 0.00020737894922633404, "loss": 3.0141, "step": 137000 }, { "epoch": 7.42, "learning_rate": 0.00020649732305701006, "loss": 3.0165, "step": 138000 }, { "epoch": 7.47, "learning_rate": 0.00020561481437900804, "loss": 3.0232, "step": 139000 }, { "epoch": 7.53, "learning_rate": 0.00020473230570100604, "loss": 3.0236, "step": 140000 }, { "epoch": 7.58, "learning_rate": 0.00020384979702300403, "loss": 3.0217, "step": 141000 }, { "epoch": 7.64, "learning_rate": 0.00020296817085368004, "loss": 3.0193, "step": 142000 }, { "epoch": 7.69, "learning_rate": 0.00020208654468435603, "loss": 3.0279, "step": 143000 }, { "epoch": 7.74, "learning_rate": 0.00020120403600635403, "loss": 3.0243, "step": 144000 }, { "epoch": 7.8, "learning_rate": 0.00020032152732835207, "loss": 3.0241, "step": 145000 }, { "epoch": 7.85, "learning_rate": 0.00019943901865035005, "loss": 3.0299, "step": 146000 }, { "epoch": 7.9, "learning_rate": 0.00019855650997234806, "loss": 3.0313, "step": 147000 }, { "epoch": 7.96, "learning_rate": 0.00019767400129434606, "loss": 3.0262, "step": 148000 }, { "epoch": 8.0, "eval_accuracy": 0.4046377523139853, "eval_loss": 3.3997907638549805, "eval_runtime": 154.087, "eval_samples_per_second": 375.879, "eval_steps_per_second": 5.873, "step": 148776 }, { "epoch": 8.01, "learning_rate": 0.00019679237512502205, "loss": 3.0108, "step": 149000 }, { "epoch": 8.07, "learning_rate": 0.00019590986644702006, "loss": 2.9607, "step": 150000 }, { "epoch": 8.12, "learning_rate": 0.00019502735776901804, "loss": 2.9699, "step": 151000 }, { "epoch": 8.17, "learning_rate": 0.00019414484909101605, "loss": 2.9736, "step": 152000 }, { "epoch": 8.23, "learning_rate": 0.00019326322292169203, "loss": 2.9743, "step": 153000 }, { "epoch": 8.28, "learning_rate": 0.00019238159675236805, "loss": 2.9832, "step": 154000 }, { "epoch": 8.33, "learning_rate": 0.00019149908807436603, "loss": 2.982, "step": 155000 }, { "epoch": 8.39, "learning_rate": 0.00019061657939636404, "loss": 2.9869, "step": 156000 }, { "epoch": 8.44, "learning_rate": 0.00018973495322704008, "loss": 2.9853, "step": 157000 }, { "epoch": 8.5, "learning_rate": 0.00018885244454903806, "loss": 2.9898, "step": 158000 }, { "epoch": 8.55, "learning_rate": 0.00018796993587103607, "loss": 2.9966, "step": 159000 }, { "epoch": 8.6, "learning_rate": 0.00018708742719303405, "loss": 2.9916, "step": 160000 }, { "epoch": 8.66, "learning_rate": 0.00018620580102371006, "loss": 2.9973, "step": 161000 }, { "epoch": 8.71, "learning_rate": 0.00018532329234570804, "loss": 2.9947, "step": 162000 }, { "epoch": 8.76, "learning_rate": 0.00018444166617638405, "loss": 2.9926, "step": 163000 }, { "epoch": 8.82, "learning_rate": 0.00018356004000706004, "loss": 2.9954, "step": 164000 }, { "epoch": 8.87, "learning_rate": 0.00018267753132905805, "loss": 2.9968, "step": 165000 }, { "epoch": 8.93, "learning_rate": 0.00018179502265105606, "loss": 2.9997, "step": 166000 }, { "epoch": 8.98, "learning_rate": 0.00018091251397305404, "loss": 3.0016, "step": 167000 }, { "epoch": 9.0, "eval_accuracy": 0.40699604418590996, "eval_loss": 3.373080015182495, "eval_runtime": 153.8917, "eval_samples_per_second": 376.355, "eval_steps_per_second": 5.881, "step": 167373 }, { "epoch": 9.03, "learning_rate": 0.00018003088780373005, "loss": 2.9567, "step": 168000 }, { "epoch": 9.09, "learning_rate": 0.00017914926163440606, "loss": 2.9362, "step": 169000 }, { "epoch": 9.14, "learning_rate": 0.00017826675295640407, "loss": 2.9438, "step": 170000 }, { "epoch": 9.2, "learning_rate": 0.00017738424427840205, "loss": 2.9495, "step": 171000 }, { "epoch": 9.25, "learning_rate": 0.00017650261810907807, "loss": 2.9516, "step": 172000 }, { "epoch": 9.3, "learning_rate": 0.00017562010943107607, "loss": 2.9533, "step": 173000 }, { "epoch": 9.36, "learning_rate": 0.00017473760075307405, "loss": 2.9566, "step": 174000 }, { "epoch": 9.41, "learning_rate": 0.00017385509207507206, "loss": 2.9633, "step": 175000 }, { "epoch": 9.46, "learning_rate": 0.00017297258339707004, "loss": 2.9596, "step": 176000 }, { "epoch": 9.52, "learning_rate": 0.00017209095722774606, "loss": 2.9628, "step": 177000 }, { "epoch": 9.57, "learning_rate": 0.00017120844854974404, "loss": 2.9617, "step": 178000 }, { "epoch": 9.63, "learning_rate": 0.00017032770488909806, "loss": 2.9647, "step": 179000 }, { "epoch": 9.68, "learning_rate": 0.00016944519621109604, "loss": 2.964, "step": 180000 }, { "epoch": 9.73, "learning_rate": 0.00016856268753309407, "loss": 2.9676, "step": 181000 }, { "epoch": 9.79, "learning_rate": 0.00016768017885509208, "loss": 2.9731, "step": 182000 }, { "epoch": 9.84, "learning_rate": 0.00016679767017709006, "loss": 2.9734, "step": 183000 }, { "epoch": 9.89, "learning_rate": 0.00016591604400776607, "loss": 2.9745, "step": 184000 }, { "epoch": 9.95, "learning_rate": 0.00016503353532976406, "loss": 2.9715, "step": 185000 }, { "epoch": 10.0, "eval_accuracy": 0.40619785412436715, "eval_loss": 3.4057776927948, "eval_runtime": 153.2918, "eval_samples_per_second": 377.828, "eval_steps_per_second": 5.904, "step": 185970 }, { "epoch": 10.0, "learning_rate": 0.00016415190916044007, "loss": 2.97, "step": 186000 }, { "epoch": 10.06, "learning_rate": 0.00016326940048243805, "loss": 2.9101, "step": 187000 }, { "epoch": 10.11, "learning_rate": 0.00016238777431311406, "loss": 2.9121, "step": 188000 }, { "epoch": 10.16, "learning_rate": 0.00016150703065246806, "loss": 2.9209, "step": 189000 }, { "epoch": 10.22, "learning_rate": 0.00016062452197446606, "loss": 2.922, "step": 190000 }, { "epoch": 10.27, "learning_rate": 0.00015974201329646404, "loss": 2.9243, "step": 191000 }, { "epoch": 10.32, "learning_rate": 0.00015885950461846205, "loss": 2.9304, "step": 192000 }, { "epoch": 10.38, "learning_rate": 0.0001579769959404601, "loss": 2.9305, "step": 193000 }, { "epoch": 10.43, "learning_rate": 0.00015709448726245807, "loss": 2.9363, "step": 194000 }, { "epoch": 10.49, "learning_rate": 0.00015621286109313408, "loss": 2.9379, "step": 195000 }, { "epoch": 10.54, "learning_rate": 0.0001553303524151321, "loss": 2.941, "step": 196000 }, { "epoch": 10.59, "learning_rate": 0.00015444872624580808, "loss": 2.9366, "step": 197000 }, { "epoch": 10.65, "learning_rate": 0.00015356621756780608, "loss": 2.9413, "step": 198000 }, { "epoch": 10.7, "learning_rate": 0.00015268370888980406, "loss": 2.9473, "step": 199000 }, { "epoch": 10.75, "learning_rate": 0.00015180120021180207, "loss": 2.9472, "step": 200000 }, { "epoch": 10.81, "learning_rate": 0.00015091957404247806, "loss": 2.9504, "step": 201000 }, { "epoch": 10.86, "learning_rate": 0.00015003706536447607, "loss": 2.9458, "step": 202000 }, { "epoch": 10.92, "learning_rate": 0.00014915455668647407, "loss": 2.9471, "step": 203000 }, { "epoch": 10.97, "learning_rate": 0.00014827293051715006, "loss": 2.9481, "step": 204000 }, { "epoch": 11.0, "eval_accuracy": 0.4068837157806495, "eval_loss": 3.38750958442688, "eval_runtime": 153.5658, "eval_samples_per_second": 377.154, "eval_steps_per_second": 5.893, "step": 204567 }, { "epoch": 11.02, "learning_rate": 0.00014739042183914807, "loss": 2.9156, "step": 205000 }, { "epoch": 11.08, "learning_rate": 0.00014650879566982408, "loss": 2.8891, "step": 206000 }, { "epoch": 11.13, "learning_rate": 0.00014562628699182206, "loss": 2.8942, "step": 207000 }, { "epoch": 11.18, "learning_rate": 0.00014474554333117608, "loss": 2.9003, "step": 208000 }, { "epoch": 11.24, "learning_rate": 0.0001438630346531741, "loss": 2.8988, "step": 209000 }, { "epoch": 11.29, "learning_rate": 0.00014298052597517207, "loss": 2.9105, "step": 210000 }, { "epoch": 11.35, "learning_rate": 0.00014209889980584808, "loss": 2.9077, "step": 211000 }, { "epoch": 11.4, "learning_rate": 0.00014121639112784606, "loss": 2.9168, "step": 212000 }, { "epoch": 11.45, "learning_rate": 0.00014033388244984407, "loss": 2.9148, "step": 213000 }, { "epoch": 11.51, "learning_rate": 0.00013945137377184208, "loss": 2.9142, "step": 214000 }, { "epoch": 11.56, "learning_rate": 0.0001385697476025181, "loss": 2.9159, "step": 215000 }, { "epoch": 11.61, "learning_rate": 0.00013768723892451607, "loss": 2.9195, "step": 216000 }, { "epoch": 11.67, "learning_rate": 0.00013680473024651408, "loss": 2.9189, "step": 217000 }, { "epoch": 11.72, "learning_rate": 0.00013592398658586807, "loss": 2.9207, "step": 218000 }, { "epoch": 11.78, "learning_rate": 0.00013504147790786608, "loss": 2.9228, "step": 219000 }, { "epoch": 11.83, "learning_rate": 0.0001341589692298641, "loss": 2.9244, "step": 220000 }, { "epoch": 11.88, "learning_rate": 0.00013327734306054008, "loss": 2.9194, "step": 221000 }, { "epoch": 11.94, "learning_rate": 0.00013239483438253808, "loss": 2.9257, "step": 222000 }, { "epoch": 11.99, "learning_rate": 0.0001315123257045361, "loss": 2.9243, "step": 223000 }, { "epoch": 12.0, "eval_accuracy": 0.407012167880445, "eval_loss": 3.4070396423339844, "eval_runtime": 153.2437, "eval_samples_per_second": 377.947, "eval_steps_per_second": 5.906, "step": 223164 }, { "epoch": 12.04, "learning_rate": 0.00013063069953521208, "loss": 2.874, "step": 224000 }, { "epoch": 12.1, "learning_rate": 0.0001297481908572101, "loss": 2.8707, "step": 225000 }, { "epoch": 12.15, "learning_rate": 0.0001288656821792081, "loss": 2.8776, "step": 226000 }, { "epoch": 12.21, "learning_rate": 0.00012798317350120608, "loss": 2.876, "step": 227000 }, { "epoch": 12.26, "learning_rate": 0.0001271015473318821, "loss": 2.8805, "step": 228000 }, { "epoch": 12.31, "learning_rate": 0.0001262190386538801, "loss": 2.8861, "step": 229000 }, { "epoch": 12.37, "learning_rate": 0.00012533741248455608, "loss": 2.8871, "step": 230000 }, { "epoch": 12.42, "learning_rate": 0.0001244549038065541, "loss": 2.8902, "step": 231000 }, { "epoch": 12.48, "learning_rate": 0.0001235723951285521, "loss": 2.8965, "step": 232000 }, { "epoch": 12.53, "learning_rate": 0.00012269076895922809, "loss": 2.8889, "step": 233000 }, { "epoch": 12.58, "learning_rate": 0.0001218082602812261, "loss": 2.9003, "step": 234000 }, { "epoch": 12.64, "learning_rate": 0.00012092575160322409, "loss": 2.896, "step": 235000 }, { "epoch": 12.69, "learning_rate": 0.00012004324292522208, "loss": 2.8995, "step": 236000 }, { "epoch": 12.74, "learning_rate": 0.00011916161675589808, "loss": 2.8987, "step": 237000 }, { "epoch": 12.8, "learning_rate": 0.00011827910807789609, "loss": 2.8998, "step": 238000 }, { "epoch": 12.85, "learning_rate": 0.0001173965993998941, "loss": 2.9073, "step": 239000 }, { "epoch": 12.91, "learning_rate": 0.0001165149732305701, "loss": 2.9013, "step": 240000 }, { "epoch": 12.96, "learning_rate": 0.00011563246455256809, "loss": 2.9047, "step": 241000 }, { "epoch": 13.0, "eval_accuracy": 0.4079228207113693, "eval_loss": 3.4015119075775146, "eval_runtime": 153.5554, "eval_samples_per_second": 377.18, "eval_steps_per_second": 5.894, "step": 241761 }, { "epoch": 13.01, "learning_rate": 0.00011475083838324409, "loss": 2.8895, "step": 242000 }, { "epoch": 13.07, "learning_rate": 0.00011386832970524209, "loss": 2.8458, "step": 243000 }, { "epoch": 13.12, "learning_rate": 0.0001129867035359181, "loss": 2.8516, "step": 244000 }, { "epoch": 13.17, "learning_rate": 0.0001121041948579161, "loss": 2.8585, "step": 245000 }, { "epoch": 13.23, "learning_rate": 0.0001112225686885921, "loss": 2.8576, "step": 246000 }, { "epoch": 13.28, "learning_rate": 0.00011034006001059009, "loss": 2.8645, "step": 247000 }, { "epoch": 13.34, "learning_rate": 0.00010945843384126609, "loss": 2.87, "step": 248000 }, { "epoch": 13.39, "learning_rate": 0.00010857592516326408, "loss": 2.8666, "step": 249000 }, { "epoch": 13.44, "learning_rate": 0.0001076934164852621, "loss": 2.872, "step": 250000 }, { "epoch": 13.5, "learning_rate": 0.0001068109078072601, "loss": 2.8725, "step": 251000 }, { "epoch": 13.55, "learning_rate": 0.0001059292816379361, "loss": 2.8739, "step": 252000 }, { "epoch": 13.6, "learning_rate": 0.00010504677295993409, "loss": 2.878, "step": 253000 }, { "epoch": 13.66, "learning_rate": 0.00010416514679061009, "loss": 2.8767, "step": 254000 }, { "epoch": 13.71, "learning_rate": 0.00010328263811260809, "loss": 2.877, "step": 255000 }, { "epoch": 13.77, "learning_rate": 0.00010240012943460611, "loss": 2.882, "step": 256000 }, { "epoch": 13.82, "learning_rate": 0.00010151850326528211, "loss": 2.8804, "step": 257000 }, { "epoch": 13.87, "learning_rate": 0.0001006368770959581, "loss": 2.8858, "step": 258000 }, { "epoch": 13.93, "learning_rate": 9.97543684179561e-05, "loss": 2.882, "step": 259000 }, { "epoch": 13.98, "learning_rate": 9.88718597399541e-05, "loss": 2.8797, "step": 260000 }, { "epoch": 14.0, "eval_accuracy": 0.40766477441674887, "eval_loss": 3.411367177963257, "eval_runtime": 153.8972, "eval_samples_per_second": 376.342, "eval_steps_per_second": 5.881, "step": 260358 }, { "epoch": 14.03, "learning_rate": 9.79902335706301e-05, "loss": 2.8484, "step": 261000 }, { "epoch": 14.09, "learning_rate": 9.71077248926281e-05, "loss": 2.8351, "step": 262000 }, { "epoch": 14.14, "learning_rate": 9.62252162146261e-05, "loss": 2.8377, "step": 263000 }, { "epoch": 14.2, "learning_rate": 9.534270753662411e-05, "loss": 2.8434, "step": 264000 }, { "epoch": 14.25, "learning_rate": 9.446108136730011e-05, "loss": 2.841, "step": 265000 }, { "epoch": 14.3, "learning_rate": 9.35785726892981e-05, "loss": 2.844, "step": 266000 }, { "epoch": 14.36, "learning_rate": 9.26960640112961e-05, "loss": 2.8465, "step": 267000 }, { "epoch": 14.41, "learning_rate": 9.18135553332941e-05, "loss": 2.8546, "step": 268000 }, { "epoch": 14.46, "learning_rate": 9.09319291639701e-05, "loss": 2.8524, "step": 269000 }, { "epoch": 14.52, "learning_rate": 9.00503029946461e-05, "loss": 2.8515, "step": 270000 }, { "epoch": 14.57, "learning_rate": 8.91677943166441e-05, "loss": 2.8541, "step": 271000 }, { "epoch": 14.63, "learning_rate": 8.82861681473201e-05, "loss": 2.8572, "step": 272000 }, { "epoch": 14.68, "learning_rate": 8.74036594693181e-05, "loss": 2.8568, "step": 273000 }, { "epoch": 14.73, "learning_rate": 8.652115079131611e-05, "loss": 2.8618, "step": 274000 }, { "epoch": 14.79, "learning_rate": 8.563952462199211e-05, "loss": 2.8609, "step": 275000 }, { "epoch": 14.84, "learning_rate": 8.475701594399011e-05, "loss": 2.8582, "step": 276000 }, { "epoch": 14.89, "learning_rate": 8.38745072659881e-05, "loss": 2.8602, "step": 277000 }, { "epoch": 14.95, "learning_rate": 8.29919985879861e-05, "loss": 2.8651, "step": 278000 }, { "epoch": 15.0, "eval_accuracy": 0.40826464303551124, "eval_loss": 3.4071929454803467, "eval_runtime": 153.9494, "eval_samples_per_second": 376.215, "eval_steps_per_second": 5.879, "step": 278955 }, { "epoch": 15.0, "learning_rate": 8.21103724186621e-05, "loss": 2.8635, "step": 279000 }, { "epoch": 15.06, "learning_rate": 8.122786374066012e-05, "loss": 2.8126, "step": 280000 }, { "epoch": 15.11, "learning_rate": 8.034535506265811e-05, "loss": 2.8181, "step": 281000 }, { "epoch": 15.16, "learning_rate": 7.946372889333411e-05, "loss": 2.82, "step": 282000 }, { "epoch": 15.22, "learning_rate": 7.85812202153321e-05, "loss": 2.826, "step": 283000 }, { "epoch": 15.27, "learning_rate": 7.76995940460081e-05, "loss": 2.8282, "step": 284000 }, { "epoch": 15.33, "learning_rate": 7.681796787668412e-05, "loss": 2.8297, "step": 285000 }, { "epoch": 15.38, "learning_rate": 7.593545919868211e-05, "loss": 2.8314, "step": 286000 }, { "epoch": 15.43, "learning_rate": 7.505383302935811e-05, "loss": 2.8329, "step": 287000 }, { "epoch": 15.49, "learning_rate": 7.417132435135611e-05, "loss": 2.8348, "step": 288000 }, { "epoch": 15.54, "learning_rate": 7.328881567335412e-05, "loss": 2.8374, "step": 289000 }, { "epoch": 15.59, "learning_rate": 7.240630699535211e-05, "loss": 2.8359, "step": 290000 }, { "epoch": 15.65, "learning_rate": 7.152468082602811e-05, "loss": 2.8397, "step": 291000 }, { "epoch": 15.7, "learning_rate": 7.064217214802612e-05, "loss": 2.8381, "step": 292000 }, { "epoch": 15.76, "learning_rate": 6.975966347002411e-05, "loss": 2.8392, "step": 293000 }, { "epoch": 15.81, "learning_rate": 6.887803730070011e-05, "loss": 2.8425, "step": 294000 }, { "epoch": 15.86, "learning_rate": 6.799552862269812e-05, "loss": 2.842, "step": 295000 }, { "epoch": 15.92, "learning_rate": 6.711301994469612e-05, "loss": 2.8431, "step": 296000 }, { "epoch": 15.97, "learning_rate": 6.623051126669412e-05, "loss": 2.8434, "step": 297000 }, { "epoch": 16.0, "eval_accuracy": 0.40750031273249193, "eval_loss": 3.424048900604248, "eval_runtime": 153.7747, "eval_samples_per_second": 376.642, "eval_steps_per_second": 5.885, "step": 297552 }, { "epoch": 16.02, "learning_rate": 6.534888509737011e-05, "loss": 2.8248, "step": 298000 }, { "epoch": 16.08, "learning_rate": 6.446637641936812e-05, "loss": 2.7992, "step": 299000 }, { "epoch": 16.13, "learning_rate": 6.358386774136611e-05, "loss": 2.808, "step": 300000 }, { "epoch": 16.19, "learning_rate": 6.270135906336412e-05, "loss": 2.807, "step": 301000 }, { "epoch": 16.24, "learning_rate": 6.181973289404012e-05, "loss": 2.8109, "step": 302000 }, { "epoch": 16.29, "learning_rate": 6.093722421603812e-05, "loss": 2.8098, "step": 303000 }, { "epoch": 16.35, "learning_rate": 6.005648055539212e-05, "loss": 2.811, "step": 304000 }, { "epoch": 16.4, "learning_rate": 5.917397187739013e-05, "loss": 2.8134, "step": 305000 }, { "epoch": 16.45, "learning_rate": 5.829146319938812e-05, "loss": 2.8165, "step": 306000 }, { "epoch": 16.51, "learning_rate": 5.740983703006412e-05, "loss": 2.8168, "step": 307000 }, { "epoch": 16.56, "learning_rate": 5.652732835206213e-05, "loss": 2.8195, "step": 308000 }, { "epoch": 16.62, "learning_rate": 5.5644819674060124e-05, "loss": 2.8171, "step": 309000 }, { "epoch": 16.67, "learning_rate": 5.4763193504736124e-05, "loss": 2.8199, "step": 310000 }, { "epoch": 16.72, "learning_rate": 5.3880684826734125e-05, "loss": 2.8202, "step": 311000 }, { "epoch": 16.78, "learning_rate": 5.2998176148732126e-05, "loss": 2.8227, "step": 312000 }, { "epoch": 16.83, "learning_rate": 5.211566747073012e-05, "loss": 2.826, "step": 313000 }, { "epoch": 16.88, "learning_rate": 5.123404130140613e-05, "loss": 2.8246, "step": 314000 }, { "epoch": 16.94, "learning_rate": 5.035153262340413e-05, "loss": 2.824, "step": 315000 }, { "epoch": 16.99, "learning_rate": 4.946902394540212e-05, "loss": 2.8255, "step": 316000 }, { "epoch": 17.0, "eval_accuracy": 0.40828197600713634, "eval_loss": 3.417881727218628, "eval_runtime": 153.4343, "eval_samples_per_second": 377.478, "eval_steps_per_second": 5.898, "step": 316149 }, { "epoch": 17.05, "learning_rate": 4.858739777607813e-05, "loss": 2.7937, "step": 317000 }, { "epoch": 17.1, "learning_rate": 4.770488909807612e-05, "loss": 2.7869, "step": 318000 }, { "epoch": 17.15, "learning_rate": 4.682326292875212e-05, "loss": 2.792, "step": 319000 }, { "epoch": 17.21, "learning_rate": 4.594163675942813e-05, "loss": 2.7937, "step": 320000 }, { "epoch": 17.26, "learning_rate": 4.505912808142613e-05, "loss": 2.7897, "step": 321000 }, { "epoch": 17.31, "learning_rate": 4.4176619403424126e-05, "loss": 2.7967, "step": 322000 }, { "epoch": 17.37, "learning_rate": 4.329411072542213e-05, "loss": 2.7942, "step": 323000 }, { "epoch": 17.42, "learning_rate": 4.241160204742013e-05, "loss": 2.8025, "step": 324000 }, { "epoch": 17.48, "learning_rate": 4.152909336941813e-05, "loss": 2.8014, "step": 325000 }, { "epoch": 17.53, "learning_rate": 4.0648349708772134e-05, "loss": 2.8022, "step": 326000 }, { "epoch": 17.58, "learning_rate": 3.976584103077013e-05, "loss": 2.8024, "step": 327000 }, { "epoch": 17.64, "learning_rate": 3.888333235276813e-05, "loss": 2.8028, "step": 328000 }, { "epoch": 17.69, "learning_rate": 3.800082367476614e-05, "loss": 2.8021, "step": 329000 }, { "epoch": 17.74, "learning_rate": 3.711831499676413e-05, "loss": 2.8048, "step": 330000 }, { "epoch": 17.8, "learning_rate": 3.623668882744013e-05, "loss": 2.8058, "step": 331000 }, { "epoch": 17.85, "learning_rate": 3.535506265811613e-05, "loss": 2.8078, "step": 332000 }, { "epoch": 17.91, "learning_rate": 3.447255398011413e-05, "loss": 2.8043, "step": 333000 }, { "epoch": 17.96, "learning_rate": 3.3590045302112134e-05, "loss": 2.8036, "step": 334000 }, { "epoch": 18.0, "eval_accuracy": 0.4081968563364037, "eval_loss": 3.4256491661071777, "eval_runtime": 153.5909, "eval_samples_per_second": 377.093, "eval_steps_per_second": 5.892, "step": 334746 }, { "epoch": 18.01, "learning_rate": 3.2708419132788134e-05, "loss": 2.7978, "step": 335000 }, { "epoch": 18.07, "learning_rate": 3.1825910454786135e-05, "loss": 2.7744, "step": 336000 }, { "epoch": 18.12, "learning_rate": 3.0943401776784136e-05, "loss": 2.7766, "step": 337000 }, { "epoch": 18.17, "learning_rate": 3.0060893098782134e-05, "loss": 2.7851, "step": 338000 }, { "epoch": 18.23, "learning_rate": 2.9179266929458137e-05, "loss": 2.7799, "step": 339000 }, { "epoch": 18.28, "learning_rate": 2.829675825145614e-05, "loss": 2.7785, "step": 340000 }, { "epoch": 18.34, "learning_rate": 2.7415132082132135e-05, "loss": 2.7824, "step": 341000 }, { "epoch": 18.39, "learning_rate": 2.6532623404130136e-05, "loss": 2.7789, "step": 342000 }, { "epoch": 18.44, "learning_rate": 2.565011472612814e-05, "loss": 2.7847, "step": 343000 }, { "epoch": 18.5, "learning_rate": 2.4767606048126135e-05, "loss": 2.7849, "step": 344000 }, { "epoch": 18.55, "learning_rate": 2.3885979878802138e-05, "loss": 2.7844, "step": 345000 }, { "epoch": 18.61, "learning_rate": 2.300435370947814e-05, "loss": 2.7845, "step": 346000 }, { "epoch": 18.66, "learning_rate": 2.212184503147614e-05, "loss": 2.7858, "step": 347000 }, { "epoch": 18.71, "learning_rate": 2.123933635347414e-05, "loss": 2.788, "step": 348000 }, { "epoch": 18.77, "learning_rate": 2.035682767547214e-05, "loss": 2.7841, "step": 349000 }, { "epoch": 18.82, "learning_rate": 1.947431899747014e-05, "loss": 2.7931, "step": 350000 }, { "epoch": 18.87, "learning_rate": 1.859357533682414e-05, "loss": 2.7847, "step": 351000 }, { "epoch": 18.93, "learning_rate": 1.7711066658822143e-05, "loss": 2.7911, "step": 352000 }, { "epoch": 18.98, "learning_rate": 1.6828557980820144e-05, "loss": 2.7888, "step": 353000 }, { "epoch": 19.0, "eval_accuracy": 0.4083161044939021, "eval_loss": 3.4363324642181396, "eval_runtime": 153.9158, "eval_samples_per_second": 376.297, "eval_steps_per_second": 5.88, "step": 353343 }, { "epoch": 19.04, "learning_rate": 1.5946931811496144e-05, "loss": 2.7724, "step": 354000 }, { "epoch": 19.09, "learning_rate": 1.5064423133494145e-05, "loss": 2.7653, "step": 355000 }, { "epoch": 19.14, "learning_rate": 1.4181914455492144e-05, "loss": 2.7712, "step": 356000 }, { "epoch": 19.2, "learning_rate": 1.3299405777490143e-05, "loss": 2.7696, "step": 357000 }, { "epoch": 19.25, "learning_rate": 1.2416897099488144e-05, "loss": 2.7658, "step": 358000 }, { "epoch": 19.3, "learning_rate": 1.1535270930164146e-05, "loss": 2.7637, "step": 359000 }, { "epoch": 19.36, "learning_rate": 1.0653644760840146e-05, "loss": 2.769, "step": 360000 }, { "epoch": 19.41, "learning_rate": 9.771136082838147e-06, "loss": 2.7734, "step": 361000 }, { "epoch": 19.47, "learning_rate": 8.888627404836147e-06, "loss": 2.7685, "step": 362000 }, { "epoch": 19.52, "learning_rate": 8.006118726834148e-06, "loss": 2.7733, "step": 363000 }, { "epoch": 19.57, "learning_rate": 7.125375066188151e-06, "loss": 2.7755, "step": 364000 }, { "epoch": 19.63, "learning_rate": 6.2428663881861495e-06, "loss": 2.7693, "step": 365000 }, { "epoch": 19.68, "learning_rate": 5.36035771018415e-06, "loss": 2.7692, "step": 366000 }, { "epoch": 19.73, "learning_rate": 4.47784903218215e-06, "loss": 2.7709, "step": 367000 }, { "epoch": 19.79, "learning_rate": 3.5953403541801493e-06, "loss": 2.7716, "step": 368000 }, { "epoch": 19.84, "learning_rate": 2.7128316761781486e-06, "loss": 2.7701, "step": 369000 }, { "epoch": 19.9, "learning_rate": 1.8312055068541506e-06, "loss": 2.7714, "step": 370000 }, { "epoch": 19.95, "learning_rate": 9.486968288521502e-07, "loss": 2.7701, "step": 371000 }, { "epoch": 20.0, "eval_accuracy": 0.40813531756892846, "eval_loss": 3.4419291019439697, "eval_runtime": 153.8158, "eval_samples_per_second": 376.541, "eval_steps_per_second": 5.884, "step": 371940 }, { "epoch": 20.0, "step": 371940, "total_flos": 1.56702845389824e+18, "train_loss": 3.060115090571354, "train_runtime": 80942.7407, "train_samples_per_second": 147.036, "train_steps_per_second": 4.595 } ], "logging_steps": 1000, "max_steps": 371940, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 5000, "total_flos": 1.56702845389824e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }