{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 371760, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "learning_rate": 3.125e-05, "loss": 6.2183, "step": 1000 }, { "epoch": 0.11, "learning_rate": 6.25e-05, "loss": 5.0127, "step": 2000 }, { "epoch": 0.16, "learning_rate": 9.375e-05, "loss": 4.6841, "step": 3000 }, { "epoch": 0.22, "learning_rate": 0.000125, "loss": 4.452, "step": 4000 }, { "epoch": 0.27, "learning_rate": 0.00015625, "loss": 4.2915, "step": 5000 }, { "epoch": 0.32, "learning_rate": 0.0001875, "loss": 4.181, "step": 6000 }, { "epoch": 0.38, "learning_rate": 0.00021875, "loss": 4.0744, "step": 7000 }, { "epoch": 0.43, "learning_rate": 0.00025, "loss": 3.9815, "step": 8000 }, { "epoch": 0.48, "learning_rate": 0.00028121875, "loss": 3.9114, "step": 9000 }, { "epoch": 0.54, "learning_rate": 0.0003124375, "loss": 3.8546, "step": 10000 }, { "epoch": 0.59, "learning_rate": 0.00034368749999999997, "loss": 3.8053, "step": 11000 }, { "epoch": 0.65, "learning_rate": 0.00037490625, "loss": 3.7693, "step": 12000 }, { "epoch": 0.7, "learning_rate": 0.00040615625, "loss": 3.7387, "step": 13000 }, { "epoch": 0.75, "learning_rate": 0.00043740625, "loss": 3.6952, "step": 14000 }, { "epoch": 0.81, "learning_rate": 0.000468625, "loss": 3.6708, "step": 15000 }, { "epoch": 0.86, "learning_rate": 0.000499875, "loss": 3.6445, "step": 16000 }, { "epoch": 0.91, "learning_rate": 0.000531125, "loss": 3.6248, "step": 17000 }, { "epoch": 0.97, "learning_rate": 0.00056234375, "loss": 3.6071, "step": 18000 }, { "epoch": 1.0, "eval_accuracy": 0.3590285050576805, "eval_loss": 3.7804887294769287, "eval_runtime": 151.8602, "eval_samples_per_second": 381.397, "eval_steps_per_second": 5.959, "step": 18588 }, { "epoch": 1.02, "learning_rate": 0.00059359375, "loss": 3.5742, "step": 19000 }, { "epoch": 1.08, "learning_rate": 0.0006248125, "loss": 3.5528, "step": 20000 }, { "epoch": 1.13, "learning_rate": 0.0006560625, "loss": 3.5368, "step": 21000 }, { "epoch": 1.18, "learning_rate": 0.0006873125, "loss": 3.5273, "step": 22000 }, { "epoch": 1.24, "learning_rate": 0.0007185000000000001, "loss": 3.5178, "step": 23000 }, { "epoch": 1.29, "learning_rate": 0.0007497500000000001, "loss": 3.5118, "step": 24000 }, { "epoch": 1.34, "learning_rate": 0.000781, "loss": 3.5063, "step": 25000 }, { "epoch": 1.4, "learning_rate": 0.00081225, "loss": 3.4865, "step": 26000 }, { "epoch": 1.45, "learning_rate": 0.00084346875, "loss": 3.4768, "step": 27000 }, { "epoch": 1.51, "learning_rate": 0.00087471875, "loss": 3.4753, "step": 28000 }, { "epoch": 1.56, "learning_rate": 0.00090596875, "loss": 3.46, "step": 29000 }, { "epoch": 1.61, "learning_rate": 0.00093721875, "loss": 3.4586, "step": 30000 }, { "epoch": 1.67, "learning_rate": 0.0009684375, "loss": 3.4509, "step": 31000 }, { "epoch": 1.72, "learning_rate": 0.0009996875, "loss": 3.4365, "step": 32000 }, { "epoch": 1.78, "learning_rate": 0.000997089121732988, "loss": 3.431, "step": 33000 }, { "epoch": 1.83, "learning_rate": 0.0009941458676712975, "loss": 3.4214, "step": 34000 }, { "epoch": 1.88, "learning_rate": 0.0009912026136096068, "loss": 3.408, "step": 35000 }, { "epoch": 1.94, "learning_rate": 0.0009882652460560396, "loss": 3.3984, "step": 36000 }, { "epoch": 1.99, "learning_rate": 0.000985321991994349, "loss": 3.3943, "step": 37000 }, { "epoch": 2.0, "eval_accuracy": 0.3805695233978648, "eval_loss": 3.579639434814453, "eval_runtime": 153.8621, "eval_samples_per_second": 376.434, "eval_steps_per_second": 5.882, "step": 37176 }, { "epoch": 2.04, "learning_rate": 0.0009823816811867201, "loss": 3.3388, "step": 38000 }, { "epoch": 2.1, "learning_rate": 0.0009794384271250296, "loss": 3.3306, "step": 39000 }, { "epoch": 2.15, "learning_rate": 0.0009764951730633388, "loss": 3.3275, "step": 40000 }, { "epoch": 2.21, "learning_rate": 0.0009735519190016482, "loss": 3.3276, "step": 41000 }, { "epoch": 2.26, "learning_rate": 0.0009706086649399576, "loss": 3.3153, "step": 42000 }, { "epoch": 2.31, "learning_rate": 0.0009676712973863904, "loss": 3.3111, "step": 43000 }, { "epoch": 2.37, "learning_rate": 0.0009647280433246997, "loss": 3.3085, "step": 44000 }, { "epoch": 2.42, "learning_rate": 0.0009617847892630091, "loss": 3.3064, "step": 45000 }, { "epoch": 2.47, "learning_rate": 0.0009588444784553803, "loss": 3.3053, "step": 46000 }, { "epoch": 2.53, "learning_rate": 0.0009559012243936896, "loss": 3.2948, "step": 47000 }, { "epoch": 2.58, "learning_rate": 0.000952957970331999, "loss": 3.2911, "step": 48000 }, { "epoch": 2.64, "learning_rate": 0.0009500176595243702, "loss": 3.2898, "step": 49000 }, { "epoch": 2.69, "learning_rate": 0.0009470744054626795, "loss": 3.2839, "step": 50000 }, { "epoch": 2.74, "learning_rate": 0.0009441311514009889, "loss": 3.2787, "step": 51000 }, { "epoch": 2.8, "learning_rate": 0.0009411908405933601, "loss": 3.2716, "step": 52000 }, { "epoch": 2.85, "learning_rate": 0.0009382475865316694, "loss": 3.2632, "step": 53000 }, { "epoch": 2.91, "learning_rate": 0.0009353043324699788, "loss": 3.2715, "step": 54000 }, { "epoch": 2.96, "learning_rate": 0.00093236402166235, "loss": 3.2625, "step": 55000 }, { "epoch": 3.0, "eval_accuracy": 0.3915330432954704, "eval_loss": 3.4678006172180176, "eval_runtime": 153.7826, "eval_samples_per_second": 376.629, "eval_steps_per_second": 5.885, "step": 55764 }, { "epoch": 3.01, "learning_rate": 0.0009294207676006593, "loss": 3.2388, "step": 56000 }, { "epoch": 3.07, "learning_rate": 0.0009264775135389687, "loss": 3.1981, "step": 57000 }, { "epoch": 3.12, "learning_rate": 0.000923534259477278, "loss": 3.1952, "step": 58000 }, { "epoch": 3.17, "learning_rate": 0.0009205939486696492, "loss": 3.1991, "step": 59000 }, { "epoch": 3.23, "learning_rate": 0.0009176506946079586, "loss": 3.2015, "step": 60000 }, { "epoch": 3.28, "learning_rate": 0.0009147074405462679, "loss": 3.1977, "step": 61000 }, { "epoch": 3.34, "learning_rate": 0.0009117641864845774, "loss": 3.1985, "step": 62000 }, { "epoch": 3.39, "learning_rate": 0.0009088238756769485, "loss": 3.195, "step": 63000 }, { "epoch": 3.44, "learning_rate": 0.0009058835648693195, "loss": 3.1988, "step": 64000 }, { "epoch": 3.5, "learning_rate": 0.000902940310807629, "loss": 3.2016, "step": 65000 }, { "epoch": 3.55, "learning_rate": 0.0008999970567459384, "loss": 3.1942, "step": 66000 }, { "epoch": 3.6, "learning_rate": 0.0008970538026842477, "loss": 3.1928, "step": 67000 }, { "epoch": 3.66, "learning_rate": 0.0008941134918766188, "loss": 3.1918, "step": 68000 }, { "epoch": 3.71, "learning_rate": 0.0008911702378149283, "loss": 3.1904, "step": 69000 }, { "epoch": 3.77, "learning_rate": 0.0008882269837532376, "loss": 3.1905, "step": 70000 }, { "epoch": 3.82, "learning_rate": 0.0008852866729456087, "loss": 3.191, "step": 71000 }, { "epoch": 3.87, "learning_rate": 0.0008823434188839181, "loss": 3.1849, "step": 72000 }, { "epoch": 3.93, "learning_rate": 0.0008794031080762892, "loss": 3.1891, "step": 73000 }, { "epoch": 3.98, "learning_rate": 0.0008764598540145986, "loss": 3.1838, "step": 74000 }, { "epoch": 4.0, "eval_accuracy": 0.3997996531181377, "eval_loss": 3.3962149620056152, "eval_runtime": 153.4429, "eval_samples_per_second": 377.463, "eval_steps_per_second": 5.898, "step": 74352 }, { "epoch": 4.03, "learning_rate": 0.0008735195432069696, "loss": 3.1335, "step": 75000 }, { "epoch": 4.09, "learning_rate": 0.0008705762891452791, "loss": 3.1196, "step": 76000 }, { "epoch": 4.14, "learning_rate": 0.0008676359783376502, "loss": 3.1214, "step": 77000 }, { "epoch": 4.2, "learning_rate": 0.0008646927242759594, "loss": 3.1243, "step": 78000 }, { "epoch": 4.25, "learning_rate": 0.0008617494702142689, "loss": 3.1284, "step": 79000 }, { "epoch": 4.3, "learning_rate": 0.0008588062161525783, "loss": 3.1291, "step": 80000 }, { "epoch": 4.36, "learning_rate": 0.0008558629620908876, "loss": 3.1335, "step": 81000 }, { "epoch": 4.41, "learning_rate": 0.0008529226512832588, "loss": 3.1305, "step": 82000 }, { "epoch": 4.47, "learning_rate": 0.0008499793972215682, "loss": 3.1299, "step": 83000 }, { "epoch": 4.52, "learning_rate": 0.0008470390864139392, "loss": 3.1297, "step": 84000 }, { "epoch": 4.57, "learning_rate": 0.0008440958323522487, "loss": 3.1277, "step": 85000 }, { "epoch": 4.63, "learning_rate": 0.0008411555215446197, "loss": 3.1316, "step": 86000 }, { "epoch": 4.68, "learning_rate": 0.0008382122674829291, "loss": 3.126, "step": 87000 }, { "epoch": 4.73, "learning_rate": 0.0008352690134212385, "loss": 3.1336, "step": 88000 }, { "epoch": 4.79, "learning_rate": 0.0008323257593595479, "loss": 3.1283, "step": 89000 }, { "epoch": 4.84, "learning_rate": 0.0008293825052978573, "loss": 3.1309, "step": 90000 }, { "epoch": 4.9, "learning_rate": 0.0008264421944902284, "loss": 3.1292, "step": 91000 }, { "epoch": 4.95, "learning_rate": 0.0008234989404285378, "loss": 3.1277, "step": 92000 }, { "epoch": 5.0, "eval_accuracy": 0.4017150477760334, "eval_loss": 3.3849174976348877, "eval_runtime": 153.5044, "eval_samples_per_second": 377.312, "eval_steps_per_second": 5.896, "step": 92940 }, { "epoch": 5.0, "learning_rate": 0.0008205556863668472, "loss": 3.1209, "step": 93000 }, { "epoch": 5.06, "learning_rate": 0.0008176153755592183, "loss": 3.0617, "step": 94000 }, { "epoch": 5.11, "learning_rate": 0.0008146721214975277, "loss": 3.0649, "step": 95000 }, { "epoch": 5.16, "learning_rate": 0.0008117288674358371, "loss": 3.0697, "step": 96000 }, { "epoch": 5.22, "learning_rate": 0.0008087885566282081, "loss": 3.0744, "step": 97000 }, { "epoch": 5.27, "learning_rate": 0.0008058482458205792, "loss": 3.0762, "step": 98000 }, { "epoch": 5.33, "learning_rate": 0.0008029049917588887, "loss": 3.0775, "step": 99000 }, { "epoch": 5.38, "learning_rate": 0.000799961737697198, "loss": 3.074, "step": 100000 }, { "epoch": 5.43, "learning_rate": 0.0007970214268895691, "loss": 3.0827, "step": 101000 }, { "epoch": 5.49, "learning_rate": 0.0007940781728278786, "loss": 3.0798, "step": 102000 }, { "epoch": 5.54, "learning_rate": 0.0007911349187661879, "loss": 3.0802, "step": 103000 }, { "epoch": 5.6, "learning_rate": 0.0007881916647044973, "loss": 3.0819, "step": 104000 }, { "epoch": 5.65, "learning_rate": 0.0007852484106428068, "loss": 3.0867, "step": 105000 }, { "epoch": 5.7, "learning_rate": 0.0007823051565811161, "loss": 3.081, "step": 106000 }, { "epoch": 5.76, "learning_rate": 0.0007793648457734872, "loss": 3.0817, "step": 107000 }, { "epoch": 5.81, "learning_rate": 0.00077642747821992, "loss": 3.0864, "step": 108000 }, { "epoch": 5.86, "learning_rate": 0.0007734842241582294, "loss": 3.0817, "step": 109000 }, { "epoch": 5.92, "learning_rate": 0.0007705409700965388, "loss": 3.0866, "step": 110000 }, { "epoch": 5.97, "learning_rate": 0.0007675977160348481, "loss": 3.0813, "step": 111000 }, { "epoch": 6.0, "eval_accuracy": 0.40404998715837087, "eval_loss": 3.387449026107788, "eval_runtime": 153.7852, "eval_samples_per_second": 376.623, "eval_steps_per_second": 5.885, "step": 111528 }, { "epoch": 6.03, "learning_rate": 0.0007646544619731576, "loss": 3.0506, "step": 112000 }, { "epoch": 6.08, "learning_rate": 0.000761711207911467, "loss": 3.0138, "step": 113000 }, { "epoch": 6.13, "learning_rate": 0.000758770897103838, "loss": 3.0205, "step": 114000 }, { "epoch": 6.19, "learning_rate": 0.0007558276430421475, "loss": 3.0302, "step": 115000 }, { "epoch": 6.24, "learning_rate": 0.0007528873322345186, "loss": 3.0285, "step": 116000 }, { "epoch": 6.29, "learning_rate": 0.0007499440781728279, "loss": 3.0317, "step": 117000 }, { "epoch": 6.35, "learning_rate": 0.0007470008241111372, "loss": 3.0377, "step": 118000 }, { "epoch": 6.4, "learning_rate": 0.0007440575700494467, "loss": 3.0375, "step": 119000 }, { "epoch": 6.46, "learning_rate": 0.0007411172592418177, "loss": 3.0414, "step": 120000 }, { "epoch": 6.51, "learning_rate": 0.0007381740051801271, "loss": 3.0435, "step": 121000 }, { "epoch": 6.56, "learning_rate": 0.0007352307511184366, "loss": 3.0441, "step": 122000 }, { "epoch": 6.62, "learning_rate": 0.0007322874970567459, "loss": 3.0481, "step": 123000 }, { "epoch": 6.67, "learning_rate": 0.000729347186249117, "loss": 3.0422, "step": 124000 }, { "epoch": 6.72, "learning_rate": 0.0007264039321874263, "loss": 3.0457, "step": 125000 }, { "epoch": 6.78, "learning_rate": 0.0007234606781257358, "loss": 3.0457, "step": 126000 }, { "epoch": 6.83, "learning_rate": 0.0007205203673181069, "loss": 3.0501, "step": 127000 }, { "epoch": 6.89, "learning_rate": 0.0007175771132564162, "loss": 3.052, "step": 128000 }, { "epoch": 6.94, "learning_rate": 0.0007146368024487874, "loss": 3.0474, "step": 129000 }, { "epoch": 6.99, "learning_rate": 0.0007116935483870968, "loss": 3.0519, "step": 130000 }, { "epoch": 7.0, "eval_accuracy": 0.40786908699745245, "eval_loss": 3.3393681049346924, "eval_runtime": 153.4305, "eval_samples_per_second": 377.493, "eval_steps_per_second": 5.898, "step": 130116 }, { "epoch": 7.05, "learning_rate": 0.0007087532375794678, "loss": 2.9879, "step": 131000 }, { "epoch": 7.1, "learning_rate": 0.000705812926771839, "loss": 2.984, "step": 132000 }, { "epoch": 7.16, "learning_rate": 0.0007028696727101484, "loss": 2.9883, "step": 133000 }, { "epoch": 7.21, "learning_rate": 0.0006999264186484577, "loss": 2.9895, "step": 134000 }, { "epoch": 7.26, "learning_rate": 0.0006969831645867672, "loss": 3.0, "step": 135000 }, { "epoch": 7.32, "learning_rate": 0.0006940399105250766, "loss": 3.0035, "step": 136000 }, { "epoch": 7.37, "learning_rate": 0.0006910966564633859, "loss": 3.0073, "step": 137000 }, { "epoch": 7.42, "learning_rate": 0.000688156345655757, "loss": 3.0078, "step": 138000 }, { "epoch": 7.48, "learning_rate": 0.0006852160348481281, "loss": 3.0042, "step": 139000 }, { "epoch": 7.53, "learning_rate": 0.0006822757240404992, "loss": 3.0086, "step": 140000 }, { "epoch": 7.59, "learning_rate": 0.0006793324699788086, "loss": 3.0068, "step": 141000 }, { "epoch": 7.64, "learning_rate": 0.000676389215917118, "loss": 3.0091, "step": 142000 }, { "epoch": 7.69, "learning_rate": 0.0006734459618554274, "loss": 3.0145, "step": 143000 }, { "epoch": 7.75, "learning_rate": 0.0006705027077937368, "loss": 3.015, "step": 144000 }, { "epoch": 7.8, "learning_rate": 0.0006675623969861078, "loss": 3.0151, "step": 145000 }, { "epoch": 7.85, "learning_rate": 0.0006646191429244173, "loss": 3.0152, "step": 146000 }, { "epoch": 7.91, "learning_rate": 0.0006616788321167884, "loss": 3.0181, "step": 147000 }, { "epoch": 7.96, "learning_rate": 0.0006587355780550977, "loss": 3.0181, "step": 148000 }, { "epoch": 8.0, "eval_accuracy": 0.40853820876773905, "eval_loss": 3.344135284423828, "eval_runtime": 153.5235, "eval_samples_per_second": 377.265, "eval_steps_per_second": 5.895, "step": 148704 }, { "epoch": 8.02, "learning_rate": 0.0006557952672474689, "loss": 2.9971, "step": 149000 }, { "epoch": 8.07, "learning_rate": 0.0006528520131857782, "loss": 2.9484, "step": 150000 }, { "epoch": 8.12, "learning_rate": 0.0006499117023781493, "loss": 2.9536, "step": 151000 }, { "epoch": 8.18, "learning_rate": 0.0006469684483164588, "loss": 2.9635, "step": 152000 }, { "epoch": 8.23, "learning_rate": 0.0006440251942547681, "loss": 2.967, "step": 153000 }, { "epoch": 8.28, "learning_rate": 0.0006410819401930775, "loss": 2.9704, "step": 154000 }, { "epoch": 8.34, "learning_rate": 0.000638138686131387, "loss": 2.9718, "step": 155000 }, { "epoch": 8.39, "learning_rate": 0.0006351954320696963, "loss": 2.9766, "step": 156000 }, { "epoch": 8.45, "learning_rate": 0.0006322551212620673, "loss": 2.9788, "step": 157000 }, { "epoch": 8.5, "learning_rate": 0.0006293118672003767, "loss": 2.9812, "step": 158000 }, { "epoch": 8.55, "learning_rate": 0.0006263715563927478, "loss": 2.9796, "step": 159000 }, { "epoch": 8.61, "learning_rate": 0.0006234283023310572, "loss": 2.9815, "step": 160000 }, { "epoch": 8.66, "learning_rate": 0.0006204879915234283, "loss": 2.9841, "step": 161000 }, { "epoch": 8.72, "learning_rate": 0.0006175447374617377, "loss": 2.9853, "step": 162000 }, { "epoch": 8.77, "learning_rate": 0.0006146044266541088, "loss": 2.9876, "step": 163000 }, { "epoch": 8.82, "learning_rate": 0.0006116611725924181, "loss": 2.9846, "step": 164000 }, { "epoch": 8.88, "learning_rate": 0.0006087179185307275, "loss": 2.9885, "step": 165000 }, { "epoch": 8.93, "learning_rate": 0.000605774664469037, "loss": 2.985, "step": 166000 }, { "epoch": 8.98, "learning_rate": 0.000602834353661408, "loss": 2.9888, "step": 167000 }, { "epoch": 9.0, "eval_accuracy": 0.40875567334308216, "eval_loss": 3.3545050621032715, "eval_runtime": 153.9524, "eval_samples_per_second": 376.214, "eval_steps_per_second": 5.878, "step": 167292 }, { "epoch": 9.04, "learning_rate": 0.0005998940428537791, "loss": 2.9412, "step": 168000 }, { "epoch": 9.09, "learning_rate": 0.0005969507887920886, "loss": 2.9225, "step": 169000 }, { "epoch": 9.15, "learning_rate": 0.0005940104779844596, "loss": 2.9315, "step": 170000 }, { "epoch": 9.2, "learning_rate": 0.000591067223922769, "loss": 2.9394, "step": 171000 }, { "epoch": 9.25, "learning_rate": 0.0005881239698610785, "loss": 2.9384, "step": 172000 }, { "epoch": 9.31, "learning_rate": 0.0005851807157993878, "loss": 2.9457, "step": 173000 }, { "epoch": 9.36, "learning_rate": 0.0005822404049917589, "loss": 2.9486, "step": 174000 }, { "epoch": 9.41, "learning_rate": 0.0005792971509300682, "loss": 2.9479, "step": 175000 }, { "epoch": 9.47, "learning_rate": 0.0005763538968683777, "loss": 2.9523, "step": 176000 }, { "epoch": 9.52, "learning_rate": 0.0005734106428066871, "loss": 2.9536, "step": 177000 }, { "epoch": 9.58, "learning_rate": 0.0005704703319990581, "loss": 2.9541, "step": 178000 }, { "epoch": 9.63, "learning_rate": 0.0005675300211914293, "loss": 2.9613, "step": 179000 }, { "epoch": 9.68, "learning_rate": 0.0005645867671297387, "loss": 2.9567, "step": 180000 }, { "epoch": 9.74, "learning_rate": 0.0005616464563221097, "loss": 2.9606, "step": 181000 }, { "epoch": 9.79, "learning_rate": 0.0005587032022604192, "loss": 2.9578, "step": 182000 }, { "epoch": 9.85, "learning_rate": 0.000555765834706852, "loss": 2.9623, "step": 183000 }, { "epoch": 9.9, "learning_rate": 0.0005528225806451613, "loss": 2.959, "step": 184000 }, { "epoch": 9.95, "learning_rate": 0.0005498793265834707, "loss": 2.9602, "step": 185000 }, { "epoch": 10.0, "eval_accuracy": 0.4088348124440257, "eval_loss": 3.3501293659210205, "eval_runtime": 153.6667, "eval_samples_per_second": 376.913, "eval_steps_per_second": 5.889, "step": 185880 }, { "epoch": 10.01, "learning_rate": 0.0005469360725217802, "loss": 2.9596, "step": 186000 }, { "epoch": 10.06, "learning_rate": 0.0005439928184600895, "loss": 2.9011, "step": 187000 }, { "epoch": 10.11, "learning_rate": 0.0005410495643983989, "loss": 2.9033, "step": 188000 }, { "epoch": 10.17, "learning_rate": 0.00053810925359077, "loss": 2.9104, "step": 189000 }, { "epoch": 10.22, "learning_rate": 0.0005351689427831411, "loss": 2.9175, "step": 190000 }, { "epoch": 10.28, "learning_rate": 0.0005322256887214505, "loss": 2.9182, "step": 191000 }, { "epoch": 10.33, "learning_rate": 0.0005292824346597599, "loss": 2.9185, "step": 192000 }, { "epoch": 10.38, "learning_rate": 0.0005263421238521309, "loss": 2.921, "step": 193000 }, { "epoch": 10.44, "learning_rate": 0.0005233988697904403, "loss": 2.9291, "step": 194000 }, { "epoch": 10.49, "learning_rate": 0.0005204585589828113, "loss": 2.9294, "step": 195000 }, { "epoch": 10.54, "learning_rate": 0.0005175153049211208, "loss": 2.9324, "step": 196000 }, { "epoch": 10.6, "learning_rate": 0.0005145749941134919, "loss": 2.9271, "step": 197000 }, { "epoch": 10.65, "learning_rate": 0.0005116317400518012, "loss": 2.9325, "step": 198000 }, { "epoch": 10.71, "learning_rate": 0.0005086884859901107, "loss": 2.9341, "step": 199000 }, { "epoch": 10.76, "learning_rate": 0.0005057481751824817, "loss": 2.9395, "step": 200000 }, { "epoch": 10.81, "learning_rate": 0.0005028049211207911, "loss": 2.9351, "step": 201000 }, { "epoch": 10.87, "learning_rate": 0.0004998616670591005, "loss": 2.9344, "step": 202000 }, { "epoch": 10.92, "learning_rate": 0.0004969213562514716, "loss": 2.9385, "step": 203000 }, { "epoch": 10.97, "learning_rate": 0.000493978102189781, "loss": 2.942, "step": 204000 }, { "epoch": 11.0, "eval_accuracy": 0.40947598695964976, "eval_loss": 3.35093092918396, "eval_runtime": 153.6805, "eval_samples_per_second": 376.879, "eval_steps_per_second": 5.889, "step": 204468 }, { "epoch": 11.03, "learning_rate": 0.0004910377913821521, "loss": 2.9079, "step": 205000 }, { "epoch": 11.08, "learning_rate": 0.0004880974805745232, "loss": 2.881, "step": 206000 }, { "epoch": 11.14, "learning_rate": 0.0004851542265128326, "loss": 2.8806, "step": 207000 }, { "epoch": 11.19, "learning_rate": 0.00048221391570520367, "loss": 2.8891, "step": 208000 }, { "epoch": 11.24, "learning_rate": 0.00047927066164351305, "loss": 2.8992, "step": 209000 }, { "epoch": 11.3, "learning_rate": 0.0004763303508358842, "loss": 2.8893, "step": 210000 }, { "epoch": 11.35, "learning_rate": 0.00047338709677419356, "loss": 2.9037, "step": 211000 }, { "epoch": 11.41, "learning_rate": 0.00047044384271250294, "loss": 2.9027, "step": 212000 }, { "epoch": 11.46, "learning_rate": 0.00046750058865081237, "loss": 2.9063, "step": 213000 }, { "epoch": 11.51, "learning_rate": 0.00046455733458912175, "loss": 2.905, "step": 214000 }, { "epoch": 11.57, "learning_rate": 0.00046161408052743113, "loss": 2.9073, "step": 215000 }, { "epoch": 11.62, "learning_rate": 0.0004586737697198022, "loss": 2.9115, "step": 216000 }, { "epoch": 11.67, "learning_rate": 0.00045573345891217334, "loss": 2.9121, "step": 217000 }, { "epoch": 11.73, "learning_rate": 0.0004527902048504827, "loss": 2.9119, "step": 218000 }, { "epoch": 11.78, "learning_rate": 0.0004498498940428538, "loss": 2.9123, "step": 219000 }, { "epoch": 11.84, "learning_rate": 0.00044690663998116323, "loss": 2.9137, "step": 220000 }, { "epoch": 11.89, "learning_rate": 0.00044396632917353425, "loss": 2.9182, "step": 221000 }, { "epoch": 11.94, "learning_rate": 0.00044102307511184363, "loss": 2.9199, "step": 222000 }, { "epoch": 12.0, "learning_rate": 0.000438079821050153, "loss": 2.9174, "step": 223000 }, { "epoch": 12.0, "eval_accuracy": 0.4092696072329107, "eval_loss": 3.370931625366211, "eval_runtime": 153.6062, "eval_samples_per_second": 377.061, "eval_steps_per_second": 5.892, "step": 223056 }, { "epoch": 12.05, "learning_rate": 0.00043513951024252414, "loss": 2.8618, "step": 224000 }, { "epoch": 12.1, "learning_rate": 0.0004321962561808335, "loss": 2.8657, "step": 225000 }, { "epoch": 12.16, "learning_rate": 0.0004292559453732046, "loss": 2.8649, "step": 226000 }, { "epoch": 12.21, "learning_rate": 0.00042631269131151404, "loss": 2.8739, "step": 227000 }, { "epoch": 12.27, "learning_rate": 0.0004233694372498234, "loss": 2.8766, "step": 228000 }, { "epoch": 12.32, "learning_rate": 0.0004204261831881328, "loss": 2.8788, "step": 229000 }, { "epoch": 12.37, "learning_rate": 0.00041748881563456557, "loss": 2.8791, "step": 230000 }, { "epoch": 12.43, "learning_rate": 0.000414545561572875, "loss": 2.8812, "step": 231000 }, { "epoch": 12.48, "learning_rate": 0.0004116023075111844, "loss": 2.8795, "step": 232000 }, { "epoch": 12.53, "learning_rate": 0.00040866493995761716, "loss": 2.8892, "step": 233000 }, { "epoch": 12.59, "learning_rate": 0.00040572168589592654, "loss": 2.8891, "step": 234000 }, { "epoch": 12.64, "learning_rate": 0.00040277843183423597, "loss": 2.8872, "step": 235000 }, { "epoch": 12.7, "learning_rate": 0.00039983517777254535, "loss": 2.8884, "step": 236000 }, { "epoch": 12.75, "learning_rate": 0.00039689192371085473, "loss": 2.8897, "step": 237000 }, { "epoch": 12.8, "learning_rate": 0.00039395161290322586, "loss": 2.8915, "step": 238000 }, { "epoch": 12.86, "learning_rate": 0.0003910083588415352, "loss": 2.8951, "step": 239000 }, { "epoch": 12.91, "learning_rate": 0.00038806804803390626, "loss": 2.8917, "step": 240000 }, { "epoch": 12.97, "learning_rate": 0.00038512479397221565, "loss": 2.8989, "step": 241000 }, { "epoch": 13.0, "eval_accuracy": 0.4106622001220946, "eval_loss": 3.3607850074768066, "eval_runtime": 153.7023, "eval_samples_per_second": 376.826, "eval_steps_per_second": 5.888, "step": 241644 }, { "epoch": 13.02, "learning_rate": 0.0003821844831645868, "loss": 2.8762, "step": 242000 }, { "epoch": 13.07, "learning_rate": 0.00037924122910289616, "loss": 2.8394, "step": 243000 }, { "epoch": 13.13, "learning_rate": 0.00037629797504120554, "loss": 2.8468, "step": 244000 }, { "epoch": 13.18, "learning_rate": 0.00037335766423357667, "loss": 2.8509, "step": 245000 }, { "epoch": 13.23, "learning_rate": 0.00037041441017188605, "loss": 2.8549, "step": 246000 }, { "epoch": 13.29, "learning_rate": 0.00036747115611019543, "loss": 2.8499, "step": 247000 }, { "epoch": 13.34, "learning_rate": 0.0003645337885566282, "loss": 2.8571, "step": 248000 }, { "epoch": 13.4, "learning_rate": 0.00036159053449493763, "loss": 2.8601, "step": 249000 }, { "epoch": 13.45, "learning_rate": 0.000358647280433247, "loss": 2.8638, "step": 250000 }, { "epoch": 13.5, "learning_rate": 0.0003557040263715564, "loss": 2.8653, "step": 251000 }, { "epoch": 13.56, "learning_rate": 0.0003527637155639275, "loss": 2.8664, "step": 252000 }, { "epoch": 13.61, "learning_rate": 0.0003498204615022369, "loss": 2.8661, "step": 253000 }, { "epoch": 13.66, "learning_rate": 0.000346880150694608, "loss": 2.8729, "step": 254000 }, { "epoch": 13.72, "learning_rate": 0.00034393689663291736, "loss": 2.8698, "step": 255000 }, { "epoch": 13.77, "learning_rate": 0.0003409936425712268, "loss": 2.875, "step": 256000 }, { "epoch": 13.83, "learning_rate": 0.0003380503885095362, "loss": 2.873, "step": 257000 }, { "epoch": 13.88, "learning_rate": 0.0003351071344478455, "loss": 2.8736, "step": 258000 }, { "epoch": 13.93, "learning_rate": 0.00033216388038615494, "loss": 2.8746, "step": 259000 }, { "epoch": 13.99, "learning_rate": 0.000329223569578526, "loss": 2.8757, "step": 260000 }, { "epoch": 14.0, "eval_accuracy": 0.41005945308163155, "eval_loss": 3.3651070594787598, "eval_runtime": 153.628, "eval_samples_per_second": 377.008, "eval_steps_per_second": 5.891, "step": 260232 }, { "epoch": 14.04, "learning_rate": 0.0003262803155168354, "loss": 2.8343, "step": 261000 }, { "epoch": 14.1, "learning_rate": 0.00032334000470920647, "loss": 2.8301, "step": 262000 }, { "epoch": 14.15, "learning_rate": 0.0003203967506475159, "loss": 2.8292, "step": 263000 }, { "epoch": 14.2, "learning_rate": 0.0003174534965858253, "loss": 2.8317, "step": 264000 }, { "epoch": 14.26, "learning_rate": 0.00031451318577819636, "loss": 2.8366, "step": 265000 }, { "epoch": 14.31, "learning_rate": 0.00031157287497056744, "loss": 2.8386, "step": 266000 }, { "epoch": 14.36, "learning_rate": 0.0003086296209088769, "loss": 2.8376, "step": 267000 }, { "epoch": 14.42, "learning_rate": 0.00030568636684718625, "loss": 2.8456, "step": 268000 }, { "epoch": 14.47, "learning_rate": 0.00030274605603955733, "loss": 2.8454, "step": 269000 }, { "epoch": 14.53, "learning_rate": 0.00029980280197786677, "loss": 2.8477, "step": 270000 }, { "epoch": 14.58, "learning_rate": 0.00029685954791617615, "loss": 2.8442, "step": 271000 }, { "epoch": 14.63, "learning_rate": 0.0002939162938544855, "loss": 2.8486, "step": 272000 }, { "epoch": 14.69, "learning_rate": 0.00029097303979279496, "loss": 2.8506, "step": 273000 }, { "epoch": 14.74, "learning_rate": 0.00028802978573110434, "loss": 2.8504, "step": 274000 }, { "epoch": 14.79, "learning_rate": 0.0002850865316694137, "loss": 2.8578, "step": 275000 }, { "epoch": 14.85, "learning_rate": 0.0002821462208617848, "loss": 2.8522, "step": 276000 }, { "epoch": 14.9, "learning_rate": 0.0002792059100541559, "loss": 2.8574, "step": 277000 }, { "epoch": 14.96, "learning_rate": 0.00027626265599246526, "loss": 2.8506, "step": 278000 }, { "epoch": 15.0, "eval_accuracy": 0.41093038627741424, "eval_loss": 3.363818407058716, "eval_runtime": 153.8072, "eval_samples_per_second": 376.569, "eval_steps_per_second": 5.884, "step": 278820 }, { "epoch": 15.01, "learning_rate": 0.00027332234518483633, "loss": 2.8477, "step": 279000 }, { "epoch": 15.06, "learning_rate": 0.0002703790911231457, "loss": 2.8029, "step": 280000 }, { "epoch": 15.12, "learning_rate": 0.00026743878031551684, "loss": 2.8113, "step": 281000 }, { "epoch": 15.17, "learning_rate": 0.0002644955262538262, "loss": 2.8192, "step": 282000 }, { "epoch": 15.22, "learning_rate": 0.0002615522721921356, "loss": 2.8172, "step": 283000 }, { "epoch": 15.28, "learning_rate": 0.00025860901813044504, "loss": 2.8204, "step": 284000 }, { "epoch": 15.33, "learning_rate": 0.0002556687073228161, "loss": 2.8208, "step": 285000 }, { "epoch": 15.39, "learning_rate": 0.0002527254532611255, "loss": 2.8246, "step": 286000 }, { "epoch": 15.44, "learning_rate": 0.0002497821991994349, "loss": 2.8226, "step": 287000 }, { "epoch": 15.49, "learning_rate": 0.000246841888391806, "loss": 2.8279, "step": 288000 }, { "epoch": 15.55, "learning_rate": 0.00024389863433011539, "loss": 2.8303, "step": 289000 }, { "epoch": 15.6, "learning_rate": 0.0002409553802684248, "loss": 2.828, "step": 290000 }, { "epoch": 15.66, "learning_rate": 0.00023801506946079587, "loss": 2.829, "step": 291000 }, { "epoch": 15.71, "learning_rate": 0.00023507181539910525, "loss": 2.8336, "step": 292000 }, { "epoch": 15.76, "learning_rate": 0.00023212856133741463, "loss": 2.8294, "step": 293000 }, { "epoch": 15.82, "learning_rate": 0.00022918825052978573, "loss": 2.8337, "step": 294000 }, { "epoch": 15.87, "learning_rate": 0.00022624499646809512, "loss": 2.837, "step": 295000 }, { "epoch": 15.92, "learning_rate": 0.00022330174240640452, "loss": 2.8345, "step": 296000 }, { "epoch": 15.98, "learning_rate": 0.00022035848834471393, "loss": 2.8373, "step": 297000 }, { "epoch": 16.0, "eval_accuracy": 0.41065266043420495, "eval_loss": 3.372394561767578, "eval_runtime": 153.6132, "eval_samples_per_second": 377.044, "eval_steps_per_second": 5.891, "step": 297408 }, { "epoch": 16.03, "learning_rate": 0.000217418177537085, "loss": 2.8089, "step": 298000 }, { "epoch": 16.09, "learning_rate": 0.0002144778667294561, "loss": 2.798, "step": 299000 }, { "epoch": 16.14, "learning_rate": 0.0002115346126677655, "loss": 2.793, "step": 300000 }, { "epoch": 16.19, "learning_rate": 0.0002085913586060749, "loss": 2.7981, "step": 301000 }, { "epoch": 16.25, "learning_rate": 0.00020565104779844595, "loss": 2.8002, "step": 302000 }, { "epoch": 16.3, "learning_rate": 0.00020270779373675535, "loss": 2.8011, "step": 303000 }, { "epoch": 16.35, "learning_rate": 0.00019976748292912643, "loss": 2.8051, "step": 304000 }, { "epoch": 16.41, "learning_rate": 0.00019682422886743584, "loss": 2.8105, "step": 305000 }, { "epoch": 16.46, "learning_rate": 0.00019388097480574525, "loss": 2.8114, "step": 306000 }, { "epoch": 16.52, "learning_rate": 0.00019093772074405463, "loss": 2.8076, "step": 307000 }, { "epoch": 16.57, "learning_rate": 0.00018799740993642573, "loss": 2.8102, "step": 308000 }, { "epoch": 16.62, "learning_rate": 0.0001850541558747351, "loss": 2.8088, "step": 309000 }, { "epoch": 16.68, "learning_rate": 0.00018211090181304452, "loss": 2.8144, "step": 310000 }, { "epoch": 16.73, "learning_rate": 0.00017917059100541557, "loss": 2.8158, "step": 311000 }, { "epoch": 16.79, "learning_rate": 0.00017622733694372497, "loss": 2.8143, "step": 312000 }, { "epoch": 16.84, "learning_rate": 0.00017328408288203438, "loss": 2.8121, "step": 313000 }, { "epoch": 16.89, "learning_rate": 0.00017034082882034376, "loss": 2.8186, "step": 314000 }, { "epoch": 16.95, "learning_rate": 0.00016740051801271487, "loss": 2.8195, "step": 315000 }, { "epoch": 17.0, "eval_accuracy": 0.4107544394986612, "eval_loss": 3.3818860054016113, "eval_runtime": 153.5625, "eval_samples_per_second": 377.169, "eval_steps_per_second": 5.893, "step": 315996 }, { "epoch": 17.0, "learning_rate": 0.00016446020720508594, "loss": 2.8161, "step": 316000 }, { "epoch": 17.05, "learning_rate": 0.00016151695314339535, "loss": 2.775, "step": 317000 }, { "epoch": 17.11, "learning_rate": 0.00015857664233576643, "loss": 2.7802, "step": 318000 }, { "epoch": 17.16, "learning_rate": 0.00015563338827407583, "loss": 2.7816, "step": 319000 }, { "epoch": 17.22, "learning_rate": 0.00015269013421238524, "loss": 2.7826, "step": 320000 }, { "epoch": 17.27, "learning_rate": 0.0001497468801506946, "loss": 2.7866, "step": 321000 }, { "epoch": 17.32, "learning_rate": 0.000146803626089004, "loss": 2.7882, "step": 322000 }, { "epoch": 17.38, "learning_rate": 0.00014386331528137508, "loss": 2.7918, "step": 323000 }, { "epoch": 17.43, "learning_rate": 0.00014092006121968449, "loss": 2.7885, "step": 324000 }, { "epoch": 17.48, "learning_rate": 0.00013797680715799387, "loss": 2.7938, "step": 325000 }, { "epoch": 17.54, "learning_rate": 0.00013503355309630327, "loss": 2.7926, "step": 326000 }, { "epoch": 17.59, "learning_rate": 0.00013209324228867438, "loss": 2.7947, "step": 327000 }, { "epoch": 17.65, "learning_rate": 0.00012915293148104545, "loss": 2.7974, "step": 328000 }, { "epoch": 17.7, "learning_rate": 0.00012620967741935486, "loss": 2.7926, "step": 329000 }, { "epoch": 17.75, "learning_rate": 0.00012326642335766424, "loss": 2.7969, "step": 330000 }, { "epoch": 17.81, "learning_rate": 0.00012032611255003532, "loss": 2.797, "step": 331000 }, { "epoch": 17.86, "learning_rate": 0.00011738580174240641, "loss": 2.8026, "step": 332000 }, { "epoch": 17.91, "learning_rate": 0.0001144425476807158, "loss": 2.7987, "step": 333000 }, { "epoch": 17.97, "learning_rate": 0.0001114992936190252, "loss": 2.7983, "step": 334000 }, { "epoch": 18.0, "eval_accuracy": 0.41104412354218284, "eval_loss": 3.3819210529327393, "eval_runtime": 153.6749, "eval_samples_per_second": 376.893, "eval_steps_per_second": 5.889, "step": 334584 }, { "epoch": 18.02, "learning_rate": 0.00010855603955733459, "loss": 2.7861, "step": 335000 }, { "epoch": 18.08, "learning_rate": 0.00010561572874970568, "loss": 2.7654, "step": 336000 }, { "epoch": 18.13, "learning_rate": 0.00010267247468801507, "loss": 2.7679, "step": 337000 }, { "epoch": 18.18, "learning_rate": 9.972922062632445e-05, "loss": 2.772, "step": 338000 }, { "epoch": 18.24, "learning_rate": 9.678596656463386e-05, "loss": 2.7715, "step": 339000 }, { "epoch": 18.29, "learning_rate": 9.384565575700495e-05, "loss": 2.7719, "step": 340000 }, { "epoch": 18.35, "learning_rate": 9.090240169531435e-05, "loss": 2.7753, "step": 341000 }, { "epoch": 18.4, "learning_rate": 8.795914763362374e-05, "loss": 2.7716, "step": 342000 }, { "epoch": 18.45, "learning_rate": 8.501883682599482e-05, "loss": 2.7763, "step": 343000 }, { "epoch": 18.51, "learning_rate": 8.20785260183659e-05, "loss": 2.7768, "step": 344000 }, { "epoch": 18.56, "learning_rate": 7.91352719566753e-05, "loss": 2.7772, "step": 345000 }, { "epoch": 18.61, "learning_rate": 7.61920178949847e-05, "loss": 2.7777, "step": 346000 }, { "epoch": 18.67, "learning_rate": 7.32487638332941e-05, "loss": 2.7804, "step": 347000 }, { "epoch": 18.72, "learning_rate": 7.03055097716035e-05, "loss": 2.7769, "step": 348000 }, { "epoch": 18.78, "learning_rate": 6.736519896397457e-05, "loss": 2.777, "step": 349000 }, { "epoch": 18.83, "learning_rate": 6.442194490228397e-05, "loss": 2.7815, "step": 350000 }, { "epoch": 18.88, "learning_rate": 6.147869084059336e-05, "loss": 2.7791, "step": 351000 }, { "epoch": 18.94, "learning_rate": 5.853543677890276e-05, "loss": 2.7793, "step": 352000 }, { "epoch": 18.99, "learning_rate": 5.559512597127384e-05, "loss": 2.7786, "step": 353000 }, { "epoch": 19.0, "eval_accuracy": 0.4102747006872539, "eval_loss": 3.3970141410827637, "eval_runtime": 153.4893, "eval_samples_per_second": 377.349, "eval_steps_per_second": 5.896, "step": 353172 }, { "epoch": 19.04, "learning_rate": 5.2654815163644926e-05, "loss": 2.7643, "step": 354000 }, { "epoch": 19.1, "learning_rate": 4.971450435601601e-05, "loss": 2.7618, "step": 355000 }, { "epoch": 19.15, "learning_rate": 4.67712502943254e-05, "loss": 2.7583, "step": 356000 }, { "epoch": 19.21, "learning_rate": 4.3827996232634804e-05, "loss": 2.7595, "step": 357000 }, { "epoch": 19.26, "learning_rate": 4.088768542500589e-05, "loss": 2.7565, "step": 358000 }, { "epoch": 19.31, "learning_rate": 3.794443136331528e-05, "loss": 2.7626, "step": 359000 }, { "epoch": 19.37, "learning_rate": 3.500117730162468e-05, "loss": 2.7608, "step": 360000 }, { "epoch": 19.42, "learning_rate": 3.205792323993407e-05, "loss": 2.7597, "step": 361000 }, { "epoch": 19.47, "learning_rate": 2.9114669178243463e-05, "loss": 2.7618, "step": 362000 }, { "epoch": 19.53, "learning_rate": 2.617141511655286e-05, "loss": 2.7633, "step": 363000 }, { "epoch": 19.58, "learning_rate": 2.323404756298564e-05, "loss": 2.7611, "step": 364000 }, { "epoch": 19.64, "learning_rate": 2.0290793501295033e-05, "loss": 2.7599, "step": 365000 }, { "epoch": 19.69, "learning_rate": 1.7347539439604427e-05, "loss": 2.7633, "step": 366000 }, { "epoch": 19.74, "learning_rate": 1.4404285377913822e-05, "loss": 2.7611, "step": 367000 }, { "epoch": 19.8, "learning_rate": 1.1463974570284906e-05, "loss": 2.7643, "step": 368000 }, { "epoch": 19.85, "learning_rate": 8.520720508594302e-06, "loss": 2.7627, "step": 369000 }, { "epoch": 19.91, "learning_rate": 5.580409700965387e-06, "loss": 2.7614, "step": 370000 }, { "epoch": 19.96, "learning_rate": 2.6371556392747826e-06, "loss": 2.7635, "step": 371000 }, { "epoch": 20.0, "eval_accuracy": 0.4103301921111753, "eval_loss": 3.405571460723877, "eval_runtime": 153.5852, "eval_samples_per_second": 377.113, "eval_steps_per_second": 5.892, "step": 371760 }, { "epoch": 20.0, "step": 371760, "total_flos": 1.5663200871168e+18, "train_loss": 3.0311791784665343, "train_runtime": 80855.4689, "train_samples_per_second": 147.128, "train_steps_per_second": 4.598 } ], "logging_steps": 1000, "max_steps": 371760, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 5000, "total_flos": 1.5663200871168e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }