{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 372580, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "learning_rate": 9.375e-06, "loss": 6.8502, "step": 1000 }, { "epoch": 0.11, "learning_rate": 1.875e-05, "loss": 5.3458, "step": 2000 }, { "epoch": 0.16, "learning_rate": 2.8125e-05, "loss": 5.0282, "step": 3000 }, { "epoch": 0.21, "learning_rate": 3.75e-05, "loss": 4.8074, "step": 4000 }, { "epoch": 0.27, "learning_rate": 4.6874999999999994e-05, "loss": 4.6301, "step": 5000 }, { "epoch": 0.32, "learning_rate": 5.625e-05, "loss": 4.488, "step": 6000 }, { "epoch": 0.38, "learning_rate": 6.5625e-05, "loss": 4.3738, "step": 7000 }, { "epoch": 0.43, "learning_rate": 7.5e-05, "loss": 4.2832, "step": 8000 }, { "epoch": 0.48, "learning_rate": 8.437499999999999e-05, "loss": 4.194, "step": 9000 }, { "epoch": 0.54, "learning_rate": 9.374999999999999e-05, "loss": 4.1227, "step": 10000 }, { "epoch": 0.59, "learning_rate": 0.00010312499999999999, "loss": 4.0613, "step": 11000 }, { "epoch": 0.64, "learning_rate": 0.0001125, "loss": 3.996, "step": 12000 }, { "epoch": 0.7, "learning_rate": 0.00012185624999999998, "loss": 3.9415, "step": 13000 }, { "epoch": 0.75, "learning_rate": 0.000131221875, "loss": 3.8838, "step": 14000 }, { "epoch": 0.81, "learning_rate": 0.000140596875, "loss": 3.8372, "step": 15000 }, { "epoch": 0.86, "learning_rate": 0.0001499625, "loss": 3.7999, "step": 16000 }, { "epoch": 0.91, "learning_rate": 0.00015933749999999996, "loss": 3.7624, "step": 17000 }, { "epoch": 0.97, "learning_rate": 0.00016871249999999996, "loss": 3.7291, "step": 18000 }, { "epoch": 1.0, "eval_accuracy": 0.34637140776693487, "eval_loss": 3.9229166507720947, "eval_runtime": 146.7032, "eval_samples_per_second": 394.886, "eval_steps_per_second": 6.176, "step": 18629 }, { "epoch": 1.02, "learning_rate": 0.000178078125, "loss": 3.6906, "step": 19000 }, { "epoch": 1.07, "learning_rate": 0.00018745312499999998, "loss": 3.6596, "step": 20000 }, { "epoch": 1.13, "learning_rate": 0.00019681874999999998, "loss": 3.6393, "step": 21000 }, { "epoch": 1.18, "learning_rate": 0.00020618437499999995, "loss": 3.618, "step": 22000 }, { "epoch": 1.23, "learning_rate": 0.00021555937499999998, "loss": 3.6044, "step": 23000 }, { "epoch": 1.29, "learning_rate": 0.00022493437499999998, "loss": 3.5839, "step": 24000 }, { "epoch": 1.34, "learning_rate": 0.00023429999999999998, "loss": 3.5764, "step": 25000 }, { "epoch": 1.4, "learning_rate": 0.00024367499999999997, "loss": 3.5571, "step": 26000 }, { "epoch": 1.45, "learning_rate": 0.00025305, "loss": 3.5493, "step": 27000 }, { "epoch": 1.5, "learning_rate": 0.000262415625, "loss": 3.5324, "step": 28000 }, { "epoch": 1.56, "learning_rate": 0.000271790625, "loss": 3.5234, "step": 29000 }, { "epoch": 1.61, "learning_rate": 0.00028115624999999994, "loss": 3.5032, "step": 30000 }, { "epoch": 1.66, "learning_rate": 0.00029053124999999994, "loss": 3.4949, "step": 31000 }, { "epoch": 1.72, "learning_rate": 0.00029990624999999993, "loss": 3.4876, "step": 32000 }, { "epoch": 1.77, "learning_rate": 0.00029912795818897174, "loss": 3.4725, "step": 33000 }, { "epoch": 1.83, "learning_rate": 0.00029824798872511596, "loss": 3.4631, "step": 34000 }, { "epoch": 1.88, "learning_rate": 0.000297367138410946, "loss": 3.4478, "step": 35000 }, { "epoch": 1.93, "learning_rate": 0.0002964871689470902, "loss": 3.4351, "step": 36000 }, { "epoch": 1.99, "learning_rate": 0.0002956063186329203, "loss": 3.4237, "step": 37000 }, { "epoch": 2.0, "eval_accuracy": 0.37469545649604485, "eval_loss": 3.654151439666748, "eval_runtime": 146.6726, "eval_samples_per_second": 394.968, "eval_steps_per_second": 6.177, "step": 37258 }, { "epoch": 2.04, "learning_rate": 0.00029472546831875034, "loss": 3.3817, "step": 38000 }, { "epoch": 2.09, "learning_rate": 0.00029384549885489456, "loss": 3.3638, "step": 39000 }, { "epoch": 2.15, "learning_rate": 0.00029296464854072465, "loss": 3.3632, "step": 40000 }, { "epoch": 2.2, "learning_rate": 0.00029208467907686886, "loss": 3.3531, "step": 41000 }, { "epoch": 2.25, "learning_rate": 0.00029120470961301303, "loss": 3.3486, "step": 42000 }, { "epoch": 2.31, "learning_rate": 0.0002903238592988431, "loss": 3.3489, "step": 43000 }, { "epoch": 2.36, "learning_rate": 0.00028944300898467315, "loss": 3.34, "step": 44000 }, { "epoch": 2.42, "learning_rate": 0.00028856215867050324, "loss": 3.3353, "step": 45000 }, { "epoch": 2.47, "learning_rate": 0.0002876813083563333, "loss": 3.328, "step": 46000 }, { "epoch": 2.52, "learning_rate": 0.0002868013388924775, "loss": 3.3191, "step": 47000 }, { "epoch": 2.58, "learning_rate": 0.0002859204885783076, "loss": 3.3164, "step": 48000 }, { "epoch": 2.63, "learning_rate": 0.0002850396382641376, "loss": 3.3104, "step": 49000 }, { "epoch": 2.68, "learning_rate": 0.00028415966880028184, "loss": 3.3026, "step": 50000 }, { "epoch": 2.74, "learning_rate": 0.0002832788184861119, "loss": 3.3037, "step": 51000 }, { "epoch": 2.79, "learning_rate": 0.00028239884902225615, "loss": 3.295, "step": 52000 }, { "epoch": 2.85, "learning_rate": 0.0002815179987080862, "loss": 3.2928, "step": 53000 }, { "epoch": 2.9, "learning_rate": 0.0002806371483939162, "loss": 3.2876, "step": 54000 }, { "epoch": 2.95, "learning_rate": 0.0002797562980797463, "loss": 3.2842, "step": 55000 }, { "epoch": 3.0, "eval_accuracy": 0.3880011689741996, "eval_loss": 3.5183119773864746, "eval_runtime": 146.4806, "eval_samples_per_second": 395.486, "eval_steps_per_second": 6.185, "step": 55887 }, { "epoch": 3.01, "learning_rate": 0.0002788772094662047, "loss": 3.2741, "step": 56000 }, { "epoch": 3.06, "learning_rate": 0.00027799635915203474, "loss": 3.213, "step": 57000 }, { "epoch": 3.11, "learning_rate": 0.00027711638968817896, "loss": 3.2153, "step": 58000 }, { "epoch": 3.17, "learning_rate": 0.000276235539374009, "loss": 3.2143, "step": 59000 }, { "epoch": 3.22, "learning_rate": 0.0002753546890598391, "loss": 3.2183, "step": 60000 }, { "epoch": 3.27, "learning_rate": 0.0002744747195959833, "loss": 3.2182, "step": 61000 }, { "epoch": 3.33, "learning_rate": 0.00027359386928181334, "loss": 3.21, "step": 62000 }, { "epoch": 3.38, "learning_rate": 0.00027271389981795756, "loss": 3.2111, "step": 63000 }, { "epoch": 3.44, "learning_rate": 0.0002718339303541018, "loss": 3.212, "step": 64000 }, { "epoch": 3.49, "learning_rate": 0.00027095308003993187, "loss": 3.2113, "step": 65000 }, { "epoch": 3.54, "learning_rate": 0.0002700722297257619, "loss": 3.2115, "step": 66000 }, { "epoch": 3.6, "learning_rate": 0.00026919137941159194, "loss": 3.2122, "step": 67000 }, { "epoch": 3.65, "learning_rate": 0.0002683114099477362, "loss": 3.2081, "step": 68000 }, { "epoch": 3.7, "learning_rate": 0.00026743055963356625, "loss": 3.2045, "step": 69000 }, { "epoch": 3.76, "learning_rate": 0.00026654970931939634, "loss": 3.2016, "step": 70000 }, { "epoch": 3.81, "learning_rate": 0.00026566885900522637, "loss": 3.2019, "step": 71000 }, { "epoch": 3.86, "learning_rate": 0.0002647888895413706, "loss": 3.1989, "step": 72000 }, { "epoch": 3.92, "learning_rate": 0.0002639080392272006, "loss": 3.1971, "step": 73000 }, { "epoch": 3.97, "learning_rate": 0.00026302718891303066, "loss": 3.1952, "step": 74000 }, { "epoch": 4.0, "eval_accuracy": 0.39376396191712576, "eval_loss": 3.474755048751831, "eval_runtime": 147.5232, "eval_samples_per_second": 392.691, "eval_steps_per_second": 6.141, "step": 74516 }, { "epoch": 4.03, "learning_rate": 0.00026214721944917493, "loss": 3.1576, "step": 75000 }, { "epoch": 4.08, "learning_rate": 0.00026126636913500497, "loss": 3.1329, "step": 76000 }, { "epoch": 4.13, "learning_rate": 0.00026038551882083506, "loss": 3.1287, "step": 77000 }, { "epoch": 4.19, "learning_rate": 0.0002595046685066651, "loss": 3.1305, "step": 78000 }, { "epoch": 4.24, "learning_rate": 0.00025862557989312344, "loss": 3.1394, "step": 79000 }, { "epoch": 4.29, "learning_rate": 0.00025774472957895353, "loss": 3.1349, "step": 80000 }, { "epoch": 4.35, "learning_rate": 0.00025686387926478356, "loss": 3.1386, "step": 81000 }, { "epoch": 4.4, "learning_rate": 0.00025598390980092784, "loss": 3.1376, "step": 82000 }, { "epoch": 4.46, "learning_rate": 0.00025510305948675787, "loss": 3.1401, "step": 83000 }, { "epoch": 4.51, "learning_rate": 0.0002542222091725879, "loss": 3.134, "step": 84000 }, { "epoch": 4.56, "learning_rate": 0.0002533422397087321, "loss": 3.1311, "step": 85000 }, { "epoch": 4.62, "learning_rate": 0.00025246138939456216, "loss": 3.1384, "step": 86000 }, { "epoch": 4.67, "learning_rate": 0.00025158141993070643, "loss": 3.1377, "step": 87000 }, { "epoch": 4.72, "learning_rate": 0.00025070056961653647, "loss": 3.1346, "step": 88000 }, { "epoch": 4.78, "learning_rate": 0.0002498206001526807, "loss": 3.1381, "step": 89000 }, { "epoch": 4.83, "learning_rate": 0.0002489397498385107, "loss": 3.1292, "step": 90000 }, { "epoch": 4.88, "learning_rate": 0.0002480588995243408, "loss": 3.1317, "step": 91000 }, { "epoch": 4.94, "learning_rate": 0.00024717804921017085, "loss": 3.1313, "step": 92000 }, { "epoch": 4.99, "learning_rate": 0.0002462980797463151, "loss": 3.1351, "step": 93000 }, { "epoch": 5.0, "eval_accuracy": 0.39715434979773, "eval_loss": 3.4493298530578613, "eval_runtime": 146.562, "eval_samples_per_second": 395.266, "eval_steps_per_second": 6.182, "step": 93145 }, { "epoch": 5.05, "learning_rate": 0.0002454181102824593, "loss": 3.0699, "step": 94000 }, { "epoch": 5.1, "learning_rate": 0.0002445372599682894, "loss": 3.0649, "step": 95000 }, { "epoch": 5.15, "learning_rate": 0.0002436564096541194, "loss": 3.0682, "step": 96000 }, { "epoch": 5.21, "learning_rate": 0.00024277555933994947, "loss": 3.0778, "step": 97000 }, { "epoch": 5.26, "learning_rate": 0.0002418955898760937, "loss": 3.0749, "step": 98000 }, { "epoch": 5.31, "learning_rate": 0.00024101473956192375, "loss": 3.0774, "step": 99000 }, { "epoch": 5.37, "learning_rate": 0.0002401338892477538, "loss": 3.079, "step": 100000 }, { "epoch": 5.42, "learning_rate": 0.00023925391978389803, "loss": 3.078, "step": 101000 }, { "epoch": 5.48, "learning_rate": 0.0002383730694697281, "loss": 3.0779, "step": 102000 }, { "epoch": 5.53, "learning_rate": 0.00023749310000587231, "loss": 3.0822, "step": 103000 }, { "epoch": 5.58, "learning_rate": 0.00023661224969170235, "loss": 3.0886, "step": 104000 }, { "epoch": 5.64, "learning_rate": 0.0002357322802278466, "loss": 3.0826, "step": 105000 }, { "epoch": 5.69, "learning_rate": 0.00023485142991367666, "loss": 3.083, "step": 106000 }, { "epoch": 5.74, "learning_rate": 0.00023397057959950672, "loss": 3.0808, "step": 107000 }, { "epoch": 5.8, "learning_rate": 0.00023308972928533675, "loss": 3.0832, "step": 108000 }, { "epoch": 5.85, "learning_rate": 0.00023220887897116682, "loss": 3.0856, "step": 109000 }, { "epoch": 5.9, "learning_rate": 0.00023132890950731104, "loss": 3.0828, "step": 110000 }, { "epoch": 5.96, "learning_rate": 0.0002304480591931411, "loss": 3.0844, "step": 111000 }, { "epoch": 6.0, "eval_accuracy": 0.4004582266021962, "eval_loss": 3.415574312210083, "eval_runtime": 147.2002, "eval_samples_per_second": 393.552, "eval_steps_per_second": 6.155, "step": 111774 }, { "epoch": 6.01, "learning_rate": 0.00022956808972928534, "loss": 3.068, "step": 112000 }, { "epoch": 6.07, "learning_rate": 0.00022868723941511538, "loss": 3.0137, "step": 113000 }, { "epoch": 6.12, "learning_rate": 0.0002278072699512596, "loss": 3.0237, "step": 114000 }, { "epoch": 6.17, "learning_rate": 0.00022692641963708966, "loss": 3.0212, "step": 115000 }, { "epoch": 6.23, "learning_rate": 0.00022604645017323385, "loss": 3.0283, "step": 116000 }, { "epoch": 6.28, "learning_rate": 0.00022516559985906391, "loss": 3.0323, "step": 117000 }, { "epoch": 6.33, "learning_rate": 0.00022428474954489398, "loss": 3.0307, "step": 118000 }, { "epoch": 6.39, "learning_rate": 0.00022340389923072404, "loss": 3.0335, "step": 119000 }, { "epoch": 6.44, "learning_rate": 0.00022252392976686826, "loss": 3.0377, "step": 120000 }, { "epoch": 6.5, "learning_rate": 0.00022164396030301248, "loss": 3.0392, "step": 121000 }, { "epoch": 6.55, "learning_rate": 0.00022076310998884254, "loss": 3.0342, "step": 122000 }, { "epoch": 6.6, "learning_rate": 0.0002198822596746726, "loss": 3.0388, "step": 123000 }, { "epoch": 6.66, "learning_rate": 0.00021900140936050263, "loss": 3.0405, "step": 124000 }, { "epoch": 6.71, "learning_rate": 0.0002181205590463327, "loss": 3.0445, "step": 125000 }, { "epoch": 6.76, "learning_rate": 0.00021724058958247694, "loss": 3.0388, "step": 126000 }, { "epoch": 6.82, "learning_rate": 0.000216359739268307, "loss": 3.0434, "step": 127000 }, { "epoch": 6.87, "learning_rate": 0.0002154797698044512, "loss": 3.0398, "step": 128000 }, { "epoch": 6.92, "learning_rate": 0.00021459891949028126, "loss": 3.0393, "step": 129000 }, { "epoch": 6.98, "learning_rate": 0.00021371806917611132, "loss": 3.0442, "step": 130000 }, { "epoch": 7.0, "eval_accuracy": 0.40321569998711065, "eval_loss": 3.3853769302368164, "eval_runtime": 146.6813, "eval_samples_per_second": 394.945, "eval_steps_per_second": 6.177, "step": 130403 }, { "epoch": 7.03, "learning_rate": 0.00021283809971225557, "loss": 2.9992, "step": 131000 }, { "epoch": 7.09, "learning_rate": 0.00021195724939808563, "loss": 2.9735, "step": 132000 }, { "epoch": 7.14, "learning_rate": 0.00021107727993422982, "loss": 2.9852, "step": 133000 }, { "epoch": 7.19, "learning_rate": 0.00021019642962005988, "loss": 2.987, "step": 134000 }, { "epoch": 7.25, "learning_rate": 0.0002093164601562041, "loss": 2.9856, "step": 135000 }, { "epoch": 7.3, "learning_rate": 0.00020843560984203414, "loss": 2.9901, "step": 136000 }, { "epoch": 7.35, "learning_rate": 0.0002075547595278642, "loss": 2.9944, "step": 137000 }, { "epoch": 7.41, "learning_rate": 0.00020667390921369426, "loss": 2.9964, "step": 138000 }, { "epoch": 7.46, "learning_rate": 0.0002057939397498385, "loss": 2.9989, "step": 139000 }, { "epoch": 7.52, "learning_rate": 0.00020491308943566854, "loss": 2.9983, "step": 140000 }, { "epoch": 7.57, "learning_rate": 0.00020403311997181276, "loss": 3.0016, "step": 141000 }, { "epoch": 7.62, "learning_rate": 0.00020315226965764282, "loss": 3.0024, "step": 142000 }, { "epoch": 7.68, "learning_rate": 0.00020227141934347288, "loss": 3.0029, "step": 143000 }, { "epoch": 7.73, "learning_rate": 0.0002013914498796171, "loss": 3.0057, "step": 144000 }, { "epoch": 7.78, "learning_rate": 0.00020051059956544717, "loss": 3.0074, "step": 145000 }, { "epoch": 7.84, "learning_rate": 0.00019962974925127723, "loss": 3.0036, "step": 146000 }, { "epoch": 7.89, "learning_rate": 0.0001987488989371073, "loss": 3.0127, "step": 147000 }, { "epoch": 7.94, "learning_rate": 0.00019786892947325148, "loss": 3.0076, "step": 148000 }, { "epoch": 8.0, "learning_rate": 0.00019698896000939573, "loss": 3.008, "step": 149000 }, { "epoch": 8.0, "eval_accuracy": 0.4029720181670573, "eval_loss": 3.406196117401123, "eval_runtime": 147.0273, "eval_samples_per_second": 394.015, "eval_steps_per_second": 6.162, "step": 149032 }, { "epoch": 8.05, "learning_rate": 0.0001961081096952258, "loss": 2.9374, "step": 150000 }, { "epoch": 8.11, "learning_rate": 0.00019522725938105585, "loss": 2.9476, "step": 151000 }, { "epoch": 8.16, "learning_rate": 0.0001943464090668859, "loss": 2.9477, "step": 152000 }, { "epoch": 8.21, "learning_rate": 0.0001934664396030301, "loss": 2.9527, "step": 153000 }, { "epoch": 8.27, "learning_rate": 0.00019258647013917432, "loss": 2.9556, "step": 154000 }, { "epoch": 8.32, "learning_rate": 0.0001917056198250044, "loss": 2.9656, "step": 155000 }, { "epoch": 8.37, "learning_rate": 0.0001908256503611486, "loss": 2.9603, "step": 156000 }, { "epoch": 8.43, "learning_rate": 0.00018994480004697867, "loss": 2.9675, "step": 157000 }, { "epoch": 8.48, "learning_rate": 0.0001890648305831229, "loss": 2.9637, "step": 158000 }, { "epoch": 8.54, "learning_rate": 0.00018818398026895295, "loss": 2.9655, "step": 159000 }, { "epoch": 8.59, "learning_rate": 0.0001873040108050972, "loss": 2.9693, "step": 160000 }, { "epoch": 8.64, "learning_rate": 0.00018642316049092723, "loss": 2.9702, "step": 161000 }, { "epoch": 8.7, "learning_rate": 0.0001855423101767573, "loss": 2.9723, "step": 162000 }, { "epoch": 8.75, "learning_rate": 0.00018466145986258735, "loss": 2.9766, "step": 163000 }, { "epoch": 8.8, "learning_rate": 0.00018378149039873155, "loss": 2.9716, "step": 164000 }, { "epoch": 8.86, "learning_rate": 0.0001829006400845616, "loss": 2.9744, "step": 165000 }, { "epoch": 8.91, "learning_rate": 0.00018202067062070585, "loss": 2.9746, "step": 166000 }, { "epoch": 8.96, "learning_rate": 0.00018113982030653592, "loss": 2.9768, "step": 167000 }, { "epoch": 9.0, "eval_accuracy": 0.40467940291506055, "eval_loss": 3.3969688415527344, "eval_runtime": 146.7783, "eval_samples_per_second": 394.684, "eval_steps_per_second": 6.173, "step": 167661 }, { "epoch": 9.02, "learning_rate": 0.0001802598508426801, "loss": 2.9532, "step": 168000 }, { "epoch": 9.07, "learning_rate": 0.00017937900052851017, "loss": 2.9113, "step": 169000 }, { "epoch": 9.13, "learning_rate": 0.00017849815021434023, "loss": 2.9146, "step": 170000 }, { "epoch": 9.18, "learning_rate": 0.00017761818075048445, "loss": 2.9201, "step": 171000 }, { "epoch": 9.23, "learning_rate": 0.00017673733043631449, "loss": 2.9252, "step": 172000 }, { "epoch": 9.29, "learning_rate": 0.00017585736097245873, "loss": 2.933, "step": 173000 }, { "epoch": 9.34, "learning_rate": 0.0001749765106582888, "loss": 2.9311, "step": 174000 }, { "epoch": 9.39, "learning_rate": 0.00017409566034411886, "loss": 2.932, "step": 175000 }, { "epoch": 9.45, "learning_rate": 0.0001732148100299489, "loss": 2.9385, "step": 176000 }, { "epoch": 9.5, "learning_rate": 0.00017233395971577895, "loss": 2.9352, "step": 177000 }, { "epoch": 9.55, "learning_rate": 0.00017145310940160901, "loss": 2.9421, "step": 178000 }, { "epoch": 9.61, "learning_rate": 0.00017057313993775323, "loss": 2.9382, "step": 179000 }, { "epoch": 9.66, "learning_rate": 0.00016969317047389745, "loss": 2.9397, "step": 180000 }, { "epoch": 9.72, "learning_rate": 0.00016881232015972751, "loss": 2.9424, "step": 181000 }, { "epoch": 9.77, "learning_rate": 0.00016793235069587173, "loss": 2.9413, "step": 182000 }, { "epoch": 9.82, "learning_rate": 0.0001670515003817018, "loss": 2.9477, "step": 183000 }, { "epoch": 9.88, "learning_rate": 0.00016617065006753183, "loss": 2.9463, "step": 184000 }, { "epoch": 9.93, "learning_rate": 0.00016529068060367605, "loss": 2.9485, "step": 185000 }, { "epoch": 9.98, "learning_rate": 0.0001644098302895061, "loss": 2.9498, "step": 186000 }, { "epoch": 10.0, "eval_accuracy": 0.40466066332636297, "eval_loss": 3.4024200439453125, "eval_runtime": 146.7679, "eval_samples_per_second": 394.712, "eval_steps_per_second": 6.173, "step": 186290 }, { "epoch": 10.04, "learning_rate": 0.00016352897997533617, "loss": 2.8971, "step": 187000 }, { "epoch": 10.09, "learning_rate": 0.00016264989136179455, "loss": 2.8859, "step": 188000 }, { "epoch": 10.15, "learning_rate": 0.0001617690410476246, "loss": 2.8923, "step": 189000 }, { "epoch": 10.2, "learning_rate": 0.00016088819073345467, "loss": 2.896, "step": 190000 }, { "epoch": 10.25, "learning_rate": 0.00016000734041928474, "loss": 2.8957, "step": 191000 }, { "epoch": 10.31, "learning_rate": 0.00015912737095542895, "loss": 2.9042, "step": 192000 }, { "epoch": 10.36, "learning_rate": 0.00015824652064125902, "loss": 2.9053, "step": 193000 }, { "epoch": 10.41, "learning_rate": 0.00015736655117740324, "loss": 2.9073, "step": 194000 }, { "epoch": 10.47, "learning_rate": 0.0001564857008632333, "loss": 2.909, "step": 195000 }, { "epoch": 10.52, "learning_rate": 0.00015560485054906333, "loss": 2.9105, "step": 196000 }, { "epoch": 10.57, "learning_rate": 0.0001547240002348934, "loss": 2.9139, "step": 197000 }, { "epoch": 10.63, "learning_rate": 0.00015384314992072346, "loss": 2.9178, "step": 198000 }, { "epoch": 10.68, "learning_rate": 0.00015296229960655352, "loss": 2.9157, "step": 199000 }, { "epoch": 10.74, "learning_rate": 0.00015208233014269774, "loss": 2.9183, "step": 200000 }, { "epoch": 10.79, "learning_rate": 0.00015120236067884196, "loss": 2.9167, "step": 201000 }, { "epoch": 10.84, "learning_rate": 0.00015032151036467202, "loss": 2.9226, "step": 202000 }, { "epoch": 10.9, "learning_rate": 0.00014944154090081624, "loss": 2.9192, "step": 203000 }, { "epoch": 10.95, "learning_rate": 0.0001485606905866463, "loss": 2.917, "step": 204000 }, { "epoch": 11.0, "eval_accuracy": 0.4038836756487508, "eval_loss": 3.424196243286133, "eval_runtime": 146.9603, "eval_samples_per_second": 394.195, "eval_steps_per_second": 6.165, "step": 204919 }, { "epoch": 11.0, "learning_rate": 0.00014767984027247636, "loss": 2.9203, "step": 205000 }, { "epoch": 11.06, "learning_rate": 0.00014679987080862058, "loss": 2.8546, "step": 206000 }, { "epoch": 11.11, "learning_rate": 0.00014591902049445064, "loss": 2.8642, "step": 207000 }, { "epoch": 11.17, "learning_rate": 0.00014503817018028068, "loss": 2.8687, "step": 208000 }, { "epoch": 11.22, "learning_rate": 0.00014415820071642492, "loss": 2.8739, "step": 209000 }, { "epoch": 11.27, "learning_rate": 0.00014327735040225496, "loss": 2.8749, "step": 210000 }, { "epoch": 11.33, "learning_rate": 0.0001423973809383992, "loss": 2.8796, "step": 211000 }, { "epoch": 11.38, "learning_rate": 0.00014151653062422924, "loss": 2.8808, "step": 212000 }, { "epoch": 11.43, "learning_rate": 0.00014063656116037346, "loss": 2.8825, "step": 213000 }, { "epoch": 11.49, "learning_rate": 0.00013975659169651768, "loss": 2.8886, "step": 214000 }, { "epoch": 11.54, "learning_rate": 0.00013887574138234774, "loss": 2.8905, "step": 215000 }, { "epoch": 11.59, "learning_rate": 0.0001379948910681778, "loss": 2.8877, "step": 216000 }, { "epoch": 11.65, "learning_rate": 0.00013711404075400786, "loss": 2.8924, "step": 217000 }, { "epoch": 11.7, "learning_rate": 0.00013623407129015208, "loss": 2.8915, "step": 218000 }, { "epoch": 11.76, "learning_rate": 0.00013535322097598214, "loss": 2.8901, "step": 219000 }, { "epoch": 11.81, "learning_rate": 0.00013447325151212636, "loss": 2.8894, "step": 220000 }, { "epoch": 11.86, "learning_rate": 0.00013359240119795643, "loss": 2.8924, "step": 221000 }, { "epoch": 11.92, "learning_rate": 0.00013271155088378646, "loss": 2.8998, "step": 222000 }, { "epoch": 11.97, "learning_rate": 0.0001318315814199307, "loss": 2.9005, "step": 223000 }, { "epoch": 12.0, "eval_accuracy": 0.40485423857592023, "eval_loss": 3.409266710281372, "eval_runtime": 147.6369, "eval_samples_per_second": 392.388, "eval_steps_per_second": 6.137, "step": 223548 }, { "epoch": 12.02, "learning_rate": 0.00013095073110576074, "loss": 2.8672, "step": 224000 }, { "epoch": 12.08, "learning_rate": 0.0001300698807915908, "loss": 2.8401, "step": 225000 }, { "epoch": 12.13, "learning_rate": 0.00012918991132773502, "loss": 2.838, "step": 226000 }, { "epoch": 12.19, "learning_rate": 0.00012830906101356508, "loss": 2.8478, "step": 227000 }, { "epoch": 12.24, "learning_rate": 0.00012742821069939515, "loss": 2.8567, "step": 228000 }, { "epoch": 12.29, "learning_rate": 0.00012654824123553937, "loss": 2.8554, "step": 229000 }, { "epoch": 12.35, "learning_rate": 0.0001256673909213694, "loss": 2.8574, "step": 230000 }, { "epoch": 12.4, "learning_rate": 0.0001247883023078278, "loss": 2.8588, "step": 231000 }, { "epoch": 12.45, "learning_rate": 0.00012390745199365787, "loss": 2.857, "step": 232000 }, { "epoch": 12.51, "learning_rate": 0.00012302660167948793, "loss": 2.8649, "step": 233000 }, { "epoch": 12.56, "learning_rate": 0.00012214575136531796, "loss": 2.862, "step": 234000 }, { "epoch": 12.61, "learning_rate": 0.00012126666275177637, "loss": 2.8702, "step": 235000 }, { "epoch": 12.67, "learning_rate": 0.00012038581243760643, "loss": 2.8654, "step": 236000 }, { "epoch": 12.72, "learning_rate": 0.00011950496212343648, "loss": 2.8664, "step": 237000 }, { "epoch": 12.78, "learning_rate": 0.00011862499265958071, "loss": 2.8714, "step": 238000 }, { "epoch": 12.83, "learning_rate": 0.00011774414234541076, "loss": 2.8719, "step": 239000 }, { "epoch": 12.88, "learning_rate": 0.00011686329203124082, "loss": 2.8718, "step": 240000 }, { "epoch": 12.94, "learning_rate": 0.00011598244171707087, "loss": 2.8766, "step": 241000 }, { "epoch": 12.99, "learning_rate": 0.00011510159140290093, "loss": 2.8747, "step": 242000 }, { "epoch": 13.0, "eval_accuracy": 0.40508253915650494, "eval_loss": 3.419220209121704, "eval_runtime": 147.1595, "eval_samples_per_second": 393.661, "eval_steps_per_second": 6.157, "step": 242177 }, { "epoch": 13.04, "learning_rate": 0.00011422162193904515, "loss": 2.8227, "step": 243000 }, { "epoch": 13.1, "learning_rate": 0.00011334165247518937, "loss": 2.8177, "step": 244000 }, { "epoch": 13.15, "learning_rate": 0.00011246080216101942, "loss": 2.8252, "step": 245000 }, { "epoch": 13.21, "learning_rate": 0.00011157995184684948, "loss": 2.8268, "step": 246000 }, { "epoch": 13.26, "learning_rate": 0.00011069910153267953, "loss": 2.8324, "step": 247000 }, { "epoch": 13.31, "learning_rate": 0.00010981825121850959, "loss": 2.8336, "step": 248000 }, { "epoch": 13.37, "learning_rate": 0.00010893828175465381, "loss": 2.8315, "step": 249000 }, { "epoch": 13.42, "learning_rate": 0.00010805743144048387, "loss": 2.8384, "step": 250000 }, { "epoch": 13.47, "learning_rate": 0.00010717746197662809, "loss": 2.8431, "step": 251000 }, { "epoch": 13.53, "learning_rate": 0.00010629661166245815, "loss": 2.8403, "step": 252000 }, { "epoch": 13.58, "learning_rate": 0.0001054157613482882, "loss": 2.8461, "step": 253000 }, { "epoch": 13.63, "learning_rate": 0.00010453579188443243, "loss": 2.8443, "step": 254000 }, { "epoch": 13.69, "learning_rate": 0.00010365494157026248, "loss": 2.8493, "step": 255000 }, { "epoch": 13.74, "learning_rate": 0.00010277497210640671, "loss": 2.8451, "step": 256000 }, { "epoch": 13.8, "learning_rate": 0.00010189412179223676, "loss": 2.8502, "step": 257000 }, { "epoch": 13.85, "learning_rate": 0.00010101327147806682, "loss": 2.8475, "step": 258000 }, { "epoch": 13.9, "learning_rate": 0.00010013242116389687, "loss": 2.8509, "step": 259000 }, { "epoch": 13.96, "learning_rate": 9.92524517000411e-05, "loss": 2.8542, "step": 260000 }, { "epoch": 14.0, "eval_accuracy": 0.40528363710833504, "eval_loss": 3.4232749938964844, "eval_runtime": 146.7575, "eval_samples_per_second": 394.74, "eval_steps_per_second": 6.173, "step": 260806 }, { "epoch": 14.01, "learning_rate": 9.837160138587115e-05, "loss": 2.8396, "step": 261000 }, { "epoch": 14.06, "learning_rate": 9.749075107170121e-05, "loss": 2.7946, "step": 262000 }, { "epoch": 14.12, "learning_rate": 9.660990075753126e-05, "loss": 2.7988, "step": 263000 }, { "epoch": 14.17, "learning_rate": 9.573081214398965e-05, "loss": 2.8067, "step": 264000 }, { "epoch": 14.23, "learning_rate": 9.48499618298197e-05, "loss": 2.8113, "step": 265000 }, { "epoch": 14.28, "learning_rate": 9.396911151564976e-05, "loss": 2.8143, "step": 266000 }, { "epoch": 14.33, "learning_rate": 9.308826120147981e-05, "loss": 2.8156, "step": 267000 }, { "epoch": 14.39, "learning_rate": 9.220829173762404e-05, "loss": 2.8157, "step": 268000 }, { "epoch": 14.44, "learning_rate": 9.132744142345409e-05, "loss": 2.8176, "step": 269000 }, { "epoch": 14.49, "learning_rate": 9.044659110928415e-05, "loss": 2.8195, "step": 270000 }, { "epoch": 14.55, "learning_rate": 8.956662164542837e-05, "loss": 2.823, "step": 271000 }, { "epoch": 14.6, "learning_rate": 8.868577133125844e-05, "loss": 2.8243, "step": 272000 }, { "epoch": 14.65, "learning_rate": 8.780492101708848e-05, "loss": 2.8264, "step": 273000 }, { "epoch": 14.71, "learning_rate": 8.692495155323272e-05, "loss": 2.8238, "step": 274000 }, { "epoch": 14.76, "learning_rate": 8.604410123906277e-05, "loss": 2.8249, "step": 275000 }, { "epoch": 14.82, "learning_rate": 8.5164131775207e-05, "loss": 2.8285, "step": 276000 }, { "epoch": 14.87, "learning_rate": 8.428328146103705e-05, "loss": 2.8252, "step": 277000 }, { "epoch": 14.92, "learning_rate": 8.340243114686711e-05, "loss": 2.8293, "step": 278000 }, { "epoch": 14.98, "learning_rate": 8.252246168301133e-05, "loss": 2.8326, "step": 279000 }, { "epoch": 15.0, "eval_accuracy": 0.40544732304975456, "eval_loss": 3.4313971996307373, "eval_runtime": 146.9557, "eval_samples_per_second": 394.207, "eval_steps_per_second": 6.165, "step": 279435 }, { "epoch": 15.03, "learning_rate": 8.164161136884139e-05, "loss": 2.8031, "step": 280000 }, { "epoch": 15.08, "learning_rate": 8.076164190498561e-05, "loss": 2.7829, "step": 281000 }, { "epoch": 15.14, "learning_rate": 7.988079159081567e-05, "loss": 2.7866, "step": 282000 }, { "epoch": 15.19, "learning_rate": 7.899994127664572e-05, "loss": 2.7878, "step": 283000 }, { "epoch": 15.25, "learning_rate": 7.811997181278994e-05, "loss": 2.7902, "step": 284000 }, { "epoch": 15.3, "learning_rate": 7.723912149861999e-05, "loss": 2.7923, "step": 285000 }, { "epoch": 15.35, "learning_rate": 7.635915203476422e-05, "loss": 2.7957, "step": 286000 }, { "epoch": 15.41, "learning_rate": 7.547830172059427e-05, "loss": 2.7959, "step": 287000 }, { "epoch": 15.46, "learning_rate": 7.459745140642433e-05, "loss": 2.7977, "step": 288000 }, { "epoch": 15.51, "learning_rate": 7.371748194256855e-05, "loss": 2.8023, "step": 289000 }, { "epoch": 15.57, "learning_rate": 7.283663162839861e-05, "loss": 2.8038, "step": 290000 }, { "epoch": 15.62, "learning_rate": 7.195666216454283e-05, "loss": 2.8004, "step": 291000 }, { "epoch": 15.67, "learning_rate": 7.107581185037289e-05, "loss": 2.8042, "step": 292000 }, { "epoch": 15.73, "learning_rate": 7.019496153620294e-05, "loss": 2.8065, "step": 293000 }, { "epoch": 15.78, "learning_rate": 6.9314111222033e-05, "loss": 2.8046, "step": 294000 }, { "epoch": 15.84, "learning_rate": 6.843414175817722e-05, "loss": 2.8082, "step": 295000 }, { "epoch": 15.89, "learning_rate": 6.755329144400728e-05, "loss": 2.8096, "step": 296000 }, { "epoch": 15.94, "learning_rate": 6.667244112983733e-05, "loss": 2.8063, "step": 297000 }, { "epoch": 16.0, "learning_rate": 6.579247166598155e-05, "loss": 2.8125, "step": 298000 }, { "epoch": 16.0, "eval_accuracy": 0.40515031064394535, "eval_loss": 3.4404408931732178, "eval_runtime": 146.6919, "eval_samples_per_second": 394.916, "eval_steps_per_second": 6.176, "step": 298064 }, { "epoch": 16.05, "learning_rate": 6.491338305243995e-05, "loss": 2.7653, "step": 299000 }, { "epoch": 16.1, "learning_rate": 6.403253273827e-05, "loss": 2.7653, "step": 300000 }, { "epoch": 16.16, "learning_rate": 6.315168242410006e-05, "loss": 2.7705, "step": 301000 }, { "epoch": 16.21, "learning_rate": 6.227083210993011e-05, "loss": 2.7698, "step": 302000 }, { "epoch": 16.26, "learning_rate": 6.138998179576017e-05, "loss": 2.7756, "step": 303000 }, { "epoch": 16.32, "learning_rate": 6.050913148159022e-05, "loss": 2.7763, "step": 304000 }, { "epoch": 16.37, "learning_rate": 5.962828116742028e-05, "loss": 2.7787, "step": 305000 }, { "epoch": 16.43, "learning_rate": 5.87483117035645e-05, "loss": 2.7797, "step": 306000 }, { "epoch": 16.48, "learning_rate": 5.786746138939455e-05, "loss": 2.7807, "step": 307000 }, { "epoch": 16.53, "learning_rate": 5.698749192553878e-05, "loss": 2.7803, "step": 308000 }, { "epoch": 16.59, "learning_rate": 5.610664161136883e-05, "loss": 2.7844, "step": 309000 }, { "epoch": 16.64, "learning_rate": 5.522579129719889e-05, "loss": 2.7835, "step": 310000 }, { "epoch": 16.69, "learning_rate": 5.4345821833343114e-05, "loss": 2.7858, "step": 311000 }, { "epoch": 16.75, "learning_rate": 5.346497151917317e-05, "loss": 2.787, "step": 312000 }, { "epoch": 16.8, "learning_rate": 5.25850020553174e-05, "loss": 2.7899, "step": 313000 }, { "epoch": 16.86, "learning_rate": 5.170415174114745e-05, "loss": 2.7873, "step": 314000 }, { "epoch": 16.91, "learning_rate": 5.0823301426977506e-05, "loss": 2.7879, "step": 315000 }, { "epoch": 16.96, "learning_rate": 4.994245111280756e-05, "loss": 2.7911, "step": 316000 }, { "epoch": 17.0, "eval_accuracy": 0.40542401610610557, "eval_loss": 3.4449925422668457, "eval_runtime": 146.7539, "eval_samples_per_second": 394.749, "eval_steps_per_second": 6.174, "step": 316693 }, { "epoch": 17.02, "learning_rate": 4.906248164895178e-05, "loss": 2.7767, "step": 317000 }, { "epoch": 17.07, "learning_rate": 4.8182512185096006e-05, "loss": 2.7487, "step": 318000 }, { "epoch": 17.12, "learning_rate": 4.730166187092606e-05, "loss": 2.7542, "step": 319000 }, { "epoch": 17.18, "learning_rate": 4.6420811556756116e-05, "loss": 2.759, "step": 320000 }, { "epoch": 17.23, "learning_rate": 4.554084209290035e-05, "loss": 2.7571, "step": 321000 }, { "epoch": 17.28, "learning_rate": 4.46599917787304e-05, "loss": 2.7582, "step": 322000 }, { "epoch": 17.34, "learning_rate": 4.3780022314874616e-05, "loss": 2.7629, "step": 323000 }, { "epoch": 17.39, "learning_rate": 4.289917200070467e-05, "loss": 2.76, "step": 324000 }, { "epoch": 17.45, "learning_rate": 4.2018321686534727e-05, "loss": 2.7596, "step": 325000 }, { "epoch": 17.5, "learning_rate": 4.113835222267895e-05, "loss": 2.762, "step": 326000 }, { "epoch": 17.55, "learning_rate": 4.0258382758823185e-05, "loss": 2.7645, "step": 327000 }, { "epoch": 17.61, "learning_rate": 3.937753244465324e-05, "loss": 2.7674, "step": 328000 }, { "epoch": 17.66, "learning_rate": 3.849668213048329e-05, "loss": 2.7634, "step": 329000 }, { "epoch": 17.71, "learning_rate": 3.7615831816313344e-05, "loss": 2.7657, "step": 330000 }, { "epoch": 17.77, "learning_rate": 3.67349815021434e-05, "loss": 2.7677, "step": 331000 }, { "epoch": 17.82, "learning_rate": 3.5854131187973454e-05, "loss": 2.7711, "step": 332000 }, { "epoch": 17.88, "learning_rate": 3.497416172411768e-05, "loss": 2.7684, "step": 333000 }, { "epoch": 17.93, "learning_rate": 3.40941922602619e-05, "loss": 2.7698, "step": 334000 }, { "epoch": 17.98, "learning_rate": 3.321422279640613e-05, "loss": 2.7682, "step": 335000 }, { "epoch": 18.0, "eval_accuracy": 0.4054381211728672, "eval_loss": 3.4487550258636475, "eval_runtime": 146.6725, "eval_samples_per_second": 394.968, "eval_steps_per_second": 6.177, "step": 335322 }, { "epoch": 18.04, "learning_rate": 3.233337248223618e-05, "loss": 2.7483, "step": 336000 }, { "epoch": 18.09, "learning_rate": 3.1452522168066235e-05, "loss": 2.7436, "step": 337000 }, { "epoch": 18.14, "learning_rate": 3.057167185389629e-05, "loss": 2.7403, "step": 338000 }, { "epoch": 18.2, "learning_rate": 2.9691702390040516e-05, "loss": 2.7426, "step": 339000 }, { "epoch": 18.25, "learning_rate": 2.881085207587057e-05, "loss": 2.7424, "step": 340000 }, { "epoch": 18.3, "learning_rate": 2.7930001761700627e-05, "loss": 2.7414, "step": 341000 }, { "epoch": 18.36, "learning_rate": 2.704915144753068e-05, "loss": 2.7417, "step": 342000 }, { "epoch": 18.41, "learning_rate": 2.6170062833989075e-05, "loss": 2.7445, "step": 343000 }, { "epoch": 18.47, "learning_rate": 2.528921251981913e-05, "loss": 2.7418, "step": 344000 }, { "epoch": 18.52, "learning_rate": 2.4408362205649185e-05, "loss": 2.7499, "step": 345000 }, { "epoch": 18.57, "learning_rate": 2.352751189147924e-05, "loss": 2.7496, "step": 346000 }, { "epoch": 18.63, "learning_rate": 2.2648423277937634e-05, "loss": 2.749, "step": 347000 }, { "epoch": 18.68, "learning_rate": 2.176757296376769e-05, "loss": 2.7483, "step": 348000 }, { "epoch": 18.73, "learning_rate": 2.0886722649597744e-05, "loss": 2.7452, "step": 349000 }, { "epoch": 18.79, "learning_rate": 2.00058723354278e-05, "loss": 2.7509, "step": 350000 }, { "epoch": 18.84, "learning_rate": 1.9125022021257855e-05, "loss": 2.7539, "step": 351000 }, { "epoch": 18.9, "learning_rate": 1.8245052557402077e-05, "loss": 2.7494, "step": 352000 }, { "epoch": 18.95, "learning_rate": 1.7364202243232132e-05, "loss": 2.7512, "step": 353000 }, { "epoch": 19.0, "eval_accuracy": 0.4053620881463235, "eval_loss": 3.4580540657043457, "eval_runtime": 146.8189, "eval_samples_per_second": 394.574, "eval_steps_per_second": 6.171, "step": 353951 }, { "epoch": 19.0, "learning_rate": 1.6484232779376355e-05, "loss": 2.7469, "step": 354000 }, { "epoch": 19.06, "learning_rate": 1.560338246520641e-05, "loss": 2.7291, "step": 355000 }, { "epoch": 19.11, "learning_rate": 1.4723413001350636e-05, "loss": 2.7264, "step": 356000 }, { "epoch": 19.16, "learning_rate": 1.384344353749486e-05, "loss": 2.7311, "step": 357000 }, { "epoch": 19.22, "learning_rate": 1.2962593223324915e-05, "loss": 2.7316, "step": 358000 }, { "epoch": 19.27, "learning_rate": 1.208174290915497e-05, "loss": 2.7296, "step": 359000 }, { "epoch": 19.32, "learning_rate": 1.1200892594985025e-05, "loss": 2.7333, "step": 360000 }, { "epoch": 19.38, "learning_rate": 1.032092313112925e-05, "loss": 2.7345, "step": 361000 }, { "epoch": 19.43, "learning_rate": 9.440953667273474e-06, "loss": 2.7319, "step": 362000 }, { "epoch": 19.49, "learning_rate": 8.560103353103529e-06, "loss": 2.7309, "step": 363000 }, { "epoch": 19.54, "learning_rate": 7.679253038933582e-06, "loss": 2.7327, "step": 364000 }, { "epoch": 19.59, "learning_rate": 6.798402724763638e-06, "loss": 2.7342, "step": 365000 }, { "epoch": 19.65, "learning_rate": 5.917552410593693e-06, "loss": 2.7296, "step": 366000 }, { "epoch": 19.7, "learning_rate": 5.037582946737917e-06, "loss": 2.7344, "step": 367000 }, { "epoch": 19.75, "learning_rate": 4.156732632567972e-06, "loss": 2.7325, "step": 368000 }, { "epoch": 19.81, "learning_rate": 3.275882318398027e-06, "loss": 2.7282, "step": 369000 }, { "epoch": 19.86, "learning_rate": 2.395912854542251e-06, "loss": 2.7303, "step": 370000 }, { "epoch": 19.92, "learning_rate": 1.5150625403723059e-06, "loss": 2.7292, "step": 371000 }, { "epoch": 19.97, "learning_rate": 6.342122262023606e-07, "loss": 2.7331, "step": 372000 }, { "epoch": 20.0, "eval_accuracy": 0.40517314741870225, "eval_loss": 3.466306209564209, "eval_runtime": 147.2197, "eval_samples_per_second": 393.5, "eval_steps_per_second": 6.154, "step": 372580 }, { "epoch": 20.0, "step": 372580, "total_flos": 1.56977765434368e+18, "train_loss": 3.0355079814286947, "train_runtime": 47458.051, "train_samples_per_second": 251.219, "train_steps_per_second": 7.851 } ], "logging_steps": 1000, "max_steps": 372580, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 5000, "total_flos": 1.56977765434368e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }