{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 371720, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "learning_rate": 9.375e-06, "loss": 6.8593, "step": 1000 }, { "epoch": 0.11, "learning_rate": 1.875e-05, "loss": 5.3677, "step": 2000 }, { "epoch": 0.16, "learning_rate": 2.8125e-05, "loss": 5.0449, "step": 3000 }, { "epoch": 0.22, "learning_rate": 3.75e-05, "loss": 4.821, "step": 4000 }, { "epoch": 0.27, "learning_rate": 4.6874999999999994e-05, "loss": 4.6456, "step": 5000 }, { "epoch": 0.32, "learning_rate": 5.625e-05, "loss": 4.5012, "step": 6000 }, { "epoch": 0.38, "learning_rate": 6.5625e-05, "loss": 4.3871, "step": 7000 }, { "epoch": 0.43, "learning_rate": 7.5e-05, "loss": 4.2893, "step": 8000 }, { "epoch": 0.48, "learning_rate": 8.437499999999999e-05, "loss": 4.2089, "step": 9000 }, { "epoch": 0.54, "learning_rate": 9.374999999999999e-05, "loss": 4.1406, "step": 10000 }, { "epoch": 0.59, "learning_rate": 0.00010312499999999999, "loss": 4.0695, "step": 11000 }, { "epoch": 0.65, "learning_rate": 0.0001125, "loss": 4.0107, "step": 12000 }, { "epoch": 0.7, "learning_rate": 0.00012185624999999998, "loss": 3.9458, "step": 13000 }, { "epoch": 0.75, "learning_rate": 0.000131221875, "loss": 3.8982, "step": 14000 }, { "epoch": 0.81, "learning_rate": 0.000140596875, "loss": 3.8433, "step": 15000 }, { "epoch": 0.86, "learning_rate": 0.000149971875, "loss": 3.8086, "step": 16000 }, { "epoch": 0.91, "learning_rate": 0.00015933749999999996, "loss": 3.7712, "step": 17000 }, { "epoch": 0.97, "learning_rate": 0.00016871249999999996, "loss": 3.7431, "step": 18000 }, { "epoch": 1.0, "eval_accuracy": 0.34807019476750434, "eval_loss": 3.8859505653381348, "eval_runtime": 152.2662, "eval_samples_per_second": 380.386, "eval_steps_per_second": 5.944, "step": 18586 }, { "epoch": 1.02, "learning_rate": 0.000178078125, "loss": 3.698, "step": 19000 }, { "epoch": 1.08, "learning_rate": 0.00018745312499999998, "loss": 3.6713, "step": 20000 }, { "epoch": 1.13, "learning_rate": 0.00019681874999999998, "loss": 3.6498, "step": 21000 }, { "epoch": 1.18, "learning_rate": 0.00020618437499999995, "loss": 3.627, "step": 22000 }, { "epoch": 1.24, "learning_rate": 0.00021555937499999998, "loss": 3.6114, "step": 23000 }, { "epoch": 1.29, "learning_rate": 0.00022493437499999998, "loss": 3.592, "step": 24000 }, { "epoch": 1.35, "learning_rate": 0.00023430937499999997, "loss": 3.5844, "step": 25000 }, { "epoch": 1.4, "learning_rate": 0.00024367499999999997, "loss": 3.5658, "step": 26000 }, { "epoch": 1.45, "learning_rate": 0.00025305, "loss": 3.5562, "step": 27000 }, { "epoch": 1.51, "learning_rate": 0.000262425, "loss": 3.5434, "step": 28000 }, { "epoch": 1.56, "learning_rate": 0.0002718, "loss": 3.528, "step": 29000 }, { "epoch": 1.61, "learning_rate": 0.000281165625, "loss": 3.5166, "step": 30000 }, { "epoch": 1.67, "learning_rate": 0.000290540625, "loss": 3.5017, "step": 31000 }, { "epoch": 1.72, "learning_rate": 0.00029990624999999993, "loss": 3.4968, "step": 32000 }, { "epoch": 1.78, "learning_rate": 0.0002991257506181561, "loss": 3.4872, "step": 33000 }, { "epoch": 1.83, "learning_rate": 0.0002982435535146591, "loss": 3.4725, "step": 34000 }, { "epoch": 1.88, "learning_rate": 0.0002973604733309784, "loss": 3.4562, "step": 35000 }, { "epoch": 1.94, "learning_rate": 0.00029647739314729777, "loss": 3.4495, "step": 36000 }, { "epoch": 1.99, "learning_rate": 0.00029559519604380077, "loss": 3.4382, "step": 37000 }, { "epoch": 2.0, "eval_accuracy": 0.37586823096932304, "eval_loss": 3.62882661819458, "eval_runtime": 152.7372, "eval_samples_per_second": 379.213, "eval_steps_per_second": 5.925, "step": 37172 }, { "epoch": 2.04, "learning_rate": 0.0002947129989403037, "loss": 3.3875, "step": 38000 }, { "epoch": 2.1, "learning_rate": 0.00029382991875662304, "loss": 3.3815, "step": 39000 }, { "epoch": 2.15, "learning_rate": 0.0002929468385729424, "loss": 3.3706, "step": 40000 }, { "epoch": 2.21, "learning_rate": 0.0002920637583892617, "loss": 3.3661, "step": 41000 }, { "epoch": 2.26, "learning_rate": 0.00029118067820558104, "loss": 3.3599, "step": 42000 }, { "epoch": 2.31, "learning_rate": 0.00029029848110208404, "loss": 3.3582, "step": 43000 }, { "epoch": 2.37, "learning_rate": 0.00028941540091840335, "loss": 3.3502, "step": 44000 }, { "epoch": 2.42, "learning_rate": 0.00028853320381490636, "loss": 3.3462, "step": 45000 }, { "epoch": 2.47, "learning_rate": 0.00028765012363122567, "loss": 3.3367, "step": 46000 }, { "epoch": 2.53, "learning_rate": 0.00028676880960791236, "loss": 3.3362, "step": 47000 }, { "epoch": 2.58, "learning_rate": 0.0002858857294242317, "loss": 3.3322, "step": 48000 }, { "epoch": 2.64, "learning_rate": 0.00028500264924055104, "loss": 3.324, "step": 49000 }, { "epoch": 2.69, "learning_rate": 0.00028411956905687036, "loss": 3.3157, "step": 50000 }, { "epoch": 2.74, "learning_rate": 0.00028323737195337336, "loss": 3.3197, "step": 51000 }, { "epoch": 2.8, "learning_rate": 0.0002823542917696927, "loss": 3.3102, "step": 52000 }, { "epoch": 2.85, "learning_rate": 0.000281471211586012, "loss": 3.3061, "step": 53000 }, { "epoch": 2.91, "learning_rate": 0.0002805881314023313, "loss": 3.3007, "step": 54000 }, { "epoch": 2.96, "learning_rate": 0.0002797059342988343, "loss": 3.2958, "step": 55000 }, { "epoch": 3.0, "eval_accuracy": 0.38913569232753614, "eval_loss": 3.5019333362579346, "eval_runtime": 152.7028, "eval_samples_per_second": 379.299, "eval_steps_per_second": 5.927, "step": 55758 }, { "epoch": 3.01, "learning_rate": 0.0002788228541151536, "loss": 3.2785, "step": 56000 }, { "epoch": 3.07, "learning_rate": 0.0002779406570116566, "loss": 3.2343, "step": 57000 }, { "epoch": 3.12, "learning_rate": 0.00027705757682797594, "loss": 3.236, "step": 58000 }, { "epoch": 3.17, "learning_rate": 0.00027617626280466263, "loss": 3.2362, "step": 59000 }, { "epoch": 3.23, "learning_rate": 0.00027529318262098194, "loss": 3.2336, "step": 60000 }, { "epoch": 3.28, "learning_rate": 0.00027441010243730126, "loss": 3.2288, "step": 61000 }, { "epoch": 3.34, "learning_rate": 0.00027352702225362063, "loss": 3.2309, "step": 62000 }, { "epoch": 3.39, "learning_rate": 0.00027264394206993995, "loss": 3.2278, "step": 63000 }, { "epoch": 3.44, "learning_rate": 0.00027176174496644295, "loss": 3.2271, "step": 64000 }, { "epoch": 3.5, "learning_rate": 0.00027087954786294595, "loss": 3.2284, "step": 65000 }, { "epoch": 3.55, "learning_rate": 0.00026999646767926526, "loss": 3.2305, "step": 66000 }, { "epoch": 3.6, "learning_rate": 0.0002691133874955846, "loss": 3.2264, "step": 67000 }, { "epoch": 3.66, "learning_rate": 0.0002682303073119039, "loss": 3.2214, "step": 68000 }, { "epoch": 3.71, "learning_rate": 0.0002673472271282232, "loss": 3.2197, "step": 69000 }, { "epoch": 3.77, "learning_rate": 0.0002664650300247262, "loss": 3.2127, "step": 70000 }, { "epoch": 3.82, "learning_rate": 0.0002655828329212292, "loss": 3.2139, "step": 71000 }, { "epoch": 3.87, "learning_rate": 0.00026469975273754853, "loss": 3.2105, "step": 72000 }, { "epoch": 3.93, "learning_rate": 0.00026381667255386785, "loss": 3.2163, "step": 73000 }, { "epoch": 3.98, "learning_rate": 0.00026293359237018716, "loss": 3.2055, "step": 74000 }, { "epoch": 4.0, "eval_accuracy": 0.3957415434142357, "eval_loss": 3.4540581703186035, "eval_runtime": 152.4456, "eval_samples_per_second": 379.939, "eval_steps_per_second": 5.937, "step": 74344 }, { "epoch": 4.04, "learning_rate": 0.0002620505121865065, "loss": 3.1626, "step": 75000 }, { "epoch": 4.09, "learning_rate": 0.00026116831508300953, "loss": 3.144, "step": 76000 }, { "epoch": 4.14, "learning_rate": 0.00026028523489932885, "loss": 3.1517, "step": 77000 }, { "epoch": 4.2, "learning_rate": 0.00025940215471564816, "loss": 3.15, "step": 78000 }, { "epoch": 4.25, "learning_rate": 0.0002585190745319675, "loss": 3.1589, "step": 79000 }, { "epoch": 4.3, "learning_rate": 0.0002576368774284705, "loss": 3.1527, "step": 80000 }, { "epoch": 4.36, "learning_rate": 0.0002567546803249735, "loss": 3.1576, "step": 81000 }, { "epoch": 4.41, "learning_rate": 0.0002558716001412928, "loss": 3.152, "step": 82000 }, { "epoch": 4.47, "learning_rate": 0.0002549885199576121, "loss": 3.1581, "step": 83000 }, { "epoch": 4.52, "learning_rate": 0.0002541063228541151, "loss": 3.1541, "step": 84000 }, { "epoch": 4.57, "learning_rate": 0.0002532232426704345, "loss": 3.1546, "step": 85000 }, { "epoch": 4.63, "learning_rate": 0.0002523401624867538, "loss": 3.1533, "step": 86000 }, { "epoch": 4.68, "learning_rate": 0.0002514570823030731, "loss": 3.1549, "step": 87000 }, { "epoch": 4.73, "learning_rate": 0.00025057576827975975, "loss": 3.152, "step": 88000 }, { "epoch": 4.79, "learning_rate": 0.0002496926880960791, "loss": 3.1516, "step": 89000 }, { "epoch": 4.84, "learning_rate": 0.00024880960791239844, "loss": 3.1563, "step": 90000 }, { "epoch": 4.9, "learning_rate": 0.00024792652772871775, "loss": 3.1519, "step": 91000 }, { "epoch": 4.95, "learning_rate": 0.00024704433062522075, "loss": 3.1533, "step": 92000 }, { "epoch": 5.0, "eval_accuracy": 0.40060643151965947, "eval_loss": 3.402815103530884, "eval_runtime": 152.6189, "eval_samples_per_second": 379.507, "eval_steps_per_second": 5.93, "step": 92930 }, { "epoch": 5.0, "learning_rate": 0.00024616125044154007, "loss": 3.1452, "step": 93000 }, { "epoch": 5.06, "learning_rate": 0.0002452781702578594, "loss": 3.0832, "step": 94000 }, { "epoch": 5.11, "learning_rate": 0.0002443959731543624, "loss": 3.0868, "step": 95000 }, { "epoch": 5.17, "learning_rate": 0.00024351289297068173, "loss": 3.092, "step": 96000 }, { "epoch": 5.22, "learning_rate": 0.00024262981278700104, "loss": 3.0954, "step": 97000 }, { "epoch": 5.27, "learning_rate": 0.00024174761568350402, "loss": 3.098, "step": 98000 }, { "epoch": 5.33, "learning_rate": 0.00024086541858000705, "loss": 3.1, "step": 99000 }, { "epoch": 5.38, "learning_rate": 0.00023998233839632636, "loss": 3.1025, "step": 100000 }, { "epoch": 5.43, "learning_rate": 0.00023909925821264568, "loss": 3.0996, "step": 101000 }, { "epoch": 5.49, "learning_rate": 0.00023821706110914868, "loss": 3.1028, "step": 102000 }, { "epoch": 5.54, "learning_rate": 0.00023733398092546802, "loss": 3.1036, "step": 103000 }, { "epoch": 5.6, "learning_rate": 0.00023645090074178734, "loss": 3.1031, "step": 104000 }, { "epoch": 5.65, "learning_rate": 0.00023556782055810665, "loss": 3.1023, "step": 105000 }, { "epoch": 5.7, "learning_rate": 0.00023468474037442597, "loss": 3.1053, "step": 106000 }, { "epoch": 5.76, "learning_rate": 0.00023380254327092897, "loss": 3.1094, "step": 107000 }, { "epoch": 5.81, "learning_rate": 0.00023291946308724831, "loss": 3.1036, "step": 108000 }, { "epoch": 5.86, "learning_rate": 0.00023203638290356763, "loss": 3.1038, "step": 109000 }, { "epoch": 5.92, "learning_rate": 0.00023115418580007063, "loss": 3.1039, "step": 110000 }, { "epoch": 5.97, "learning_rate": 0.00023027110561638995, "loss": 3.1056, "step": 111000 }, { "epoch": 6.0, "eval_accuracy": 0.40350113936840293, "eval_loss": 3.380485773086548, "eval_runtime": 152.853, "eval_samples_per_second": 378.926, "eval_steps_per_second": 5.921, "step": 111516 }, { "epoch": 6.03, "learning_rate": 0.00022938802543270926, "loss": 3.0752, "step": 112000 }, { "epoch": 6.08, "learning_rate": 0.0002285058283292123, "loss": 3.034, "step": 113000 }, { "epoch": 6.13, "learning_rate": 0.00022762363122571526, "loss": 3.0444, "step": 114000 }, { "epoch": 6.19, "learning_rate": 0.00022674055104203458, "loss": 3.0518, "step": 115000 }, { "epoch": 6.24, "learning_rate": 0.0002258574708583539, "loss": 3.0495, "step": 116000 }, { "epoch": 6.3, "learning_rate": 0.00022497527375485692, "loss": 3.0551, "step": 117000 }, { "epoch": 6.35, "learning_rate": 0.00022409219357117624, "loss": 3.0553, "step": 118000 }, { "epoch": 6.4, "learning_rate": 0.00022320911338749556, "loss": 3.054, "step": 119000 }, { "epoch": 6.46, "learning_rate": 0.00022232603320381487, "loss": 3.0601, "step": 120000 }, { "epoch": 6.51, "learning_rate": 0.00022144383610031787, "loss": 3.0595, "step": 121000 }, { "epoch": 6.56, "learning_rate": 0.00022056075591663722, "loss": 3.0641, "step": 122000 }, { "epoch": 6.62, "learning_rate": 0.00021967767573295653, "loss": 3.0614, "step": 123000 }, { "epoch": 6.67, "learning_rate": 0.00021879547862945953, "loss": 3.0655, "step": 124000 }, { "epoch": 6.73, "learning_rate": 0.00021791239844577885, "loss": 3.0667, "step": 125000 }, { "epoch": 6.78, "learning_rate": 0.00021703020134228188, "loss": 3.0629, "step": 126000 }, { "epoch": 6.83, "learning_rate": 0.0002161471211586012, "loss": 3.0646, "step": 127000 }, { "epoch": 6.89, "learning_rate": 0.0002152640409749205, "loss": 3.0681, "step": 128000 }, { "epoch": 6.94, "learning_rate": 0.00021438096079123983, "loss": 3.0668, "step": 129000 }, { "epoch": 6.99, "learning_rate": 0.00021349876368774283, "loss": 3.0671, "step": 130000 }, { "epoch": 7.0, "eval_accuracy": 0.40417266805683943, "eval_loss": 3.38333797454834, "eval_runtime": 152.8266, "eval_samples_per_second": 378.992, "eval_steps_per_second": 5.922, "step": 130102 }, { "epoch": 7.05, "learning_rate": 0.00021261656658424583, "loss": 3.0056, "step": 131000 }, { "epoch": 7.1, "learning_rate": 0.00021173348640056514, "loss": 3.004, "step": 132000 }, { "epoch": 7.16, "learning_rate": 0.00021085128929706814, "loss": 3.0074, "step": 133000 }, { "epoch": 7.21, "learning_rate": 0.00020996820911338746, "loss": 3.0123, "step": 134000 }, { "epoch": 7.26, "learning_rate": 0.0002090851289297068, "loss": 3.0156, "step": 135000 }, { "epoch": 7.32, "learning_rate": 0.0002082029318262098, "loss": 3.0192, "step": 136000 }, { "epoch": 7.37, "learning_rate": 0.00020731985164252912, "loss": 3.0201, "step": 137000 }, { "epoch": 7.42, "learning_rate": 0.00020643765453903212, "loss": 3.0221, "step": 138000 }, { "epoch": 7.48, "learning_rate": 0.00020555457435535144, "loss": 3.0277, "step": 139000 }, { "epoch": 7.53, "learning_rate": 0.00020467149417167078, "loss": 3.0262, "step": 140000 }, { "epoch": 7.59, "learning_rate": 0.00020378929706817378, "loss": 3.0283, "step": 141000 }, { "epoch": 7.64, "learning_rate": 0.0002029062168844931, "loss": 3.028, "step": 142000 }, { "epoch": 7.69, "learning_rate": 0.00020202401978099607, "loss": 3.0256, "step": 143000 }, { "epoch": 7.75, "learning_rate": 0.00020114093959731544, "loss": 3.0298, "step": 144000 }, { "epoch": 7.8, "learning_rate": 0.00020025874249381841, "loss": 3.0327, "step": 145000 }, { "epoch": 7.86, "learning_rate": 0.00019937566231013773, "loss": 3.0332, "step": 146000 }, { "epoch": 7.91, "learning_rate": 0.00019849258212645705, "loss": 3.0385, "step": 147000 }, { "epoch": 7.96, "learning_rate": 0.00019761038502296007, "loss": 3.0331, "step": 148000 }, { "epoch": 8.0, "eval_accuracy": 0.40691104864888106, "eval_loss": 3.366875410079956, "eval_runtime": 152.6213, "eval_samples_per_second": 379.501, "eval_steps_per_second": 5.93, "step": 148688 }, { "epoch": 8.02, "learning_rate": 0.00019672818791946308, "loss": 3.0125, "step": 149000 }, { "epoch": 8.07, "learning_rate": 0.0001958451077357824, "loss": 2.9641, "step": 150000 }, { "epoch": 8.12, "learning_rate": 0.0001949620275521017, "loss": 2.9757, "step": 151000 }, { "epoch": 8.18, "learning_rate": 0.00019407894736842102, "loss": 2.9761, "step": 152000 }, { "epoch": 8.23, "learning_rate": 0.00019319586718474037, "loss": 2.978, "step": 153000 }, { "epoch": 8.29, "learning_rate": 0.00019231367008124337, "loss": 2.9905, "step": 154000 }, { "epoch": 8.34, "learning_rate": 0.00019143058989756268, "loss": 2.9886, "step": 155000 }, { "epoch": 8.39, "learning_rate": 0.00019054839279406568, "loss": 2.9935, "step": 156000 }, { "epoch": 8.45, "learning_rate": 0.00018966531261038503, "loss": 2.9913, "step": 157000 }, { "epoch": 8.5, "learning_rate": 0.00018878223242670434, "loss": 2.9943, "step": 158000 }, { "epoch": 8.55, "learning_rate": 0.00018790003532320734, "loss": 2.9973, "step": 159000 }, { "epoch": 8.61, "learning_rate": 0.00018701695513952666, "loss": 2.9986, "step": 160000 }, { "epoch": 8.66, "learning_rate": 0.00018613475803602963, "loss": 3.0023, "step": 161000 }, { "epoch": 8.72, "learning_rate": 0.00018525167785234898, "loss": 2.9989, "step": 162000 }, { "epoch": 8.77, "learning_rate": 0.00018436948074885198, "loss": 3.0, "step": 163000 }, { "epoch": 8.82, "learning_rate": 0.0001834864005651713, "loss": 3.0023, "step": 164000 }, { "epoch": 8.88, "learning_rate": 0.0001826033203814906, "loss": 3.0023, "step": 165000 }, { "epoch": 8.93, "learning_rate": 0.00018172024019780993, "loss": 3.0039, "step": 166000 }, { "epoch": 8.99, "learning_rate": 0.00018083804309431295, "loss": 3.0072, "step": 167000 }, { "epoch": 9.0, "eval_accuracy": 0.408285882043124, "eval_loss": 3.3615922927856445, "eval_runtime": 152.6155, "eval_samples_per_second": 379.516, "eval_steps_per_second": 5.93, "step": 167274 }, { "epoch": 9.04, "learning_rate": 0.00017995496291063227, "loss": 2.9528, "step": 168000 }, { "epoch": 9.09, "learning_rate": 0.00017907276580713527, "loss": 2.9442, "step": 169000 }, { "epoch": 9.15, "learning_rate": 0.0001781896856234546, "loss": 2.9515, "step": 170000 }, { "epoch": 9.2, "learning_rate": 0.00017730748851995762, "loss": 2.9502, "step": 171000 }, { "epoch": 9.25, "learning_rate": 0.00017642440833627693, "loss": 2.9558, "step": 172000 }, { "epoch": 9.31, "learning_rate": 0.00017554132815259625, "loss": 2.9566, "step": 173000 }, { "epoch": 9.36, "learning_rate": 0.00017465913104909922, "loss": 2.9635, "step": 174000 }, { "epoch": 9.42, "learning_rate": 0.0001737760508654186, "loss": 2.9669, "step": 175000 }, { "epoch": 9.47, "learning_rate": 0.0001728929706817379, "loss": 2.967, "step": 176000 }, { "epoch": 9.52, "learning_rate": 0.00017201165665842457, "loss": 2.9745, "step": 177000 }, { "epoch": 9.58, "learning_rate": 0.00017112857647474388, "loss": 2.9685, "step": 178000 }, { "epoch": 9.63, "learning_rate": 0.0001702463793712469, "loss": 2.9699, "step": 179000 }, { "epoch": 9.68, "learning_rate": 0.00016936329918756623, "loss": 2.972, "step": 180000 }, { "epoch": 9.74, "learning_rate": 0.00016848021900388554, "loss": 2.9706, "step": 181000 }, { "epoch": 9.79, "learning_rate": 0.00016759713882020486, "loss": 2.9777, "step": 182000 }, { "epoch": 9.85, "learning_rate": 0.00016671494171670786, "loss": 2.9724, "step": 183000 }, { "epoch": 9.9, "learning_rate": 0.0001658318615330272, "loss": 2.9779, "step": 184000 }, { "epoch": 9.95, "learning_rate": 0.0001649496644295302, "loss": 2.9771, "step": 185000 }, { "epoch": 10.0, "eval_accuracy": 0.4078330234107961, "eval_loss": 3.3777239322662354, "eval_runtime": 152.9968, "eval_samples_per_second": 378.57, "eval_steps_per_second": 5.915, "step": 185860 }, { "epoch": 10.01, "learning_rate": 0.00016406658424584952, "loss": 2.9716, "step": 186000 }, { "epoch": 10.06, "learning_rate": 0.0001631843871423525, "loss": 2.9114, "step": 187000 }, { "epoch": 10.12, "learning_rate": 0.0001623013069586718, "loss": 2.9208, "step": 188000 }, { "epoch": 10.17, "learning_rate": 0.00016141822677499118, "loss": 2.9265, "step": 189000 }, { "epoch": 10.22, "learning_rate": 0.0001605351465913105, "loss": 2.9286, "step": 190000 }, { "epoch": 10.28, "learning_rate": 0.00015965294948781347, "loss": 2.9316, "step": 191000 }, { "epoch": 10.33, "learning_rate": 0.00015876986930413278, "loss": 2.9347, "step": 192000 }, { "epoch": 10.38, "learning_rate": 0.0001578876722006358, "loss": 2.9384, "step": 193000 }, { "epoch": 10.44, "learning_rate": 0.00015700459201695513, "loss": 2.9406, "step": 194000 }, { "epoch": 10.49, "learning_rate": 0.00015612239491345813, "loss": 2.9389, "step": 195000 }, { "epoch": 10.55, "learning_rate": 0.00015523931472977745, "loss": 2.944, "step": 196000 }, { "epoch": 10.6, "learning_rate": 0.00015435623454609676, "loss": 2.9508, "step": 197000 }, { "epoch": 10.65, "learning_rate": 0.0001534731543624161, "loss": 2.9459, "step": 198000 }, { "epoch": 10.71, "learning_rate": 0.00015259007417873542, "loss": 2.9463, "step": 199000 }, { "epoch": 10.76, "learning_rate": 0.00015170699399505474, "loss": 2.9486, "step": 200000 }, { "epoch": 10.81, "learning_rate": 0.00015082479689155774, "loss": 2.9517, "step": 201000 }, { "epoch": 10.87, "learning_rate": 0.00014994171670787705, "loss": 2.9533, "step": 202000 }, { "epoch": 10.92, "learning_rate": 0.00014905951960438005, "loss": 2.9536, "step": 203000 }, { "epoch": 10.98, "learning_rate": 0.0001481764394206994, "loss": 2.9534, "step": 204000 }, { "epoch": 11.0, "eval_accuracy": 0.4088908356084872, "eval_loss": 3.374073028564453, "eval_runtime": 152.6907, "eval_samples_per_second": 379.329, "eval_steps_per_second": 5.927, "step": 204446 }, { "epoch": 11.03, "learning_rate": 0.00014729335923701871, "loss": 2.9188, "step": 205000 }, { "epoch": 11.08, "learning_rate": 0.00014641116213352171, "loss": 2.8961, "step": 206000 }, { "epoch": 11.14, "learning_rate": 0.00014552808194984103, "loss": 2.9004, "step": 207000 }, { "epoch": 11.19, "learning_rate": 0.00014464588484634403, "loss": 2.9005, "step": 208000 }, { "epoch": 11.25, "learning_rate": 0.00014376280466266335, "loss": 2.9037, "step": 209000 }, { "epoch": 11.3, "learning_rate": 0.00014287972447898266, "loss": 2.9129, "step": 210000 }, { "epoch": 11.35, "learning_rate": 0.000141996644295302, "loss": 2.9132, "step": 211000 }, { "epoch": 11.41, "learning_rate": 0.000141114447191805, "loss": 2.9156, "step": 212000 }, { "epoch": 11.46, "learning_rate": 0.000140232250088308, "loss": 2.9192, "step": 213000 }, { "epoch": 11.51, "learning_rate": 0.00013934916990462732, "loss": 2.9185, "step": 214000 }, { "epoch": 11.57, "learning_rate": 0.00013846608972094667, "loss": 2.9243, "step": 215000 }, { "epoch": 11.62, "learning_rate": 0.00013758300953726598, "loss": 2.9236, "step": 216000 }, { "epoch": 11.68, "learning_rate": 0.00013670081243376898, "loss": 2.9244, "step": 217000 }, { "epoch": 11.73, "learning_rate": 0.0001358177322500883, "loss": 2.9289, "step": 218000 }, { "epoch": 11.78, "learning_rate": 0.00013493465206640762, "loss": 2.928, "step": 219000 }, { "epoch": 11.84, "learning_rate": 0.0001340533380430943, "loss": 2.9268, "step": 220000 }, { "epoch": 11.89, "learning_rate": 0.00013317025785941362, "loss": 2.9329, "step": 221000 }, { "epoch": 11.94, "learning_rate": 0.00013228717767573293, "loss": 2.9315, "step": 222000 }, { "epoch": 12.0, "learning_rate": 0.00013140498057223594, "loss": 2.9279, "step": 223000 }, { "epoch": 12.0, "eval_accuracy": 0.40917695412429866, "eval_loss": 3.384521484375, "eval_runtime": 152.8405, "eval_samples_per_second": 378.957, "eval_steps_per_second": 5.921, "step": 223032 }, { "epoch": 12.05, "learning_rate": 0.00013052190038855528, "loss": 2.8706, "step": 224000 }, { "epoch": 12.11, "learning_rate": 0.0001296388202048746, "loss": 2.8764, "step": 225000 }, { "epoch": 12.16, "learning_rate": 0.0001287557400211939, "loss": 2.8801, "step": 226000 }, { "epoch": 12.21, "learning_rate": 0.00012787265983751323, "loss": 2.8853, "step": 227000 }, { "epoch": 12.27, "learning_rate": 0.00012699046273401623, "loss": 2.8877, "step": 228000 }, { "epoch": 12.32, "learning_rate": 0.00012610738255033557, "loss": 2.8905, "step": 229000 }, { "epoch": 12.37, "learning_rate": 0.00012522430236665489, "loss": 2.8928, "step": 230000 }, { "epoch": 12.43, "learning_rate": 0.0001243421052631579, "loss": 2.898, "step": 231000 }, { "epoch": 12.48, "learning_rate": 0.0001234590250794772, "loss": 2.8982, "step": 232000 }, { "epoch": 12.54, "learning_rate": 0.0001225768279759802, "loss": 2.8923, "step": 233000 }, { "epoch": 12.59, "learning_rate": 0.00012169374779229953, "loss": 2.9047, "step": 234000 }, { "epoch": 12.64, "learning_rate": 0.00012081066760861885, "loss": 2.9063, "step": 235000 }, { "epoch": 12.7, "learning_rate": 0.00011992758742493818, "loss": 2.9029, "step": 236000 }, { "epoch": 12.75, "learning_rate": 0.00011904539032144117, "loss": 2.9057, "step": 237000 }, { "epoch": 12.81, "learning_rate": 0.0001181623101377605, "loss": 2.9073, "step": 238000 }, { "epoch": 12.86, "learning_rate": 0.00011727922995407981, "loss": 2.9081, "step": 239000 }, { "epoch": 12.91, "learning_rate": 0.00011639614977039916, "loss": 2.9086, "step": 240000 }, { "epoch": 12.97, "learning_rate": 0.00011551483574708583, "loss": 2.9063, "step": 241000 }, { "epoch": 13.0, "eval_accuracy": 0.4105062396543199, "eval_loss": 3.36887264251709, "eval_runtime": 152.9632, "eval_samples_per_second": 378.653, "eval_steps_per_second": 5.916, "step": 241618 }, { "epoch": 13.02, "learning_rate": 0.00011463175556340516, "loss": 2.8865, "step": 242000 }, { "epoch": 13.07, "learning_rate": 0.00011374867537972447, "loss": 2.8543, "step": 243000 }, { "epoch": 13.13, "learning_rate": 0.00011286647827622747, "loss": 2.8563, "step": 244000 }, { "epoch": 13.18, "learning_rate": 0.00011198339809254679, "loss": 2.8637, "step": 245000 }, { "epoch": 13.24, "learning_rate": 0.0001111003179088661, "loss": 2.866, "step": 246000 }, { "epoch": 13.29, "learning_rate": 0.00011021812080536912, "loss": 2.8713, "step": 247000 }, { "epoch": 13.34, "learning_rate": 0.00010933504062168844, "loss": 2.8737, "step": 248000 }, { "epoch": 13.4, "learning_rate": 0.00010845196043800777, "loss": 2.8684, "step": 249000 }, { "epoch": 13.45, "learning_rate": 0.00010756888025432708, "loss": 2.8772, "step": 250000 }, { "epoch": 13.5, "learning_rate": 0.00010668756623101377, "loss": 2.8758, "step": 251000 }, { "epoch": 13.56, "learning_rate": 0.00010580448604733308, "loss": 2.8799, "step": 252000 }, { "epoch": 13.61, "learning_rate": 0.0001049214058636524, "loss": 2.8829, "step": 253000 }, { "epoch": 13.67, "learning_rate": 0.00010403920876015541, "loss": 2.8794, "step": 254000 }, { "epoch": 13.72, "learning_rate": 0.00010315612857647473, "loss": 2.8887, "step": 255000 }, { "epoch": 13.77, "learning_rate": 0.00010227393147297774, "loss": 2.8846, "step": 256000 }, { "epoch": 13.83, "learning_rate": 0.00010139085128929706, "loss": 2.8851, "step": 257000 }, { "epoch": 13.88, "learning_rate": 0.00010050777110561639, "loss": 2.8858, "step": 258000 }, { "epoch": 13.94, "learning_rate": 9.962557400211938e-05, "loss": 2.8897, "step": 259000 }, { "epoch": 13.99, "learning_rate": 9.874249381843871e-05, "loss": 2.8913, "step": 260000 }, { "epoch": 14.0, "eval_accuracy": 0.4104661333490982, "eval_loss": 3.371145486831665, "eval_runtime": 152.7732, "eval_samples_per_second": 379.124, "eval_steps_per_second": 5.924, "step": 260204 }, { "epoch": 14.04, "learning_rate": 9.786029671494171e-05, "loss": 2.839, "step": 261000 }, { "epoch": 14.1, "learning_rate": 9.697721653126104e-05, "loss": 2.8365, "step": 262000 }, { "epoch": 14.15, "learning_rate": 9.609501942776402e-05, "loss": 2.8441, "step": 263000 }, { "epoch": 14.2, "learning_rate": 9.521282232426704e-05, "loss": 2.8514, "step": 264000 }, { "epoch": 14.26, "learning_rate": 9.432974214058636e-05, "loss": 2.8488, "step": 265000 }, { "epoch": 14.31, "learning_rate": 9.344666195690567e-05, "loss": 2.8537, "step": 266000 }, { "epoch": 14.37, "learning_rate": 9.2563581773225e-05, "loss": 2.8525, "step": 267000 }, { "epoch": 14.42, "learning_rate": 9.168050158954432e-05, "loss": 2.8573, "step": 268000 }, { "epoch": 14.47, "learning_rate": 9.079830448604733e-05, "loss": 2.8588, "step": 269000 }, { "epoch": 14.53, "learning_rate": 8.991522430236665e-05, "loss": 2.8604, "step": 270000 }, { "epoch": 14.58, "learning_rate": 8.903302719886966e-05, "loss": 2.8599, "step": 271000 }, { "epoch": 14.63, "learning_rate": 8.814994701518898e-05, "loss": 2.8603, "step": 272000 }, { "epoch": 14.69, "learning_rate": 8.726686683150828e-05, "loss": 2.8633, "step": 273000 }, { "epoch": 14.74, "learning_rate": 8.638378664782762e-05, "loss": 2.8652, "step": 274000 }, { "epoch": 14.8, "learning_rate": 8.550247262451431e-05, "loss": 2.865, "step": 275000 }, { "epoch": 14.85, "learning_rate": 8.461939244083362e-05, "loss": 2.865, "step": 276000 }, { "epoch": 14.9, "learning_rate": 8.373719533733661e-05, "loss": 2.8662, "step": 277000 }, { "epoch": 14.96, "learning_rate": 8.285411515365594e-05, "loss": 2.8704, "step": 278000 }, { "epoch": 15.0, "eval_accuracy": 0.4098972553045123, "eval_loss": 3.3779125213623047, "eval_runtime": 152.7549, "eval_samples_per_second": 379.169, "eval_steps_per_second": 5.925, "step": 278790 }, { "epoch": 15.01, "learning_rate": 8.197103496997526e-05, "loss": 2.8583, "step": 279000 }, { "epoch": 15.07, "learning_rate": 8.10879547862946e-05, "loss": 2.8214, "step": 280000 }, { "epoch": 15.12, "learning_rate": 8.020575768279759e-05, "loss": 2.8237, "step": 281000 }, { "epoch": 15.17, "learning_rate": 7.932267749911692e-05, "loss": 2.8222, "step": 282000 }, { "epoch": 15.23, "learning_rate": 7.844048039561992e-05, "loss": 2.8321, "step": 283000 }, { "epoch": 15.28, "learning_rate": 7.755740021193923e-05, "loss": 2.8329, "step": 284000 }, { "epoch": 15.33, "learning_rate": 7.667432002825856e-05, "loss": 2.8337, "step": 285000 }, { "epoch": 15.39, "learning_rate": 7.579212292476155e-05, "loss": 2.8375, "step": 286000 }, { "epoch": 15.44, "learning_rate": 7.490992582126457e-05, "loss": 2.8415, "step": 287000 }, { "epoch": 15.5, "learning_rate": 7.40268456375839e-05, "loss": 2.8366, "step": 288000 }, { "epoch": 15.55, "learning_rate": 7.314376545390321e-05, "loss": 2.8427, "step": 289000 }, { "epoch": 15.6, "learning_rate": 7.226068527022253e-05, "loss": 2.8424, "step": 290000 }, { "epoch": 15.66, "learning_rate": 7.137848816672553e-05, "loss": 2.8453, "step": 291000 }, { "epoch": 15.71, "learning_rate": 7.049540798304486e-05, "loss": 2.8451, "step": 292000 }, { "epoch": 15.76, "learning_rate": 6.961321087954786e-05, "loss": 2.8489, "step": 293000 }, { "epoch": 15.82, "learning_rate": 6.873101377605086e-05, "loss": 2.8442, "step": 294000 }, { "epoch": 15.87, "learning_rate": 6.784793359237018e-05, "loss": 2.844, "step": 295000 }, { "epoch": 15.93, "learning_rate": 6.69648534086895e-05, "loss": 2.8475, "step": 296000 }, { "epoch": 15.98, "learning_rate": 6.608177322500882e-05, "loss": 2.8491, "step": 297000 }, { "epoch": 16.0, "eval_accuracy": 0.4111809257905712, "eval_loss": 3.376009464263916, "eval_runtime": 153.0344, "eval_samples_per_second": 378.477, "eval_steps_per_second": 5.914, "step": 297376 }, { "epoch": 16.03, "learning_rate": 6.519957612151182e-05, "loss": 2.8234, "step": 298000 }, { "epoch": 16.09, "learning_rate": 6.431649593783115e-05, "loss": 2.804, "step": 299000 }, { "epoch": 16.14, "learning_rate": 6.343429883433415e-05, "loss": 2.8093, "step": 300000 }, { "epoch": 16.19, "learning_rate": 6.255121865065347e-05, "loss": 2.8146, "step": 301000 }, { "epoch": 16.25, "learning_rate": 6.16681384669728e-05, "loss": 2.8127, "step": 302000 }, { "epoch": 16.3, "learning_rate": 6.07859413634758e-05, "loss": 2.8193, "step": 303000 }, { "epoch": 16.36, "learning_rate": 5.990286117979512e-05, "loss": 2.8189, "step": 304000 }, { "epoch": 16.41, "learning_rate": 5.901978099611444e-05, "loss": 2.8211, "step": 305000 }, { "epoch": 16.46, "learning_rate": 5.813670081243376e-05, "loss": 2.817, "step": 306000 }, { "epoch": 16.52, "learning_rate": 5.725450370893676e-05, "loss": 2.8202, "step": 307000 }, { "epoch": 16.57, "learning_rate": 5.6371423525256084e-05, "loss": 2.8212, "step": 308000 }, { "epoch": 16.63, "learning_rate": 5.5488343341575414e-05, "loss": 2.8274, "step": 309000 }, { "epoch": 16.68, "learning_rate": 5.4606146238078415e-05, "loss": 2.8263, "step": 310000 }, { "epoch": 16.73, "learning_rate": 5.372306605439774e-05, "loss": 2.8244, "step": 311000 }, { "epoch": 16.79, "learning_rate": 5.2839985870717054e-05, "loss": 2.8305, "step": 312000 }, { "epoch": 16.84, "learning_rate": 5.195778876722006e-05, "loss": 2.8253, "step": 313000 }, { "epoch": 16.89, "learning_rate": 5.107470858353938e-05, "loss": 2.831, "step": 314000 }, { "epoch": 16.95, "learning_rate": 5.019251148004238e-05, "loss": 2.83, "step": 315000 }, { "epoch": 17.0, "eval_accuracy": 0.4115711057247888, "eval_loss": 3.375187397003174, "eval_runtime": 152.7823, "eval_samples_per_second": 379.101, "eval_steps_per_second": 5.923, "step": 315962 }, { "epoch": 17.0, "learning_rate": 4.93094312963617e-05, "loss": 2.8325, "step": 316000 }, { "epoch": 17.06, "learning_rate": 4.842635111268103e-05, "loss": 2.7924, "step": 317000 }, { "epoch": 17.11, "learning_rate": 4.754415400918403e-05, "loss": 2.793, "step": 318000 }, { "epoch": 17.16, "learning_rate": 4.6661073825503354e-05, "loss": 2.7973, "step": 319000 }, { "epoch": 17.22, "learning_rate": 4.5778876722006355e-05, "loss": 2.7998, "step": 320000 }, { "epoch": 17.27, "learning_rate": 4.489579653832568e-05, "loss": 2.803, "step": 321000 }, { "epoch": 17.32, "learning_rate": 4.4012716354644994e-05, "loss": 2.8033, "step": 322000 }, { "epoch": 17.38, "learning_rate": 4.3130519251147995e-05, "loss": 2.8002, "step": 323000 }, { "epoch": 17.43, "learning_rate": 4.224743906746732e-05, "loss": 2.8042, "step": 324000 }, { "epoch": 17.49, "learning_rate": 4.1365241963970325e-05, "loss": 2.8048, "step": 325000 }, { "epoch": 17.54, "learning_rate": 4.048216178028965e-05, "loss": 2.806, "step": 326000 }, { "epoch": 17.59, "learning_rate": 3.959908159660897e-05, "loss": 2.8035, "step": 327000 }, { "epoch": 17.65, "learning_rate": 3.871688449311197e-05, "loss": 2.8083, "step": 328000 }, { "epoch": 17.7, "learning_rate": 3.7833804309431294e-05, "loss": 2.8089, "step": 329000 }, { "epoch": 17.76, "learning_rate": 3.6951607205934295e-05, "loss": 2.8063, "step": 330000 }, { "epoch": 17.81, "learning_rate": 3.6069410102437296e-05, "loss": 2.8133, "step": 331000 }, { "epoch": 17.86, "learning_rate": 3.518632991875662e-05, "loss": 2.807, "step": 332000 }, { "epoch": 17.92, "learning_rate": 3.430324973507594e-05, "loss": 2.8097, "step": 333000 }, { "epoch": 17.97, "learning_rate": 3.3420169551395265e-05, "loss": 2.8136, "step": 334000 }, { "epoch": 18.0, "eval_accuracy": 0.41081936442590883, "eval_loss": 3.3912463188171387, "eval_runtime": 153.0224, "eval_samples_per_second": 378.507, "eval_steps_per_second": 5.914, "step": 334548 }, { "epoch": 18.02, "learning_rate": 3.2537972447898265e-05, "loss": 2.7965, "step": 335000 }, { "epoch": 18.08, "learning_rate": 3.165489226421759e-05, "loss": 2.7795, "step": 336000 }, { "epoch": 18.13, "learning_rate": 3.077181208053691e-05, "loss": 2.7846, "step": 337000 }, { "epoch": 18.19, "learning_rate": 2.9889614977039912e-05, "loss": 2.7852, "step": 338000 }, { "epoch": 18.24, "learning_rate": 2.9006534793359235e-05, "loss": 2.7818, "step": 339000 }, { "epoch": 18.29, "learning_rate": 2.8123454609678557e-05, "loss": 2.7855, "step": 340000 }, { "epoch": 18.35, "learning_rate": 2.724125750618156e-05, "loss": 2.7881, "step": 341000 }, { "epoch": 18.4, "learning_rate": 2.635817732250088e-05, "loss": 2.7901, "step": 342000 }, { "epoch": 18.45, "learning_rate": 2.5475097138820204e-05, "loss": 2.7895, "step": 343000 }, { "epoch": 18.51, "learning_rate": 2.4592016955139527e-05, "loss": 2.787, "step": 344000 }, { "epoch": 18.56, "learning_rate": 2.3709819851642528e-05, "loss": 2.7907, "step": 345000 }, { "epoch": 18.62, "learning_rate": 2.282762274814553e-05, "loss": 2.7957, "step": 346000 }, { "epoch": 18.67, "learning_rate": 2.194454256446485e-05, "loss": 2.7935, "step": 347000 }, { "epoch": 18.72, "learning_rate": 2.1061462380784174e-05, "loss": 2.794, "step": 348000 }, { "epoch": 18.78, "learning_rate": 2.0179265277287175e-05, "loss": 2.7885, "step": 349000 }, { "epoch": 18.83, "learning_rate": 1.9296185093606498e-05, "loss": 2.7946, "step": 350000 }, { "epoch": 18.89, "learning_rate": 1.8413987990109502e-05, "loss": 2.7905, "step": 351000 }, { "epoch": 18.94, "learning_rate": 1.7531790886612503e-05, "loss": 2.7904, "step": 352000 }, { "epoch": 18.99, "learning_rate": 1.6648710702931826e-05, "loss": 2.7924, "step": 353000 }, { "epoch": 19.0, "eval_accuracy": 0.41071033170668786, "eval_loss": 3.3984034061431885, "eval_runtime": 153.8265, "eval_samples_per_second": 376.528, "eval_steps_per_second": 5.883, "step": 353134 }, { "epoch": 19.05, "learning_rate": 1.5765630519251145e-05, "loss": 2.7766, "step": 354000 }, { "epoch": 19.1, "learning_rate": 1.4882550335570468e-05, "loss": 2.7713, "step": 355000 }, { "epoch": 19.15, "learning_rate": 1.4000353232073472e-05, "loss": 2.7724, "step": 356000 }, { "epoch": 19.21, "learning_rate": 1.3117273048392793e-05, "loss": 2.7711, "step": 357000 }, { "epoch": 19.26, "learning_rate": 1.2234192864712116e-05, "loss": 2.7774, "step": 358000 }, { "epoch": 19.32, "learning_rate": 1.1351995761215117e-05, "loss": 2.777, "step": 359000 }, { "epoch": 19.37, "learning_rate": 1.046891557753444e-05, "loss": 2.778, "step": 360000 }, { "epoch": 19.42, "learning_rate": 9.58583539385376e-06, "loss": 2.7765, "step": 361000 }, { "epoch": 19.48, "learning_rate": 8.703638290356763e-06, "loss": 2.7765, "step": 362000 }, { "epoch": 19.53, "learning_rate": 7.820558106676086e-06, "loss": 2.7713, "step": 363000 }, { "epoch": 19.58, "learning_rate": 6.938361003179088e-06, "loss": 2.7701, "step": 364000 }, { "epoch": 19.64, "learning_rate": 6.05528081949841e-06, "loss": 2.7758, "step": 365000 }, { "epoch": 19.69, "learning_rate": 5.172200635817732e-06, "loss": 2.7754, "step": 366000 }, { "epoch": 19.75, "learning_rate": 4.290003532320734e-06, "loss": 2.7755, "step": 367000 }, { "epoch": 19.8, "learning_rate": 3.406923348640056e-06, "loss": 2.7752, "step": 368000 }, { "epoch": 19.85, "learning_rate": 2.523843164959378e-06, "loss": 2.7719, "step": 369000 }, { "epoch": 19.91, "learning_rate": 1.6407629812787e-06, "loss": 2.7787, "step": 370000 }, { "epoch": 19.96, "learning_rate": 7.576827975980219e-07, "loss": 2.7792, "step": 371000 }, { "epoch": 20.0, "eval_accuracy": 0.4106218559881334, "eval_loss": 3.404461622238159, "eval_runtime": 153.6422, "eval_samples_per_second": 376.98, "eval_steps_per_second": 5.89, "step": 371720 }, { "epoch": 20.0, "step": 371720, "total_flos": 1.56614628708864e+18, "train_loss": 3.0647617085172687, "train_runtime": 80733.64, "train_samples_per_second": 147.334, "train_steps_per_second": 4.604 } ], "logging_steps": 1000, "max_steps": 371720, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 5000, "total_flos": 1.56614628708864e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }