{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 372000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "learning_rate": 3.125e-05, "loss": 6.2307, "step": 1000 }, { "epoch": 0.11, "learning_rate": 6.25e-05, "loss": 5.0106, "step": 2000 }, { "epoch": 0.16, "learning_rate": 9.375e-05, "loss": 4.68, "step": 3000 }, { "epoch": 0.22, "learning_rate": 0.000125, "loss": 4.4601, "step": 4000 }, { "epoch": 0.27, "learning_rate": 0.00015625, "loss": 4.2904, "step": 5000 }, { "epoch": 0.32, "learning_rate": 0.0001875, "loss": 4.1762, "step": 6000 }, { "epoch": 0.38, "learning_rate": 0.00021875, "loss": 4.0703, "step": 7000 }, { "epoch": 0.43, "learning_rate": 0.00025, "loss": 3.977, "step": 8000 }, { "epoch": 0.48, "learning_rate": 0.00028125000000000003, "loss": 3.9045, "step": 9000 }, { "epoch": 0.54, "learning_rate": 0.00031246875000000003, "loss": 3.848, "step": 10000 }, { "epoch": 0.59, "learning_rate": 0.00034371875, "loss": 3.8034, "step": 11000 }, { "epoch": 0.65, "learning_rate": 0.0003749375, "loss": 3.7535, "step": 12000 }, { "epoch": 0.7, "learning_rate": 0.0004061875, "loss": 3.7203, "step": 13000 }, { "epoch": 0.75, "learning_rate": 0.0004374375, "loss": 3.6886, "step": 14000 }, { "epoch": 0.81, "learning_rate": 0.00046865625, "loss": 3.6652, "step": 15000 }, { "epoch": 0.86, "learning_rate": 0.00049990625, "loss": 3.6413, "step": 16000 }, { "epoch": 0.91, "learning_rate": 0.000531125, "loss": 3.6178, "step": 17000 }, { "epoch": 0.97, "learning_rate": 0.0005623749999999999, "loss": 3.6017, "step": 18000 }, { "epoch": 1.0, "eval_accuracy": 0.35927561873656305, "eval_loss": 3.768275737762451, "eval_runtime": 151.987, "eval_samples_per_second": 381.092, "eval_steps_per_second": 5.961, "step": 18600 }, { "epoch": 1.02, "learning_rate": 0.00059359375, "loss": 3.5665, "step": 19000 }, { "epoch": 1.08, "learning_rate": 0.0006248437500000001, "loss": 3.5454, "step": 20000 }, { "epoch": 1.13, "learning_rate": 0.0006560625, "loss": 3.5334, "step": 21000 }, { "epoch": 1.18, "learning_rate": 0.00068728125, "loss": 3.5201, "step": 22000 }, { "epoch": 1.24, "learning_rate": 0.00071853125, "loss": 3.5093, "step": 23000 }, { "epoch": 1.29, "learning_rate": 0.00074978125, "loss": 3.5036, "step": 24000 }, { "epoch": 1.34, "learning_rate": 0.0007810312499999999, "loss": 3.491, "step": 25000 }, { "epoch": 1.4, "learning_rate": 0.00081228125, "loss": 3.4802, "step": 26000 }, { "epoch": 1.45, "learning_rate": 0.0008435000000000001, "loss": 3.4787, "step": 27000 }, { "epoch": 1.51, "learning_rate": 0.0008747500000000001, "loss": 3.463, "step": 28000 }, { "epoch": 1.56, "learning_rate": 0.0009059375, "loss": 3.4564, "step": 29000 }, { "epoch": 1.61, "learning_rate": 0.0009371875, "loss": 3.4457, "step": 30000 }, { "epoch": 1.67, "learning_rate": 0.0009684375, "loss": 3.4381, "step": 31000 }, { "epoch": 1.72, "learning_rate": 0.0009996875, "loss": 3.4322, "step": 32000 }, { "epoch": 1.77, "learning_rate": 0.0009970911764705882, "loss": 3.4237, "step": 33000 }, { "epoch": 1.83, "learning_rate": 0.00099415, "loss": 3.4082, "step": 34000 }, { "epoch": 1.88, "learning_rate": 0.0009912117647058825, "loss": 3.4007, "step": 35000 }, { "epoch": 1.94, "learning_rate": 0.000988270588235294, "loss": 3.3875, "step": 36000 }, { "epoch": 1.99, "learning_rate": 0.000985329411764706, "loss": 3.3799, "step": 37000 }, { "epoch": 2.0, "eval_accuracy": 0.37902175491009926, "eval_loss": 3.593546152114868, "eval_runtime": 153.1239, "eval_samples_per_second": 378.262, "eval_steps_per_second": 5.917, "step": 37200 }, { "epoch": 2.04, "learning_rate": 0.0009823911764705884, "loss": 3.3337, "step": 38000 }, { "epoch": 2.1, "learning_rate": 0.00097945, "loss": 3.3265, "step": 39000 }, { "epoch": 2.15, "learning_rate": 0.0009765088235294118, "loss": 3.3165, "step": 40000 }, { "epoch": 2.2, "learning_rate": 0.0009735676470588235, "loss": 3.3161, "step": 41000 }, { "epoch": 2.26, "learning_rate": 0.0009706294117647059, "loss": 3.312, "step": 42000 }, { "epoch": 2.31, "learning_rate": 0.0009676911764705883, "loss": 3.3036, "step": 43000 }, { "epoch": 2.37, "learning_rate": 0.00096475, "loss": 3.3006, "step": 44000 }, { "epoch": 2.42, "learning_rate": 0.0009618088235294117, "loss": 3.2951, "step": 45000 }, { "epoch": 2.47, "learning_rate": 0.0009588705882352941, "loss": 3.2917, "step": 46000 }, { "epoch": 2.53, "learning_rate": 0.0009559294117647059, "loss": 3.282, "step": 47000 }, { "epoch": 2.58, "learning_rate": 0.0009529882352941177, "loss": 3.2847, "step": 48000 }, { "epoch": 2.63, "learning_rate": 0.0009500470588235295, "loss": 3.2786, "step": 49000 }, { "epoch": 2.69, "learning_rate": 0.0009471088235294118, "loss": 3.271, "step": 50000 }, { "epoch": 2.74, "learning_rate": 0.0009441676470588235, "loss": 3.2696, "step": 51000 }, { "epoch": 2.8, "learning_rate": 0.0009412264705882353, "loss": 3.2677, "step": 52000 }, { "epoch": 2.85, "learning_rate": 0.0009382882352941176, "loss": 3.2575, "step": 53000 }, { "epoch": 2.9, "learning_rate": 0.0009353470588235295, "loss": 3.2542, "step": 54000 }, { "epoch": 2.96, "learning_rate": 0.0009324058823529412, "loss": 3.2546, "step": 55000 }, { "epoch": 3.0, "eval_accuracy": 0.39154149113743214, "eval_loss": 3.482309103012085, "eval_runtime": 153.2699, "eval_samples_per_second": 377.902, "eval_steps_per_second": 5.911, "step": 55800 }, { "epoch": 3.01, "learning_rate": 0.000929464705882353, "loss": 3.234, "step": 56000 }, { "epoch": 3.06, "learning_rate": 0.0009265264705882354, "loss": 3.1844, "step": 57000 }, { "epoch": 3.12, "learning_rate": 0.000923585294117647, "loss": 3.1895, "step": 58000 }, { "epoch": 3.17, "learning_rate": 0.0009206441176470588, "loss": 3.1899, "step": 59000 }, { "epoch": 3.23, "learning_rate": 0.0009177058823529411, "loss": 3.1897, "step": 60000 }, { "epoch": 3.28, "learning_rate": 0.000914764705882353, "loss": 3.1888, "step": 61000 }, { "epoch": 3.33, "learning_rate": 0.0009118264705882353, "loss": 3.19, "step": 62000 }, { "epoch": 3.39, "learning_rate": 0.0009088852941176471, "loss": 3.1929, "step": 63000 }, { "epoch": 3.44, "learning_rate": 0.0009059441176470589, "loss": 3.1901, "step": 64000 }, { "epoch": 3.49, "learning_rate": 0.0009030058823529413, "loss": 3.1868, "step": 65000 }, { "epoch": 3.55, "learning_rate": 0.000900064705882353, "loss": 3.1808, "step": 66000 }, { "epoch": 3.6, "learning_rate": 0.0008971264705882353, "loss": 3.1813, "step": 67000 }, { "epoch": 3.66, "learning_rate": 0.000894185294117647, "loss": 3.1815, "step": 68000 }, { "epoch": 3.71, "learning_rate": 0.0008912470588235294, "loss": 3.1778, "step": 69000 }, { "epoch": 3.76, "learning_rate": 0.0008883058823529412, "loss": 3.1842, "step": 70000 }, { "epoch": 3.82, "learning_rate": 0.0008853676470588236, "loss": 3.1788, "step": 71000 }, { "epoch": 3.87, "learning_rate": 0.0008824264705882353, "loss": 3.1787, "step": 72000 }, { "epoch": 3.92, "learning_rate": 0.0008794852941176471, "loss": 3.1714, "step": 73000 }, { "epoch": 3.98, "learning_rate": 0.0008765441176470589, "loss": 3.1737, "step": 74000 }, { "epoch": 4.0, "eval_accuracy": 0.39783632570245114, "eval_loss": 3.4548425674438477, "eval_runtime": 154.0942, "eval_samples_per_second": 375.88, "eval_steps_per_second": 5.88, "step": 74400 }, { "epoch": 4.03, "learning_rate": 0.0008736058823529411, "loss": 3.1312, "step": 75000 }, { "epoch": 4.09, "learning_rate": 0.000870664705882353, "loss": 3.107, "step": 76000 }, { "epoch": 4.14, "learning_rate": 0.0008677264705882352, "loss": 3.1138, "step": 77000 }, { "epoch": 4.19, "learning_rate": 0.0008647852941176471, "loss": 3.1109, "step": 78000 }, { "epoch": 4.25, "learning_rate": 0.0008618441176470588, "loss": 3.1156, "step": 79000 }, { "epoch": 4.3, "learning_rate": 0.0008589029411764706, "loss": 3.1169, "step": 80000 }, { "epoch": 4.35, "learning_rate": 0.0008559676470588235, "loss": 3.1188, "step": 81000 }, { "epoch": 4.41, "learning_rate": 0.0008530264705882354, "loss": 3.1217, "step": 82000 }, { "epoch": 4.46, "learning_rate": 0.0008500882352941177, "loss": 3.1207, "step": 83000 }, { "epoch": 4.52, "learning_rate": 0.0008471470588235294, "loss": 3.1183, "step": 84000 }, { "epoch": 4.57, "learning_rate": 0.0008442058823529411, "loss": 3.1259, "step": 85000 }, { "epoch": 4.62, "learning_rate": 0.0008412676470588235, "loss": 3.1255, "step": 86000 }, { "epoch": 4.68, "learning_rate": 0.0008383264705882353, "loss": 3.12, "step": 87000 }, { "epoch": 4.73, "learning_rate": 0.000835385294117647, "loss": 3.1135, "step": 88000 }, { "epoch": 4.78, "learning_rate": 0.0008324470588235294, "loss": 3.1175, "step": 89000 }, { "epoch": 4.84, "learning_rate": 0.0008295058823529413, "loss": 3.1197, "step": 90000 }, { "epoch": 4.89, "learning_rate": 0.0008265676470588236, "loss": 3.1176, "step": 91000 }, { "epoch": 4.95, "learning_rate": 0.0008236264705882354, "loss": 3.1175, "step": 92000 }, { "epoch": 5.0, "learning_rate": 0.0008206882352941176, "loss": 3.1178, "step": 93000 }, { "epoch": 5.0, "eval_accuracy": 0.40139941045420985, "eval_loss": 3.4163460731506348, "eval_runtime": 152.6757, "eval_samples_per_second": 379.373, "eval_steps_per_second": 5.934, "step": 93000 }, { "epoch": 5.05, "learning_rate": 0.0008177470588235295, "loss": 3.0456, "step": 94000 }, { "epoch": 5.11, "learning_rate": 0.0008148058823529412, "loss": 3.0573, "step": 95000 }, { "epoch": 5.16, "learning_rate": 0.0008118676470588236, "loss": 3.0613, "step": 96000 }, { "epoch": 5.22, "learning_rate": 0.0008089264705882353, "loss": 3.0623, "step": 97000 }, { "epoch": 5.27, "learning_rate": 0.0008059852941176471, "loss": 3.0653, "step": 98000 }, { "epoch": 5.32, "learning_rate": 0.0008030441176470589, "loss": 3.0626, "step": 99000 }, { "epoch": 5.38, "learning_rate": 0.0008001058823529412, "loss": 3.0658, "step": 100000 }, { "epoch": 5.43, "learning_rate": 0.000797164705882353, "loss": 3.0673, "step": 101000 }, { "epoch": 5.48, "learning_rate": 0.0007942264705882352, "loss": 3.0705, "step": 102000 }, { "epoch": 5.54, "learning_rate": 0.0007912852941176471, "loss": 3.067, "step": 103000 }, { "epoch": 5.59, "learning_rate": 0.0007883441176470588, "loss": 3.0735, "step": 104000 }, { "epoch": 5.65, "learning_rate": 0.0007854029411764706, "loss": 3.0763, "step": 105000 }, { "epoch": 5.7, "learning_rate": 0.0007824617647058824, "loss": 3.0734, "step": 106000 }, { "epoch": 5.75, "learning_rate": 0.0007795235294117647, "loss": 3.0735, "step": 107000 }, { "epoch": 5.81, "learning_rate": 0.0007765823529411766, "loss": 3.0735, "step": 108000 }, { "epoch": 5.86, "learning_rate": 0.0007736441176470589, "loss": 3.0709, "step": 109000 }, { "epoch": 5.91, "learning_rate": 0.0007707029411764706, "loss": 3.0737, "step": 110000 }, { "epoch": 5.97, "learning_rate": 0.0007677647058823529, "loss": 3.0736, "step": 111000 }, { "epoch": 6.0, "eval_accuracy": 0.4037599986080598, "eval_loss": 3.401655673980713, "eval_runtime": 153.1003, "eval_samples_per_second": 378.321, "eval_steps_per_second": 5.918, "step": 111600 }, { "epoch": 6.02, "learning_rate": 0.0007648235294117647, "loss": 3.0433, "step": 112000 }, { "epoch": 6.08, "learning_rate": 0.0007618823529411765, "loss": 3.0037, "step": 113000 }, { "epoch": 6.13, "learning_rate": 0.0007589441176470589, "loss": 3.0114, "step": 114000 }, { "epoch": 6.18, "learning_rate": 0.0007560029411764706, "loss": 3.0155, "step": 115000 }, { "epoch": 6.24, "learning_rate": 0.000753064705882353, "loss": 3.0163, "step": 116000 }, { "epoch": 6.29, "learning_rate": 0.0007501235294117648, "loss": 3.0242, "step": 117000 }, { "epoch": 6.34, "learning_rate": 0.0007471823529411765, "loss": 3.0261, "step": 118000 }, { "epoch": 6.4, "learning_rate": 0.0007442411764705882, "loss": 3.0279, "step": 119000 }, { "epoch": 6.45, "learning_rate": 0.0007413029411764705, "loss": 3.0314, "step": 120000 }, { "epoch": 6.51, "learning_rate": 0.0007383617647058824, "loss": 3.0319, "step": 121000 }, { "epoch": 6.56, "learning_rate": 0.0007354205882352941, "loss": 3.03, "step": 122000 }, { "epoch": 6.61, "learning_rate": 0.0007324823529411765, "loss": 3.035, "step": 123000 }, { "epoch": 6.67, "learning_rate": 0.0007295411764705883, "loss": 3.0351, "step": 124000 }, { "epoch": 6.72, "learning_rate": 0.0007266029411764707, "loss": 3.0333, "step": 125000 }, { "epoch": 6.77, "learning_rate": 0.0007236617647058824, "loss": 3.0353, "step": 126000 }, { "epoch": 6.83, "learning_rate": 0.0007207235294117647, "loss": 3.0375, "step": 127000 }, { "epoch": 6.88, "learning_rate": 0.0007177823529411764, "loss": 3.0362, "step": 128000 }, { "epoch": 6.94, "learning_rate": 0.0007148441176470588, "loss": 3.0355, "step": 129000 }, { "epoch": 6.99, "learning_rate": 0.0007119029411764706, "loss": 3.0385, "step": 130000 }, { "epoch": 7.0, "eval_accuracy": 0.4057333022430861, "eval_loss": 3.3797543048858643, "eval_runtime": 152.5811, "eval_samples_per_second": 379.608, "eval_steps_per_second": 5.938, "step": 130200 }, { "epoch": 7.04, "learning_rate": 0.0007089617647058824, "loss": 2.9799, "step": 131000 }, { "epoch": 7.1, "learning_rate": 0.0007060205882352942, "loss": 2.9734, "step": 132000 }, { "epoch": 7.15, "learning_rate": 0.0007030794117647059, "loss": 2.9787, "step": 133000 }, { "epoch": 7.2, "learning_rate": 0.0007001411764705883, "loss": 2.9844, "step": 134000 }, { "epoch": 7.26, "learning_rate": 0.0006972, "loss": 2.9858, "step": 135000 }, { "epoch": 7.31, "learning_rate": 0.0006942617647058823, "loss": 2.9909, "step": 136000 }, { "epoch": 7.37, "learning_rate": 0.0006913205882352941, "loss": 2.993, "step": 137000 }, { "epoch": 7.42, "learning_rate": 0.0006883794117647059, "loss": 2.9939, "step": 138000 }, { "epoch": 7.47, "learning_rate": 0.0006854411764705882, "loss": 2.9916, "step": 139000 }, { "epoch": 7.53, "learning_rate": 0.0006825000000000001, "loss": 2.9992, "step": 140000 }, { "epoch": 7.58, "learning_rate": 0.0006795617647058824, "loss": 2.9996, "step": 141000 }, { "epoch": 7.63, "learning_rate": 0.0006766205882352942, "loss": 2.9953, "step": 142000 }, { "epoch": 7.69, "learning_rate": 0.0006736823529411765, "loss": 3.0019, "step": 143000 }, { "epoch": 7.74, "learning_rate": 0.0006707411764705883, "loss": 3.0053, "step": 144000 }, { "epoch": 7.8, "learning_rate": 0.0006678, "loss": 3.0019, "step": 145000 }, { "epoch": 7.85, "learning_rate": 0.0006648588235294117, "loss": 3.0029, "step": 146000 }, { "epoch": 7.9, "learning_rate": 0.0006619205882352941, "loss": 3.0039, "step": 147000 }, { "epoch": 7.96, "learning_rate": 0.0006589794117647058, "loss": 3.0068, "step": 148000 }, { "epoch": 8.0, "eval_accuracy": 0.4059920741366696, "eval_loss": 3.3987667560577393, "eval_runtime": 152.6066, "eval_samples_per_second": 379.545, "eval_steps_per_second": 5.937, "step": 148800 }, { "epoch": 8.01, "learning_rate": 0.0006560411764705883, "loss": 2.9921, "step": 149000 }, { "epoch": 8.06, "learning_rate": 0.0006531029411764707, "loss": 2.9365, "step": 150000 }, { "epoch": 8.12, "learning_rate": 0.0006501617647058824, "loss": 2.9421, "step": 151000 }, { "epoch": 8.17, "learning_rate": 0.0006472205882352941, "loss": 2.9522, "step": 152000 }, { "epoch": 8.23, "learning_rate": 0.000644279411764706, "loss": 2.9535, "step": 153000 }, { "epoch": 8.28, "learning_rate": 0.0006413382352941176, "loss": 2.9562, "step": 154000 }, { "epoch": 8.33, "learning_rate": 0.0006384, "loss": 2.9613, "step": 155000 }, { "epoch": 8.39, "learning_rate": 0.0006354588235294118, "loss": 2.9634, "step": 156000 }, { "epoch": 8.44, "learning_rate": 0.0006325176470588235, "loss": 2.9646, "step": 157000 }, { "epoch": 8.49, "learning_rate": 0.0006295794117647059, "loss": 2.9638, "step": 158000 }, { "epoch": 8.55, "learning_rate": 0.0006266382352941176, "loss": 2.9689, "step": 159000 }, { "epoch": 8.6, "learning_rate": 0.0006237, "loss": 2.9713, "step": 160000 }, { "epoch": 8.66, "learning_rate": 0.0006207588235294118, "loss": 2.9745, "step": 161000 }, { "epoch": 8.71, "learning_rate": 0.0006178176470588236, "loss": 2.9719, "step": 162000 }, { "epoch": 8.76, "learning_rate": 0.0006148794117647058, "loss": 2.9719, "step": 163000 }, { "epoch": 8.82, "learning_rate": 0.0006119411764705882, "loss": 2.9743, "step": 164000 }, { "epoch": 8.87, "learning_rate": 0.000609, "loss": 2.9776, "step": 165000 }, { "epoch": 8.92, "learning_rate": 0.0006060588235294118, "loss": 2.9772, "step": 166000 }, { "epoch": 8.98, "learning_rate": 0.0006031176470588235, "loss": 2.9774, "step": 167000 }, { "epoch": 9.0, "eval_accuracy": 0.4074036304783041, "eval_loss": 3.3728039264678955, "eval_runtime": 152.355, "eval_samples_per_second": 380.171, "eval_steps_per_second": 5.947, "step": 167400 }, { "epoch": 9.03, "learning_rate": 0.0006001764705882353, "loss": 2.9356, "step": 168000 }, { "epoch": 9.09, "learning_rate": 0.0005972411764705882, "loss": 2.9158, "step": 169000 }, { "epoch": 9.14, "learning_rate": 0.0005943000000000001, "loss": 2.92, "step": 170000 }, { "epoch": 9.19, "learning_rate": 0.0005913588235294117, "loss": 2.9247, "step": 171000 }, { "epoch": 9.25, "learning_rate": 0.0005884205882352941, "loss": 2.9248, "step": 172000 }, { "epoch": 9.3, "learning_rate": 0.0005854794117647059, "loss": 2.9304, "step": 173000 }, { "epoch": 9.35, "learning_rate": 0.0005825382352941177, "loss": 2.93, "step": 174000 }, { "epoch": 9.41, "learning_rate": 0.0005795970588235294, "loss": 2.9384, "step": 175000 }, { "epoch": 9.46, "learning_rate": 0.0005766588235294118, "loss": 2.9418, "step": 176000 }, { "epoch": 9.52, "learning_rate": 0.0005737176470588236, "loss": 2.9438, "step": 177000 }, { "epoch": 9.57, "learning_rate": 0.000570779411764706, "loss": 2.9424, "step": 178000 }, { "epoch": 9.62, "learning_rate": 0.0005678411764705882, "loss": 2.9404, "step": 179000 }, { "epoch": 9.68, "learning_rate": 0.0005649, "loss": 2.9428, "step": 180000 }, { "epoch": 9.73, "learning_rate": 0.0005619588235294118, "loss": 2.9464, "step": 181000 }, { "epoch": 9.78, "learning_rate": 0.0005590176470588235, "loss": 2.9472, "step": 182000 }, { "epoch": 9.84, "learning_rate": 0.0005560764705882353, "loss": 2.9478, "step": 183000 }, { "epoch": 9.89, "learning_rate": 0.0005531382352941176, "loss": 2.9515, "step": 184000 }, { "epoch": 9.95, "learning_rate": 0.0005501970588235295, "loss": 2.9535, "step": 185000 }, { "epoch": 10.0, "learning_rate": 0.0005472588235294118, "loss": 2.9558, "step": 186000 }, { "epoch": 10.0, "eval_accuracy": 0.4086515397968936, "eval_loss": 3.3694682121276855, "eval_runtime": 152.991, "eval_samples_per_second": 378.591, "eval_steps_per_second": 5.922, "step": 186000 }, { "epoch": 10.05, "learning_rate": 0.0005443176470588236, "loss": 2.8847, "step": 187000 }, { "epoch": 10.11, "learning_rate": 0.0005413764705882354, "loss": 2.8924, "step": 188000 }, { "epoch": 10.16, "learning_rate": 0.0005384382352941177, "loss": 2.8966, "step": 189000 }, { "epoch": 10.22, "learning_rate": 0.0005354970588235294, "loss": 2.903, "step": 190000 }, { "epoch": 10.27, "learning_rate": 0.0005325588235294118, "loss": 2.9012, "step": 191000 }, { "epoch": 10.32, "learning_rate": 0.0005296205882352941, "loss": 2.9059, "step": 192000 }, { "epoch": 10.38, "learning_rate": 0.0005266794117647059, "loss": 2.9088, "step": 193000 }, { "epoch": 10.43, "learning_rate": 0.0005237382352941177, "loss": 2.9133, "step": 194000 }, { "epoch": 10.48, "learning_rate": 0.0005207970588235294, "loss": 2.913, "step": 195000 }, { "epoch": 10.54, "learning_rate": 0.0005178558823529413, "loss": 2.919, "step": 196000 }, { "epoch": 10.59, "learning_rate": 0.0005149176470588234, "loss": 2.9168, "step": 197000 }, { "epoch": 10.65, "learning_rate": 0.0005119764705882353, "loss": 2.9204, "step": 198000 }, { "epoch": 10.7, "learning_rate": 0.0005090382352941176, "loss": 2.926, "step": 199000 }, { "epoch": 10.75, "learning_rate": 0.0005061, "loss": 2.9238, "step": 200000 }, { "epoch": 10.81, "learning_rate": 0.0005031588235294117, "loss": 2.9267, "step": 201000 }, { "epoch": 10.86, "learning_rate": 0.0005002176470588236, "loss": 2.9296, "step": 202000 }, { "epoch": 10.91, "learning_rate": 0.0004972764705882353, "loss": 2.9231, "step": 203000 }, { "epoch": 10.97, "learning_rate": 0.0004943382352941176, "loss": 2.9289, "step": 204000 }, { "epoch": 11.0, "eval_accuracy": 0.409351809324078, "eval_loss": 3.364856004714966, "eval_runtime": 153.2895, "eval_samples_per_second": 377.854, "eval_steps_per_second": 5.91, "step": 204600 }, { "epoch": 11.02, "learning_rate": 0.0004913970588235295, "loss": 2.9004, "step": 205000 }, { "epoch": 11.08, "learning_rate": 0.0004884588235294118, "loss": 2.8684, "step": 206000 }, { "epoch": 11.13, "learning_rate": 0.00048551764705882355, "loss": 2.8725, "step": 207000 }, { "epoch": 11.18, "learning_rate": 0.0004825794117647059, "loss": 2.8784, "step": 208000 }, { "epoch": 11.24, "learning_rate": 0.00047963823529411764, "loss": 2.8794, "step": 209000 }, { "epoch": 11.29, "learning_rate": 0.00047669705882352943, "loss": 2.8815, "step": 210000 }, { "epoch": 11.34, "learning_rate": 0.0004737558823529412, "loss": 2.8833, "step": 211000 }, { "epoch": 11.4, "learning_rate": 0.0004708176470588235, "loss": 2.8887, "step": 212000 }, { "epoch": 11.45, "learning_rate": 0.0004678764705882353, "loss": 2.8906, "step": 213000 }, { "epoch": 11.51, "learning_rate": 0.00046493529411764705, "loss": 2.8921, "step": 214000 }, { "epoch": 11.56, "learning_rate": 0.00046199411764705884, "loss": 2.8961, "step": 215000 }, { "epoch": 11.61, "learning_rate": 0.00045905882352941175, "loss": 2.8966, "step": 216000 }, { "epoch": 11.67, "learning_rate": 0.00045611764705882354, "loss": 2.899, "step": 217000 }, { "epoch": 11.72, "learning_rate": 0.0004531764705882353, "loss": 2.9001, "step": 218000 }, { "epoch": 11.77, "learning_rate": 0.0004502382352941177, "loss": 2.9036, "step": 219000 }, { "epoch": 11.83, "learning_rate": 0.00044729705882352947, "loss": 2.9021, "step": 220000 }, { "epoch": 11.88, "learning_rate": 0.00044435588235294116, "loss": 2.9045, "step": 221000 }, { "epoch": 11.94, "learning_rate": 0.00044141470588235295, "loss": 2.9051, "step": 222000 }, { "epoch": 11.99, "learning_rate": 0.0004384764705882353, "loss": 2.9058, "step": 223000 }, { "epoch": 12.0, "eval_accuracy": 0.4094906002721942, "eval_loss": 3.3603594303131104, "eval_runtime": 152.9212, "eval_samples_per_second": 378.764, "eval_steps_per_second": 5.925, "step": 223200 }, { "epoch": 12.04, "learning_rate": 0.0004355382352941177, "loss": 2.854, "step": 224000 }, { "epoch": 12.1, "learning_rate": 0.0004325970588235294, "loss": 2.8436, "step": 225000 }, { "epoch": 12.15, "learning_rate": 0.0004296588235294118, "loss": 2.8549, "step": 226000 }, { "epoch": 12.2, "learning_rate": 0.00042671764705882353, "loss": 2.8551, "step": 227000 }, { "epoch": 12.26, "learning_rate": 0.0004237764705882353, "loss": 2.8579, "step": 228000 }, { "epoch": 12.31, "learning_rate": 0.0004208352941176471, "loss": 2.8663, "step": 229000 }, { "epoch": 12.37, "learning_rate": 0.0004178970588235294, "loss": 2.8652, "step": 230000 }, { "epoch": 12.42, "learning_rate": 0.0004149558823529412, "loss": 2.871, "step": 231000 }, { "epoch": 12.47, "learning_rate": 0.00041201470588235294, "loss": 2.8696, "step": 232000 }, { "epoch": 12.53, "learning_rate": 0.00040907352941176473, "loss": 2.8723, "step": 233000 }, { "epoch": 12.58, "learning_rate": 0.00040613235294117647, "loss": 2.875, "step": 234000 }, { "epoch": 12.63, "learning_rate": 0.0004031911764705882, "loss": 2.8776, "step": 235000 }, { "epoch": 12.69, "learning_rate": 0.0004002529411764706, "loss": 2.8773, "step": 236000 }, { "epoch": 12.74, "learning_rate": 0.00039731470588235296, "loss": 2.8808, "step": 237000 }, { "epoch": 12.8, "learning_rate": 0.0003943735294117647, "loss": 2.8819, "step": 238000 }, { "epoch": 12.85, "learning_rate": 0.00039143235294117644, "loss": 2.8865, "step": 239000 }, { "epoch": 12.9, "learning_rate": 0.00038849411764705884, "loss": 2.8811, "step": 240000 }, { "epoch": 12.96, "learning_rate": 0.0003855558823529412, "loss": 2.8805, "step": 241000 }, { "epoch": 13.0, "eval_accuracy": 0.40976489041796293, "eval_loss": 3.380065679550171, "eval_runtime": 152.8294, "eval_samples_per_second": 378.991, "eval_steps_per_second": 5.928, "step": 241800 }, { "epoch": 13.01, "learning_rate": 0.000382614705882353, "loss": 2.8705, "step": 242000 }, { "epoch": 13.06, "learning_rate": 0.0003796735294117647, "loss": 2.8279, "step": 243000 }, { "epoch": 13.12, "learning_rate": 0.00037673235294117646, "loss": 2.8249, "step": 244000 }, { "epoch": 13.17, "learning_rate": 0.00037379117647058825, "loss": 2.8377, "step": 245000 }, { "epoch": 13.23, "learning_rate": 0.0003708529411764706, "loss": 2.8363, "step": 246000 }, { "epoch": 13.28, "learning_rate": 0.00036791176470588234, "loss": 2.8433, "step": 247000 }, { "epoch": 13.33, "learning_rate": 0.0003649735294117647, "loss": 2.8461, "step": 248000 }, { "epoch": 13.39, "learning_rate": 0.0003620323529411765, "loss": 2.8522, "step": 249000 }, { "epoch": 13.44, "learning_rate": 0.00035909411764705883, "loss": 2.8426, "step": 250000 }, { "epoch": 13.49, "learning_rate": 0.00035615294117647057, "loss": 2.854, "step": 251000 }, { "epoch": 13.55, "learning_rate": 0.00035321176470588236, "loss": 2.8533, "step": 252000 }, { "epoch": 13.6, "learning_rate": 0.0003502705882352941, "loss": 2.8515, "step": 253000 }, { "epoch": 13.66, "learning_rate": 0.0003473323529411765, "loss": 2.8585, "step": 254000 }, { "epoch": 13.71, "learning_rate": 0.00034439117647058824, "loss": 2.8567, "step": 255000 }, { "epoch": 13.76, "learning_rate": 0.00034145, "loss": 2.861, "step": 256000 }, { "epoch": 13.82, "learning_rate": 0.00033850882352941177, "loss": 2.8554, "step": 257000 }, { "epoch": 13.87, "learning_rate": 0.0003355705882352941, "loss": 2.8636, "step": 258000 }, { "epoch": 13.92, "learning_rate": 0.0003326294117647059, "loss": 2.8648, "step": 259000 }, { "epoch": 13.98, "learning_rate": 0.0003296882352941177, "loss": 2.8621, "step": 260000 }, { "epoch": 14.0, "eval_accuracy": 0.409478642484796, "eval_loss": 3.3870773315429688, "eval_runtime": 153.1211, "eval_samples_per_second": 378.269, "eval_steps_per_second": 5.917, "step": 260400 }, { "epoch": 14.03, "learning_rate": 0.0003267529411764706, "loss": 2.8345, "step": 261000 }, { "epoch": 14.09, "learning_rate": 0.00032381176470588235, "loss": 2.809, "step": 262000 }, { "epoch": 14.14, "learning_rate": 0.00032087058823529414, "loss": 2.8165, "step": 263000 }, { "epoch": 14.19, "learning_rate": 0.00031792941176470593, "loss": 2.8175, "step": 264000 }, { "epoch": 14.25, "learning_rate": 0.0003149882352941176, "loss": 2.8223, "step": 265000 }, { "epoch": 14.3, "learning_rate": 0.00031205, "loss": 2.8244, "step": 266000 }, { "epoch": 14.35, "learning_rate": 0.00030910882352941176, "loss": 2.8286, "step": 267000 }, { "epoch": 14.41, "learning_rate": 0.00030616764705882355, "loss": 2.8291, "step": 268000 }, { "epoch": 14.46, "learning_rate": 0.00030322941176470585, "loss": 2.8318, "step": 269000 }, { "epoch": 14.52, "learning_rate": 0.00030028823529411764, "loss": 2.8312, "step": 270000 }, { "epoch": 14.57, "learning_rate": 0.00029734705882352943, "loss": 2.8348, "step": 271000 }, { "epoch": 14.62, "learning_rate": 0.0002944088235294118, "loss": 2.8378, "step": 272000 }, { "epoch": 14.68, "learning_rate": 0.0002914676470588236, "loss": 2.8368, "step": 273000 }, { "epoch": 14.73, "learning_rate": 0.00028852941176470587, "loss": 2.8375, "step": 274000 }, { "epoch": 14.78, "learning_rate": 0.00028558823529411766, "loss": 2.8374, "step": 275000 }, { "epoch": 14.84, "learning_rate": 0.0002826470588235294, "loss": 2.8398, "step": 276000 }, { "epoch": 14.89, "learning_rate": 0.0002797088235294118, "loss": 2.8405, "step": 277000 }, { "epoch": 14.95, "learning_rate": 0.0002767676470588235, "loss": 2.8394, "step": 278000 }, { "epoch": 15.0, "learning_rate": 0.0002738294117647059, "loss": 2.8423, "step": 279000 }, { "epoch": 15.0, "eval_accuracy": 0.4095923758222406, "eval_loss": 3.3872194290161133, "eval_runtime": 152.5036, "eval_samples_per_second": 379.801, "eval_steps_per_second": 5.941, "step": 279000 }, { "epoch": 15.05, "learning_rate": 0.00027088823529411763, "loss": 2.7935, "step": 280000 }, { "epoch": 15.11, "learning_rate": 0.0002679470588235294, "loss": 2.8014, "step": 281000 }, { "epoch": 15.16, "learning_rate": 0.00026500882352941177, "loss": 2.7988, "step": 282000 }, { "epoch": 15.22, "learning_rate": 0.0002620676470588235, "loss": 2.8032, "step": 283000 }, { "epoch": 15.27, "learning_rate": 0.0002591294117647059, "loss": 2.8058, "step": 284000 }, { "epoch": 15.32, "learning_rate": 0.00025619117647058827, "loss": 2.8074, "step": 285000 }, { "epoch": 15.38, "learning_rate": 0.00025325, "loss": 2.8105, "step": 286000 }, { "epoch": 15.43, "learning_rate": 0.00025030882352941174, "loss": 2.8109, "step": 287000 }, { "epoch": 15.48, "learning_rate": 0.00024737058823529415, "loss": 2.8118, "step": 288000 }, { "epoch": 15.54, "learning_rate": 0.0002444294117647059, "loss": 2.814, "step": 289000 }, { "epoch": 15.59, "learning_rate": 0.00024148823529411765, "loss": 2.817, "step": 290000 }, { "epoch": 15.65, "learning_rate": 0.0002385470588235294, "loss": 2.8198, "step": 291000 }, { "epoch": 15.7, "learning_rate": 0.00023560588235294118, "loss": 2.8158, "step": 292000 }, { "epoch": 15.75, "learning_rate": 0.00023266764705882353, "loss": 2.8187, "step": 293000 }, { "epoch": 15.81, "learning_rate": 0.0002297264705882353, "loss": 2.8163, "step": 294000 }, { "epoch": 15.86, "learning_rate": 0.00022678529411764706, "loss": 2.8183, "step": 295000 }, { "epoch": 15.91, "learning_rate": 0.00022385, "loss": 2.8204, "step": 296000 }, { "epoch": 15.97, "learning_rate": 0.00022090882352941179, "loss": 2.8216, "step": 297000 }, { "epoch": 16.0, "eval_accuracy": 0.40969267344350757, "eval_loss": 3.3996341228485107, "eval_runtime": 152.6929, "eval_samples_per_second": 379.33, "eval_steps_per_second": 5.933, "step": 297600 }, { "epoch": 16.02, "learning_rate": 0.00021796764705882352, "loss": 2.8062, "step": 298000 }, { "epoch": 16.08, "learning_rate": 0.0002150294117647059, "loss": 2.7798, "step": 299000 }, { "epoch": 16.13, "learning_rate": 0.00021208823529411767, "loss": 2.7837, "step": 300000 }, { "epoch": 16.18, "learning_rate": 0.0002091470588235294, "loss": 2.7856, "step": 301000 }, { "epoch": 16.24, "learning_rate": 0.0002062058823529412, "loss": 2.7852, "step": 302000 }, { "epoch": 16.29, "learning_rate": 0.00020327058823529413, "loss": 2.7907, "step": 303000 }, { "epoch": 16.34, "learning_rate": 0.0002003294117647059, "loss": 2.7915, "step": 304000 }, { "epoch": 16.4, "learning_rate": 0.00019738823529411763, "loss": 2.7898, "step": 305000 }, { "epoch": 16.45, "learning_rate": 0.00019444705882352943, "loss": 2.7921, "step": 306000 }, { "epoch": 16.51, "learning_rate": 0.00019150882352941178, "loss": 2.7969, "step": 307000 }, { "epoch": 16.56, "learning_rate": 0.00018856764705882354, "loss": 2.7974, "step": 308000 }, { "epoch": 16.61, "learning_rate": 0.0001856294117647059, "loss": 2.7936, "step": 309000 }, { "epoch": 16.67, "learning_rate": 0.00018268823529411766, "loss": 2.7995, "step": 310000 }, { "epoch": 16.72, "learning_rate": 0.0001797470588235294, "loss": 2.7983, "step": 311000 }, { "epoch": 16.77, "learning_rate": 0.00017680882352941177, "loss": 2.7995, "step": 312000 }, { "epoch": 16.83, "learning_rate": 0.0001738676470588235, "loss": 2.7993, "step": 313000 }, { "epoch": 16.88, "learning_rate": 0.00017092941176470589, "loss": 2.8003, "step": 314000 }, { "epoch": 16.94, "learning_rate": 0.00016798823529411768, "loss": 2.8086, "step": 315000 }, { "epoch": 16.99, "learning_rate": 0.00016504705882352942, "loss": 2.8042, "step": 316000 }, { "epoch": 17.0, "eval_accuracy": 0.41008405585576546, "eval_loss": 3.398700714111328, "eval_runtime": 152.4781, "eval_samples_per_second": 379.864, "eval_steps_per_second": 5.942, "step": 316200 }, { "epoch": 17.04, "learning_rate": 0.00016210588235294118, "loss": 2.7702, "step": 317000 }, { "epoch": 17.1, "learning_rate": 0.00015916764705882353, "loss": 2.7625, "step": 318000 }, { "epoch": 17.15, "learning_rate": 0.0001562264705882353, "loss": 2.7737, "step": 319000 }, { "epoch": 17.2, "learning_rate": 0.00015328529411764706, "loss": 2.7706, "step": 320000 }, { "epoch": 17.26, "learning_rate": 0.0001503470588235294, "loss": 2.7714, "step": 321000 }, { "epoch": 17.31, "learning_rate": 0.00014740588235294118, "loss": 2.7705, "step": 322000 }, { "epoch": 17.37, "learning_rate": 0.00014446764705882353, "loss": 2.7761, "step": 323000 }, { "epoch": 17.42, "learning_rate": 0.0001415264705882353, "loss": 2.776, "step": 324000 }, { "epoch": 17.47, "learning_rate": 0.00013858529411764706, "loss": 2.7782, "step": 325000 }, { "epoch": 17.53, "learning_rate": 0.0001356470588235294, "loss": 2.7805, "step": 326000 }, { "epoch": 17.58, "learning_rate": 0.00013270588235294117, "loss": 2.7804, "step": 327000 }, { "epoch": 17.63, "learning_rate": 0.00012976470588235296, "loss": 2.7823, "step": 328000 }, { "epoch": 17.69, "learning_rate": 0.00012682647058823529, "loss": 2.7811, "step": 329000 }, { "epoch": 17.74, "learning_rate": 0.00012388529411764708, "loss": 2.7799, "step": 330000 }, { "epoch": 17.8, "learning_rate": 0.00012094411764705883, "loss": 2.7821, "step": 331000 }, { "epoch": 17.85, "learning_rate": 0.0001180029411764706, "loss": 2.7839, "step": 332000 }, { "epoch": 17.9, "learning_rate": 0.00011506176470588235, "loss": 2.7848, "step": 333000 }, { "epoch": 17.96, "learning_rate": 0.00011212058823529411, "loss": 2.7834, "step": 334000 }, { "epoch": 18.0, "eval_accuracy": 0.41014008279222663, "eval_loss": 3.4020495414733887, "eval_runtime": 152.7878, "eval_samples_per_second": 379.094, "eval_steps_per_second": 5.93, "step": 334800 }, { "epoch": 18.01, "learning_rate": 0.00010918235294117647, "loss": 2.7783, "step": 335000 }, { "epoch": 18.06, "learning_rate": 0.00010624117647058824, "loss": 2.7522, "step": 336000 }, { "epoch": 18.12, "learning_rate": 0.00010330294117647059, "loss": 2.7562, "step": 337000 }, { "epoch": 18.17, "learning_rate": 0.00010036176470588237, "loss": 2.7594, "step": 338000 }, { "epoch": 18.23, "learning_rate": 9.74235294117647e-05, "loss": 2.7523, "step": 339000 }, { "epoch": 18.28, "learning_rate": 9.448235294117648e-05, "loss": 2.7595, "step": 340000 }, { "epoch": 18.33, "learning_rate": 9.154411764705882e-05, "loss": 2.7656, "step": 341000 }, { "epoch": 18.39, "learning_rate": 8.86029411764706e-05, "loss": 2.7577, "step": 342000 }, { "epoch": 18.44, "learning_rate": 8.566176470588236e-05, "loss": 2.7624, "step": 343000 }, { "epoch": 18.49, "learning_rate": 8.272058823529411e-05, "loss": 2.7589, "step": 344000 }, { "epoch": 18.55, "learning_rate": 7.978235294117648e-05, "loss": 2.7612, "step": 345000 }, { "epoch": 18.6, "learning_rate": 7.684117647058823e-05, "loss": 2.7645, "step": 346000 }, { "epoch": 18.66, "learning_rate": 7.390294117647059e-05, "loss": 2.7634, "step": 347000 }, { "epoch": 18.71, "learning_rate": 7.096176470588236e-05, "loss": 2.761, "step": 348000 }, { "epoch": 18.76, "learning_rate": 6.802058823529412e-05, "loss": 2.7608, "step": 349000 }, { "epoch": 18.82, "learning_rate": 6.508235294117647e-05, "loss": 2.7627, "step": 350000 }, { "epoch": 18.87, "learning_rate": 6.214117647058824e-05, "loss": 2.767, "step": 351000 }, { "epoch": 18.92, "learning_rate": 5.9202941176470594e-05, "loss": 2.7678, "step": 352000 }, { "epoch": 18.98, "learning_rate": 5.626176470588235e-05, "loss": 2.7643, "step": 353000 }, { "epoch": 19.0, "eval_accuracy": 0.4096899191216911, "eval_loss": 3.419888973236084, "eval_runtime": 152.6752, "eval_samples_per_second": 379.374, "eval_steps_per_second": 5.934, "step": 353400 }, { "epoch": 19.03, "learning_rate": 5.332058823529412e-05, "loss": 2.7504, "step": 354000 }, { "epoch": 19.09, "learning_rate": 5.037941176470588e-05, "loss": 2.7458, "step": 355000 }, { "epoch": 19.14, "learning_rate": 4.743823529411765e-05, "loss": 2.7466, "step": 356000 }, { "epoch": 19.19, "learning_rate": 4.449705882352941e-05, "loss": 2.7444, "step": 357000 }, { "epoch": 19.25, "learning_rate": 4.155882352941177e-05, "loss": 2.7448, "step": 358000 }, { "epoch": 19.3, "learning_rate": 3.862058823529411e-05, "loss": 2.7459, "step": 359000 }, { "epoch": 19.35, "learning_rate": 3.5679411764705884e-05, "loss": 2.747, "step": 360000 }, { "epoch": 19.41, "learning_rate": 3.2741176470588234e-05, "loss": 2.7463, "step": 361000 }, { "epoch": 19.46, "learning_rate": 2.98e-05, "loss": 2.7457, "step": 362000 }, { "epoch": 19.52, "learning_rate": 2.6858823529411764e-05, "loss": 2.7479, "step": 363000 }, { "epoch": 19.57, "learning_rate": 2.391764705882353e-05, "loss": 2.7478, "step": 364000 }, { "epoch": 19.62, "learning_rate": 2.0979411764705883e-05, "loss": 2.7469, "step": 365000 }, { "epoch": 19.68, "learning_rate": 1.8038235294117648e-05, "loss": 2.7461, "step": 366000 }, { "epoch": 19.73, "learning_rate": 1.5097058823529411e-05, "loss": 2.7484, "step": 367000 }, { "epoch": 19.78, "learning_rate": 1.2158823529411766e-05, "loss": 2.7524, "step": 368000 }, { "epoch": 19.84, "learning_rate": 9.217647058823531e-06, "loss": 2.7469, "step": 369000 }, { "epoch": 19.89, "learning_rate": 6.2823529411764705e-06, "loss": 2.7463, "step": 370000 }, { "epoch": 19.95, "learning_rate": 3.3411764705882354e-06, "loss": 2.746, "step": 371000 }, { "epoch": 20.0, "learning_rate": 4.0000000000000003e-07, "loss": 2.7463, "step": 372000 }, { "epoch": 20.0, "eval_accuracy": 0.4096600918317765, "eval_loss": 3.425863742828369, "eval_runtime": 152.7802, "eval_samples_per_second": 379.113, "eval_steps_per_second": 5.93, "step": 372000 }, { "epoch": 20.0, "step": 372000, "total_flos": 1.56735498728448e+18, "train_loss": 3.0193379687237485, "train_runtime": 80852.2733, "train_samples_per_second": 147.231, "train_steps_per_second": 4.601 } ], "logging_steps": 1000, "max_steps": 372000, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 5000, "total_flos": 1.56735498728448e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }