{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 371860, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "learning_rate": 3.125e-05, "loss": 6.2324, "step": 1000 }, { "epoch": 0.11, "learning_rate": 6.25e-05, "loss": 5.004, "step": 2000 }, { "epoch": 0.16, "learning_rate": 9.375e-05, "loss": 4.6782, "step": 3000 }, { "epoch": 0.22, "learning_rate": 0.000125, "loss": 4.451, "step": 4000 }, { "epoch": 0.27, "learning_rate": 0.00015625, "loss": 4.2941, "step": 5000 }, { "epoch": 0.32, "learning_rate": 0.0001875, "loss": 4.1772, "step": 6000 }, { "epoch": 0.38, "learning_rate": 0.00021875, "loss": 4.0706, "step": 7000 }, { "epoch": 0.43, "learning_rate": 0.00025, "loss": 3.9803, "step": 8000 }, { "epoch": 0.48, "learning_rate": 0.00028121875, "loss": 3.9093, "step": 9000 }, { "epoch": 0.54, "learning_rate": 0.00031246875000000003, "loss": 3.8551, "step": 10000 }, { "epoch": 0.59, "learning_rate": 0.00034368749999999997, "loss": 3.8101, "step": 11000 }, { "epoch": 0.65, "learning_rate": 0.0003749375, "loss": 3.7743, "step": 12000 }, { "epoch": 0.7, "learning_rate": 0.00040615625, "loss": 3.7316, "step": 13000 }, { "epoch": 0.75, "learning_rate": 0.00043737500000000005, "loss": 3.7002, "step": 14000 }, { "epoch": 0.81, "learning_rate": 0.00046859375, "loss": 3.6744, "step": 15000 }, { "epoch": 0.86, "learning_rate": 0.00049984375, "loss": 3.6446, "step": 16000 }, { "epoch": 0.91, "learning_rate": 0.00053109375, "loss": 3.6324, "step": 17000 }, { "epoch": 0.97, "learning_rate": 0.00056234375, "loss": 3.6079, "step": 18000 }, { "epoch": 1.0, "eval_accuracy": 0.35758257053339554, "eval_loss": 3.814922332763672, "eval_runtime": 152.0015, "eval_samples_per_second": 381.042, "eval_steps_per_second": 5.954, "step": 18593 }, { "epoch": 1.02, "learning_rate": 0.00059359375, "loss": 3.5787, "step": 19000 }, { "epoch": 1.08, "learning_rate": 0.0006248437500000001, "loss": 3.5479, "step": 20000 }, { "epoch": 1.13, "learning_rate": 0.0006560625, "loss": 3.5427, "step": 21000 }, { "epoch": 1.18, "learning_rate": 0.0006873125, "loss": 3.5347, "step": 22000 }, { "epoch": 1.24, "learning_rate": 0.00071853125, "loss": 3.5129, "step": 23000 }, { "epoch": 1.29, "learning_rate": 0.00074978125, "loss": 3.5089, "step": 24000 }, { "epoch": 1.34, "learning_rate": 0.0007810312499999999, "loss": 3.496, "step": 25000 }, { "epoch": 1.4, "learning_rate": 0.00081225, "loss": 3.4853, "step": 26000 }, { "epoch": 1.45, "learning_rate": 0.00084346875, "loss": 3.4794, "step": 27000 }, { "epoch": 1.51, "learning_rate": 0.00087471875, "loss": 3.471, "step": 28000 }, { "epoch": 1.56, "learning_rate": 0.00090596875, "loss": 3.4613, "step": 29000 }, { "epoch": 1.61, "learning_rate": 0.0009371875, "loss": 3.4515, "step": 30000 }, { "epoch": 1.67, "learning_rate": 0.0009684375, "loss": 3.4459, "step": 31000 }, { "epoch": 1.72, "learning_rate": 0.0009996875, "loss": 3.4398, "step": 32000 }, { "epoch": 1.77, "learning_rate": 0.0009970899782263285, "loss": 3.4285, "step": 33000 }, { "epoch": 1.83, "learning_rate": 0.0009941475901841935, "loss": 3.4208, "step": 34000 }, { "epoch": 1.88, "learning_rate": 0.0009912052021420585, "loss": 3.406, "step": 35000 }, { "epoch": 1.94, "learning_rate": 0.0009882628140999235, "loss": 3.3979, "step": 36000 }, { "epoch": 1.99, "learning_rate": 0.0009853233684458307, "loss": 3.3841, "step": 37000 }, { "epoch": 2.0, "eval_accuracy": 0.37915455926877084, "eval_loss": 3.590174436569214, "eval_runtime": 153.5121, "eval_samples_per_second": 377.293, "eval_steps_per_second": 5.895, "step": 37186 }, { "epoch": 2.04, "learning_rate": 0.0009823809804036957, "loss": 3.3374, "step": 38000 }, { "epoch": 2.1, "learning_rate": 0.0009794415347496028, "loss": 3.3261, "step": 39000 }, { "epoch": 2.15, "learning_rate": 0.0009764991467074677, "loss": 3.3249, "step": 40000 }, { "epoch": 2.21, "learning_rate": 0.0009735567586653328, "loss": 3.3187, "step": 41000 }, { "epoch": 2.26, "learning_rate": 0.0009706143706231978, "loss": 3.3172, "step": 42000 }, { "epoch": 2.31, "learning_rate": 0.0009676719825810628, "loss": 3.3078, "step": 43000 }, { "epoch": 2.37, "learning_rate": 0.0009647325369269699, "loss": 3.31, "step": 44000 }, { "epoch": 2.42, "learning_rate": 0.000961793091272877, "loss": 3.3027, "step": 45000 }, { "epoch": 2.47, "learning_rate": 0.0009588507032307421, "loss": 3.2934, "step": 46000 }, { "epoch": 2.53, "learning_rate": 0.0009559083151886071, "loss": 3.2863, "step": 47000 }, { "epoch": 2.58, "learning_rate": 0.0009529659271464721, "loss": 3.291, "step": 48000 }, { "epoch": 2.64, "learning_rate": 0.0009500235391043372, "loss": 3.2837, "step": 49000 }, { "epoch": 2.69, "learning_rate": 0.0009470840934502442, "loss": 3.2791, "step": 50000 }, { "epoch": 2.74, "learning_rate": 0.0009441417054081093, "loss": 3.2756, "step": 51000 }, { "epoch": 2.8, "learning_rate": 0.0009412022597540164, "loss": 3.273, "step": 52000 }, { "epoch": 2.85, "learning_rate": 0.0009382598717118814, "loss": 3.2648, "step": 53000 }, { "epoch": 2.9, "learning_rate": 0.0009353204260577886, "loss": 3.2609, "step": 54000 }, { "epoch": 2.96, "learning_rate": 0.0009323780380156535, "loss": 3.2548, "step": 55000 }, { "epoch": 3.0, "eval_accuracy": 0.3917531951068388, "eval_loss": 3.4806971549987793, "eval_runtime": 153.7786, "eval_samples_per_second": 376.639, "eval_steps_per_second": 5.885, "step": 55779 }, { "epoch": 3.01, "learning_rate": 0.0009294356499735185, "loss": 3.2367, "step": 56000 }, { "epoch": 3.07, "learning_rate": 0.0009264962043194257, "loss": 3.1899, "step": 57000 }, { "epoch": 3.12, "learning_rate": 0.0009235538162772906, "loss": 3.1955, "step": 58000 }, { "epoch": 3.17, "learning_rate": 0.00092061731301124, "loss": 3.1993, "step": 59000 }, { "epoch": 3.23, "learning_rate": 0.0009176749249691049, "loss": 3.1937, "step": 60000 }, { "epoch": 3.28, "learning_rate": 0.0009147325369269699, "loss": 3.196, "step": 61000 }, { "epoch": 3.33, "learning_rate": 0.0009117930912728771, "loss": 3.1964, "step": 62000 }, { "epoch": 3.39, "learning_rate": 0.0009088507032307421, "loss": 3.2007, "step": 63000 }, { "epoch": 3.44, "learning_rate": 0.0009059083151886071, "loss": 3.1925, "step": 64000 }, { "epoch": 3.5, "learning_rate": 0.000902965927146472, "loss": 3.193, "step": 65000 }, { "epoch": 3.55, "learning_rate": 0.0009000264814923792, "loss": 3.1927, "step": 66000 }, { "epoch": 3.6, "learning_rate": 0.0008970840934502442, "loss": 3.1868, "step": 67000 }, { "epoch": 3.66, "learning_rate": 0.0008941446477961514, "loss": 3.1813, "step": 68000 }, { "epoch": 3.71, "learning_rate": 0.0008912022597540164, "loss": 3.1868, "step": 69000 }, { "epoch": 3.76, "learning_rate": 0.0008882628140999234, "loss": 3.1849, "step": 70000 }, { "epoch": 3.82, "learning_rate": 0.0008853204260577885, "loss": 3.1809, "step": 71000 }, { "epoch": 3.87, "learning_rate": 0.0008823809804036956, "loss": 3.1804, "step": 72000 }, { "epoch": 3.93, "learning_rate": 0.0008794385923615607, "loss": 3.1842, "step": 73000 }, { "epoch": 3.98, "learning_rate": 0.0008764991467074678, "loss": 3.1845, "step": 74000 }, { "epoch": 4.0, "eval_accuracy": 0.3967745643436161, "eval_loss": 3.4469034671783447, "eval_runtime": 153.5493, "eval_samples_per_second": 377.201, "eval_steps_per_second": 5.894, "step": 74372 }, { "epoch": 4.03, "learning_rate": 0.0008735567586653328, "loss": 3.1352, "step": 75000 }, { "epoch": 4.09, "learning_rate": 0.0008706143706231978, "loss": 3.1097, "step": 76000 }, { "epoch": 4.14, "learning_rate": 0.0008676719825810628, "loss": 3.1133, "step": 77000 }, { "epoch": 4.2, "learning_rate": 0.0008647354793150121, "loss": 3.1205, "step": 78000 }, { "epoch": 4.25, "learning_rate": 0.0008617930912728771, "loss": 3.1258, "step": 79000 }, { "epoch": 4.3, "learning_rate": 0.0008588536456187842, "loss": 3.1284, "step": 80000 }, { "epoch": 4.36, "learning_rate": 0.0008559112575766492, "loss": 3.1247, "step": 81000 }, { "epoch": 4.41, "learning_rate": 0.0008529688695345142, "loss": 3.1244, "step": 82000 }, { "epoch": 4.46, "learning_rate": 0.0008500264814923793, "loss": 3.1273, "step": 83000 }, { "epoch": 4.52, "learning_rate": 0.0008470870358382864, "loss": 3.1249, "step": 84000 }, { "epoch": 4.57, "learning_rate": 0.0008441446477961514, "loss": 3.1289, "step": 85000 }, { "epoch": 4.63, "learning_rate": 0.0008412022597540165, "loss": 3.1241, "step": 86000 }, { "epoch": 4.68, "learning_rate": 0.0008382657564879656, "loss": 3.1304, "step": 87000 }, { "epoch": 4.73, "learning_rate": 0.0008353233684458307, "loss": 3.1271, "step": 88000 }, { "epoch": 4.79, "learning_rate": 0.0008323839227917378, "loss": 3.1215, "step": 89000 }, { "epoch": 4.84, "learning_rate": 0.0008294415347496028, "loss": 3.1234, "step": 90000 }, { "epoch": 4.89, "learning_rate": 0.0008264991467074679, "loss": 3.1243, "step": 91000 }, { "epoch": 4.95, "learning_rate": 0.0008235567586653327, "loss": 3.1246, "step": 92000 }, { "epoch": 5.0, "eval_accuracy": 0.40143053666186035, "eval_loss": 3.416903018951416, "eval_runtime": 153.8466, "eval_samples_per_second": 376.472, "eval_steps_per_second": 5.882, "step": 92965 }, { "epoch": 5.0, "learning_rate": 0.00082061731301124, "loss": 3.1216, "step": 93000 }, { "epoch": 5.06, "learning_rate": 0.0008176749249691049, "loss": 3.0534, "step": 94000 }, { "epoch": 5.11, "learning_rate": 0.0008147325369269699, "loss": 3.0622, "step": 95000 }, { "epoch": 5.16, "learning_rate": 0.000811790148884835, "loss": 3.0629, "step": 96000 }, { "epoch": 5.22, "learning_rate": 0.000808850703230742, "loss": 3.0635, "step": 97000 }, { "epoch": 5.27, "learning_rate": 0.0008059112575766492, "loss": 3.0733, "step": 98000 }, { "epoch": 5.32, "learning_rate": 0.0008029688695345142, "loss": 3.0755, "step": 99000 }, { "epoch": 5.38, "learning_rate": 0.0008000264814923792, "loss": 3.0766, "step": 100000 }, { "epoch": 5.43, "learning_rate": 0.0007970840934502443, "loss": 3.0731, "step": 101000 }, { "epoch": 5.49, "learning_rate": 0.0007941417054081093, "loss": 3.078, "step": 102000 }, { "epoch": 5.54, "learning_rate": 0.0007911993173659742, "loss": 3.0757, "step": 103000 }, { "epoch": 5.59, "learning_rate": 0.0007882598717118814, "loss": 3.0787, "step": 104000 }, { "epoch": 5.65, "learning_rate": 0.0007853204260577885, "loss": 3.0817, "step": 105000 }, { "epoch": 5.7, "learning_rate": 0.0007823780380156536, "loss": 3.0789, "step": 106000 }, { "epoch": 5.75, "learning_rate": 0.0007794385923615607, "loss": 3.0801, "step": 107000 }, { "epoch": 5.81, "learning_rate": 0.0007764962043194257, "loss": 3.0792, "step": 108000 }, { "epoch": 5.86, "learning_rate": 0.0007735538162772907, "loss": 3.0791, "step": 109000 }, { "epoch": 5.92, "learning_rate": 0.0007706143706231978, "loss": 3.0793, "step": 110000 }, { "epoch": 5.97, "learning_rate": 0.0007676719825810629, "loss": 3.0823, "step": 111000 }, { "epoch": 6.0, "eval_accuracy": 0.40345429411247413, "eval_loss": 3.3872547149658203, "eval_runtime": 153.6203, "eval_samples_per_second": 377.027, "eval_steps_per_second": 5.891, "step": 111558 }, { "epoch": 6.02, "learning_rate": 0.0007647295945389278, "loss": 3.0473, "step": 112000 }, { "epoch": 6.08, "learning_rate": 0.0007617872064967927, "loss": 3.0106, "step": 113000 }, { "epoch": 6.13, "learning_rate": 0.0007588507032307421, "loss": 3.0174, "step": 114000 }, { "epoch": 6.19, "learning_rate": 0.0007559083151886071, "loss": 3.0203, "step": 115000 }, { "epoch": 6.24, "learning_rate": 0.0007529659271464721, "loss": 3.0278, "step": 116000 }, { "epoch": 6.29, "learning_rate": 0.0007500235391043371, "loss": 3.0312, "step": 117000 }, { "epoch": 6.35, "learning_rate": 0.000747081151062202, "loss": 3.0315, "step": 118000 }, { "epoch": 6.4, "learning_rate": 0.0007441417054081092, "loss": 3.0355, "step": 119000 }, { "epoch": 6.45, "learning_rate": 0.0007412022597540163, "loss": 3.0334, "step": 120000 }, { "epoch": 6.51, "learning_rate": 0.0007382598717118814, "loss": 3.0394, "step": 121000 }, { "epoch": 6.56, "learning_rate": 0.0007353174836697464, "loss": 3.039, "step": 122000 }, { "epoch": 6.62, "learning_rate": 0.0007323780380156535, "loss": 3.039, "step": 123000 }, { "epoch": 6.67, "learning_rate": 0.0007294356499735185, "loss": 3.0429, "step": 124000 }, { "epoch": 6.72, "learning_rate": 0.0007264932619313835, "loss": 3.0438, "step": 125000 }, { "epoch": 6.78, "learning_rate": 0.0007235538162772907, "loss": 3.0439, "step": 126000 }, { "epoch": 6.83, "learning_rate": 0.0007206114282351557, "loss": 3.0431, "step": 127000 }, { "epoch": 6.88, "learning_rate": 0.0007176690401930207, "loss": 3.0421, "step": 128000 }, { "epoch": 6.94, "learning_rate": 0.0007147266521508858, "loss": 3.044, "step": 129000 }, { "epoch": 6.99, "learning_rate": 0.000711793091272877, "loss": 3.0457, "step": 130000 }, { "epoch": 7.0, "eval_accuracy": 0.40529780520669445, "eval_loss": 3.3856544494628906, "eval_runtime": 153.5426, "eval_samples_per_second": 377.218, "eval_steps_per_second": 5.894, "step": 130151 }, { "epoch": 7.05, "learning_rate": 0.0007088507032307421, "loss": 2.9788, "step": 131000 }, { "epoch": 7.1, "learning_rate": 0.0007059083151886071, "loss": 2.9803, "step": 132000 }, { "epoch": 7.15, "learning_rate": 0.0007029659271464721, "loss": 2.9871, "step": 133000 }, { "epoch": 7.21, "learning_rate": 0.0007000235391043372, "loss": 2.9897, "step": 134000 }, { "epoch": 7.26, "learning_rate": 0.000697081151062202, "loss": 2.9932, "step": 135000 }, { "epoch": 7.31, "learning_rate": 0.0006941417054081093, "loss": 2.9984, "step": 136000 }, { "epoch": 7.37, "learning_rate": 0.0006911993173659742, "loss": 2.9945, "step": 137000 }, { "epoch": 7.42, "learning_rate": 0.0006882598717118814, "loss": 3.0014, "step": 138000 }, { "epoch": 7.48, "learning_rate": 0.0006853174836697464, "loss": 3.004, "step": 139000 }, { "epoch": 7.53, "learning_rate": 0.0006823750956276113, "loss": 3.0046, "step": 140000 }, { "epoch": 7.58, "learning_rate": 0.0006794356499735185, "loss": 3.0085, "step": 141000 }, { "epoch": 7.64, "learning_rate": 0.0006764932619313835, "loss": 3.0102, "step": 142000 }, { "epoch": 7.69, "learning_rate": 0.0006735508738892485, "loss": 3.0089, "step": 143000 }, { "epoch": 7.74, "learning_rate": 0.0006706084858471136, "loss": 3.0119, "step": 144000 }, { "epoch": 7.8, "learning_rate": 0.0006676690401930206, "loss": 3.0145, "step": 145000 }, { "epoch": 7.85, "learning_rate": 0.0006647266521508856, "loss": 3.0094, "step": 146000 }, { "epoch": 7.91, "learning_rate": 0.0006617872064967928, "loss": 3.013, "step": 147000 }, { "epoch": 7.96, "learning_rate": 0.0006588477608426999, "loss": 3.0112, "step": 148000 }, { "epoch": 8.0, "eval_accuracy": 0.40697128144141725, "eval_loss": 3.351954936981201, "eval_runtime": 153.9341, "eval_samples_per_second": 376.258, "eval_steps_per_second": 5.879, "step": 148744 }, { "epoch": 8.01, "learning_rate": 0.000655905372800565, "loss": 2.9938, "step": 149000 }, { "epoch": 8.07, "learning_rate": 0.00065296298475843, "loss": 2.9454, "step": 150000 }, { "epoch": 8.12, "learning_rate": 0.0006500205967162949, "loss": 2.9538, "step": 151000 }, { "epoch": 8.18, "learning_rate": 0.0006470811510622021, "loss": 2.957, "step": 152000 }, { "epoch": 8.23, "learning_rate": 0.0006441417054081092, "loss": 2.959, "step": 153000 }, { "epoch": 8.28, "learning_rate": 0.0006411993173659743, "loss": 2.9678, "step": 154000 }, { "epoch": 8.34, "learning_rate": 0.0006382569293238393, "loss": 2.9651, "step": 155000 }, { "epoch": 8.39, "learning_rate": 0.0006353145412817042, "loss": 2.9725, "step": 156000 }, { "epoch": 8.44, "learning_rate": 0.0006323750956276114, "loss": 2.9723, "step": 157000 }, { "epoch": 8.5, "learning_rate": 0.0006294327075854764, "loss": 2.9785, "step": 158000 }, { "epoch": 8.55, "learning_rate": 0.0006264903195433414, "loss": 2.9773, "step": 159000 }, { "epoch": 8.61, "learning_rate": 0.0006235508738892486, "loss": 2.9782, "step": 160000 }, { "epoch": 8.66, "learning_rate": 0.0006206084858471134, "loss": 2.9797, "step": 161000 }, { "epoch": 8.71, "learning_rate": 0.0006176690401930206, "loss": 2.9821, "step": 162000 }, { "epoch": 8.77, "learning_rate": 0.0006147266521508856, "loss": 2.9781, "step": 163000 }, { "epoch": 8.82, "learning_rate": 0.0006117872064967928, "loss": 2.9783, "step": 164000 }, { "epoch": 8.87, "learning_rate": 0.0006088448184546578, "loss": 2.9839, "step": 165000 }, { "epoch": 8.93, "learning_rate": 0.0006059024304125227, "loss": 2.9802, "step": 166000 }, { "epoch": 8.98, "learning_rate": 0.0006029629847584299, "loss": 2.9878, "step": 167000 }, { "epoch": 9.0, "eval_accuracy": 0.40724766366661397, "eval_loss": 3.373262882232666, "eval_runtime": 153.9138, "eval_samples_per_second": 376.308, "eval_steps_per_second": 5.88, "step": 167337 }, { "epoch": 9.04, "learning_rate": 0.000600023539104337, "loss": 2.9425, "step": 168000 }, { "epoch": 9.09, "learning_rate": 0.0005970811510622021, "loss": 2.9186, "step": 169000 }, { "epoch": 9.14, "learning_rate": 0.0005941387630200671, "loss": 2.9267, "step": 170000 }, { "epoch": 9.2, "learning_rate": 0.0005911963749779321, "loss": 2.9365, "step": 171000 }, { "epoch": 9.25, "learning_rate": 0.0005882539869357971, "loss": 2.936, "step": 172000 }, { "epoch": 9.3, "learning_rate": 0.0005853115988936621, "loss": 2.9392, "step": 173000 }, { "epoch": 9.36, "learning_rate": 0.0005823721532395693, "loss": 2.9433, "step": 174000 }, { "epoch": 9.41, "learning_rate": 0.0005794327075854764, "loss": 2.9456, "step": 175000 }, { "epoch": 9.47, "learning_rate": 0.0005764903195433414, "loss": 2.9453, "step": 176000 }, { "epoch": 9.52, "learning_rate": 0.0005735479315012065, "loss": 2.95, "step": 177000 }, { "epoch": 9.57, "learning_rate": 0.0005706055434590713, "loss": 2.9531, "step": 178000 }, { "epoch": 9.63, "learning_rate": 0.0005676660978049786, "loss": 2.9517, "step": 179000 }, { "epoch": 9.68, "learning_rate": 0.0005647237097628435, "loss": 2.9523, "step": 180000 }, { "epoch": 9.73, "learning_rate": 0.0005617813217207085, "loss": 2.9557, "step": 181000 }, { "epoch": 9.79, "learning_rate": 0.0005588418760666157, "loss": 2.9531, "step": 182000 }, { "epoch": 9.84, "learning_rate": 0.0005559024304125228, "loss": 2.9572, "step": 183000 }, { "epoch": 9.9, "learning_rate": 0.0005529600423703878, "loss": 2.9587, "step": 184000 }, { "epoch": 9.95, "learning_rate": 0.0005500176543282528, "loss": 2.96, "step": 185000 }, { "epoch": 10.0, "eval_accuracy": 0.4083363301613423, "eval_loss": 3.3502931594848633, "eval_runtime": 153.7858, "eval_samples_per_second": 376.621, "eval_steps_per_second": 5.885, "step": 185930 }, { "epoch": 10.0, "learning_rate": 0.0005470782086741599, "loss": 2.9556, "step": 186000 }, { "epoch": 10.06, "learning_rate": 0.000544135820632025, "loss": 2.8933, "step": 187000 }, { "epoch": 10.11, "learning_rate": 0.0005411934325898899, "loss": 2.9007, "step": 188000 }, { "epoch": 10.17, "learning_rate": 0.0005382510445477549, "loss": 2.9058, "step": 189000 }, { "epoch": 10.22, "learning_rate": 0.0005353145412817042, "loss": 2.9129, "step": 190000 }, { "epoch": 10.27, "learning_rate": 0.0005323721532395692, "loss": 2.9168, "step": 191000 }, { "epoch": 10.33, "learning_rate": 0.0005294297651974343, "loss": 2.9153, "step": 192000 }, { "epoch": 10.38, "learning_rate": 0.0005264873771552992, "loss": 2.9197, "step": 193000 }, { "epoch": 10.43, "learning_rate": 0.0005235479315012064, "loss": 2.9233, "step": 194000 }, { "epoch": 10.49, "learning_rate": 0.0005206084858471135, "loss": 2.9206, "step": 195000 }, { "epoch": 10.54, "learning_rate": 0.0005176690401930207, "loss": 2.923, "step": 196000 }, { "epoch": 10.6, "learning_rate": 0.0005147266521508857, "loss": 2.9268, "step": 197000 }, { "epoch": 10.65, "learning_rate": 0.0005117842641087506, "loss": 2.9284, "step": 198000 }, { "epoch": 10.7, "learning_rate": 0.0005088418760666157, "loss": 2.9327, "step": 199000 }, { "epoch": 10.76, "learning_rate": 0.0005058994880244807, "loss": 2.9307, "step": 200000 }, { "epoch": 10.81, "learning_rate": 0.0005029570999823457, "loss": 2.9309, "step": 201000 }, { "epoch": 10.86, "learning_rate": 0.0005000147119402107, "loss": 2.9368, "step": 202000 }, { "epoch": 10.92, "learning_rate": 0.0004970752662861179, "loss": 2.9346, "step": 203000 }, { "epoch": 10.97, "learning_rate": 0.0004941358206320249, "loss": 2.938, "step": 204000 }, { "epoch": 11.0, "eval_accuracy": 0.40840062228324636, "eval_loss": 3.3663504123687744, "eval_runtime": 153.6763, "eval_samples_per_second": 376.89, "eval_steps_per_second": 5.889, "step": 204523 }, { "epoch": 11.03, "learning_rate": 0.0004911934325898899, "loss": 2.9043, "step": 205000 }, { "epoch": 11.08, "learning_rate": 0.0004882539869357971, "loss": 2.8785, "step": 206000 }, { "epoch": 11.13, "learning_rate": 0.00048531159889366214, "loss": 2.881, "step": 207000 }, { "epoch": 11.19, "learning_rate": 0.0004823692108515271, "loss": 2.8844, "step": 208000 }, { "epoch": 11.24, "learning_rate": 0.00047942976519743425, "loss": 2.8866, "step": 209000 }, { "epoch": 11.29, "learning_rate": 0.0004764873771552993, "loss": 2.8919, "step": 210000 }, { "epoch": 11.35, "learning_rate": 0.00047354498911316426, "loss": 2.8931, "step": 211000 }, { "epoch": 11.4, "learning_rate": 0.00047060260107102924, "loss": 2.8974, "step": 212000 }, { "epoch": 11.46, "learning_rate": 0.00046766315541693637, "loss": 2.8975, "step": 213000 }, { "epoch": 11.51, "learning_rate": 0.0004647207673748014, "loss": 2.9048, "step": 214000 }, { "epoch": 11.56, "learning_rate": 0.00046178132172070853, "loss": 2.9042, "step": 215000 }, { "epoch": 11.62, "learning_rate": 0.00045883893367857357, "loss": 2.9049, "step": 216000 }, { "epoch": 11.67, "learning_rate": 0.00045589654563643855, "loss": 2.91, "step": 217000 }, { "epoch": 11.72, "learning_rate": 0.00045295415759430353, "loss": 2.9081, "step": 218000 }, { "epoch": 11.78, "learning_rate": 0.0004500176543282528, "loss": 2.9122, "step": 219000 }, { "epoch": 11.83, "learning_rate": 0.00044707526628611784, "loss": 2.9081, "step": 220000 }, { "epoch": 11.89, "learning_rate": 0.0004441328782439829, "loss": 2.9113, "step": 221000 }, { "epoch": 11.94, "learning_rate": 0.00044119343258988995, "loss": 2.9173, "step": 222000 }, { "epoch": 11.99, "learning_rate": 0.00043825104454775493, "loss": 2.9158, "step": 223000 }, { "epoch": 12.0, "eval_accuracy": 0.409260269087723, "eval_loss": 3.3659985065460205, "eval_runtime": 153.6366, "eval_samples_per_second": 376.987, "eval_steps_per_second": 5.891, "step": 223116 }, { "epoch": 12.05, "learning_rate": 0.0004353115988936621, "loss": 2.8563, "step": 224000 }, { "epoch": 12.1, "learning_rate": 0.0004323692108515271, "loss": 2.8596, "step": 225000 }, { "epoch": 12.16, "learning_rate": 0.0004294268228093921, "loss": 2.8624, "step": 226000 }, { "epoch": 12.21, "learning_rate": 0.0004264873771552992, "loss": 2.8647, "step": 227000 }, { "epoch": 12.26, "learning_rate": 0.00042354498911316424, "loss": 2.8693, "step": 228000 }, { "epoch": 12.32, "learning_rate": 0.0004206026010710293, "loss": 2.8721, "step": 229000 }, { "epoch": 12.37, "learning_rate": 0.0004176602130288943, "loss": 2.8762, "step": 230000 }, { "epoch": 12.42, "learning_rate": 0.0004147207673748014, "loss": 2.8763, "step": 231000 }, { "epoch": 12.48, "learning_rate": 0.00041177837933266637, "loss": 2.8808, "step": 232000 }, { "epoch": 12.53, "learning_rate": 0.0004088359912905314, "loss": 2.8784, "step": 233000 }, { "epoch": 12.59, "learning_rate": 0.00040589360324839644, "loss": 2.8828, "step": 234000 }, { "epoch": 12.64, "learning_rate": 0.00040295415759430357, "loss": 2.8865, "step": 235000 }, { "epoch": 12.69, "learning_rate": 0.00040001176955216855, "loss": 2.8932, "step": 236000 }, { "epoch": 12.75, "learning_rate": 0.0003970723238980757, "loss": 2.8908, "step": 237000 }, { "epoch": 12.8, "learning_rate": 0.0003941328782439828, "loss": 2.8887, "step": 238000 }, { "epoch": 12.85, "learning_rate": 0.00039119049020184784, "loss": 2.8927, "step": 239000 }, { "epoch": 12.91, "learning_rate": 0.00038824810215971287, "loss": 2.8926, "step": 240000 }, { "epoch": 12.96, "learning_rate": 0.00038530865650561995, "loss": 2.8919, "step": 241000 }, { "epoch": 13.0, "eval_accuracy": 0.41007524059327993, "eval_loss": 3.356402635574341, "eval_runtime": 153.9338, "eval_samples_per_second": 376.259, "eval_steps_per_second": 5.879, "step": 241709 }, { "epoch": 13.02, "learning_rate": 0.000382366268463485, "loss": 2.8744, "step": 242000 }, { "epoch": 13.07, "learning_rate": 0.00037942388042134996, "loss": 2.8308, "step": 243000 }, { "epoch": 13.12, "learning_rate": 0.00037648443476725714, "loss": 2.8409, "step": 244000 }, { "epoch": 13.18, "learning_rate": 0.00037354204672512207, "loss": 2.8463, "step": 245000 }, { "epoch": 13.23, "learning_rate": 0.0003705996586829871, "loss": 2.8508, "step": 246000 }, { "epoch": 13.28, "learning_rate": 0.00036765727064085214, "loss": 2.8527, "step": 247000 }, { "epoch": 13.34, "learning_rate": 0.0003647148825987171, "loss": 2.8531, "step": 248000 }, { "epoch": 13.39, "learning_rate": 0.0003617754369446243, "loss": 2.8581, "step": 249000 }, { "epoch": 13.45, "learning_rate": 0.0003588359912905314, "loss": 2.8592, "step": 250000 }, { "epoch": 13.5, "learning_rate": 0.0003558936032483964, "loss": 2.8585, "step": 251000 }, { "epoch": 13.55, "learning_rate": 0.0003529512152062614, "loss": 2.8675, "step": 252000 }, { "epoch": 13.61, "learning_rate": 0.00035000882716412643, "loss": 2.8613, "step": 253000 }, { "epoch": 13.66, "learning_rate": 0.00034706938151003356, "loss": 2.867, "step": 254000 }, { "epoch": 13.71, "learning_rate": 0.0003441299358559407, "loss": 2.8679, "step": 255000 }, { "epoch": 13.77, "learning_rate": 0.00034118754781380567, "loss": 2.8687, "step": 256000 }, { "epoch": 13.82, "learning_rate": 0.0003382451597716707, "loss": 2.8717, "step": 257000 }, { "epoch": 13.88, "learning_rate": 0.00033530277172953574, "loss": 2.8678, "step": 258000 }, { "epoch": 13.93, "learning_rate": 0.00033236038368740067, "loss": 2.87, "step": 259000 }, { "epoch": 13.98, "learning_rate": 0.0003294179956452657, "loss": 2.8735, "step": 260000 }, { "epoch": 14.0, "eval_accuracy": 0.4106870570553281, "eval_loss": 3.3567473888397217, "eval_runtime": 153.9192, "eval_samples_per_second": 376.295, "eval_steps_per_second": 5.88, "step": 260302 }, { "epoch": 14.04, "learning_rate": 0.00032647854999117283, "loss": 2.8363, "step": 261000 }, { "epoch": 14.09, "learning_rate": 0.00032353616194903787, "loss": 2.8187, "step": 262000 }, { "epoch": 14.15, "learning_rate": 0.00032059377390690285, "loss": 2.8209, "step": 263000 }, { "epoch": 14.2, "learning_rate": 0.00031765432825281, "loss": 2.8269, "step": 264000 }, { "epoch": 14.25, "learning_rate": 0.000314711940210675, "loss": 2.8333, "step": 265000 }, { "epoch": 14.31, "learning_rate": 0.00031177249455658214, "loss": 2.8345, "step": 266000 }, { "epoch": 14.36, "learning_rate": 0.0003088301065144472, "loss": 2.8346, "step": 267000 }, { "epoch": 14.41, "learning_rate": 0.0003058877184723121, "loss": 2.8415, "step": 268000 }, { "epoch": 14.47, "learning_rate": 0.00030294533043017714, "loss": 2.8395, "step": 269000 }, { "epoch": 14.52, "learning_rate": 0.00030000588477608426, "loss": 2.8446, "step": 270000 }, { "epoch": 14.58, "learning_rate": 0.00029706643912199145, "loss": 2.8475, "step": 271000 }, { "epoch": 14.63, "learning_rate": 0.00029412405107985643, "loss": 2.8464, "step": 272000 }, { "epoch": 14.68, "learning_rate": 0.0002911816630377214, "loss": 2.8431, "step": 273000 }, { "epoch": 14.74, "learning_rate": 0.00028823927499558645, "loss": 2.8475, "step": 274000 }, { "epoch": 14.79, "learning_rate": 0.00028529982934149357, "loss": 2.8491, "step": 275000 }, { "epoch": 14.84, "learning_rate": 0.0002823603836874007, "loss": 2.854, "step": 276000 }, { "epoch": 14.9, "learning_rate": 0.00027941799564526574, "loss": 2.8498, "step": 277000 }, { "epoch": 14.95, "learning_rate": 0.0002764756076031307, "loss": 2.8562, "step": 278000 }, { "epoch": 15.0, "eval_accuracy": 0.4100232425761914, "eval_loss": 3.3675010204315186, "eval_runtime": 153.998, "eval_samples_per_second": 376.102, "eval_steps_per_second": 5.877, "step": 278895 }, { "epoch": 15.01, "learning_rate": 0.0002735332195609957, "loss": 2.8471, "step": 279000 }, { "epoch": 15.06, "learning_rate": 0.0002705937739069029, "loss": 2.805, "step": 280000 }, { "epoch": 15.11, "learning_rate": 0.00026765138586476786, "loss": 2.8089, "step": 281000 }, { "epoch": 15.17, "learning_rate": 0.00026470899782263284, "loss": 2.8092, "step": 282000 }, { "epoch": 15.22, "learning_rate": 0.00026176955216853997, "loss": 2.8135, "step": 283000 }, { "epoch": 15.27, "learning_rate": 0.000258827164126405, "loss": 2.8146, "step": 284000 }, { "epoch": 15.33, "learning_rate": 0.00025588477608427004, "loss": 2.8197, "step": 285000 }, { "epoch": 15.38, "learning_rate": 0.00025294533043017717, "loss": 2.8212, "step": 286000 }, { "epoch": 15.44, "learning_rate": 0.00025000294238804215, "loss": 2.8213, "step": 287000 }, { "epoch": 15.49, "learning_rate": 0.00024706055434590713, "loss": 2.8222, "step": 288000 }, { "epoch": 15.54, "learning_rate": 0.0002441211086918143, "loss": 2.8197, "step": 289000 }, { "epoch": 15.6, "learning_rate": 0.00024117872064967927, "loss": 2.825, "step": 290000 }, { "epoch": 15.65, "learning_rate": 0.0002382363326075443, "loss": 2.8267, "step": 291000 }, { "epoch": 15.7, "learning_rate": 0.0002352968869534514, "loss": 2.826, "step": 292000 }, { "epoch": 15.76, "learning_rate": 0.00023235449891131644, "loss": 2.8306, "step": 293000 }, { "epoch": 15.81, "learning_rate": 0.00022941211086918142, "loss": 2.8308, "step": 294000 }, { "epoch": 15.87, "learning_rate": 0.00022646972282704646, "loss": 2.8306, "step": 295000 }, { "epoch": 15.92, "learning_rate": 0.00022353027717295358, "loss": 2.8337, "step": 296000 }, { "epoch": 15.97, "learning_rate": 0.00022058788913081857, "loss": 2.8344, "step": 297000 }, { "epoch": 16.0, "eval_accuracy": 0.4103203836996831, "eval_loss": 3.3702094554901123, "eval_runtime": 154.2186, "eval_samples_per_second": 375.564, "eval_steps_per_second": 5.868, "step": 297488 }, { "epoch": 16.03, "learning_rate": 0.00021764550108868357, "loss": 2.8099, "step": 298000 }, { "epoch": 16.08, "learning_rate": 0.0002147060554345907, "loss": 2.7897, "step": 299000 }, { "epoch": 16.14, "learning_rate": 0.00021176366739245574, "loss": 2.7917, "step": 300000 }, { "epoch": 16.19, "learning_rate": 0.00020882127935032072, "loss": 2.7953, "step": 301000 }, { "epoch": 16.24, "learning_rate": 0.00020588183369622787, "loss": 2.7966, "step": 302000 }, { "epoch": 16.3, "learning_rate": 0.00020293944565409286, "loss": 2.8025, "step": 303000 }, { "epoch": 16.35, "learning_rate": 0.0002, "loss": 2.8022, "step": 304000 }, { "epoch": 16.4, "learning_rate": 0.00019705761195786502, "loss": 2.8053, "step": 305000 }, { "epoch": 16.46, "learning_rate": 0.00019411522391573, "loss": 2.8051, "step": 306000 }, { "epoch": 16.51, "learning_rate": 0.000191172835873595, "loss": 2.8114, "step": 307000 }, { "epoch": 16.57, "learning_rate": 0.00018823044783146002, "loss": 2.8063, "step": 308000 }, { "epoch": 16.62, "learning_rate": 0.0001852939445654093, "loss": 2.8062, "step": 309000 }, { "epoch": 16.67, "learning_rate": 0.0001823515565232743, "loss": 2.8081, "step": 310000 }, { "epoch": 16.73, "learning_rate": 0.0001794091684811393, "loss": 2.8049, "step": 311000 }, { "epoch": 16.78, "learning_rate": 0.0001764667804390043, "loss": 2.8113, "step": 312000 }, { "epoch": 16.83, "learning_rate": 0.00017352733478491144, "loss": 2.8138, "step": 313000 }, { "epoch": 16.89, "learning_rate": 0.00017058494674277645, "loss": 2.8114, "step": 314000 }, { "epoch": 16.94, "learning_rate": 0.00016764550108868358, "loss": 2.8168, "step": 315000 }, { "epoch": 17.0, "learning_rate": 0.0001647031130465486, "loss": 2.814, "step": 316000 }, { "epoch": 17.0, "eval_accuracy": 0.4100904906577232, "eval_loss": 3.380819797515869, "eval_runtime": 153.6931, "eval_samples_per_second": 376.848, "eval_steps_per_second": 5.888, "step": 316081 }, { "epoch": 17.05, "learning_rate": 0.00016176072500441357, "loss": 2.7806, "step": 317000 }, { "epoch": 17.1, "learning_rate": 0.00015882422173836287, "loss": 2.7806, "step": 318000 }, { "epoch": 17.16, "learning_rate": 0.00015588183369622785, "loss": 2.7804, "step": 319000 }, { "epoch": 17.21, "learning_rate": 0.0001529394456540929, "loss": 2.7796, "step": 320000 }, { "epoch": 17.26, "learning_rate": 0.00014999705761195787, "loss": 2.7818, "step": 321000 }, { "epoch": 17.32, "learning_rate": 0.00014705761195786502, "loss": 2.7842, "step": 322000 }, { "epoch": 17.37, "learning_rate": 0.00014411816630377215, "loss": 2.7866, "step": 323000 }, { "epoch": 17.43, "learning_rate": 0.0001411787206496793, "loss": 2.7849, "step": 324000 }, { "epoch": 17.48, "learning_rate": 0.00013823633260754429, "loss": 2.7858, "step": 325000 }, { "epoch": 17.53, "learning_rate": 0.00013529394456540927, "loss": 2.7908, "step": 326000 }, { "epoch": 17.59, "learning_rate": 0.0001323515565232743, "loss": 2.7868, "step": 327000 }, { "epoch": 17.64, "learning_rate": 0.00012940916848113928, "loss": 2.7918, "step": 328000 }, { "epoch": 17.69, "learning_rate": 0.00012646972282704644, "loss": 2.7926, "step": 329000 }, { "epoch": 17.75, "learning_rate": 0.00012352733478491142, "loss": 2.7943, "step": 330000 }, { "epoch": 17.8, "learning_rate": 0.00012058494674277644, "loss": 2.7906, "step": 331000 }, { "epoch": 17.86, "learning_rate": 0.00011764255870064144, "loss": 2.796, "step": 332000 }, { "epoch": 17.91, "learning_rate": 0.00011470017065850645, "loss": 2.7958, "step": 333000 }, { "epoch": 17.96, "learning_rate": 0.00011175778261637146, "loss": 2.7973, "step": 334000 }, { "epoch": 18.0, "eval_accuracy": 0.4098012096996053, "eval_loss": 3.3934710025787354, "eval_runtime": 153.7409, "eval_samples_per_second": 376.731, "eval_steps_per_second": 5.887, "step": 334674 }, { "epoch": 18.02, "learning_rate": 0.0001088183369622786, "loss": 2.7829, "step": 335000 }, { "epoch": 18.07, "learning_rate": 0.00010587889130818574, "loss": 2.7617, "step": 336000 }, { "epoch": 18.13, "learning_rate": 0.00010293650326605072, "loss": 2.7652, "step": 337000 }, { "epoch": 18.18, "learning_rate": 9.999411522391573e-05, "loss": 2.7735, "step": 338000 }, { "epoch": 18.23, "learning_rate": 9.705172718178074e-05, "loss": 2.7722, "step": 339000 }, { "epoch": 18.29, "learning_rate": 9.411228152768788e-05, "loss": 2.7673, "step": 340000 }, { "epoch": 18.34, "learning_rate": 9.116989348555289e-05, "loss": 2.7696, "step": 341000 }, { "epoch": 18.39, "learning_rate": 8.822750544341788e-05, "loss": 2.7708, "step": 342000 }, { "epoch": 18.45, "learning_rate": 8.528805978932502e-05, "loss": 2.7714, "step": 343000 }, { "epoch": 18.5, "learning_rate": 8.234567174719002e-05, "loss": 2.7757, "step": 344000 }, { "epoch": 18.56, "learning_rate": 7.940622609309716e-05, "loss": 2.7743, "step": 345000 }, { "epoch": 18.61, "learning_rate": 7.646383805096215e-05, "loss": 2.7707, "step": 346000 }, { "epoch": 18.66, "learning_rate": 7.35243923968693e-05, "loss": 2.7734, "step": 347000 }, { "epoch": 18.72, "learning_rate": 7.05820043547343e-05, "loss": 2.7749, "step": 348000 }, { "epoch": 18.77, "learning_rate": 6.764255870064144e-05, "loss": 2.7769, "step": 349000 }, { "epoch": 18.82, "learning_rate": 6.470017065850645e-05, "loss": 2.7797, "step": 350000 }, { "epoch": 18.88, "learning_rate": 6.175778261637145e-05, "loss": 2.7753, "step": 351000 }, { "epoch": 18.93, "learning_rate": 5.881833696227858e-05, "loss": 2.7806, "step": 352000 }, { "epoch": 18.99, "learning_rate": 5.587889130818573e-05, "loss": 2.7732, "step": 353000 }, { "epoch": 19.0, "eval_accuracy": 0.4104149744077718, "eval_loss": 3.3887417316436768, "eval_runtime": 153.8157, "eval_samples_per_second": 376.548, "eval_steps_per_second": 5.884, "step": 353267 }, { "epoch": 19.04, "learning_rate": 5.293650326605073e-05, "loss": 2.7578, "step": 354000 }, { "epoch": 19.09, "learning_rate": 4.999411522391573e-05, "loss": 2.7527, "step": 355000 }, { "epoch": 19.15, "learning_rate": 4.7051727181780734e-05, "loss": 2.7525, "step": 356000 }, { "epoch": 19.2, "learning_rate": 4.411228152768787e-05, "loss": 2.7595, "step": 357000 }, { "epoch": 19.25, "learning_rate": 4.1169893485552876e-05, "loss": 2.7602, "step": 358000 }, { "epoch": 19.31, "learning_rate": 3.823044783146001e-05, "loss": 2.7569, "step": 359000 }, { "epoch": 19.36, "learning_rate": 3.528805978932502e-05, "loss": 2.7605, "step": 360000 }, { "epoch": 19.42, "learning_rate": 3.234567174719002e-05, "loss": 2.758, "step": 361000 }, { "epoch": 19.47, "learning_rate": 2.940622609309716e-05, "loss": 2.7595, "step": 362000 }, { "epoch": 19.52, "learning_rate": 2.6463838050962164e-05, "loss": 2.7584, "step": 363000 }, { "epoch": 19.58, "learning_rate": 2.35243923968693e-05, "loss": 2.7612, "step": 364000 }, { "epoch": 19.63, "learning_rate": 2.0584946742776438e-05, "loss": 2.7626, "step": 365000 }, { "epoch": 19.68, "learning_rate": 1.764255870064144e-05, "loss": 2.7573, "step": 366000 }, { "epoch": 19.74, "learning_rate": 1.4700170658506444e-05, "loss": 2.7586, "step": 367000 }, { "epoch": 19.79, "learning_rate": 1.1757782616371448e-05, "loss": 2.7569, "step": 368000 }, { "epoch": 19.85, "learning_rate": 8.815394574236451e-06, "loss": 2.761, "step": 369000 }, { "epoch": 19.9, "learning_rate": 5.8759489201435885e-06, "loss": 2.7599, "step": 370000 }, { "epoch": 19.95, "learning_rate": 2.933560878008592e-06, "loss": 2.7585, "step": 371000 }, { "epoch": 20.0, "eval_accuracy": 0.40999993080367236, "eval_loss": 3.400653839111328, "eval_runtime": 153.6941, "eval_samples_per_second": 376.846, "eval_steps_per_second": 5.888, "step": 371860 }, { "epoch": 20.0, "step": 371860, "total_flos": 1.56669402051072e+18, "train_loss": 3.027651528302192, "train_runtime": 81113.4919, "train_samples_per_second": 146.695, "train_steps_per_second": 4.584 } ], "logging_steps": 1000, "max_steps": 371860, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 5000, "total_flos": 1.56669402051072e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }