{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 744020, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "learning_rate": 3.125e-05, "loss": 6.2858, "step": 1000 }, { "epoch": 0.05, "learning_rate": 6.25e-05, "loss": 5.118, "step": 2000 }, { "epoch": 0.08, "learning_rate": 9.375e-05, "loss": 4.8257, "step": 3000 }, { "epoch": 0.11, "learning_rate": 0.000125, "loss": 4.62, "step": 4000 }, { "epoch": 0.13, "learning_rate": 0.00015625, "loss": 4.4689, "step": 5000 }, { "epoch": 0.16, "learning_rate": 0.0001875, "loss": 4.3516, "step": 6000 }, { "epoch": 0.19, "learning_rate": 0.00021875, "loss": 4.2636, "step": 7000 }, { "epoch": 0.22, "learning_rate": 0.00025, "loss": 4.1906, "step": 8000 }, { "epoch": 0.24, "learning_rate": 0.00028125000000000003, "loss": 4.1354, "step": 9000 }, { "epoch": 0.27, "learning_rate": 0.0003125, "loss": 4.0582, "step": 10000 }, { "epoch": 0.3, "learning_rate": 0.00034365625, "loss": 3.9986, "step": 11000 }, { "epoch": 0.32, "learning_rate": 0.00037490625, "loss": 3.9535, "step": 12000 }, { "epoch": 0.35, "learning_rate": 0.00040615625, "loss": 3.9158, "step": 13000 }, { "epoch": 0.38, "learning_rate": 0.00043737500000000005, "loss": 3.8747, "step": 14000 }, { "epoch": 0.4, "learning_rate": 0.000468625, "loss": 3.8471, "step": 15000 }, { "epoch": 0.43, "learning_rate": 0.00049984375, "loss": 3.8178, "step": 16000 }, { "epoch": 0.46, "learning_rate": 0.00053109375, "loss": 3.794, "step": 17000 }, { "epoch": 0.48, "learning_rate": 0.0005623125, "loss": 3.7705, "step": 18000 }, { "epoch": 0.51, "learning_rate": 0.0005935625, "loss": 3.757, "step": 19000 }, { "epoch": 0.54, "learning_rate": 0.00062478125, "loss": 3.731, "step": 20000 }, { "epoch": 0.56, "learning_rate": 0.0006560312499999999, "loss": 3.7135, "step": 21000 }, { "epoch": 0.59, "learning_rate": 0.00068725, "loss": 3.6993, "step": 22000 }, { "epoch": 0.62, "learning_rate": 0.00071846875, "loss": 3.6745, "step": 23000 }, { "epoch": 0.65, "learning_rate": 0.00074971875, "loss": 3.668, "step": 24000 }, { "epoch": 0.67, "learning_rate": 0.0007809375, "loss": 3.6522, "step": 25000 }, { "epoch": 0.7, "learning_rate": 0.0008121875, "loss": 3.6422, "step": 26000 }, { "epoch": 0.73, "learning_rate": 0.0008434062500000001, "loss": 3.6284, "step": 27000 }, { "epoch": 0.75, "learning_rate": 0.00087465625, "loss": 3.6213, "step": 28000 }, { "epoch": 0.78, "learning_rate": 0.00090590625, "loss": 3.6071, "step": 29000 }, { "epoch": 0.81, "learning_rate": 0.000937125, "loss": 3.5975, "step": 30000 }, { "epoch": 0.83, "learning_rate": 0.000968375, "loss": 3.5921, "step": 31000 }, { "epoch": 0.86, "learning_rate": 0.00099959375, "loss": 3.5862, "step": 32000 }, { "epoch": 0.89, "learning_rate": 0.0009986138029830622, "loss": 3.5652, "step": 33000 }, { "epoch": 0.91, "learning_rate": 0.000997209348052021, "loss": 3.5558, "step": 34000 }, { "epoch": 0.94, "learning_rate": 0.0009958048931209798, "loss": 3.5398, "step": 35000 }, { "epoch": 0.97, "learning_rate": 0.0009944018426448695, "loss": 3.527, "step": 36000 }, { "epoch": 0.99, "learning_rate": 0.0009929973877138283, "loss": 3.5148, "step": 37000 }, { "epoch": 1.0, "eval_accuracy": 0.3670570705333534, "eval_loss": 3.7269980907440186, "eval_runtime": 147.4611, "eval_samples_per_second": 392.781, "eval_steps_per_second": 6.137, "step": 37201 }, { "epoch": 1.02, "learning_rate": 0.0009915929327827871, "loss": 3.4761, "step": 38000 }, { "epoch": 1.05, "learning_rate": 0.0009901898823066768, "loss": 3.4631, "step": 39000 }, { "epoch": 1.08, "learning_rate": 0.0009887854273756356, "loss": 3.4614, "step": 40000 }, { "epoch": 1.1, "learning_rate": 0.0009873809724445942, "loss": 3.4538, "step": 41000 }, { "epoch": 1.13, "learning_rate": 0.000985976517513553, "loss": 3.4587, "step": 42000 }, { "epoch": 1.16, "learning_rate": 0.0009845720625825118, "loss": 3.4417, "step": 43000 }, { "epoch": 1.18, "learning_rate": 0.0009831690121064015, "loss": 3.4351, "step": 44000 }, { "epoch": 1.21, "learning_rate": 0.0009817645571753603, "loss": 3.4312, "step": 45000 }, { "epoch": 1.24, "learning_rate": 0.0009803601022443191, "loss": 3.4235, "step": 46000 }, { "epoch": 1.26, "learning_rate": 0.0009789570517682088, "loss": 3.4191, "step": 47000 }, { "epoch": 1.29, "learning_rate": 0.0009775525968371674, "loss": 3.4077, "step": 48000 }, { "epoch": 1.32, "learning_rate": 0.0009761495463610572, "loss": 3.4119, "step": 49000 }, { "epoch": 1.34, "learning_rate": 0.000974745091430016, "loss": 3.392, "step": 50000 }, { "epoch": 1.37, "learning_rate": 0.0009733406364989748, "loss": 3.3985, "step": 51000 }, { "epoch": 1.4, "learning_rate": 0.0009719361815679335, "loss": 3.3884, "step": 52000 }, { "epoch": 1.42, "learning_rate": 0.0009705317266368922, "loss": 3.3792, "step": 53000 }, { "epoch": 1.45, "learning_rate": 0.000969127271705851, "loss": 3.3734, "step": 54000 }, { "epoch": 1.48, "learning_rate": 0.0009677242212297408, "loss": 3.3699, "step": 55000 }, { "epoch": 1.51, "learning_rate": 0.0009663197662986994, "loss": 3.3721, "step": 56000 }, { "epoch": 1.53, "learning_rate": 0.0009649167158225893, "loss": 3.3688, "step": 57000 }, { "epoch": 1.56, "learning_rate": 0.000963512260891548, "loss": 3.3598, "step": 58000 }, { "epoch": 1.59, "learning_rate": 0.0009621078059605067, "loss": 3.3608, "step": 59000 }, { "epoch": 1.61, "learning_rate": 0.0009607047554843966, "loss": 3.3529, "step": 60000 }, { "epoch": 1.64, "learning_rate": 0.0009593003005533553, "loss": 3.3448, "step": 61000 }, { "epoch": 1.67, "learning_rate": 0.000957895845622314, "loss": 3.3475, "step": 62000 }, { "epoch": 1.69, "learning_rate": 0.0009564927951462038, "loss": 3.3339, "step": 63000 }, { "epoch": 1.72, "learning_rate": 0.0009550883402151626, "loss": 3.3342, "step": 64000 }, { "epoch": 1.75, "learning_rate": 0.0009536852897390523, "loss": 3.3323, "step": 65000 }, { "epoch": 1.77, "learning_rate": 0.0009522808348080109, "loss": 3.3332, "step": 66000 }, { "epoch": 1.8, "learning_rate": 0.0009508763798769697, "loss": 3.32, "step": 67000 }, { "epoch": 1.83, "learning_rate": 0.0009494733294008595, "loss": 3.3214, "step": 68000 }, { "epoch": 1.85, "learning_rate": 0.0009480688744698182, "loss": 3.3226, "step": 69000 }, { "epoch": 1.88, "learning_rate": 0.000946664419538777, "loss": 3.3158, "step": 70000 }, { "epoch": 1.91, "learning_rate": 0.0009452613690626668, "loss": 3.3179, "step": 71000 }, { "epoch": 1.94, "learning_rate": 0.0009438569141316255, "loss": 3.3089, "step": 72000 }, { "epoch": 1.96, "learning_rate": 0.0009424524592005843, "loss": 3.3103, "step": 73000 }, { "epoch": 1.99, "learning_rate": 0.0009410480042695429, "loss": 3.3074, "step": 74000 }, { "epoch": 2.0, "eval_accuracy": 0.38968938775071477, "eval_loss": 3.484098196029663, "eval_runtime": 146.602, "eval_samples_per_second": 395.083, "eval_steps_per_second": 6.173, "step": 74402 }, { "epoch": 2.02, "learning_rate": 0.0009396435493385018, "loss": 3.2586, "step": 75000 }, { "epoch": 2.04, "learning_rate": 0.0009382419033173227, "loss": 3.2401, "step": 76000 }, { "epoch": 2.07, "learning_rate": 0.0009368374483862813, "loss": 3.2462, "step": 77000 }, { "epoch": 2.1, "learning_rate": 0.00093543299345524, "loss": 3.2422, "step": 78000 }, { "epoch": 2.12, "learning_rate": 0.0009340285385241988, "loss": 3.2446, "step": 79000 }, { "epoch": 2.15, "learning_rate": 0.0009326254880480886, "loss": 3.2477, "step": 80000 }, { "epoch": 2.18, "learning_rate": 0.0009312224375719783, "loss": 3.2454, "step": 81000 }, { "epoch": 2.2, "learning_rate": 0.000929817982640937, "loss": 3.2402, "step": 82000 }, { "epoch": 2.23, "learning_rate": 0.0009284135277098959, "loss": 3.2414, "step": 83000 }, { "epoch": 2.26, "learning_rate": 0.0009270104772337856, "loss": 3.2409, "step": 84000 }, { "epoch": 2.28, "learning_rate": 0.0009256060223027443, "loss": 3.2413, "step": 85000 }, { "epoch": 2.31, "learning_rate": 0.000924201567371703, "loss": 3.2392, "step": 86000 }, { "epoch": 2.34, "learning_rate": 0.0009227985168955928, "loss": 3.2399, "step": 87000 }, { "epoch": 2.37, "learning_rate": 0.0009213940619645515, "loss": 3.2404, "step": 88000 }, { "epoch": 2.39, "learning_rate": 0.0009199896070335103, "loss": 3.2342, "step": 89000 }, { "epoch": 2.42, "learning_rate": 0.000918585152102469, "loss": 3.2307, "step": 90000 }, { "epoch": 2.45, "learning_rate": 0.0009171821016263588, "loss": 3.2342, "step": 91000 }, { "epoch": 2.47, "learning_rate": 0.0009157790511502487, "loss": 3.2372, "step": 92000 }, { "epoch": 2.5, "learning_rate": 0.0009143745962192074, "loss": 3.2274, "step": 93000 }, { "epoch": 2.53, "learning_rate": 0.0009129715457430971, "loss": 3.2337, "step": 94000 }, { "epoch": 2.55, "learning_rate": 0.000911567090812056, "loss": 3.2228, "step": 95000 }, { "epoch": 2.58, "learning_rate": 0.0009101626358810146, "loss": 3.2285, "step": 96000 }, { "epoch": 2.61, "learning_rate": 0.0009087581809499733, "loss": 3.2247, "step": 97000 }, { "epoch": 2.63, "learning_rate": 0.000907355130473863, "loss": 3.2241, "step": 98000 }, { "epoch": 2.66, "learning_rate": 0.0009059520799977529, "loss": 3.2243, "step": 99000 }, { "epoch": 2.69, "learning_rate": 0.0009045476250667116, "loss": 3.221, "step": 100000 }, { "epoch": 2.71, "learning_rate": 0.0009031431701356703, "loss": 3.2195, "step": 101000 }, { "epoch": 2.74, "learning_rate": 0.0009017401196595602, "loss": 3.2168, "step": 102000 }, { "epoch": 2.77, "learning_rate": 0.0009003356647285189, "loss": 3.2185, "step": 103000 }, { "epoch": 2.8, "learning_rate": 0.0008989326142524087, "loss": 3.2177, "step": 104000 }, { "epoch": 2.82, "learning_rate": 0.0008975281593213675, "loss": 3.2188, "step": 105000 }, { "epoch": 2.85, "learning_rate": 0.0008961237043903261, "loss": 3.2152, "step": 106000 }, { "epoch": 2.88, "learning_rate": 0.0008947192494592848, "loss": 3.2146, "step": 107000 }, { "epoch": 2.9, "learning_rate": 0.0008933176034381056, "loss": 3.2083, "step": 108000 }, { "epoch": 2.93, "learning_rate": 0.0008919131485070644, "loss": 3.2108, "step": 109000 }, { "epoch": 2.96, "learning_rate": 0.0008905086935760231, "loss": 3.2138, "step": 110000 }, { "epoch": 2.98, "learning_rate": 0.0008891056430999129, "loss": 3.1988, "step": 111000 }, { "epoch": 3.0, "eval_accuracy": 0.3979050669647656, "eval_loss": 3.4299747943878174, "eval_runtime": 149.1667, "eval_samples_per_second": 388.29, "eval_steps_per_second": 6.067, "step": 111603 }, { "epoch": 3.01, "learning_rate": 0.0008877011881688717, "loss": 3.1786, "step": 112000 }, { "epoch": 3.04, "learning_rate": 0.0008862967332378304, "loss": 3.14, "step": 113000 }, { "epoch": 3.06, "learning_rate": 0.0008848936827617202, "loss": 3.1408, "step": 114000 }, { "epoch": 3.09, "learning_rate": 0.000883489227830679, "loss": 3.1491, "step": 115000 }, { "epoch": 3.12, "learning_rate": 0.0008820847728996376, "loss": 3.1428, "step": 116000 }, { "epoch": 3.15, "learning_rate": 0.0008806803179685963, "loss": 3.1515, "step": 117000 }, { "epoch": 3.17, "learning_rate": 0.0008792772674924862, "loss": 3.1396, "step": 118000 }, { "epoch": 3.2, "learning_rate": 0.0008778728125614449, "loss": 3.1518, "step": 119000 }, { "epoch": 3.23, "learning_rate": 0.0008764683576304036, "loss": 3.1464, "step": 120000 }, { "epoch": 3.25, "learning_rate": 0.0008750653071542935, "loss": 3.1562, "step": 121000 }, { "epoch": 3.28, "learning_rate": 0.0008736608522232522, "loss": 3.1454, "step": 122000 }, { "epoch": 3.31, "learning_rate": 0.0008722563972922109, "loss": 3.153, "step": 123000 }, { "epoch": 3.33, "learning_rate": 0.0008708547512710317, "loss": 3.1608, "step": 124000 }, { "epoch": 3.36, "learning_rate": 0.0008694502963399905, "loss": 3.1545, "step": 125000 }, { "epoch": 3.39, "learning_rate": 0.0008680458414089491, "loss": 3.1477, "step": 126000 }, { "epoch": 3.41, "learning_rate": 0.0008666427909328389, "loss": 3.1499, "step": 127000 }, { "epoch": 3.44, "learning_rate": 0.0008652383360017977, "loss": 3.1532, "step": 128000 }, { "epoch": 3.47, "learning_rate": 0.0008638338810707564, "loss": 3.1507, "step": 129000 }, { "epoch": 3.49, "learning_rate": 0.0008624308305946462, "loss": 3.1545, "step": 130000 }, { "epoch": 3.52, "learning_rate": 0.000861026375663605, "loss": 3.1478, "step": 131000 }, { "epoch": 3.55, "learning_rate": 0.0008596233251874948, "loss": 3.157, "step": 132000 }, { "epoch": 3.58, "learning_rate": 0.0008582188702564535, "loss": 3.1439, "step": 133000 }, { "epoch": 3.6, "learning_rate": 0.0008568144153254123, "loss": 3.1461, "step": 134000 }, { "epoch": 3.63, "learning_rate": 0.0008554099603943709, "loss": 3.1467, "step": 135000 }, { "epoch": 3.66, "learning_rate": 0.0008540069099182607, "loss": 3.1486, "step": 136000 }, { "epoch": 3.68, "learning_rate": 0.0008526024549872195, "loss": 3.1467, "step": 137000 }, { "epoch": 3.71, "learning_rate": 0.0008511994045111093, "loss": 3.1482, "step": 138000 }, { "epoch": 3.74, "learning_rate": 0.000849794949580068, "loss": 3.1508, "step": 139000 }, { "epoch": 3.76, "learning_rate": 0.0008483904946490267, "loss": 3.1574, "step": 140000 }, { "epoch": 3.79, "learning_rate": 0.0008469860397179855, "loss": 3.1437, "step": 141000 }, { "epoch": 3.82, "learning_rate": 0.0008455815847869442, "loss": 3.1427, "step": 142000 }, { "epoch": 3.84, "learning_rate": 0.000844178534310834, "loss": 3.15, "step": 143000 }, { "epoch": 3.87, "learning_rate": 0.0008427740793797927, "loss": 3.1489, "step": 144000 }, { "epoch": 3.9, "learning_rate": 0.0008413710289036824, "loss": 3.1449, "step": 145000 }, { "epoch": 3.92, "learning_rate": 0.0008399665739726412, "loss": 3.1465, "step": 146000 }, { "epoch": 3.95, "learning_rate": 0.0008385621190416, "loss": 3.1375, "step": 147000 }, { "epoch": 3.98, "learning_rate": 0.0008371576641105587, "loss": 3.152, "step": 148000 }, { "epoch": 4.0, "eval_accuracy": 0.4049755331384225, "eval_loss": 3.3773725032806396, "eval_runtime": 149.0965, "eval_samples_per_second": 388.473, "eval_steps_per_second": 6.07, "step": 148804 }, { "epoch": 4.01, "learning_rate": 0.0008357546136344485, "loss": 3.1283, "step": 149000 }, { "epoch": 4.03, "learning_rate": 0.0008343515631583383, "loss": 3.0733, "step": 150000 }, { "epoch": 4.06, "learning_rate": 0.000832947108227297, "loss": 3.0775, "step": 151000 }, { "epoch": 4.09, "learning_rate": 0.0008315426532962557, "loss": 3.0746, "step": 152000 }, { "epoch": 4.11, "learning_rate": 0.0008301381983652145, "loss": 3.0835, "step": 153000 }, { "epoch": 4.14, "learning_rate": 0.0008287351478891042, "loss": 3.0848, "step": 154000 }, { "epoch": 4.17, "learning_rate": 0.0008273306929580629, "loss": 3.0851, "step": 155000 }, { "epoch": 4.19, "learning_rate": 0.0008259276424819527, "loss": 3.0789, "step": 156000 }, { "epoch": 4.22, "learning_rate": 0.0008245231875509115, "loss": 3.0933, "step": 157000 }, { "epoch": 4.25, "learning_rate": 0.0008231201370748013, "loss": 3.0864, "step": 158000 }, { "epoch": 4.27, "learning_rate": 0.00082171568214376, "loss": 3.0948, "step": 159000 }, { "epoch": 4.3, "learning_rate": 0.0008203126316676498, "loss": 3.0914, "step": 160000 }, { "epoch": 4.33, "learning_rate": 0.0008189081767366086, "loss": 3.0954, "step": 161000 }, { "epoch": 4.35, "learning_rate": 0.0008175037218055673, "loss": 3.0923, "step": 162000 }, { "epoch": 4.38, "learning_rate": 0.000816099266874526, "loss": 3.0964, "step": 163000 }, { "epoch": 4.41, "learning_rate": 0.0008146962163984158, "loss": 3.0941, "step": 164000 }, { "epoch": 4.44, "learning_rate": 0.0008132917614673745, "loss": 3.0902, "step": 165000 }, { "epoch": 4.46, "learning_rate": 0.0008118887109912643, "loss": 3.0969, "step": 166000 }, { "epoch": 4.49, "learning_rate": 0.000810484256060223, "loss": 3.0948, "step": 167000 }, { "epoch": 4.52, "learning_rate": 0.0008090798011291817, "loss": 3.0874, "step": 168000 }, { "epoch": 4.54, "learning_rate": 0.0008076767506530715, "loss": 3.0981, "step": 169000 }, { "epoch": 4.57, "learning_rate": 0.0008062722957220303, "loss": 3.0934, "step": 170000 }, { "epoch": 4.6, "learning_rate": 0.000804867840790989, "loss": 3.0974, "step": 171000 }, { "epoch": 4.62, "learning_rate": 0.0008034633858599479, "loss": 3.0942, "step": 172000 }, { "epoch": 4.65, "learning_rate": 0.0008020603353838376, "loss": 3.0917, "step": 173000 }, { "epoch": 4.68, "learning_rate": 0.0008006558804527962, "loss": 3.0974, "step": 174000 }, { "epoch": 4.7, "learning_rate": 0.000799252829976686, "loss": 3.0948, "step": 175000 }, { "epoch": 4.73, "learning_rate": 0.0007978483750456448, "loss": 3.099, "step": 176000 }, { "epoch": 4.76, "learning_rate": 0.0007964439201146035, "loss": 3.1001, "step": 177000 }, { "epoch": 4.78, "learning_rate": 0.0007950394651835623, "loss": 3.0987, "step": 178000 }, { "epoch": 4.81, "learning_rate": 0.0007936364147074521, "loss": 3.0925, "step": 179000 }, { "epoch": 4.84, "learning_rate": 0.0007922319597764108, "loss": 3.0969, "step": 180000 }, { "epoch": 4.87, "learning_rate": 0.0007908289093003006, "loss": 3.0986, "step": 181000 }, { "epoch": 4.89, "learning_rate": 0.0007894244543692594, "loss": 3.095, "step": 182000 }, { "epoch": 4.92, "learning_rate": 0.000788019999438218, "loss": 3.0932, "step": 183000 }, { "epoch": 4.95, "learning_rate": 0.0007866169489621078, "loss": 3.0919, "step": 184000 }, { "epoch": 4.97, "learning_rate": 0.0007852124940310666, "loss": 3.0978, "step": 185000 }, { "epoch": 5.0, "learning_rate": 0.0007838080391000253, "loss": 3.0973, "step": 186000 }, { "epoch": 5.0, "eval_accuracy": 0.40901955199174495, "eval_loss": 3.346210479736328, "eval_runtime": 147.9489, "eval_samples_per_second": 391.487, "eval_steps_per_second": 6.117, "step": 186005 }, { "epoch": 5.03, "learning_rate": 0.000782403584168984, "loss": 3.021, "step": 187000 }, { "epoch": 5.05, "learning_rate": 0.0007810005336928739, "loss": 3.0218, "step": 188000 }, { "epoch": 5.08, "learning_rate": 0.0007795960787618326, "loss": 3.0321, "step": 189000 }, { "epoch": 5.11, "learning_rate": 0.0007781930282857223, "loss": 3.0359, "step": 190000 }, { "epoch": 5.13, "learning_rate": 0.0007767885733546812, "loss": 3.0365, "step": 191000 }, { "epoch": 5.16, "learning_rate": 0.0007753855228785709, "loss": 3.0411, "step": 192000 }, { "epoch": 5.19, "learning_rate": 0.0007739810679475295, "loss": 3.0414, "step": 193000 }, { "epoch": 5.21, "learning_rate": 0.0007725766130164883, "loss": 3.0395, "step": 194000 }, { "epoch": 5.24, "learning_rate": 0.0007711721580854471, "loss": 3.042, "step": 195000 }, { "epoch": 5.27, "learning_rate": 0.0007697691076093368, "loss": 3.0454, "step": 196000 }, { "epoch": 5.3, "learning_rate": 0.0007683646526782956, "loss": 3.0444, "step": 197000 }, { "epoch": 5.32, "learning_rate": 0.0007669601977472544, "loss": 3.0452, "step": 198000 }, { "epoch": 5.35, "learning_rate": 0.0007655557428162131, "loss": 3.0496, "step": 199000 }, { "epoch": 5.38, "learning_rate": 0.0007641526923401028, "loss": 3.0454, "step": 200000 }, { "epoch": 5.4, "learning_rate": 0.0007627482374090615, "loss": 3.048, "step": 201000 }, { "epoch": 5.43, "learning_rate": 0.0007613451869329513, "loss": 3.0478, "step": 202000 }, { "epoch": 5.46, "learning_rate": 0.00075994073200191, "loss": 3.0495, "step": 203000 }, { "epoch": 5.48, "learning_rate": 0.0007585376815257999, "loss": 3.0498, "step": 204000 }, { "epoch": 5.51, "learning_rate": 0.0007571332265947586, "loss": 3.0483, "step": 205000 }, { "epoch": 5.54, "learning_rate": 0.0007557301761186483, "loss": 3.0534, "step": 206000 }, { "epoch": 5.56, "learning_rate": 0.0007543257211876072, "loss": 3.0583, "step": 207000 }, { "epoch": 5.59, "learning_rate": 0.0007529212662565659, "loss": 3.0512, "step": 208000 }, { "epoch": 5.62, "learning_rate": 0.0007515182157804556, "loss": 3.0507, "step": 209000 }, { "epoch": 5.65, "learning_rate": 0.0007501137608494145, "loss": 3.0499, "step": 210000 }, { "epoch": 5.67, "learning_rate": 0.0007487107103733042, "loss": 3.0572, "step": 211000 }, { "epoch": 5.7, "learning_rate": 0.0007473062554422628, "loss": 3.0519, "step": 212000 }, { "epoch": 5.73, "learning_rate": 0.0007459018005112215, "loss": 3.0495, "step": 213000 }, { "epoch": 5.75, "learning_rate": 0.0007444987500351114, "loss": 3.0612, "step": 214000 }, { "epoch": 5.78, "learning_rate": 0.0007430956995590012, "loss": 3.0547, "step": 215000 }, { "epoch": 5.81, "learning_rate": 0.0007416912446279599, "loss": 3.054, "step": 216000 }, { "epoch": 5.83, "learning_rate": 0.0007402867896969187, "loss": 3.0547, "step": 217000 }, { "epoch": 5.86, "learning_rate": 0.0007388823347658774, "loss": 3.0588, "step": 218000 }, { "epoch": 5.89, "learning_rate": 0.0007374792842897672, "loss": 3.0531, "step": 219000 }, { "epoch": 5.91, "learning_rate": 0.0007360762338136569, "loss": 3.0585, "step": 220000 }, { "epoch": 5.94, "learning_rate": 0.0007346717788826157, "loss": 3.0522, "step": 221000 }, { "epoch": 5.97, "learning_rate": 0.0007332673239515743, "loss": 3.0604, "step": 222000 }, { "epoch": 5.99, "learning_rate": 0.0007318628690205332, "loss": 3.0543, "step": 223000 }, { "epoch": 6.0, "eval_accuracy": 0.4064327960745534, "eval_loss": 3.3686516284942627, "eval_runtime": 149.1251, "eval_samples_per_second": 388.399, "eval_steps_per_second": 6.069, "step": 223206 }, { "epoch": 6.02, "learning_rate": 0.0007304598185444229, "loss": 3.0023, "step": 224000 }, { "epoch": 6.05, "learning_rate": 0.0007290553636133816, "loss": 2.9878, "step": 225000 }, { "epoch": 6.08, "learning_rate": 0.0007276509086823405, "loss": 2.9862, "step": 226000 }, { "epoch": 6.1, "learning_rate": 0.0007262464537512992, "loss": 2.993, "step": 227000 }, { "epoch": 6.13, "learning_rate": 0.0007248434032751889, "loss": 2.9986, "step": 228000 }, { "epoch": 6.16, "learning_rate": 0.0007234389483441476, "loss": 2.9984, "step": 229000 }, { "epoch": 6.18, "learning_rate": 0.0007220344934131064, "loss": 2.9975, "step": 230000 }, { "epoch": 6.21, "learning_rate": 0.0007206314429369961, "loss": 3.0058, "step": 231000 }, { "epoch": 6.24, "learning_rate": 0.0007192269880059548, "loss": 3.003, "step": 232000 }, { "epoch": 6.26, "learning_rate": 0.0007178239375298447, "loss": 3.0059, "step": 233000 }, { "epoch": 6.29, "learning_rate": 0.0007164194825988034, "loss": 3.0085, "step": 234000 }, { "epoch": 6.32, "learning_rate": 0.0007150150276677621, "loss": 3.0067, "step": 235000 }, { "epoch": 6.34, "learning_rate": 0.000713611977191652, "loss": 3.0048, "step": 236000 }, { "epoch": 6.37, "learning_rate": 0.0007122075222606107, "loss": 3.007, "step": 237000 }, { "epoch": 6.4, "learning_rate": 0.0007108044717845005, "loss": 3.0163, "step": 238000 }, { "epoch": 6.42, "learning_rate": 0.0007094000168534593, "loss": 3.0104, "step": 239000 }, { "epoch": 6.45, "learning_rate": 0.0007079955619224179, "loss": 3.0145, "step": 240000 }, { "epoch": 6.48, "learning_rate": 0.0007065939159012388, "loss": 3.0145, "step": 241000 }, { "epoch": 6.51, "learning_rate": 0.0007051894609701974, "loss": 3.0127, "step": 242000 }, { "epoch": 6.53, "learning_rate": 0.0007037850060391562, "loss": 3.0159, "step": 243000 }, { "epoch": 6.56, "learning_rate": 0.0007023805511081149, "loss": 3.0151, "step": 244000 }, { "epoch": 6.59, "learning_rate": 0.0007009760961770736, "loss": 3.02, "step": 245000 }, { "epoch": 6.61, "learning_rate": 0.0006995730457009635, "loss": 3.019, "step": 246000 }, { "epoch": 6.64, "learning_rate": 0.0006981685907699222, "loss": 3.0139, "step": 247000 }, { "epoch": 6.67, "learning_rate": 0.000696765540293812, "loss": 3.0183, "step": 248000 }, { "epoch": 6.69, "learning_rate": 0.0006953610853627708, "loss": 3.0203, "step": 249000 }, { "epoch": 6.72, "learning_rate": 0.0006939566304317294, "loss": 3.024, "step": 250000 }, { "epoch": 6.75, "learning_rate": 0.0006925535799556192, "loss": 3.0248, "step": 251000 }, { "epoch": 6.77, "learning_rate": 0.000691149125024578, "loss": 3.0249, "step": 252000 }, { "epoch": 6.8, "learning_rate": 0.0006897460745484677, "loss": 3.0181, "step": 253000 }, { "epoch": 6.83, "learning_rate": 0.0006883416196174265, "loss": 3.0196, "step": 254000 }, { "epoch": 6.85, "learning_rate": 0.0006869371646863853, "loss": 3.0244, "step": 255000 }, { "epoch": 6.88, "learning_rate": 0.000685534114210275, "loss": 3.0251, "step": 256000 }, { "epoch": 6.91, "learning_rate": 0.0006841296592792338, "loss": 3.0219, "step": 257000 }, { "epoch": 6.94, "learning_rate": 0.0006827266088031235, "loss": 3.0203, "step": 258000 }, { "epoch": 6.96, "learning_rate": 0.0006813221538720823, "loss": 3.0225, "step": 259000 }, { "epoch": 6.99, "learning_rate": 0.0006799176989410409, "loss": 3.0161, "step": 260000 }, { "epoch": 7.0, "eval_accuracy": 0.4113539808027173, "eval_loss": 3.339113473892212, "eval_runtime": 148.8983, "eval_samples_per_second": 388.99, "eval_steps_per_second": 6.078, "step": 260407 }, { "epoch": 7.02, "learning_rate": 0.0006785132440099997, "loss": 2.9764, "step": 261000 }, { "epoch": 7.04, "learning_rate": 0.0006771101935338895, "loss": 2.9576, "step": 262000 }, { "epoch": 7.07, "learning_rate": 0.0006757057386028482, "loss": 2.9529, "step": 263000 }, { "epoch": 7.1, "learning_rate": 0.0006743012836718069, "loss": 2.9603, "step": 264000 }, { "epoch": 7.12, "learning_rate": 0.0006728968287407658, "loss": 2.9641, "step": 265000 }, { "epoch": 7.15, "learning_rate": 0.0006714937782646555, "loss": 2.9675, "step": 266000 }, { "epoch": 7.18, "learning_rate": 0.0006700907277885453, "loss": 2.9641, "step": 267000 }, { "epoch": 7.2, "learning_rate": 0.0006686862728575041, "loss": 2.9672, "step": 268000 }, { "epoch": 7.23, "learning_rate": 0.0006672818179264628, "loss": 2.9774, "step": 269000 }, { "epoch": 7.26, "learning_rate": 0.0006658773629954214, "loss": 2.9753, "step": 270000 }, { "epoch": 7.28, "learning_rate": 0.0006644743125193112, "loss": 2.9674, "step": 271000 }, { "epoch": 7.31, "learning_rate": 0.00066306985758827, "loss": 2.9716, "step": 272000 }, { "epoch": 7.34, "learning_rate": 0.0006616654026572287, "loss": 2.9788, "step": 273000 }, { "epoch": 7.37, "learning_rate": 0.0006602623521811185, "loss": 2.9791, "step": 274000 }, { "epoch": 7.39, "learning_rate": 0.0006588593017050083, "loss": 2.9821, "step": 275000 }, { "epoch": 7.42, "learning_rate": 0.000657454846773967, "loss": 2.9828, "step": 276000 }, { "epoch": 7.45, "learning_rate": 0.0006560503918429258, "loss": 2.9802, "step": 277000 }, { "epoch": 7.47, "learning_rate": 0.0006546459369118846, "loss": 2.9858, "step": 278000 }, { "epoch": 7.5, "learning_rate": 0.0006532428864357743, "loss": 2.9814, "step": 279000 }, { "epoch": 7.53, "learning_rate": 0.0006518398359596641, "loss": 2.9865, "step": 280000 }, { "epoch": 7.55, "learning_rate": 0.0006504353810286228, "loss": 2.9894, "step": 281000 }, { "epoch": 7.58, "learning_rate": 0.0006490309260975815, "loss": 2.9832, "step": 282000 }, { "epoch": 7.61, "learning_rate": 0.0006476264711665402, "loss": 2.986, "step": 283000 }, { "epoch": 7.63, "learning_rate": 0.0006462234206904301, "loss": 2.9924, "step": 284000 }, { "epoch": 7.66, "learning_rate": 0.0006448189657593888, "loss": 2.9838, "step": 285000 }, { "epoch": 7.69, "learning_rate": 0.0006434145108283475, "loss": 2.99, "step": 286000 }, { "epoch": 7.71, "learning_rate": 0.0006420100558973064, "loss": 2.9873, "step": 287000 }, { "epoch": 7.74, "learning_rate": 0.0006406070054211961, "loss": 2.9866, "step": 288000 }, { "epoch": 7.77, "learning_rate": 0.0006392025504901547, "loss": 2.9869, "step": 289000 }, { "epoch": 7.8, "learning_rate": 0.0006377995000140445, "loss": 2.9881, "step": 290000 }, { "epoch": 7.82, "learning_rate": 0.0006363964495379343, "loss": 2.9825, "step": 291000 }, { "epoch": 7.85, "learning_rate": 0.000634991994606893, "loss": 2.9951, "step": 292000 }, { "epoch": 7.88, "learning_rate": 0.0006335875396758518, "loss": 2.9958, "step": 293000 }, { "epoch": 7.9, "learning_rate": 0.0006321830847448106, "loss": 2.997, "step": 294000 }, { "epoch": 7.93, "learning_rate": 0.0006307800342687003, "loss": 2.9886, "step": 295000 }, { "epoch": 7.96, "learning_rate": 0.0006293755793376591, "loss": 3.0001, "step": 296000 }, { "epoch": 7.98, "learning_rate": 0.0006279725288615489, "loss": 2.9858, "step": 297000 }, { "epoch": 8.0, "eval_accuracy": 0.4104764790291721, "eval_loss": 3.347707748413086, "eval_runtime": 148.9691, "eval_samples_per_second": 388.806, "eval_steps_per_second": 6.075, "step": 297608 }, { "epoch": 8.01, "learning_rate": 0.0006265680739305076, "loss": 2.9621, "step": 298000 }, { "epoch": 8.04, "learning_rate": 0.0006251650234543974, "loss": 2.9243, "step": 299000 }, { "epoch": 8.06, "learning_rate": 0.000623760568523356, "loss": 2.9237, "step": 300000 }, { "epoch": 8.09, "learning_rate": 0.0006223575180472459, "loss": 2.9296, "step": 301000 }, { "epoch": 8.12, "learning_rate": 0.0006209530631162046, "loss": 2.9321, "step": 302000 }, { "epoch": 8.14, "learning_rate": 0.0006195486081851633, "loss": 2.9411, "step": 303000 }, { "epoch": 8.17, "learning_rate": 0.0006181441532541221, "loss": 2.9375, "step": 304000 }, { "epoch": 8.2, "learning_rate": 0.0006167396983230808, "loss": 2.9463, "step": 305000 }, { "epoch": 8.23, "learning_rate": 0.0006153366478469706, "loss": 2.9421, "step": 306000 }, { "epoch": 8.25, "learning_rate": 0.0006139321929159294, "loss": 2.9412, "step": 307000 }, { "epoch": 8.28, "learning_rate": 0.0006125291424398192, "loss": 2.9451, "step": 308000 }, { "epoch": 8.31, "learning_rate": 0.0006111246875087778, "loss": 2.9487, "step": 309000 }, { "epoch": 8.33, "learning_rate": 0.0006097216370326676, "loss": 2.9474, "step": 310000 }, { "epoch": 8.36, "learning_rate": 0.0006083171821016264, "loss": 2.9504, "step": 311000 }, { "epoch": 8.39, "learning_rate": 0.0006069127271705851, "loss": 2.9526, "step": 312000 }, { "epoch": 8.41, "learning_rate": 0.0006055096766944749, "loss": 2.948, "step": 313000 }, { "epoch": 8.44, "learning_rate": 0.0006041066262183647, "loss": 2.954, "step": 314000 }, { "epoch": 8.47, "learning_rate": 0.0006027021712873234, "loss": 2.9529, "step": 315000 }, { "epoch": 8.49, "learning_rate": 0.0006012977163562821, "loss": 2.9531, "step": 316000 }, { "epoch": 8.52, "learning_rate": 0.0005998932614252409, "loss": 2.9572, "step": 317000 }, { "epoch": 8.55, "learning_rate": 0.0005984902109491307, "loss": 2.9579, "step": 318000 }, { "epoch": 8.58, "learning_rate": 0.0005970857560180893, "loss": 2.9599, "step": 319000 }, { "epoch": 8.6, "learning_rate": 0.0005956827055419792, "loss": 2.9537, "step": 320000 }, { "epoch": 8.63, "learning_rate": 0.0005942782506109379, "loss": 2.9554, "step": 321000 }, { "epoch": 8.66, "learning_rate": 0.0005928737956798966, "loss": 2.9589, "step": 322000 }, { "epoch": 8.68, "learning_rate": 0.0005914707452037865, "loss": 2.9607, "step": 323000 }, { "epoch": 8.71, "learning_rate": 0.0005900662902727452, "loss": 2.958, "step": 324000 }, { "epoch": 8.74, "learning_rate": 0.0005886632397966349, "loss": 2.9597, "step": 325000 }, { "epoch": 8.76, "learning_rate": 0.0005872587848655937, "loss": 2.9666, "step": 326000 }, { "epoch": 8.79, "learning_rate": 0.0005858557343894835, "loss": 2.9572, "step": 327000 }, { "epoch": 8.82, "learning_rate": 0.0005844512794584422, "loss": 2.9654, "step": 328000 }, { "epoch": 8.84, "learning_rate": 0.000583048228982332, "loss": 2.9631, "step": 329000 }, { "epoch": 8.87, "learning_rate": 0.0005816437740512907, "loss": 2.9666, "step": 330000 }, { "epoch": 8.9, "learning_rate": 0.0005802393191202494, "loss": 2.9719, "step": 331000 }, { "epoch": 8.92, "learning_rate": 0.0005788348641892081, "loss": 2.9649, "step": 332000 }, { "epoch": 8.95, "learning_rate": 0.0005774332181680289, "loss": 2.9659, "step": 333000 }, { "epoch": 8.98, "learning_rate": 0.0005760287632369877, "loss": 2.9718, "step": 334000 }, { "epoch": 9.0, "eval_accuracy": 0.41122177107294106, "eval_loss": 3.343648672103882, "eval_runtime": 147.5276, "eval_samples_per_second": 392.604, "eval_steps_per_second": 6.134, "step": 334809 }, { "epoch": 9.01, "learning_rate": 0.0005746243083059465, "loss": 2.9542, "step": 335000 }, { "epoch": 9.03, "learning_rate": 0.0005732212578298362, "loss": 2.891, "step": 336000 }, { "epoch": 9.06, "learning_rate": 0.000571816802898795, "loss": 2.9009, "step": 337000 }, { "epoch": 9.09, "learning_rate": 0.0005704123479677537, "loss": 2.8991, "step": 338000 }, { "epoch": 9.11, "learning_rate": 0.0005690078930367125, "loss": 2.9084, "step": 339000 }, { "epoch": 9.14, "learning_rate": 0.0005676048425606023, "loss": 2.91, "step": 340000 }, { "epoch": 9.17, "learning_rate": 0.0005662003876295609, "loss": 2.9123, "step": 341000 }, { "epoch": 9.19, "learning_rate": 0.0005647973371534507, "loss": 2.9128, "step": 342000 }, { "epoch": 9.22, "learning_rate": 0.0005633942866773404, "loss": 2.9138, "step": 343000 }, { "epoch": 9.25, "learning_rate": 0.0005619898317462993, "loss": 2.9208, "step": 344000 }, { "epoch": 9.27, "learning_rate": 0.000560585376815258, "loss": 2.9197, "step": 345000 }, { "epoch": 9.3, "learning_rate": 0.0005591809218842168, "loss": 2.919, "step": 346000 }, { "epoch": 9.33, "learning_rate": 0.0005577778714081066, "loss": 2.9241, "step": 347000 }, { "epoch": 9.35, "learning_rate": 0.0005563734164770653, "loss": 2.9236, "step": 348000 }, { "epoch": 9.38, "learning_rate": 0.000554970366000955, "loss": 2.9224, "step": 349000 }, { "epoch": 9.41, "learning_rate": 0.0005535673155248449, "loss": 2.9247, "step": 350000 }, { "epoch": 9.44, "learning_rate": 0.0005521628605938036, "loss": 2.9262, "step": 351000 }, { "epoch": 9.46, "learning_rate": 0.0005507584056627622, "loss": 2.9309, "step": 352000 }, { "epoch": 9.49, "learning_rate": 0.000549353950731721, "loss": 2.9299, "step": 353000 }, { "epoch": 9.52, "learning_rate": 0.0005479509002556108, "loss": 2.9349, "step": 354000 }, { "epoch": 9.54, "learning_rate": 0.0005465464453245695, "loss": 2.9367, "step": 355000 }, { "epoch": 9.57, "learning_rate": 0.0005451433948484594, "loss": 2.933, "step": 356000 }, { "epoch": 9.6, "learning_rate": 0.0005437389399174181, "loss": 2.9336, "step": 357000 }, { "epoch": 9.62, "learning_rate": 0.0005423344849863768, "loss": 2.9349, "step": 358000 }, { "epoch": 9.65, "learning_rate": 0.0005409314345102666, "loss": 2.9415, "step": 359000 }, { "epoch": 9.68, "learning_rate": 0.0005395269795792254, "loss": 2.9328, "step": 360000 }, { "epoch": 9.7, "learning_rate": 0.0005381239291031151, "loss": 2.9346, "step": 361000 }, { "epoch": 9.73, "learning_rate": 0.0005367194741720737, "loss": 2.9391, "step": 362000 }, { "epoch": 9.76, "learning_rate": 0.0005353150192410326, "loss": 2.9393, "step": 363000 }, { "epoch": 9.78, "learning_rate": 0.0005339119687649223, "loss": 2.9419, "step": 364000 }, { "epoch": 9.81, "learning_rate": 0.000532507513833881, "loss": 2.9377, "step": 365000 }, { "epoch": 9.84, "learning_rate": 0.0005311058678127018, "loss": 2.9373, "step": 366000 }, { "epoch": 9.87, "learning_rate": 0.0005297014128816607, "loss": 2.9416, "step": 367000 }, { "epoch": 9.89, "learning_rate": 0.0005282969579506194, "loss": 2.9433, "step": 368000 }, { "epoch": 9.92, "learning_rate": 0.0005268925030195782, "loss": 2.9406, "step": 369000 }, { "epoch": 9.95, "learning_rate": 0.000525489452543468, "loss": 2.9419, "step": 370000 }, { "epoch": 9.97, "learning_rate": 0.0005240849976124267, "loss": 2.9411, "step": 371000 }, { "epoch": 10.0, "learning_rate": 0.0005226805426813853, "loss": 2.9399, "step": 372000 }, { "epoch": 10.0, "eval_accuracy": 0.41210269901326396, "eval_loss": 3.345149278640747, "eval_runtime": 148.3396, "eval_samples_per_second": 390.455, "eval_steps_per_second": 6.101, "step": 372010 }, { "epoch": 10.03, "learning_rate": 0.0005212774922052751, "loss": 2.8712, "step": 373000 }, { "epoch": 10.05, "learning_rate": 0.0005198730372742338, "loss": 2.8767, "step": 374000 }, { "epoch": 10.08, "learning_rate": 0.0005184699867981236, "loss": 2.8784, "step": 375000 }, { "epoch": 10.11, "learning_rate": 0.0005170655318670824, "loss": 2.8827, "step": 376000 }, { "epoch": 10.13, "learning_rate": 0.0005156624813909722, "loss": 2.8869, "step": 377000 }, { "epoch": 10.16, "learning_rate": 0.0005142594309148619, "loss": 2.8899, "step": 378000 }, { "epoch": 10.19, "learning_rate": 0.0005128549759838207, "loss": 2.8921, "step": 379000 }, { "epoch": 10.21, "learning_rate": 0.0005114505210527795, "loss": 2.8914, "step": 380000 }, { "epoch": 10.24, "learning_rate": 0.0005100460661217382, "loss": 2.8919, "step": 381000 }, { "epoch": 10.27, "learning_rate": 0.0005086430156456279, "loss": 2.9003, "step": 382000 }, { "epoch": 10.3, "learning_rate": 0.0005072385607145867, "loss": 2.8972, "step": 383000 }, { "epoch": 10.32, "learning_rate": 0.0005058355102384764, "loss": 2.8954, "step": 384000 }, { "epoch": 10.35, "learning_rate": 0.0005044310553074351, "loss": 2.9014, "step": 385000 }, { "epoch": 10.38, "learning_rate": 0.000503026600376394, "loss": 2.903, "step": 386000 }, { "epoch": 10.4, "learning_rate": 0.0005016235499002837, "loss": 2.9039, "step": 387000 }, { "epoch": 10.43, "learning_rate": 0.0005002190949692424, "loss": 2.8998, "step": 388000 }, { "epoch": 10.46, "learning_rate": 0.0004988160444931322, "loss": 2.9079, "step": 389000 }, { "epoch": 10.48, "learning_rate": 0.000497412994017022, "loss": 2.9058, "step": 390000 }, { "epoch": 10.51, "learning_rate": 0.0004960085390859808, "loss": 2.9025, "step": 391000 }, { "epoch": 10.54, "learning_rate": 0.0004946040841549395, "loss": 2.9084, "step": 392000 }, { "epoch": 10.56, "learning_rate": 0.0004931996292238982, "loss": 2.9051, "step": 393000 }, { "epoch": 10.59, "learning_rate": 0.0004917965787477879, "loss": 2.9119, "step": 394000 }, { "epoch": 10.62, "learning_rate": 0.0004903921238167468, "loss": 2.909, "step": 395000 }, { "epoch": 10.64, "learning_rate": 0.0004889876688857055, "loss": 2.9127, "step": 396000 }, { "epoch": 10.67, "learning_rate": 0.00048758461840959523, "loss": 2.9129, "step": 397000 }, { "epoch": 10.7, "learning_rate": 0.000486180163478554, "loss": 2.9112, "step": 398000 }, { "epoch": 10.73, "learning_rate": 0.00048477711300244376, "loss": 2.9202, "step": 399000 }, { "epoch": 10.75, "learning_rate": 0.0004833726580714025, "loss": 2.9161, "step": 400000 }, { "epoch": 10.78, "learning_rate": 0.00048196820314036124, "loss": 2.9211, "step": 401000 }, { "epoch": 10.81, "learning_rate": 0.00048056374820931995, "loss": 2.9192, "step": 402000 }, { "epoch": 10.83, "learning_rate": 0.00047916069773320976, "loss": 2.9145, "step": 403000 }, { "epoch": 10.86, "learning_rate": 0.00047775624280216853, "loss": 2.9171, "step": 404000 }, { "epoch": 10.89, "learning_rate": 0.0004763517878711272, "loss": 2.9132, "step": 405000 }, { "epoch": 10.91, "learning_rate": 0.00047495014184994805, "loss": 2.9178, "step": 406000 }, { "epoch": 10.94, "learning_rate": 0.00047354568691890676, "loss": 2.9177, "step": 407000 }, { "epoch": 10.97, "learning_rate": 0.00047214123198786553, "loss": 2.9154, "step": 408000 }, { "epoch": 10.99, "learning_rate": 0.0004707367770568243, "loss": 2.9207, "step": 409000 }, { "epoch": 11.0, "eval_accuracy": 0.4129671679171056, "eval_loss": 3.358556032180786, "eval_runtime": 148.219, "eval_samples_per_second": 390.773, "eval_steps_per_second": 6.106, "step": 409211 }, { "epoch": 11.02, "learning_rate": 0.00046933232212578296, "loss": 2.8685, "step": 410000 }, { "epoch": 11.05, "learning_rate": 0.00046792927164967277, "loss": 2.8573, "step": 411000 }, { "epoch": 11.07, "learning_rate": 0.0004665248167186315, "loss": 2.8612, "step": 412000 }, { "epoch": 11.1, "learning_rate": 0.0004651217662425213, "loss": 2.862, "step": 413000 }, { "epoch": 11.13, "learning_rate": 0.00046371871576641105, "loss": 2.8641, "step": 414000 }, { "epoch": 11.16, "learning_rate": 0.0004623142608353698, "loss": 2.8669, "step": 415000 }, { "epoch": 11.18, "learning_rate": 0.00046090980590432853, "loss": 2.8682, "step": 416000 }, { "epoch": 11.21, "learning_rate": 0.0004595053509732873, "loss": 2.8753, "step": 417000 }, { "epoch": 11.24, "learning_rate": 0.00045810230049717705, "loss": 2.8688, "step": 418000 }, { "epoch": 11.26, "learning_rate": 0.0004566978455661358, "loss": 2.8752, "step": 419000 }, { "epoch": 11.29, "learning_rate": 0.0004552947950900256, "loss": 2.8755, "step": 420000 }, { "epoch": 11.32, "learning_rate": 0.0004538903401589843, "loss": 2.8753, "step": 421000 }, { "epoch": 11.34, "learning_rate": 0.00045248728968287405, "loss": 2.8777, "step": 422000 }, { "epoch": 11.37, "learning_rate": 0.0004510828347518328, "loss": 2.8822, "step": 423000 }, { "epoch": 11.4, "learning_rate": 0.00044968118873065367, "loss": 2.8774, "step": 424000 }, { "epoch": 11.42, "learning_rate": 0.0004482767337996124, "loss": 2.8873, "step": 425000 }, { "epoch": 11.45, "learning_rate": 0.0004468722788685711, "loss": 2.8866, "step": 426000 }, { "epoch": 11.48, "learning_rate": 0.00044546782393752987, "loss": 2.8853, "step": 427000 }, { "epoch": 11.51, "learning_rate": 0.0004440647734614197, "loss": 2.8821, "step": 428000 }, { "epoch": 11.53, "learning_rate": 0.00044266031853037834, "loss": 2.8884, "step": 429000 }, { "epoch": 11.56, "learning_rate": 0.0004412558635993371, "loss": 2.8865, "step": 430000 }, { "epoch": 11.59, "learning_rate": 0.0004398528131232269, "loss": 2.8887, "step": 431000 }, { "epoch": 11.61, "learning_rate": 0.00043844835819218563, "loss": 2.8915, "step": 432000 }, { "epoch": 11.64, "learning_rate": 0.00043704530771607544, "loss": 2.8884, "step": 433000 }, { "epoch": 11.67, "learning_rate": 0.0004356408527850341, "loss": 2.8851, "step": 434000 }, { "epoch": 11.69, "learning_rate": 0.0004342378023089239, "loss": 2.8882, "step": 435000 }, { "epoch": 11.72, "learning_rate": 0.0004328333473778827, "loss": 2.8893, "step": 436000 }, { "epoch": 11.75, "learning_rate": 0.00043143029690177244, "loss": 2.8898, "step": 437000 }, { "epoch": 11.77, "learning_rate": 0.0004300258419707312, "loss": 2.8875, "step": 438000 }, { "epoch": 11.8, "learning_rate": 0.0004286213870396899, "loss": 2.8952, "step": 439000 }, { "epoch": 11.83, "learning_rate": 0.0004272183365635797, "loss": 2.8938, "step": 440000 }, { "epoch": 11.85, "learning_rate": 0.00042581388163253844, "loss": 2.8965, "step": 441000 }, { "epoch": 11.88, "learning_rate": 0.00042440942670149716, "loss": 2.8929, "step": 442000 }, { "epoch": 11.91, "learning_rate": 0.00042300637622538697, "loss": 2.9017, "step": 443000 }, { "epoch": 11.94, "learning_rate": 0.0004216019212943457, "loss": 2.895, "step": 444000 }, { "epoch": 11.96, "learning_rate": 0.00042019887081823544, "loss": 2.892, "step": 445000 }, { "epoch": 11.99, "learning_rate": 0.0004187944158871942, "loss": 2.8987, "step": 446000 }, { "epoch": 12.0, "eval_accuracy": 0.4122773663391878, "eval_loss": 3.355417013168335, "eval_runtime": 147.56, "eval_samples_per_second": 392.518, "eval_steps_per_second": 6.133, "step": 446412 }, { "epoch": 12.02, "learning_rate": 0.000417389960956153, "loss": 2.8605, "step": 447000 }, { "epoch": 12.04, "learning_rate": 0.0004159883149349737, "loss": 2.8334, "step": 448000 }, { "epoch": 12.07, "learning_rate": 0.0004145838600039325, "loss": 2.8413, "step": 449000 }, { "epoch": 12.1, "learning_rate": 0.0004131794050728912, "loss": 2.8457, "step": 450000 }, { "epoch": 12.12, "learning_rate": 0.00041177495014184997, "loss": 2.8426, "step": 451000 }, { "epoch": 12.15, "learning_rate": 0.0004103718996657397, "loss": 2.8498, "step": 452000 }, { "epoch": 12.18, "learning_rate": 0.0004089674447346985, "loss": 2.8513, "step": 453000 }, { "epoch": 12.2, "learning_rate": 0.0004075629898036572, "loss": 2.8469, "step": 454000 }, { "epoch": 12.23, "learning_rate": 0.0004061585348726159, "loss": 2.8472, "step": 455000 }, { "epoch": 12.26, "learning_rate": 0.00040475548439650573, "loss": 2.8486, "step": 456000 }, { "epoch": 12.28, "learning_rate": 0.0004033510294654645, "loss": 2.8561, "step": 457000 }, { "epoch": 12.31, "learning_rate": 0.00040194797898935426, "loss": 2.8519, "step": 458000 }, { "epoch": 12.34, "learning_rate": 0.00040054352405831297, "loss": 2.8522, "step": 459000 }, { "epoch": 12.37, "learning_rate": 0.00039913906912727174, "loss": 2.8585, "step": 460000 }, { "epoch": 12.39, "learning_rate": 0.00039773461419623045, "loss": 2.8596, "step": 461000 }, { "epoch": 12.42, "learning_rate": 0.00039633156372012026, "loss": 2.8608, "step": 462000 }, { "epoch": 12.45, "learning_rate": 0.00039492851324401, "loss": 2.8572, "step": 463000 }, { "epoch": 12.47, "learning_rate": 0.00039352405831296873, "loss": 2.8622, "step": 464000 }, { "epoch": 12.5, "learning_rate": 0.0003921196033819275, "loss": 2.8632, "step": 465000 }, { "epoch": 12.53, "learning_rate": 0.00039071655290581726, "loss": 2.8635, "step": 466000 }, { "epoch": 12.55, "learning_rate": 0.000389312097974776, "loss": 2.8693, "step": 467000 }, { "epoch": 12.58, "learning_rate": 0.00038790764304373474, "loss": 2.8685, "step": 468000 }, { "epoch": 12.61, "learning_rate": 0.00038650318811269345, "loss": 2.863, "step": 469000 }, { "epoch": 12.63, "learning_rate": 0.00038510013763658327, "loss": 2.8722, "step": 470000 }, { "epoch": 12.66, "learning_rate": 0.000383695682705542, "loss": 2.8671, "step": 471000 }, { "epoch": 12.69, "learning_rate": 0.0003822912277745007, "loss": 2.8703, "step": 472000 }, { "epoch": 12.71, "learning_rate": 0.0003808881772983905, "loss": 2.8754, "step": 473000 }, { "epoch": 12.74, "learning_rate": 0.0003794837223673492, "loss": 2.8687, "step": 474000 }, { "epoch": 12.77, "learning_rate": 0.00037808067189123903, "loss": 2.8734, "step": 475000 }, { "epoch": 12.8, "learning_rate": 0.0003766762169601978, "loss": 2.8748, "step": 476000 }, { "epoch": 12.82, "learning_rate": 0.00037527316648408755, "loss": 2.8773, "step": 477000 }, { "epoch": 12.85, "learning_rate": 0.00037386871155304627, "loss": 2.8739, "step": 478000 }, { "epoch": 12.88, "learning_rate": 0.000372465661076936, "loss": 2.8759, "step": 479000 }, { "epoch": 12.9, "learning_rate": 0.0003710612061458948, "loss": 2.8762, "step": 480000 }, { "epoch": 12.93, "learning_rate": 0.00036965675121485356, "loss": 2.8766, "step": 481000 }, { "epoch": 12.96, "learning_rate": 0.0003682522962838122, "loss": 2.8768, "step": 482000 }, { "epoch": 12.98, "learning_rate": 0.000366847841352771, "loss": 2.8779, "step": 483000 }, { "epoch": 13.0, "eval_accuracy": 0.41304986617795647, "eval_loss": 3.3615658283233643, "eval_runtime": 148.2115, "eval_samples_per_second": 390.793, "eval_steps_per_second": 6.106, "step": 483613 }, { "epoch": 13.01, "learning_rate": 0.00036544479087666074, "loss": 2.8498, "step": 484000 }, { "epoch": 13.04, "learning_rate": 0.00036404174040055055, "loss": 2.8192, "step": 485000 }, { "epoch": 13.06, "learning_rate": 0.0003626372854695093, "loss": 2.8181, "step": 486000 }, { "epoch": 13.09, "learning_rate": 0.000361232830538468, "loss": 2.8195, "step": 487000 }, { "epoch": 13.12, "learning_rate": 0.00035982837560742675, "loss": 2.8275, "step": 488000 }, { "epoch": 13.14, "learning_rate": 0.0003584239206763855, "loss": 2.8255, "step": 489000 }, { "epoch": 13.17, "learning_rate": 0.0003570208702002753, "loss": 2.8286, "step": 490000 }, { "epoch": 13.2, "learning_rate": 0.000355616415269234, "loss": 2.8343, "step": 491000 }, { "epoch": 13.23, "learning_rate": 0.00035421336479312375, "loss": 2.8334, "step": 492000 }, { "epoch": 13.25, "learning_rate": 0.0003528089098620825, "loss": 2.8299, "step": 493000 }, { "epoch": 13.28, "learning_rate": 0.0003514044549310413, "loss": 2.8365, "step": 494000 }, { "epoch": 13.31, "learning_rate": 0.00035, "loss": 2.8353, "step": 495000 }, { "epoch": 13.33, "learning_rate": 0.00034859694952388975, "loss": 2.8377, "step": 496000 }, { "epoch": 13.36, "learning_rate": 0.0003471924945928485, "loss": 2.8407, "step": 497000 }, { "epoch": 13.39, "learning_rate": 0.0003457880396618073, "loss": 2.8428, "step": 498000 }, { "epoch": 13.41, "learning_rate": 0.00034438498918569705, "loss": 2.8405, "step": 499000 }, { "epoch": 13.44, "learning_rate": 0.00034298053425465576, "loss": 2.8419, "step": 500000 }, { "epoch": 13.47, "learning_rate": 0.0003415774837785455, "loss": 2.8469, "step": 501000 }, { "epoch": 13.49, "learning_rate": 0.0003401730288475043, "loss": 2.8488, "step": 502000 }, { "epoch": 13.52, "learning_rate": 0.00033876997837139404, "loss": 2.8434, "step": 503000 }, { "epoch": 13.55, "learning_rate": 0.0003373655234403528, "loss": 2.8472, "step": 504000 }, { "epoch": 13.57, "learning_rate": 0.0003359610685093115, "loss": 2.8471, "step": 505000 }, { "epoch": 13.6, "learning_rate": 0.0003345566135782703, "loss": 2.8512, "step": 506000 }, { "epoch": 13.63, "learning_rate": 0.00033315496755709114, "loss": 2.8477, "step": 507000 }, { "epoch": 13.66, "learning_rate": 0.0003317505126260498, "loss": 2.8482, "step": 508000 }, { "epoch": 13.68, "learning_rate": 0.00033034605769500857, "loss": 2.8487, "step": 509000 }, { "epoch": 13.71, "learning_rate": 0.0003289416027639673, "loss": 2.8496, "step": 510000 }, { "epoch": 13.74, "learning_rate": 0.0003275385522878571, "loss": 2.8543, "step": 511000 }, { "epoch": 13.76, "learning_rate": 0.0003261355018117469, "loss": 2.851, "step": 512000 }, { "epoch": 13.79, "learning_rate": 0.00032473104688070557, "loss": 2.853, "step": 513000 }, { "epoch": 13.82, "learning_rate": 0.0003233279964045954, "loss": 2.8558, "step": 514000 }, { "epoch": 13.84, "learning_rate": 0.00032192354147355415, "loss": 2.8448, "step": 515000 }, { "epoch": 13.87, "learning_rate": 0.00032051908654251286, "loss": 2.8517, "step": 516000 }, { "epoch": 13.9, "learning_rate": 0.0003191146316114716, "loss": 2.8543, "step": 517000 }, { "epoch": 13.92, "learning_rate": 0.0003177115811353614, "loss": 2.8519, "step": 518000 }, { "epoch": 13.95, "learning_rate": 0.0003163071262043201, "loss": 2.8479, "step": 519000 }, { "epoch": 13.98, "learning_rate": 0.00031490267127327887, "loss": 2.8519, "step": 520000 }, { "epoch": 14.0, "eval_accuracy": 0.41285833673710687, "eval_loss": 3.369619846343994, "eval_runtime": 147.964, "eval_samples_per_second": 391.446, "eval_steps_per_second": 6.116, "step": 520814 }, { "epoch": 14.0, "learning_rate": 0.0003134982163422376, "loss": 2.8511, "step": 521000 }, { "epoch": 14.03, "learning_rate": 0.00031209376141119635, "loss": 2.802, "step": 522000 }, { "epoch": 14.06, "learning_rate": 0.0003106907109350861, "loss": 2.7993, "step": 523000 }, { "epoch": 14.09, "learning_rate": 0.0003092862560040449, "loss": 2.8106, "step": 524000 }, { "epoch": 14.11, "learning_rate": 0.0003078818010730036, "loss": 2.8026, "step": 525000 }, { "epoch": 14.14, "learning_rate": 0.00030647875059689334, "loss": 2.8074, "step": 526000 }, { "epoch": 14.17, "learning_rate": 0.0003050742956658521, "loss": 2.8076, "step": 527000 }, { "epoch": 14.19, "learning_rate": 0.00030367124518974187, "loss": 2.8154, "step": 528000 }, { "epoch": 14.22, "learning_rate": 0.00030226679025870064, "loss": 2.8121, "step": 529000 }, { "epoch": 14.25, "learning_rate": 0.0003008637397825904, "loss": 2.816, "step": 530000 }, { "epoch": 14.27, "learning_rate": 0.0002994592848515491, "loss": 2.8158, "step": 531000 }, { "epoch": 14.3, "learning_rate": 0.00029805623437543886, "loss": 2.8182, "step": 532000 }, { "epoch": 14.33, "learning_rate": 0.0002966531838993287, "loss": 2.8169, "step": 533000 }, { "epoch": 14.35, "learning_rate": 0.00029524872896828744, "loss": 2.8197, "step": 534000 }, { "epoch": 14.38, "learning_rate": 0.00029384427403724616, "loss": 2.818, "step": 535000 }, { "epoch": 14.41, "learning_rate": 0.00029243981910620487, "loss": 2.821, "step": 536000 }, { "epoch": 14.44, "learning_rate": 0.0002910367686300946, "loss": 2.8227, "step": 537000 }, { "epoch": 14.46, "learning_rate": 0.0002896323136990534, "loss": 2.8222, "step": 538000 }, { "epoch": 14.49, "learning_rate": 0.0002882292632229432, "loss": 2.8308, "step": 539000 }, { "epoch": 14.52, "learning_rate": 0.0002868248082919019, "loss": 2.8315, "step": 540000 }, { "epoch": 14.54, "learning_rate": 0.00028542035336086063, "loss": 2.8244, "step": 541000 }, { "epoch": 14.57, "learning_rate": 0.0002840173028847504, "loss": 2.8245, "step": 542000 }, { "epoch": 14.6, "learning_rate": 0.00028261284795370916, "loss": 2.8289, "step": 543000 }, { "epoch": 14.62, "learning_rate": 0.00028120979747759897, "loss": 2.8252, "step": 544000 }, { "epoch": 14.65, "learning_rate": 0.0002798053425465577, "loss": 2.8265, "step": 545000 }, { "epoch": 14.68, "learning_rate": 0.0002784022920704475, "loss": 2.8309, "step": 546000 }, { "epoch": 14.7, "learning_rate": 0.00027699924159433725, "loss": 2.8286, "step": 547000 }, { "epoch": 14.73, "learning_rate": 0.00027559478666329596, "loss": 2.8289, "step": 548000 }, { "epoch": 14.76, "learning_rate": 0.00027419033173225473, "loss": 2.8297, "step": 549000 }, { "epoch": 14.78, "learning_rate": 0.00027278587680121345, "loss": 2.8295, "step": 550000 }, { "epoch": 14.81, "learning_rate": 0.00027138282632510326, "loss": 2.8369, "step": 551000 }, { "epoch": 14.84, "learning_rate": 0.000269979775848993, "loss": 2.8354, "step": 552000 }, { "epoch": 14.87, "learning_rate": 0.00026857532091795173, "loss": 2.8305, "step": 553000 }, { "epoch": 14.89, "learning_rate": 0.0002671708659869105, "loss": 2.8355, "step": 554000 }, { "epoch": 14.92, "learning_rate": 0.00026576641105586926, "loss": 2.8353, "step": 555000 }, { "epoch": 14.95, "learning_rate": 0.000264363360579759, "loss": 2.8427, "step": 556000 }, { "epoch": 14.97, "learning_rate": 0.00026295890564871773, "loss": 2.8361, "step": 557000 }, { "epoch": 15.0, "learning_rate": 0.00026155445071767645, "loss": 2.8395, "step": 558000 }, { "epoch": 15.0, "eval_accuracy": 0.41281084066040374, "eval_loss": 3.3729231357574463, "eval_runtime": 147.9664, "eval_samples_per_second": 391.44, "eval_steps_per_second": 6.116, "step": 558015 }, { "epoch": 15.03, "learning_rate": 0.00026015140024156626, "loss": 2.7847, "step": 559000 }, { "epoch": 15.05, "learning_rate": 0.000258746945310525, "loss": 2.7891, "step": 560000 }, { "epoch": 15.08, "learning_rate": 0.0002573424903794837, "loss": 2.788, "step": 561000 }, { "epoch": 15.11, "learning_rate": 0.0002559394399033735, "loss": 2.7885, "step": 562000 }, { "epoch": 15.13, "learning_rate": 0.00025453498497233227, "loss": 2.7939, "step": 563000 }, { "epoch": 15.16, "learning_rate": 0.000253133338951153, "loss": 2.7933, "step": 564000 }, { "epoch": 15.19, "learning_rate": 0.0002517288840201118, "loss": 2.7946, "step": 565000 }, { "epoch": 15.21, "learning_rate": 0.00025032442908907055, "loss": 2.7977, "step": 566000 }, { "epoch": 15.24, "learning_rate": 0.0002489213786129603, "loss": 2.7946, "step": 567000 }, { "epoch": 15.27, "learning_rate": 0.00024751692368191907, "loss": 2.7985, "step": 568000 }, { "epoch": 15.3, "learning_rate": 0.0002461124687508778, "loss": 2.7984, "step": 569000 }, { "epoch": 15.32, "learning_rate": 0.00024470801381983655, "loss": 2.7972, "step": 570000 }, { "epoch": 15.35, "learning_rate": 0.0002433049633437263, "loss": 2.7978, "step": 571000 }, { "epoch": 15.38, "learning_rate": 0.00024190050841268505, "loss": 2.8039, "step": 572000 }, { "epoch": 15.4, "learning_rate": 0.00024049605348164377, "loss": 2.8002, "step": 573000 }, { "epoch": 15.43, "learning_rate": 0.00023909300300553355, "loss": 2.8051, "step": 574000 }, { "epoch": 15.46, "learning_rate": 0.00023768854807449232, "loss": 2.8069, "step": 575000 }, { "epoch": 15.48, "learning_rate": 0.00023628409314345103, "loss": 2.8039, "step": 576000 }, { "epoch": 15.51, "learning_rate": 0.00023488104266734081, "loss": 2.8068, "step": 577000 }, { "epoch": 15.54, "learning_rate": 0.00023347658773629953, "loss": 2.8093, "step": 578000 }, { "epoch": 15.56, "learning_rate": 0.0002320735372601893, "loss": 2.8067, "step": 579000 }, { "epoch": 15.59, "learning_rate": 0.00023066908232914808, "loss": 2.8073, "step": 580000 }, { "epoch": 15.62, "learning_rate": 0.0002292646273981068, "loss": 2.8129, "step": 581000 }, { "epoch": 15.64, "learning_rate": 0.00022786017246706554, "loss": 2.8102, "step": 582000 }, { "epoch": 15.67, "learning_rate": 0.00022645712199095532, "loss": 2.812, "step": 583000 }, { "epoch": 15.7, "learning_rate": 0.00022505266705991406, "loss": 2.8093, "step": 584000 }, { "epoch": 15.73, "learning_rate": 0.00022364961658380384, "loss": 2.8139, "step": 585000 }, { "epoch": 15.75, "learning_rate": 0.00022224516165276256, "loss": 2.8115, "step": 586000 }, { "epoch": 15.78, "learning_rate": 0.0002208407067217213, "loss": 2.8157, "step": 587000 }, { "epoch": 15.81, "learning_rate": 0.00021943765624561108, "loss": 2.8138, "step": 588000 }, { "epoch": 15.83, "learning_rate": 0.00021803320131456982, "loss": 2.8146, "step": 589000 }, { "epoch": 15.86, "learning_rate": 0.0002166301508384596, "loss": 2.8138, "step": 590000 }, { "epoch": 15.89, "learning_rate": 0.00021522569590741835, "loss": 2.8195, "step": 591000 }, { "epoch": 15.91, "learning_rate": 0.0002138226454313081, "loss": 2.8192, "step": 592000 }, { "epoch": 15.94, "learning_rate": 0.00021241819050026685, "loss": 2.8169, "step": 593000 }, { "epoch": 15.97, "learning_rate": 0.00021101373556922559, "loss": 2.8174, "step": 594000 }, { "epoch": 15.99, "learning_rate": 0.00020961068509311537, "loss": 2.8151, "step": 595000 }, { "epoch": 16.0, "eval_accuracy": 0.4140438576219447, "eval_loss": 3.3717539310455322, "eval_runtime": 148.2583, "eval_samples_per_second": 390.669, "eval_steps_per_second": 6.104, "step": 595216 }, { "epoch": 16.02, "learning_rate": 0.0002082062301620741, "loss": 2.7799, "step": 596000 }, { "epoch": 16.05, "learning_rate": 0.00020680177523103282, "loss": 2.7671, "step": 597000 }, { "epoch": 16.07, "learning_rate": 0.0002053987247549226, "loss": 2.772, "step": 598000 }, { "epoch": 16.1, "learning_rate": 0.00020399426982388135, "loss": 2.7732, "step": 599000 }, { "epoch": 16.13, "learning_rate": 0.00020259121934777113, "loss": 2.7791, "step": 600000 }, { "epoch": 16.16, "learning_rate": 0.00020118676441672987, "loss": 2.7742, "step": 601000 }, { "epoch": 16.18, "learning_rate": 0.0001997823094856886, "loss": 2.7786, "step": 602000 }, { "epoch": 16.21, "learning_rate": 0.0001983792590095784, "loss": 2.7834, "step": 603000 }, { "epoch": 16.24, "learning_rate": 0.00019697620853346818, "loss": 2.7824, "step": 604000 }, { "epoch": 16.26, "learning_rate": 0.0001955717536024269, "loss": 2.7857, "step": 605000 }, { "epoch": 16.29, "learning_rate": 0.00019416729867138564, "loss": 2.7824, "step": 606000 }, { "epoch": 16.32, "learning_rate": 0.00019276284374034438, "loss": 2.7849, "step": 607000 }, { "epoch": 16.34, "learning_rate": 0.00019135838880930312, "loss": 2.7853, "step": 608000 }, { "epoch": 16.37, "learning_rate": 0.0001899553383331929, "loss": 2.7886, "step": 609000 }, { "epoch": 16.4, "learning_rate": 0.00018855088340215162, "loss": 2.7843, "step": 610000 }, { "epoch": 16.42, "learning_rate": 0.0001871478329260414, "loss": 2.7929, "step": 611000 }, { "epoch": 16.45, "learning_rate": 0.00018574337799500017, "loss": 2.7879, "step": 612000 }, { "epoch": 16.48, "learning_rate": 0.00018434032751888993, "loss": 2.7893, "step": 613000 }, { "epoch": 16.5, "learning_rate": 0.0001829372770427797, "loss": 2.791, "step": 614000 }, { "epoch": 16.53, "learning_rate": 0.00018153282211173845, "loss": 2.7879, "step": 615000 }, { "epoch": 16.56, "learning_rate": 0.0001801297716356282, "loss": 2.7904, "step": 616000 }, { "epoch": 16.59, "learning_rate": 0.00017872531670458695, "loss": 2.7892, "step": 617000 }, { "epoch": 16.61, "learning_rate": 0.0001773208617735457, "loss": 2.7929, "step": 618000 }, { "epoch": 16.64, "learning_rate": 0.00017591640684250443, "loss": 2.7952, "step": 619000 }, { "epoch": 16.67, "learning_rate": 0.00017451195191146317, "loss": 2.7835, "step": 620000 }, { "epoch": 16.69, "learning_rate": 0.00017311030589028397, "loss": 2.793, "step": 621000 }, { "epoch": 16.72, "learning_rate": 0.0001717058509592427, "loss": 2.7948, "step": 622000 }, { "epoch": 16.75, "learning_rate": 0.00017030139602820148, "loss": 2.7917, "step": 623000 }, { "epoch": 16.77, "learning_rate": 0.0001688969410971602, "loss": 2.7938, "step": 624000 }, { "epoch": 16.8, "learning_rate": 0.00016749248616611893, "loss": 2.7978, "step": 625000 }, { "epoch": 16.83, "learning_rate": 0.0001660894356900087, "loss": 2.7945, "step": 626000 }, { "epoch": 16.85, "learning_rate": 0.00016468498075896746, "loss": 2.7943, "step": 627000 }, { "epoch": 16.88, "learning_rate": 0.00016328193028285724, "loss": 2.7918, "step": 628000 }, { "epoch": 16.91, "learning_rate": 0.00016187747535181596, "loss": 2.7988, "step": 629000 }, { "epoch": 16.94, "learning_rate": 0.00016047442487570574, "loss": 2.7968, "step": 630000 }, { "epoch": 16.96, "learning_rate": 0.00015906996994466445, "loss": 2.7929, "step": 631000 }, { "epoch": 16.99, "learning_rate": 0.00015766551501362322, "loss": 2.798, "step": 632000 }, { "epoch": 17.0, "eval_accuracy": 0.41277355590429304, "eval_loss": 3.385791063308716, "eval_runtime": 147.8055, "eval_samples_per_second": 391.866, "eval_steps_per_second": 6.123, "step": 632417 }, { "epoch": 17.02, "learning_rate": 0.000156262464537513, "loss": 2.7792, "step": 633000 }, { "epoch": 17.04, "learning_rate": 0.00015485800960647172, "loss": 2.7608, "step": 634000 }, { "epoch": 17.07, "learning_rate": 0.0001534549591303615, "loss": 2.7645, "step": 635000 }, { "epoch": 17.1, "learning_rate": 0.00015205050419932024, "loss": 2.7656, "step": 636000 }, { "epoch": 17.12, "learning_rate": 0.00015064604926827898, "loss": 2.7604, "step": 637000 }, { "epoch": 17.15, "learning_rate": 0.00014924299879216877, "loss": 2.7683, "step": 638000 }, { "epoch": 17.18, "learning_rate": 0.00014783854386112748, "loss": 2.7652, "step": 639000 }, { "epoch": 17.2, "learning_rate": 0.00014643408893008622, "loss": 2.7589, "step": 640000 }, { "epoch": 17.23, "learning_rate": 0.000145029633999045, "loss": 2.7709, "step": 641000 }, { "epoch": 17.26, "learning_rate": 0.00014362658352293475, "loss": 2.7638, "step": 642000 }, { "epoch": 17.28, "learning_rate": 0.00014222353304682453, "loss": 2.7667, "step": 643000 }, { "epoch": 17.31, "learning_rate": 0.00014081907811578327, "loss": 2.7637, "step": 644000 }, { "epoch": 17.34, "learning_rate": 0.00013941462318474201, "loss": 2.7752, "step": 645000 }, { "epoch": 17.37, "learning_rate": 0.0001380115727086318, "loss": 2.7716, "step": 646000 }, { "epoch": 17.39, "learning_rate": 0.0001366071177775905, "loss": 2.7665, "step": 647000 }, { "epoch": 17.42, "learning_rate": 0.00013520266284654925, "loss": 2.7669, "step": 648000 }, { "epoch": 17.45, "learning_rate": 0.00013379820791550802, "loss": 2.7743, "step": 649000 }, { "epoch": 17.47, "learning_rate": 0.00013239375298446673, "loss": 2.7733, "step": 650000 }, { "epoch": 17.5, "learning_rate": 0.00013099070250835652, "loss": 2.7713, "step": 651000 }, { "epoch": 17.53, "learning_rate": 0.0001295876520322463, "loss": 2.7694, "step": 652000 }, { "epoch": 17.55, "learning_rate": 0.00012818319710120502, "loss": 2.767, "step": 653000 }, { "epoch": 17.58, "learning_rate": 0.00012677874217016378, "loss": 2.7736, "step": 654000 }, { "epoch": 17.61, "learning_rate": 0.0001253742872391225, "loss": 2.7743, "step": 655000 }, { "epoch": 17.63, "learning_rate": 0.00012396983230808124, "loss": 2.7739, "step": 656000 }, { "epoch": 17.66, "learning_rate": 0.000122566781831971, "loss": 2.7722, "step": 657000 }, { "epoch": 17.69, "learning_rate": 0.00012116232690092975, "loss": 2.776, "step": 658000 }, { "epoch": 17.71, "learning_rate": 0.00011975787196988849, "loss": 2.7807, "step": 659000 }, { "epoch": 17.74, "learning_rate": 0.00011835482149377827, "loss": 2.7719, "step": 660000 }, { "epoch": 17.77, "learning_rate": 0.000116950366562737, "loss": 2.7747, "step": 661000 }, { "epoch": 17.8, "learning_rate": 0.00011554731608662679, "loss": 2.7782, "step": 662000 }, { "epoch": 17.82, "learning_rate": 0.00011414286115558551, "loss": 2.7738, "step": 663000 }, { "epoch": 17.85, "learning_rate": 0.0001127398106794753, "loss": 2.7756, "step": 664000 }, { "epoch": 17.88, "learning_rate": 0.00011133535574843404, "loss": 2.7715, "step": 665000 }, { "epoch": 17.9, "learning_rate": 0.00010993090081739278, "loss": 2.7809, "step": 666000 }, { "epoch": 17.93, "learning_rate": 0.00010852785034128255, "loss": 2.7813, "step": 667000 }, { "epoch": 17.96, "learning_rate": 0.00010712339541024129, "loss": 2.7748, "step": 668000 }, { "epoch": 17.98, "learning_rate": 0.00010571894047920003, "loss": 2.7738, "step": 669000 }, { "epoch": 18.0, "eval_accuracy": 0.41297422178988324, "eval_loss": 3.407961368560791, "eval_runtime": 148.1896, "eval_samples_per_second": 390.851, "eval_steps_per_second": 6.107, "step": 669618 }, { "epoch": 18.01, "learning_rate": 0.0001043158900030898, "loss": 2.7651, "step": 670000 }, { "epoch": 18.04, "learning_rate": 0.00010291143507204854, "loss": 2.7487, "step": 671000 }, { "epoch": 18.06, "learning_rate": 0.00010150838459593833, "loss": 2.7489, "step": 672000 }, { "epoch": 18.09, "learning_rate": 0.00010010392966489705, "loss": 2.7467, "step": 673000 }, { "epoch": 18.12, "learning_rate": 9.870087918878684e-05, "loss": 2.7491, "step": 674000 }, { "epoch": 18.14, "learning_rate": 9.729642425774556e-05, "loss": 2.7511, "step": 675000 }, { "epoch": 18.17, "learning_rate": 9.58919693267043e-05, "loss": 2.745, "step": 676000 }, { "epoch": 18.2, "learning_rate": 9.448891885059409e-05, "loss": 2.7478, "step": 677000 }, { "epoch": 18.23, "learning_rate": 9.308446391955282e-05, "loss": 2.755, "step": 678000 }, { "epoch": 18.25, "learning_rate": 9.168000898851156e-05, "loss": 2.7538, "step": 679000 }, { "epoch": 18.28, "learning_rate": 9.027695851240134e-05, "loss": 2.755, "step": 680000 }, { "epoch": 18.31, "learning_rate": 8.887250358136008e-05, "loss": 2.7577, "step": 681000 }, { "epoch": 18.33, "learning_rate": 8.746945310524985e-05, "loss": 2.7505, "step": 682000 }, { "epoch": 18.36, "learning_rate": 8.60649981742086e-05, "loss": 2.7591, "step": 683000 }, { "epoch": 18.39, "learning_rate": 8.466194769809838e-05, "loss": 2.7601, "step": 684000 }, { "epoch": 18.41, "learning_rate": 8.32574927670571e-05, "loss": 2.7567, "step": 685000 }, { "epoch": 18.44, "learning_rate": 8.185444229094687e-05, "loss": 2.7547, "step": 686000 }, { "epoch": 18.47, "learning_rate": 8.044998735990562e-05, "loss": 2.7584, "step": 687000 }, { "epoch": 18.49, "learning_rate": 7.904553242886437e-05, "loss": 2.7554, "step": 688000 }, { "epoch": 18.52, "learning_rate": 7.764388640768517e-05, "loss": 2.756, "step": 689000 }, { "epoch": 18.55, "learning_rate": 7.623943147664391e-05, "loss": 2.7581, "step": 690000 }, { "epoch": 18.57, "learning_rate": 7.483497654560266e-05, "loss": 2.7593, "step": 691000 }, { "epoch": 18.6, "learning_rate": 7.343192606949243e-05, "loss": 2.7549, "step": 692000 }, { "epoch": 18.63, "learning_rate": 7.202747113845116e-05, "loss": 2.761, "step": 693000 }, { "epoch": 18.66, "learning_rate": 7.06230162074099e-05, "loss": 2.7556, "step": 694000 }, { "epoch": 18.68, "learning_rate": 6.921856127636864e-05, "loss": 2.7513, "step": 695000 }, { "epoch": 18.71, "learning_rate": 6.781410634532737e-05, "loss": 2.7577, "step": 696000 }, { "epoch": 18.74, "learning_rate": 6.641105586921716e-05, "loss": 2.759, "step": 697000 }, { "epoch": 18.76, "learning_rate": 6.50066009381759e-05, "loss": 2.7603, "step": 698000 }, { "epoch": 18.79, "learning_rate": 6.360355046206568e-05, "loss": 2.7598, "step": 699000 }, { "epoch": 18.82, "learning_rate": 6.219909553102441e-05, "loss": 2.7545, "step": 700000 }, { "epoch": 18.84, "learning_rate": 6.079464059998314e-05, "loss": 2.7603, "step": 701000 }, { "epoch": 18.87, "learning_rate": 5.939018566894189e-05, "loss": 2.7558, "step": 702000 }, { "epoch": 18.9, "learning_rate": 5.798713519283167e-05, "loss": 2.7559, "step": 703000 }, { "epoch": 18.92, "learning_rate": 5.65826802617904e-05, "loss": 2.7584, "step": 704000 }, { "epoch": 18.95, "learning_rate": 5.5178225330749135e-05, "loss": 2.7614, "step": 705000 }, { "epoch": 18.98, "learning_rate": 5.377517485463892e-05, "loss": 2.7555, "step": 706000 }, { "epoch": 19.0, "eval_accuracy": 0.41307418524410433, "eval_loss": 3.4066617488861084, "eval_runtime": 148.4411, "eval_samples_per_second": 390.189, "eval_steps_per_second": 6.097, "step": 706819 }, { "epoch": 19.0, "learning_rate": 5.237071992359765e-05, "loss": 2.7536, "step": 707000 }, { "epoch": 19.03, "learning_rate": 5.096766944748743e-05, "loss": 2.7341, "step": 708000 }, { "epoch": 19.06, "learning_rate": 4.956461897137721e-05, "loss": 2.7413, "step": 709000 }, { "epoch": 19.09, "learning_rate": 4.816016404033595e-05, "loss": 2.7406, "step": 710000 }, { "epoch": 19.11, "learning_rate": 4.675570910929468e-05, "loss": 2.7417, "step": 711000 }, { "epoch": 19.14, "learning_rate": 4.535265863318446e-05, "loss": 2.7401, "step": 712000 }, { "epoch": 19.17, "learning_rate": 4.39482037021432e-05, "loss": 2.741, "step": 713000 }, { "epoch": 19.19, "learning_rate": 4.254374877110194e-05, "loss": 2.7379, "step": 714000 }, { "epoch": 19.22, "learning_rate": 4.1139293840060675e-05, "loss": 2.7369, "step": 715000 }, { "epoch": 19.25, "learning_rate": 3.973483890901941e-05, "loss": 2.7397, "step": 716000 }, { "epoch": 19.27, "learning_rate": 3.8330383977978144e-05, "loss": 2.7431, "step": 717000 }, { "epoch": 19.3, "learning_rate": 3.692873795679897e-05, "loss": 2.7404, "step": 718000 }, { "epoch": 19.33, "learning_rate": 3.5524283025757704e-05, "loss": 2.7413, "step": 719000 }, { "epoch": 19.35, "learning_rate": 3.411982809471644e-05, "loss": 2.7379, "step": 720000 }, { "epoch": 19.38, "learning_rate": 3.271537316367518e-05, "loss": 2.7436, "step": 721000 }, { "epoch": 19.41, "learning_rate": 3.1312322687564956e-05, "loss": 2.7363, "step": 722000 }, { "epoch": 19.43, "learning_rate": 2.9907867756523694e-05, "loss": 2.7387, "step": 723000 }, { "epoch": 19.46, "learning_rate": 2.850481728041347e-05, "loss": 2.7431, "step": 724000 }, { "epoch": 19.49, "learning_rate": 2.710036234937221e-05, "loss": 2.7458, "step": 725000 }, { "epoch": 19.52, "learning_rate": 2.569731187326199e-05, "loss": 2.7419, "step": 726000 }, { "epoch": 19.54, "learning_rate": 2.4292856942220723e-05, "loss": 2.7415, "step": 727000 }, { "epoch": 19.57, "learning_rate": 2.2888402011179464e-05, "loss": 2.7392, "step": 728000 }, { "epoch": 19.6, "learning_rate": 2.1485351535069238e-05, "loss": 2.7467, "step": 729000 }, { "epoch": 19.62, "learning_rate": 2.008089660402798e-05, "loss": 2.7393, "step": 730000 }, { "epoch": 19.65, "learning_rate": 1.8677846127917755e-05, "loss": 2.7378, "step": 731000 }, { "epoch": 19.68, "learning_rate": 1.7273391196876493e-05, "loss": 2.747, "step": 732000 }, { "epoch": 19.7, "learning_rate": 1.587034072076627e-05, "loss": 2.7419, "step": 733000 }, { "epoch": 19.73, "learning_rate": 1.4465885789725008e-05, "loss": 2.7466, "step": 734000 }, { "epoch": 19.76, "learning_rate": 1.3062835313614786e-05, "loss": 2.7354, "step": 735000 }, { "epoch": 19.78, "learning_rate": 1.1658380382573524e-05, "loss": 2.7457, "step": 736000 }, { "epoch": 19.81, "learning_rate": 1.025392545153226e-05, "loss": 2.7398, "step": 737000 }, { "epoch": 19.84, "learning_rate": 8.850874975422038e-06, "loss": 2.7415, "step": 738000 }, { "epoch": 19.87, "learning_rate": 7.446420044380776e-06, "loss": 2.7381, "step": 739000 }, { "epoch": 19.89, "learning_rate": 6.043369568270554e-06, "loss": 2.7407, "step": 740000 }, { "epoch": 19.92, "learning_rate": 4.638914637229291e-06, "loss": 2.7397, "step": 741000 }, { "epoch": 19.95, "learning_rate": 3.2344597061880285e-06, "loss": 2.7412, "step": 742000 }, { "epoch": 19.97, "learning_rate": 1.831409230077807e-06, "loss": 2.7409, "step": 743000 }, { "epoch": 20.0, "learning_rate": 4.2695429903654394e-07, "loss": 2.7434, "step": 744000 }, { "epoch": 20.0, "eval_accuracy": 0.41252109443859236, "eval_loss": 3.417576313018799, "eval_runtime": 148.878, "eval_samples_per_second": 389.043, "eval_steps_per_second": 6.079, "step": 744020 }, { "epoch": 20.0, "step": 744020, "total_flos": 1.56740238729216e+18, "train_loss": 2.994195082282441, "train_runtime": 55239.7487, "train_samples_per_second": 215.503, "train_steps_per_second": 13.469 } ], "logging_steps": 1000, "max_steps": 744020, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 5000, "total_flos": 1.56740238729216e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }