diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4674 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 20.0, + "eval_steps": 500, + "global_step": 744020, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03, + "learning_rate": 3.125e-05, + "loss": 6.2858, + "step": 1000 + }, + { + "epoch": 0.05, + "learning_rate": 6.25e-05, + "loss": 5.118, + "step": 2000 + }, + { + "epoch": 0.08, + "learning_rate": 9.375e-05, + "loss": 4.8257, + "step": 3000 + }, + { + "epoch": 0.11, + "learning_rate": 0.000125, + "loss": 4.62, + "step": 4000 + }, + { + "epoch": 0.13, + "learning_rate": 0.00015625, + "loss": 4.4689, + "step": 5000 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001875, + "loss": 4.3516, + "step": 6000 + }, + { + "epoch": 0.19, + "learning_rate": 0.00021875, + "loss": 4.2636, + "step": 7000 + }, + { + "epoch": 0.22, + "learning_rate": 0.00025, + "loss": 4.1906, + "step": 8000 + }, + { + "epoch": 0.24, + "learning_rate": 0.00028125000000000003, + "loss": 4.1354, + "step": 9000 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003125, + "loss": 4.0582, + "step": 10000 + }, + { + "epoch": 0.3, + "learning_rate": 0.00034365625, + "loss": 3.9986, + "step": 11000 + }, + { + "epoch": 0.32, + "learning_rate": 0.00037490625, + "loss": 3.9535, + "step": 12000 + }, + { + "epoch": 0.35, + "learning_rate": 0.00040615625, + "loss": 3.9158, + "step": 13000 + }, + { + "epoch": 0.38, + "learning_rate": 0.00043737500000000005, + "loss": 3.8747, + "step": 14000 + }, + { + "epoch": 0.4, + "learning_rate": 0.000468625, + "loss": 3.8471, + "step": 15000 + }, + { + "epoch": 0.43, + "learning_rate": 0.00049984375, + "loss": 3.8178, + "step": 16000 + }, + { + "epoch": 0.46, + "learning_rate": 0.00053109375, + "loss": 3.794, + "step": 17000 + }, + { + "epoch": 0.48, + "learning_rate": 0.0005623125, + "loss": 3.7705, + "step": 18000 + }, + { + "epoch": 0.51, + "learning_rate": 0.0005935625, + "loss": 3.757, + "step": 19000 + }, + { + "epoch": 0.54, + "learning_rate": 0.00062478125, + "loss": 3.731, + "step": 20000 + }, + { + "epoch": 0.56, + "learning_rate": 0.0006560312499999999, + "loss": 3.7135, + "step": 21000 + }, + { + "epoch": 0.59, + "learning_rate": 0.00068725, + "loss": 3.6993, + "step": 22000 + }, + { + "epoch": 0.62, + "learning_rate": 0.00071846875, + "loss": 3.6745, + "step": 23000 + }, + { + "epoch": 0.65, + "learning_rate": 0.00074971875, + "loss": 3.668, + "step": 24000 + }, + { + "epoch": 0.67, + "learning_rate": 0.0007809375, + "loss": 3.6522, + "step": 25000 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008121875, + "loss": 3.6422, + "step": 26000 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008434062500000001, + "loss": 3.6284, + "step": 27000 + }, + { + "epoch": 0.75, + "learning_rate": 0.00087465625, + "loss": 3.6213, + "step": 28000 + }, + { + "epoch": 0.78, + "learning_rate": 0.00090590625, + "loss": 3.6071, + "step": 29000 + }, + { + "epoch": 0.81, + "learning_rate": 0.000937125, + "loss": 3.5975, + "step": 30000 + }, + { + "epoch": 0.83, + "learning_rate": 0.000968375, + "loss": 3.5921, + "step": 31000 + }, + { + "epoch": 0.86, + "learning_rate": 0.00099959375, + "loss": 3.5862, + "step": 32000 + }, + { + "epoch": 0.89, + "learning_rate": 0.0009986138029830622, + "loss": 3.5652, + "step": 33000 + }, + { + "epoch": 0.91, + "learning_rate": 0.000997209348052021, + "loss": 3.5558, + "step": 34000 + }, + { + "epoch": 0.94, + "learning_rate": 0.0009958048931209798, + "loss": 3.5398, + "step": 35000 + }, + { + "epoch": 0.97, + "learning_rate": 0.0009944018426448695, + "loss": 3.527, + "step": 36000 + }, + { + "epoch": 0.99, + "learning_rate": 0.0009929973877138283, + "loss": 3.5148, + "step": 37000 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.3670570705333534, + "eval_loss": 3.7269980907440186, + "eval_runtime": 147.4611, + "eval_samples_per_second": 392.781, + "eval_steps_per_second": 6.137, + "step": 37201 + }, + { + "epoch": 1.02, + "learning_rate": 0.0009915929327827871, + "loss": 3.4761, + "step": 38000 + }, + { + "epoch": 1.05, + "learning_rate": 0.0009901898823066768, + "loss": 3.4631, + "step": 39000 + }, + { + "epoch": 1.08, + "learning_rate": 0.0009887854273756356, + "loss": 3.4614, + "step": 40000 + }, + { + "epoch": 1.1, + "learning_rate": 0.0009873809724445942, + "loss": 3.4538, + "step": 41000 + }, + { + "epoch": 1.13, + "learning_rate": 0.000985976517513553, + "loss": 3.4587, + "step": 42000 + }, + { + "epoch": 1.16, + "learning_rate": 0.0009845720625825118, + "loss": 3.4417, + "step": 43000 + }, + { + "epoch": 1.18, + "learning_rate": 0.0009831690121064015, + "loss": 3.4351, + "step": 44000 + }, + { + "epoch": 1.21, + "learning_rate": 0.0009817645571753603, + "loss": 3.4312, + "step": 45000 + }, + { + "epoch": 1.24, + "learning_rate": 0.0009803601022443191, + "loss": 3.4235, + "step": 46000 + }, + { + "epoch": 1.26, + "learning_rate": 0.0009789570517682088, + "loss": 3.4191, + "step": 47000 + }, + { + "epoch": 1.29, + "learning_rate": 0.0009775525968371674, + "loss": 3.4077, + "step": 48000 + }, + { + "epoch": 1.32, + "learning_rate": 0.0009761495463610572, + "loss": 3.4119, + "step": 49000 + }, + { + "epoch": 1.34, + "learning_rate": 0.000974745091430016, + "loss": 3.392, + "step": 50000 + }, + { + "epoch": 1.37, + "learning_rate": 0.0009733406364989748, + "loss": 3.3985, + "step": 51000 + }, + { + "epoch": 1.4, + "learning_rate": 0.0009719361815679335, + "loss": 3.3884, + "step": 52000 + }, + { + "epoch": 1.42, + "learning_rate": 0.0009705317266368922, + "loss": 3.3792, + "step": 53000 + }, + { + "epoch": 1.45, + "learning_rate": 0.000969127271705851, + "loss": 3.3734, + "step": 54000 + }, + { + "epoch": 1.48, + "learning_rate": 0.0009677242212297408, + "loss": 3.3699, + "step": 55000 + }, + { + "epoch": 1.51, + "learning_rate": 0.0009663197662986994, + "loss": 3.3721, + "step": 56000 + }, + { + "epoch": 1.53, + "learning_rate": 0.0009649167158225893, + "loss": 3.3688, + "step": 57000 + }, + { + "epoch": 1.56, + "learning_rate": 0.000963512260891548, + "loss": 3.3598, + "step": 58000 + }, + { + "epoch": 1.59, + "learning_rate": 0.0009621078059605067, + "loss": 3.3608, + "step": 59000 + }, + { + "epoch": 1.61, + "learning_rate": 0.0009607047554843966, + "loss": 3.3529, + "step": 60000 + }, + { + "epoch": 1.64, + "learning_rate": 0.0009593003005533553, + "loss": 3.3448, + "step": 61000 + }, + { + "epoch": 1.67, + "learning_rate": 0.000957895845622314, + "loss": 3.3475, + "step": 62000 + }, + { + "epoch": 1.69, + "learning_rate": 0.0009564927951462038, + "loss": 3.3339, + "step": 63000 + }, + { + "epoch": 1.72, + "learning_rate": 0.0009550883402151626, + "loss": 3.3342, + "step": 64000 + }, + { + "epoch": 1.75, + "learning_rate": 0.0009536852897390523, + "loss": 3.3323, + "step": 65000 + }, + { + "epoch": 1.77, + "learning_rate": 0.0009522808348080109, + "loss": 3.3332, + "step": 66000 + }, + { + "epoch": 1.8, + "learning_rate": 0.0009508763798769697, + "loss": 3.32, + "step": 67000 + }, + { + "epoch": 1.83, + "learning_rate": 0.0009494733294008595, + "loss": 3.3214, + "step": 68000 + }, + { + "epoch": 1.85, + "learning_rate": 0.0009480688744698182, + "loss": 3.3226, + "step": 69000 + }, + { + "epoch": 1.88, + "learning_rate": 0.000946664419538777, + "loss": 3.3158, + "step": 70000 + }, + { + "epoch": 1.91, + "learning_rate": 0.0009452613690626668, + "loss": 3.3179, + "step": 71000 + }, + { + "epoch": 1.94, + "learning_rate": 0.0009438569141316255, + "loss": 3.3089, + "step": 72000 + }, + { + "epoch": 1.96, + "learning_rate": 0.0009424524592005843, + "loss": 3.3103, + "step": 73000 + }, + { + "epoch": 1.99, + "learning_rate": 0.0009410480042695429, + "loss": 3.3074, + "step": 74000 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.38968938775071477, + "eval_loss": 3.484098196029663, + "eval_runtime": 146.602, + "eval_samples_per_second": 395.083, + "eval_steps_per_second": 6.173, + "step": 74402 + }, + { + "epoch": 2.02, + "learning_rate": 0.0009396435493385018, + "loss": 3.2586, + "step": 75000 + }, + { + "epoch": 2.04, + "learning_rate": 0.0009382419033173227, + "loss": 3.2401, + "step": 76000 + }, + { + "epoch": 2.07, + "learning_rate": 0.0009368374483862813, + "loss": 3.2462, + "step": 77000 + }, + { + "epoch": 2.1, + "learning_rate": 0.00093543299345524, + "loss": 3.2422, + "step": 78000 + }, + { + "epoch": 2.12, + "learning_rate": 0.0009340285385241988, + "loss": 3.2446, + "step": 79000 + }, + { + "epoch": 2.15, + "learning_rate": 0.0009326254880480886, + "loss": 3.2477, + "step": 80000 + }, + { + "epoch": 2.18, + "learning_rate": 0.0009312224375719783, + "loss": 3.2454, + "step": 81000 + }, + { + "epoch": 2.2, + "learning_rate": 0.000929817982640937, + "loss": 3.2402, + "step": 82000 + }, + { + "epoch": 2.23, + "learning_rate": 0.0009284135277098959, + "loss": 3.2414, + "step": 83000 + }, + { + "epoch": 2.26, + "learning_rate": 0.0009270104772337856, + "loss": 3.2409, + "step": 84000 + }, + { + "epoch": 2.28, + "learning_rate": 0.0009256060223027443, + "loss": 3.2413, + "step": 85000 + }, + { + "epoch": 2.31, + "learning_rate": 0.000924201567371703, + "loss": 3.2392, + "step": 86000 + }, + { + "epoch": 2.34, + "learning_rate": 0.0009227985168955928, + "loss": 3.2399, + "step": 87000 + }, + { + "epoch": 2.37, + "learning_rate": 0.0009213940619645515, + "loss": 3.2404, + "step": 88000 + }, + { + "epoch": 2.39, + "learning_rate": 0.0009199896070335103, + "loss": 3.2342, + "step": 89000 + }, + { + "epoch": 2.42, + "learning_rate": 0.000918585152102469, + "loss": 3.2307, + "step": 90000 + }, + { + "epoch": 2.45, + "learning_rate": 0.0009171821016263588, + "loss": 3.2342, + "step": 91000 + }, + { + "epoch": 2.47, + "learning_rate": 0.0009157790511502487, + "loss": 3.2372, + "step": 92000 + }, + { + "epoch": 2.5, + "learning_rate": 0.0009143745962192074, + "loss": 3.2274, + "step": 93000 + }, + { + "epoch": 2.53, + "learning_rate": 0.0009129715457430971, + "loss": 3.2337, + "step": 94000 + }, + { + "epoch": 2.55, + "learning_rate": 0.000911567090812056, + "loss": 3.2228, + "step": 95000 + }, + { + "epoch": 2.58, + "learning_rate": 0.0009101626358810146, + "loss": 3.2285, + "step": 96000 + }, + { + "epoch": 2.61, + "learning_rate": 0.0009087581809499733, + "loss": 3.2247, + "step": 97000 + }, + { + "epoch": 2.63, + "learning_rate": 0.000907355130473863, + "loss": 3.2241, + "step": 98000 + }, + { + "epoch": 2.66, + "learning_rate": 0.0009059520799977529, + "loss": 3.2243, + "step": 99000 + }, + { + "epoch": 2.69, + "learning_rate": 0.0009045476250667116, + "loss": 3.221, + "step": 100000 + }, + { + "epoch": 2.71, + "learning_rate": 0.0009031431701356703, + "loss": 3.2195, + "step": 101000 + }, + { + "epoch": 2.74, + "learning_rate": 0.0009017401196595602, + "loss": 3.2168, + "step": 102000 + }, + { + "epoch": 2.77, + "learning_rate": 0.0009003356647285189, + "loss": 3.2185, + "step": 103000 + }, + { + "epoch": 2.8, + "learning_rate": 0.0008989326142524087, + "loss": 3.2177, + "step": 104000 + }, + { + "epoch": 2.82, + "learning_rate": 0.0008975281593213675, + "loss": 3.2188, + "step": 105000 + }, + { + "epoch": 2.85, + "learning_rate": 0.0008961237043903261, + "loss": 3.2152, + "step": 106000 + }, + { + "epoch": 2.88, + "learning_rate": 0.0008947192494592848, + "loss": 3.2146, + "step": 107000 + }, + { + "epoch": 2.9, + "learning_rate": 0.0008933176034381056, + "loss": 3.2083, + "step": 108000 + }, + { + "epoch": 2.93, + "learning_rate": 0.0008919131485070644, + "loss": 3.2108, + "step": 109000 + }, + { + "epoch": 2.96, + "learning_rate": 0.0008905086935760231, + "loss": 3.2138, + "step": 110000 + }, + { + "epoch": 2.98, + "learning_rate": 0.0008891056430999129, + "loss": 3.1988, + "step": 111000 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.3979050669647656, + "eval_loss": 3.4299747943878174, + "eval_runtime": 149.1667, + "eval_samples_per_second": 388.29, + "eval_steps_per_second": 6.067, + "step": 111603 + }, + { + "epoch": 3.01, + "learning_rate": 0.0008877011881688717, + "loss": 3.1786, + "step": 112000 + }, + { + "epoch": 3.04, + "learning_rate": 0.0008862967332378304, + "loss": 3.14, + "step": 113000 + }, + { + "epoch": 3.06, + "learning_rate": 0.0008848936827617202, + "loss": 3.1408, + "step": 114000 + }, + { + "epoch": 3.09, + "learning_rate": 0.000883489227830679, + "loss": 3.1491, + "step": 115000 + }, + { + "epoch": 3.12, + "learning_rate": 0.0008820847728996376, + "loss": 3.1428, + "step": 116000 + }, + { + "epoch": 3.15, + "learning_rate": 0.0008806803179685963, + "loss": 3.1515, + "step": 117000 + }, + { + "epoch": 3.17, + "learning_rate": 0.0008792772674924862, + "loss": 3.1396, + "step": 118000 + }, + { + "epoch": 3.2, + "learning_rate": 0.0008778728125614449, + "loss": 3.1518, + "step": 119000 + }, + { + "epoch": 3.23, + "learning_rate": 0.0008764683576304036, + "loss": 3.1464, + "step": 120000 + }, + { + "epoch": 3.25, + "learning_rate": 0.0008750653071542935, + "loss": 3.1562, + "step": 121000 + }, + { + "epoch": 3.28, + "learning_rate": 0.0008736608522232522, + "loss": 3.1454, + "step": 122000 + }, + { + "epoch": 3.31, + "learning_rate": 0.0008722563972922109, + "loss": 3.153, + "step": 123000 + }, + { + "epoch": 3.33, + "learning_rate": 0.0008708547512710317, + "loss": 3.1608, + "step": 124000 + }, + { + "epoch": 3.36, + "learning_rate": 0.0008694502963399905, + "loss": 3.1545, + "step": 125000 + }, + { + "epoch": 3.39, + "learning_rate": 0.0008680458414089491, + "loss": 3.1477, + "step": 126000 + }, + { + "epoch": 3.41, + "learning_rate": 0.0008666427909328389, + "loss": 3.1499, + "step": 127000 + }, + { + "epoch": 3.44, + "learning_rate": 0.0008652383360017977, + "loss": 3.1532, + "step": 128000 + }, + { + "epoch": 3.47, + "learning_rate": 0.0008638338810707564, + "loss": 3.1507, + "step": 129000 + }, + { + "epoch": 3.49, + "learning_rate": 0.0008624308305946462, + "loss": 3.1545, + "step": 130000 + }, + { + "epoch": 3.52, + "learning_rate": 0.000861026375663605, + "loss": 3.1478, + "step": 131000 + }, + { + "epoch": 3.55, + "learning_rate": 0.0008596233251874948, + "loss": 3.157, + "step": 132000 + }, + { + "epoch": 3.58, + "learning_rate": 0.0008582188702564535, + "loss": 3.1439, + "step": 133000 + }, + { + "epoch": 3.6, + "learning_rate": 0.0008568144153254123, + "loss": 3.1461, + "step": 134000 + }, + { + "epoch": 3.63, + "learning_rate": 0.0008554099603943709, + "loss": 3.1467, + "step": 135000 + }, + { + "epoch": 3.66, + "learning_rate": 0.0008540069099182607, + "loss": 3.1486, + "step": 136000 + }, + { + "epoch": 3.68, + "learning_rate": 0.0008526024549872195, + "loss": 3.1467, + "step": 137000 + }, + { + "epoch": 3.71, + "learning_rate": 0.0008511994045111093, + "loss": 3.1482, + "step": 138000 + }, + { + "epoch": 3.74, + "learning_rate": 0.000849794949580068, + "loss": 3.1508, + "step": 139000 + }, + { + "epoch": 3.76, + "learning_rate": 0.0008483904946490267, + "loss": 3.1574, + "step": 140000 + }, + { + "epoch": 3.79, + "learning_rate": 0.0008469860397179855, + "loss": 3.1437, + "step": 141000 + }, + { + "epoch": 3.82, + "learning_rate": 0.0008455815847869442, + "loss": 3.1427, + "step": 142000 + }, + { + "epoch": 3.84, + "learning_rate": 0.000844178534310834, + "loss": 3.15, + "step": 143000 + }, + { + "epoch": 3.87, + "learning_rate": 0.0008427740793797927, + "loss": 3.1489, + "step": 144000 + }, + { + "epoch": 3.9, + "learning_rate": 0.0008413710289036824, + "loss": 3.1449, + "step": 145000 + }, + { + "epoch": 3.92, + "learning_rate": 0.0008399665739726412, + "loss": 3.1465, + "step": 146000 + }, + { + "epoch": 3.95, + "learning_rate": 0.0008385621190416, + "loss": 3.1375, + "step": 147000 + }, + { + "epoch": 3.98, + "learning_rate": 0.0008371576641105587, + "loss": 3.152, + "step": 148000 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.4049755331384225, + "eval_loss": 3.3773725032806396, + "eval_runtime": 149.0965, + "eval_samples_per_second": 388.473, + "eval_steps_per_second": 6.07, + "step": 148804 + }, + { + "epoch": 4.01, + "learning_rate": 0.0008357546136344485, + "loss": 3.1283, + "step": 149000 + }, + { + "epoch": 4.03, + "learning_rate": 0.0008343515631583383, + "loss": 3.0733, + "step": 150000 + }, + { + "epoch": 4.06, + "learning_rate": 0.000832947108227297, + "loss": 3.0775, + "step": 151000 + }, + { + "epoch": 4.09, + "learning_rate": 0.0008315426532962557, + "loss": 3.0746, + "step": 152000 + }, + { + "epoch": 4.11, + "learning_rate": 0.0008301381983652145, + "loss": 3.0835, + "step": 153000 + }, + { + "epoch": 4.14, + "learning_rate": 0.0008287351478891042, + "loss": 3.0848, + "step": 154000 + }, + { + "epoch": 4.17, + "learning_rate": 0.0008273306929580629, + "loss": 3.0851, + "step": 155000 + }, + { + "epoch": 4.19, + "learning_rate": 0.0008259276424819527, + "loss": 3.0789, + "step": 156000 + }, + { + "epoch": 4.22, + "learning_rate": 0.0008245231875509115, + "loss": 3.0933, + "step": 157000 + }, + { + "epoch": 4.25, + "learning_rate": 0.0008231201370748013, + "loss": 3.0864, + "step": 158000 + }, + { + "epoch": 4.27, + "learning_rate": 0.00082171568214376, + "loss": 3.0948, + "step": 159000 + }, + { + "epoch": 4.3, + "learning_rate": 0.0008203126316676498, + "loss": 3.0914, + "step": 160000 + }, + { + "epoch": 4.33, + "learning_rate": 0.0008189081767366086, + "loss": 3.0954, + "step": 161000 + }, + { + "epoch": 4.35, + "learning_rate": 0.0008175037218055673, + "loss": 3.0923, + "step": 162000 + }, + { + "epoch": 4.38, + "learning_rate": 0.000816099266874526, + "loss": 3.0964, + "step": 163000 + }, + { + "epoch": 4.41, + "learning_rate": 0.0008146962163984158, + "loss": 3.0941, + "step": 164000 + }, + { + "epoch": 4.44, + "learning_rate": 0.0008132917614673745, + "loss": 3.0902, + "step": 165000 + }, + { + "epoch": 4.46, + "learning_rate": 0.0008118887109912643, + "loss": 3.0969, + "step": 166000 + }, + { + "epoch": 4.49, + "learning_rate": 0.000810484256060223, + "loss": 3.0948, + "step": 167000 + }, + { + "epoch": 4.52, + "learning_rate": 0.0008090798011291817, + "loss": 3.0874, + "step": 168000 + }, + { + "epoch": 4.54, + "learning_rate": 0.0008076767506530715, + "loss": 3.0981, + "step": 169000 + }, + { + "epoch": 4.57, + "learning_rate": 0.0008062722957220303, + "loss": 3.0934, + "step": 170000 + }, + { + "epoch": 4.6, + "learning_rate": 0.000804867840790989, + "loss": 3.0974, + "step": 171000 + }, + { + "epoch": 4.62, + "learning_rate": 0.0008034633858599479, + "loss": 3.0942, + "step": 172000 + }, + { + "epoch": 4.65, + "learning_rate": 0.0008020603353838376, + "loss": 3.0917, + "step": 173000 + }, + { + "epoch": 4.68, + "learning_rate": 0.0008006558804527962, + "loss": 3.0974, + "step": 174000 + }, + { + "epoch": 4.7, + "learning_rate": 0.000799252829976686, + "loss": 3.0948, + "step": 175000 + }, + { + "epoch": 4.73, + "learning_rate": 0.0007978483750456448, + "loss": 3.099, + "step": 176000 + }, + { + "epoch": 4.76, + "learning_rate": 0.0007964439201146035, + "loss": 3.1001, + "step": 177000 + }, + { + "epoch": 4.78, + "learning_rate": 0.0007950394651835623, + "loss": 3.0987, + "step": 178000 + }, + { + "epoch": 4.81, + "learning_rate": 0.0007936364147074521, + "loss": 3.0925, + "step": 179000 + }, + { + "epoch": 4.84, + "learning_rate": 0.0007922319597764108, + "loss": 3.0969, + "step": 180000 + }, + { + "epoch": 4.87, + "learning_rate": 0.0007908289093003006, + "loss": 3.0986, + "step": 181000 + }, + { + "epoch": 4.89, + "learning_rate": 0.0007894244543692594, + "loss": 3.095, + "step": 182000 + }, + { + "epoch": 4.92, + "learning_rate": 0.000788019999438218, + "loss": 3.0932, + "step": 183000 + }, + { + "epoch": 4.95, + "learning_rate": 0.0007866169489621078, + "loss": 3.0919, + "step": 184000 + }, + { + "epoch": 4.97, + "learning_rate": 0.0007852124940310666, + "loss": 3.0978, + "step": 185000 + }, + { + "epoch": 5.0, + "learning_rate": 0.0007838080391000253, + "loss": 3.0973, + "step": 186000 + }, + { + "epoch": 5.0, + "eval_accuracy": 0.40901955199174495, + "eval_loss": 3.346210479736328, + "eval_runtime": 147.9489, + "eval_samples_per_second": 391.487, + "eval_steps_per_second": 6.117, + "step": 186005 + }, + { + "epoch": 5.03, + "learning_rate": 0.000782403584168984, + "loss": 3.021, + "step": 187000 + }, + { + "epoch": 5.05, + "learning_rate": 0.0007810005336928739, + "loss": 3.0218, + "step": 188000 + }, + { + "epoch": 5.08, + "learning_rate": 0.0007795960787618326, + "loss": 3.0321, + "step": 189000 + }, + { + "epoch": 5.11, + "learning_rate": 0.0007781930282857223, + "loss": 3.0359, + "step": 190000 + }, + { + "epoch": 5.13, + "learning_rate": 0.0007767885733546812, + "loss": 3.0365, + "step": 191000 + }, + { + "epoch": 5.16, + "learning_rate": 0.0007753855228785709, + "loss": 3.0411, + "step": 192000 + }, + { + "epoch": 5.19, + "learning_rate": 0.0007739810679475295, + "loss": 3.0414, + "step": 193000 + }, + { + "epoch": 5.21, + "learning_rate": 0.0007725766130164883, + "loss": 3.0395, + "step": 194000 + }, + { + "epoch": 5.24, + "learning_rate": 0.0007711721580854471, + "loss": 3.042, + "step": 195000 + }, + { + "epoch": 5.27, + "learning_rate": 0.0007697691076093368, + "loss": 3.0454, + "step": 196000 + }, + { + "epoch": 5.3, + "learning_rate": 0.0007683646526782956, + "loss": 3.0444, + "step": 197000 + }, + { + "epoch": 5.32, + "learning_rate": 0.0007669601977472544, + "loss": 3.0452, + "step": 198000 + }, + { + "epoch": 5.35, + "learning_rate": 0.0007655557428162131, + "loss": 3.0496, + "step": 199000 + }, + { + "epoch": 5.38, + "learning_rate": 0.0007641526923401028, + "loss": 3.0454, + "step": 200000 + }, + { + "epoch": 5.4, + "learning_rate": 0.0007627482374090615, + "loss": 3.048, + "step": 201000 + }, + { + "epoch": 5.43, + "learning_rate": 0.0007613451869329513, + "loss": 3.0478, + "step": 202000 + }, + { + "epoch": 5.46, + "learning_rate": 0.00075994073200191, + "loss": 3.0495, + "step": 203000 + }, + { + "epoch": 5.48, + "learning_rate": 0.0007585376815257999, + "loss": 3.0498, + "step": 204000 + }, + { + "epoch": 5.51, + "learning_rate": 0.0007571332265947586, + "loss": 3.0483, + "step": 205000 + }, + { + "epoch": 5.54, + "learning_rate": 0.0007557301761186483, + "loss": 3.0534, + "step": 206000 + }, + { + "epoch": 5.56, + "learning_rate": 0.0007543257211876072, + "loss": 3.0583, + "step": 207000 + }, + { + "epoch": 5.59, + "learning_rate": 0.0007529212662565659, + "loss": 3.0512, + "step": 208000 + }, + { + "epoch": 5.62, + "learning_rate": 0.0007515182157804556, + "loss": 3.0507, + "step": 209000 + }, + { + "epoch": 5.65, + "learning_rate": 0.0007501137608494145, + "loss": 3.0499, + "step": 210000 + }, + { + "epoch": 5.67, + "learning_rate": 0.0007487107103733042, + "loss": 3.0572, + "step": 211000 + }, + { + "epoch": 5.7, + "learning_rate": 0.0007473062554422628, + "loss": 3.0519, + "step": 212000 + }, + { + "epoch": 5.73, + "learning_rate": 0.0007459018005112215, + "loss": 3.0495, + "step": 213000 + }, + { + "epoch": 5.75, + "learning_rate": 0.0007444987500351114, + "loss": 3.0612, + "step": 214000 + }, + { + "epoch": 5.78, + "learning_rate": 0.0007430956995590012, + "loss": 3.0547, + "step": 215000 + }, + { + "epoch": 5.81, + "learning_rate": 0.0007416912446279599, + "loss": 3.054, + "step": 216000 + }, + { + "epoch": 5.83, + "learning_rate": 0.0007402867896969187, + "loss": 3.0547, + "step": 217000 + }, + { + "epoch": 5.86, + "learning_rate": 0.0007388823347658774, + "loss": 3.0588, + "step": 218000 + }, + { + "epoch": 5.89, + "learning_rate": 0.0007374792842897672, + "loss": 3.0531, + "step": 219000 + }, + { + "epoch": 5.91, + "learning_rate": 0.0007360762338136569, + "loss": 3.0585, + "step": 220000 + }, + { + "epoch": 5.94, + "learning_rate": 0.0007346717788826157, + "loss": 3.0522, + "step": 221000 + }, + { + "epoch": 5.97, + "learning_rate": 0.0007332673239515743, + "loss": 3.0604, + "step": 222000 + }, + { + "epoch": 5.99, + "learning_rate": 0.0007318628690205332, + "loss": 3.0543, + "step": 223000 + }, + { + "epoch": 6.0, + "eval_accuracy": 0.4064327960745534, + "eval_loss": 3.3686516284942627, + "eval_runtime": 149.1251, + "eval_samples_per_second": 388.399, + "eval_steps_per_second": 6.069, + "step": 223206 + }, + { + "epoch": 6.02, + "learning_rate": 0.0007304598185444229, + "loss": 3.0023, + "step": 224000 + }, + { + "epoch": 6.05, + "learning_rate": 0.0007290553636133816, + "loss": 2.9878, + "step": 225000 + }, + { + "epoch": 6.08, + "learning_rate": 0.0007276509086823405, + "loss": 2.9862, + "step": 226000 + }, + { + "epoch": 6.1, + "learning_rate": 0.0007262464537512992, + "loss": 2.993, + "step": 227000 + }, + { + "epoch": 6.13, + "learning_rate": 0.0007248434032751889, + "loss": 2.9986, + "step": 228000 + }, + { + "epoch": 6.16, + "learning_rate": 0.0007234389483441476, + "loss": 2.9984, + "step": 229000 + }, + { + "epoch": 6.18, + "learning_rate": 0.0007220344934131064, + "loss": 2.9975, + "step": 230000 + }, + { + "epoch": 6.21, + "learning_rate": 0.0007206314429369961, + "loss": 3.0058, + "step": 231000 + }, + { + "epoch": 6.24, + "learning_rate": 0.0007192269880059548, + "loss": 3.003, + "step": 232000 + }, + { + "epoch": 6.26, + "learning_rate": 0.0007178239375298447, + "loss": 3.0059, + "step": 233000 + }, + { + "epoch": 6.29, + "learning_rate": 0.0007164194825988034, + "loss": 3.0085, + "step": 234000 + }, + { + "epoch": 6.32, + "learning_rate": 0.0007150150276677621, + "loss": 3.0067, + "step": 235000 + }, + { + "epoch": 6.34, + "learning_rate": 0.000713611977191652, + "loss": 3.0048, + "step": 236000 + }, + { + "epoch": 6.37, + "learning_rate": 0.0007122075222606107, + "loss": 3.007, + "step": 237000 + }, + { + "epoch": 6.4, + "learning_rate": 0.0007108044717845005, + "loss": 3.0163, + "step": 238000 + }, + { + "epoch": 6.42, + "learning_rate": 0.0007094000168534593, + "loss": 3.0104, + "step": 239000 + }, + { + "epoch": 6.45, + "learning_rate": 0.0007079955619224179, + "loss": 3.0145, + "step": 240000 + }, + { + "epoch": 6.48, + "learning_rate": 0.0007065939159012388, + "loss": 3.0145, + "step": 241000 + }, + { + "epoch": 6.51, + "learning_rate": 0.0007051894609701974, + "loss": 3.0127, + "step": 242000 + }, + { + "epoch": 6.53, + "learning_rate": 0.0007037850060391562, + "loss": 3.0159, + "step": 243000 + }, + { + "epoch": 6.56, + "learning_rate": 0.0007023805511081149, + "loss": 3.0151, + "step": 244000 + }, + { + "epoch": 6.59, + "learning_rate": 0.0007009760961770736, + "loss": 3.02, + "step": 245000 + }, + { + "epoch": 6.61, + "learning_rate": 0.0006995730457009635, + "loss": 3.019, + "step": 246000 + }, + { + "epoch": 6.64, + "learning_rate": 0.0006981685907699222, + "loss": 3.0139, + "step": 247000 + }, + { + "epoch": 6.67, + "learning_rate": 0.000696765540293812, + "loss": 3.0183, + "step": 248000 + }, + { + "epoch": 6.69, + "learning_rate": 0.0006953610853627708, + "loss": 3.0203, + "step": 249000 + }, + { + "epoch": 6.72, + "learning_rate": 0.0006939566304317294, + "loss": 3.024, + "step": 250000 + }, + { + "epoch": 6.75, + "learning_rate": 0.0006925535799556192, + "loss": 3.0248, + "step": 251000 + }, + { + "epoch": 6.77, + "learning_rate": 0.000691149125024578, + "loss": 3.0249, + "step": 252000 + }, + { + "epoch": 6.8, + "learning_rate": 0.0006897460745484677, + "loss": 3.0181, + "step": 253000 + }, + { + "epoch": 6.83, + "learning_rate": 0.0006883416196174265, + "loss": 3.0196, + "step": 254000 + }, + { + "epoch": 6.85, + "learning_rate": 0.0006869371646863853, + "loss": 3.0244, + "step": 255000 + }, + { + "epoch": 6.88, + "learning_rate": 0.000685534114210275, + "loss": 3.0251, + "step": 256000 + }, + { + "epoch": 6.91, + "learning_rate": 0.0006841296592792338, + "loss": 3.0219, + "step": 257000 + }, + { + "epoch": 6.94, + "learning_rate": 0.0006827266088031235, + "loss": 3.0203, + "step": 258000 + }, + { + "epoch": 6.96, + "learning_rate": 0.0006813221538720823, + "loss": 3.0225, + "step": 259000 + }, + { + "epoch": 6.99, + "learning_rate": 0.0006799176989410409, + "loss": 3.0161, + "step": 260000 + }, + { + "epoch": 7.0, + "eval_accuracy": 0.4113539808027173, + "eval_loss": 3.339113473892212, + "eval_runtime": 148.8983, + "eval_samples_per_second": 388.99, + "eval_steps_per_second": 6.078, + "step": 260407 + }, + { + "epoch": 7.02, + "learning_rate": 0.0006785132440099997, + "loss": 2.9764, + "step": 261000 + }, + { + "epoch": 7.04, + "learning_rate": 0.0006771101935338895, + "loss": 2.9576, + "step": 262000 + }, + { + "epoch": 7.07, + "learning_rate": 0.0006757057386028482, + "loss": 2.9529, + "step": 263000 + }, + { + "epoch": 7.1, + "learning_rate": 0.0006743012836718069, + "loss": 2.9603, + "step": 264000 + }, + { + "epoch": 7.12, + "learning_rate": 0.0006728968287407658, + "loss": 2.9641, + "step": 265000 + }, + { + "epoch": 7.15, + "learning_rate": 0.0006714937782646555, + "loss": 2.9675, + "step": 266000 + }, + { + "epoch": 7.18, + "learning_rate": 0.0006700907277885453, + "loss": 2.9641, + "step": 267000 + }, + { + "epoch": 7.2, + "learning_rate": 0.0006686862728575041, + "loss": 2.9672, + "step": 268000 + }, + { + "epoch": 7.23, + "learning_rate": 0.0006672818179264628, + "loss": 2.9774, + "step": 269000 + }, + { + "epoch": 7.26, + "learning_rate": 0.0006658773629954214, + "loss": 2.9753, + "step": 270000 + }, + { + "epoch": 7.28, + "learning_rate": 0.0006644743125193112, + "loss": 2.9674, + "step": 271000 + }, + { + "epoch": 7.31, + "learning_rate": 0.00066306985758827, + "loss": 2.9716, + "step": 272000 + }, + { + "epoch": 7.34, + "learning_rate": 0.0006616654026572287, + "loss": 2.9788, + "step": 273000 + }, + { + "epoch": 7.37, + "learning_rate": 0.0006602623521811185, + "loss": 2.9791, + "step": 274000 + }, + { + "epoch": 7.39, + "learning_rate": 0.0006588593017050083, + "loss": 2.9821, + "step": 275000 + }, + { + "epoch": 7.42, + "learning_rate": 0.000657454846773967, + "loss": 2.9828, + "step": 276000 + }, + { + "epoch": 7.45, + "learning_rate": 0.0006560503918429258, + "loss": 2.9802, + "step": 277000 + }, + { + "epoch": 7.47, + "learning_rate": 0.0006546459369118846, + "loss": 2.9858, + "step": 278000 + }, + { + "epoch": 7.5, + "learning_rate": 0.0006532428864357743, + "loss": 2.9814, + "step": 279000 + }, + { + "epoch": 7.53, + "learning_rate": 0.0006518398359596641, + "loss": 2.9865, + "step": 280000 + }, + { + "epoch": 7.55, + "learning_rate": 0.0006504353810286228, + "loss": 2.9894, + "step": 281000 + }, + { + "epoch": 7.58, + "learning_rate": 0.0006490309260975815, + "loss": 2.9832, + "step": 282000 + }, + { + "epoch": 7.61, + "learning_rate": 0.0006476264711665402, + "loss": 2.986, + "step": 283000 + }, + { + "epoch": 7.63, + "learning_rate": 0.0006462234206904301, + "loss": 2.9924, + "step": 284000 + }, + { + "epoch": 7.66, + "learning_rate": 0.0006448189657593888, + "loss": 2.9838, + "step": 285000 + }, + { + "epoch": 7.69, + "learning_rate": 0.0006434145108283475, + "loss": 2.99, + "step": 286000 + }, + { + "epoch": 7.71, + "learning_rate": 0.0006420100558973064, + "loss": 2.9873, + "step": 287000 + }, + { + "epoch": 7.74, + "learning_rate": 0.0006406070054211961, + "loss": 2.9866, + "step": 288000 + }, + { + "epoch": 7.77, + "learning_rate": 0.0006392025504901547, + "loss": 2.9869, + "step": 289000 + }, + { + "epoch": 7.8, + "learning_rate": 0.0006377995000140445, + "loss": 2.9881, + "step": 290000 + }, + { + "epoch": 7.82, + "learning_rate": 0.0006363964495379343, + "loss": 2.9825, + "step": 291000 + }, + { + "epoch": 7.85, + "learning_rate": 0.000634991994606893, + "loss": 2.9951, + "step": 292000 + }, + { + "epoch": 7.88, + "learning_rate": 0.0006335875396758518, + "loss": 2.9958, + "step": 293000 + }, + { + "epoch": 7.9, + "learning_rate": 0.0006321830847448106, + "loss": 2.997, + "step": 294000 + }, + { + "epoch": 7.93, + "learning_rate": 0.0006307800342687003, + "loss": 2.9886, + "step": 295000 + }, + { + "epoch": 7.96, + "learning_rate": 0.0006293755793376591, + "loss": 3.0001, + "step": 296000 + }, + { + "epoch": 7.98, + "learning_rate": 0.0006279725288615489, + "loss": 2.9858, + "step": 297000 + }, + { + "epoch": 8.0, + "eval_accuracy": 0.4104764790291721, + "eval_loss": 3.347707748413086, + "eval_runtime": 148.9691, + "eval_samples_per_second": 388.806, + "eval_steps_per_second": 6.075, + "step": 297608 + }, + { + "epoch": 8.01, + "learning_rate": 0.0006265680739305076, + "loss": 2.9621, + "step": 298000 + }, + { + "epoch": 8.04, + "learning_rate": 0.0006251650234543974, + "loss": 2.9243, + "step": 299000 + }, + { + "epoch": 8.06, + "learning_rate": 0.000623760568523356, + "loss": 2.9237, + "step": 300000 + }, + { + "epoch": 8.09, + "learning_rate": 0.0006223575180472459, + "loss": 2.9296, + "step": 301000 + }, + { + "epoch": 8.12, + "learning_rate": 0.0006209530631162046, + "loss": 2.9321, + "step": 302000 + }, + { + "epoch": 8.14, + "learning_rate": 0.0006195486081851633, + "loss": 2.9411, + "step": 303000 + }, + { + "epoch": 8.17, + "learning_rate": 0.0006181441532541221, + "loss": 2.9375, + "step": 304000 + }, + { + "epoch": 8.2, + "learning_rate": 0.0006167396983230808, + "loss": 2.9463, + "step": 305000 + }, + { + "epoch": 8.23, + "learning_rate": 0.0006153366478469706, + "loss": 2.9421, + "step": 306000 + }, + { + "epoch": 8.25, + "learning_rate": 0.0006139321929159294, + "loss": 2.9412, + "step": 307000 + }, + { + "epoch": 8.28, + "learning_rate": 0.0006125291424398192, + "loss": 2.9451, + "step": 308000 + }, + { + "epoch": 8.31, + "learning_rate": 0.0006111246875087778, + "loss": 2.9487, + "step": 309000 + }, + { + "epoch": 8.33, + "learning_rate": 0.0006097216370326676, + "loss": 2.9474, + "step": 310000 + }, + { + "epoch": 8.36, + "learning_rate": 0.0006083171821016264, + "loss": 2.9504, + "step": 311000 + }, + { + "epoch": 8.39, + "learning_rate": 0.0006069127271705851, + "loss": 2.9526, + "step": 312000 + }, + { + "epoch": 8.41, + "learning_rate": 0.0006055096766944749, + "loss": 2.948, + "step": 313000 + }, + { + "epoch": 8.44, + "learning_rate": 0.0006041066262183647, + "loss": 2.954, + "step": 314000 + }, + { + "epoch": 8.47, + "learning_rate": 0.0006027021712873234, + "loss": 2.9529, + "step": 315000 + }, + { + "epoch": 8.49, + "learning_rate": 0.0006012977163562821, + "loss": 2.9531, + "step": 316000 + }, + { + "epoch": 8.52, + "learning_rate": 0.0005998932614252409, + "loss": 2.9572, + "step": 317000 + }, + { + "epoch": 8.55, + "learning_rate": 0.0005984902109491307, + "loss": 2.9579, + "step": 318000 + }, + { + "epoch": 8.58, + "learning_rate": 0.0005970857560180893, + "loss": 2.9599, + "step": 319000 + }, + { + "epoch": 8.6, + "learning_rate": 0.0005956827055419792, + "loss": 2.9537, + "step": 320000 + }, + { + "epoch": 8.63, + "learning_rate": 0.0005942782506109379, + "loss": 2.9554, + "step": 321000 + }, + { + "epoch": 8.66, + "learning_rate": 0.0005928737956798966, + "loss": 2.9589, + "step": 322000 + }, + { + "epoch": 8.68, + "learning_rate": 0.0005914707452037865, + "loss": 2.9607, + "step": 323000 + }, + { + "epoch": 8.71, + "learning_rate": 0.0005900662902727452, + "loss": 2.958, + "step": 324000 + }, + { + "epoch": 8.74, + "learning_rate": 0.0005886632397966349, + "loss": 2.9597, + "step": 325000 + }, + { + "epoch": 8.76, + "learning_rate": 0.0005872587848655937, + "loss": 2.9666, + "step": 326000 + }, + { + "epoch": 8.79, + "learning_rate": 0.0005858557343894835, + "loss": 2.9572, + "step": 327000 + }, + { + "epoch": 8.82, + "learning_rate": 0.0005844512794584422, + "loss": 2.9654, + "step": 328000 + }, + { + "epoch": 8.84, + "learning_rate": 0.000583048228982332, + "loss": 2.9631, + "step": 329000 + }, + { + "epoch": 8.87, + "learning_rate": 0.0005816437740512907, + "loss": 2.9666, + "step": 330000 + }, + { + "epoch": 8.9, + "learning_rate": 0.0005802393191202494, + "loss": 2.9719, + "step": 331000 + }, + { + "epoch": 8.92, + "learning_rate": 0.0005788348641892081, + "loss": 2.9649, + "step": 332000 + }, + { + "epoch": 8.95, + "learning_rate": 0.0005774332181680289, + "loss": 2.9659, + "step": 333000 + }, + { + "epoch": 8.98, + "learning_rate": 0.0005760287632369877, + "loss": 2.9718, + "step": 334000 + }, + { + "epoch": 9.0, + "eval_accuracy": 0.41122177107294106, + "eval_loss": 3.343648672103882, + "eval_runtime": 147.5276, + "eval_samples_per_second": 392.604, + "eval_steps_per_second": 6.134, + "step": 334809 + }, + { + "epoch": 9.01, + "learning_rate": 0.0005746243083059465, + "loss": 2.9542, + "step": 335000 + }, + { + "epoch": 9.03, + "learning_rate": 0.0005732212578298362, + "loss": 2.891, + "step": 336000 + }, + { + "epoch": 9.06, + "learning_rate": 0.000571816802898795, + "loss": 2.9009, + "step": 337000 + }, + { + "epoch": 9.09, + "learning_rate": 0.0005704123479677537, + "loss": 2.8991, + "step": 338000 + }, + { + "epoch": 9.11, + "learning_rate": 0.0005690078930367125, + "loss": 2.9084, + "step": 339000 + }, + { + "epoch": 9.14, + "learning_rate": 0.0005676048425606023, + "loss": 2.91, + "step": 340000 + }, + { + "epoch": 9.17, + "learning_rate": 0.0005662003876295609, + "loss": 2.9123, + "step": 341000 + }, + { + "epoch": 9.19, + "learning_rate": 0.0005647973371534507, + "loss": 2.9128, + "step": 342000 + }, + { + "epoch": 9.22, + "learning_rate": 0.0005633942866773404, + "loss": 2.9138, + "step": 343000 + }, + { + "epoch": 9.25, + "learning_rate": 0.0005619898317462993, + "loss": 2.9208, + "step": 344000 + }, + { + "epoch": 9.27, + "learning_rate": 0.000560585376815258, + "loss": 2.9197, + "step": 345000 + }, + { + "epoch": 9.3, + "learning_rate": 0.0005591809218842168, + "loss": 2.919, + "step": 346000 + }, + { + "epoch": 9.33, + "learning_rate": 0.0005577778714081066, + "loss": 2.9241, + "step": 347000 + }, + { + "epoch": 9.35, + "learning_rate": 0.0005563734164770653, + "loss": 2.9236, + "step": 348000 + }, + { + "epoch": 9.38, + "learning_rate": 0.000554970366000955, + "loss": 2.9224, + "step": 349000 + }, + { + "epoch": 9.41, + "learning_rate": 0.0005535673155248449, + "loss": 2.9247, + "step": 350000 + }, + { + "epoch": 9.44, + "learning_rate": 0.0005521628605938036, + "loss": 2.9262, + "step": 351000 + }, + { + "epoch": 9.46, + "learning_rate": 0.0005507584056627622, + "loss": 2.9309, + "step": 352000 + }, + { + "epoch": 9.49, + "learning_rate": 0.000549353950731721, + "loss": 2.9299, + "step": 353000 + }, + { + "epoch": 9.52, + "learning_rate": 0.0005479509002556108, + "loss": 2.9349, + "step": 354000 + }, + { + "epoch": 9.54, + "learning_rate": 0.0005465464453245695, + "loss": 2.9367, + "step": 355000 + }, + { + "epoch": 9.57, + "learning_rate": 0.0005451433948484594, + "loss": 2.933, + "step": 356000 + }, + { + "epoch": 9.6, + "learning_rate": 0.0005437389399174181, + "loss": 2.9336, + "step": 357000 + }, + { + "epoch": 9.62, + "learning_rate": 0.0005423344849863768, + "loss": 2.9349, + "step": 358000 + }, + { + "epoch": 9.65, + "learning_rate": 0.0005409314345102666, + "loss": 2.9415, + "step": 359000 + }, + { + "epoch": 9.68, + "learning_rate": 0.0005395269795792254, + "loss": 2.9328, + "step": 360000 + }, + { + "epoch": 9.7, + "learning_rate": 0.0005381239291031151, + "loss": 2.9346, + "step": 361000 + }, + { + "epoch": 9.73, + "learning_rate": 0.0005367194741720737, + "loss": 2.9391, + "step": 362000 + }, + { + "epoch": 9.76, + "learning_rate": 0.0005353150192410326, + "loss": 2.9393, + "step": 363000 + }, + { + "epoch": 9.78, + "learning_rate": 0.0005339119687649223, + "loss": 2.9419, + "step": 364000 + }, + { + "epoch": 9.81, + "learning_rate": 0.000532507513833881, + "loss": 2.9377, + "step": 365000 + }, + { + "epoch": 9.84, + "learning_rate": 0.0005311058678127018, + "loss": 2.9373, + "step": 366000 + }, + { + "epoch": 9.87, + "learning_rate": 0.0005297014128816607, + "loss": 2.9416, + "step": 367000 + }, + { + "epoch": 9.89, + "learning_rate": 0.0005282969579506194, + "loss": 2.9433, + "step": 368000 + }, + { + "epoch": 9.92, + "learning_rate": 0.0005268925030195782, + "loss": 2.9406, + "step": 369000 + }, + { + "epoch": 9.95, + "learning_rate": 0.000525489452543468, + "loss": 2.9419, + "step": 370000 + }, + { + "epoch": 9.97, + "learning_rate": 0.0005240849976124267, + "loss": 2.9411, + "step": 371000 + }, + { + "epoch": 10.0, + "learning_rate": 0.0005226805426813853, + "loss": 2.9399, + "step": 372000 + }, + { + "epoch": 10.0, + "eval_accuracy": 0.41210269901326396, + "eval_loss": 3.345149278640747, + "eval_runtime": 148.3396, + "eval_samples_per_second": 390.455, + "eval_steps_per_second": 6.101, + "step": 372010 + }, + { + "epoch": 10.03, + "learning_rate": 0.0005212774922052751, + "loss": 2.8712, + "step": 373000 + }, + { + "epoch": 10.05, + "learning_rate": 0.0005198730372742338, + "loss": 2.8767, + "step": 374000 + }, + { + "epoch": 10.08, + "learning_rate": 0.0005184699867981236, + "loss": 2.8784, + "step": 375000 + }, + { + "epoch": 10.11, + "learning_rate": 0.0005170655318670824, + "loss": 2.8827, + "step": 376000 + }, + { + "epoch": 10.13, + "learning_rate": 0.0005156624813909722, + "loss": 2.8869, + "step": 377000 + }, + { + "epoch": 10.16, + "learning_rate": 0.0005142594309148619, + "loss": 2.8899, + "step": 378000 + }, + { + "epoch": 10.19, + "learning_rate": 0.0005128549759838207, + "loss": 2.8921, + "step": 379000 + }, + { + "epoch": 10.21, + "learning_rate": 0.0005114505210527795, + "loss": 2.8914, + "step": 380000 + }, + { + "epoch": 10.24, + "learning_rate": 0.0005100460661217382, + "loss": 2.8919, + "step": 381000 + }, + { + "epoch": 10.27, + "learning_rate": 0.0005086430156456279, + "loss": 2.9003, + "step": 382000 + }, + { + "epoch": 10.3, + "learning_rate": 0.0005072385607145867, + "loss": 2.8972, + "step": 383000 + }, + { + "epoch": 10.32, + "learning_rate": 0.0005058355102384764, + "loss": 2.8954, + "step": 384000 + }, + { + "epoch": 10.35, + "learning_rate": 0.0005044310553074351, + "loss": 2.9014, + "step": 385000 + }, + { + "epoch": 10.38, + "learning_rate": 0.000503026600376394, + "loss": 2.903, + "step": 386000 + }, + { + "epoch": 10.4, + "learning_rate": 0.0005016235499002837, + "loss": 2.9039, + "step": 387000 + }, + { + "epoch": 10.43, + "learning_rate": 0.0005002190949692424, + "loss": 2.8998, + "step": 388000 + }, + { + "epoch": 10.46, + "learning_rate": 0.0004988160444931322, + "loss": 2.9079, + "step": 389000 + }, + { + "epoch": 10.48, + "learning_rate": 0.000497412994017022, + "loss": 2.9058, + "step": 390000 + }, + { + "epoch": 10.51, + "learning_rate": 0.0004960085390859808, + "loss": 2.9025, + "step": 391000 + }, + { + "epoch": 10.54, + "learning_rate": 0.0004946040841549395, + "loss": 2.9084, + "step": 392000 + }, + { + "epoch": 10.56, + "learning_rate": 0.0004931996292238982, + "loss": 2.9051, + "step": 393000 + }, + { + "epoch": 10.59, + "learning_rate": 0.0004917965787477879, + "loss": 2.9119, + "step": 394000 + }, + { + "epoch": 10.62, + "learning_rate": 0.0004903921238167468, + "loss": 2.909, + "step": 395000 + }, + { + "epoch": 10.64, + "learning_rate": 0.0004889876688857055, + "loss": 2.9127, + "step": 396000 + }, + { + "epoch": 10.67, + "learning_rate": 0.00048758461840959523, + "loss": 2.9129, + "step": 397000 + }, + { + "epoch": 10.7, + "learning_rate": 0.000486180163478554, + "loss": 2.9112, + "step": 398000 + }, + { + "epoch": 10.73, + "learning_rate": 0.00048477711300244376, + "loss": 2.9202, + "step": 399000 + }, + { + "epoch": 10.75, + "learning_rate": 0.0004833726580714025, + "loss": 2.9161, + "step": 400000 + }, + { + "epoch": 10.78, + "learning_rate": 0.00048196820314036124, + "loss": 2.9211, + "step": 401000 + }, + { + "epoch": 10.81, + "learning_rate": 0.00048056374820931995, + "loss": 2.9192, + "step": 402000 + }, + { + "epoch": 10.83, + "learning_rate": 0.00047916069773320976, + "loss": 2.9145, + "step": 403000 + }, + { + "epoch": 10.86, + "learning_rate": 0.00047775624280216853, + "loss": 2.9171, + "step": 404000 + }, + { + "epoch": 10.89, + "learning_rate": 0.0004763517878711272, + "loss": 2.9132, + "step": 405000 + }, + { + "epoch": 10.91, + "learning_rate": 0.00047495014184994805, + "loss": 2.9178, + "step": 406000 + }, + { + "epoch": 10.94, + "learning_rate": 0.00047354568691890676, + "loss": 2.9177, + "step": 407000 + }, + { + "epoch": 10.97, + "learning_rate": 0.00047214123198786553, + "loss": 2.9154, + "step": 408000 + }, + { + "epoch": 10.99, + "learning_rate": 0.0004707367770568243, + "loss": 2.9207, + "step": 409000 + }, + { + "epoch": 11.0, + "eval_accuracy": 0.4129671679171056, + "eval_loss": 3.358556032180786, + "eval_runtime": 148.219, + "eval_samples_per_second": 390.773, + "eval_steps_per_second": 6.106, + "step": 409211 + }, + { + "epoch": 11.02, + "learning_rate": 0.00046933232212578296, + "loss": 2.8685, + "step": 410000 + }, + { + "epoch": 11.05, + "learning_rate": 0.00046792927164967277, + "loss": 2.8573, + "step": 411000 + }, + { + "epoch": 11.07, + "learning_rate": 0.0004665248167186315, + "loss": 2.8612, + "step": 412000 + }, + { + "epoch": 11.1, + "learning_rate": 0.0004651217662425213, + "loss": 2.862, + "step": 413000 + }, + { + "epoch": 11.13, + "learning_rate": 0.00046371871576641105, + "loss": 2.8641, + "step": 414000 + }, + { + "epoch": 11.16, + "learning_rate": 0.0004623142608353698, + "loss": 2.8669, + "step": 415000 + }, + { + "epoch": 11.18, + "learning_rate": 0.00046090980590432853, + "loss": 2.8682, + "step": 416000 + }, + { + "epoch": 11.21, + "learning_rate": 0.0004595053509732873, + "loss": 2.8753, + "step": 417000 + }, + { + "epoch": 11.24, + "learning_rate": 0.00045810230049717705, + "loss": 2.8688, + "step": 418000 + }, + { + "epoch": 11.26, + "learning_rate": 0.0004566978455661358, + "loss": 2.8752, + "step": 419000 + }, + { + "epoch": 11.29, + "learning_rate": 0.0004552947950900256, + "loss": 2.8755, + "step": 420000 + }, + { + "epoch": 11.32, + "learning_rate": 0.0004538903401589843, + "loss": 2.8753, + "step": 421000 + }, + { + "epoch": 11.34, + "learning_rate": 0.00045248728968287405, + "loss": 2.8777, + "step": 422000 + }, + { + "epoch": 11.37, + "learning_rate": 0.0004510828347518328, + "loss": 2.8822, + "step": 423000 + }, + { + "epoch": 11.4, + "learning_rate": 0.00044968118873065367, + "loss": 2.8774, + "step": 424000 + }, + { + "epoch": 11.42, + "learning_rate": 0.0004482767337996124, + "loss": 2.8873, + "step": 425000 + }, + { + "epoch": 11.45, + "learning_rate": 0.0004468722788685711, + "loss": 2.8866, + "step": 426000 + }, + { + "epoch": 11.48, + "learning_rate": 0.00044546782393752987, + "loss": 2.8853, + "step": 427000 + }, + { + "epoch": 11.51, + "learning_rate": 0.0004440647734614197, + "loss": 2.8821, + "step": 428000 + }, + { + "epoch": 11.53, + "learning_rate": 0.00044266031853037834, + "loss": 2.8884, + "step": 429000 + }, + { + "epoch": 11.56, + "learning_rate": 0.0004412558635993371, + "loss": 2.8865, + "step": 430000 + }, + { + "epoch": 11.59, + "learning_rate": 0.0004398528131232269, + "loss": 2.8887, + "step": 431000 + }, + { + "epoch": 11.61, + "learning_rate": 0.00043844835819218563, + "loss": 2.8915, + "step": 432000 + }, + { + "epoch": 11.64, + "learning_rate": 0.00043704530771607544, + "loss": 2.8884, + "step": 433000 + }, + { + "epoch": 11.67, + "learning_rate": 0.0004356408527850341, + "loss": 2.8851, + "step": 434000 + }, + { + "epoch": 11.69, + "learning_rate": 0.0004342378023089239, + "loss": 2.8882, + "step": 435000 + }, + { + "epoch": 11.72, + "learning_rate": 0.0004328333473778827, + "loss": 2.8893, + "step": 436000 + }, + { + "epoch": 11.75, + "learning_rate": 0.00043143029690177244, + "loss": 2.8898, + "step": 437000 + }, + { + "epoch": 11.77, + "learning_rate": 0.0004300258419707312, + "loss": 2.8875, + "step": 438000 + }, + { + "epoch": 11.8, + "learning_rate": 0.0004286213870396899, + "loss": 2.8952, + "step": 439000 + }, + { + "epoch": 11.83, + "learning_rate": 0.0004272183365635797, + "loss": 2.8938, + "step": 440000 + }, + { + "epoch": 11.85, + "learning_rate": 0.00042581388163253844, + "loss": 2.8965, + "step": 441000 + }, + { + "epoch": 11.88, + "learning_rate": 0.00042440942670149716, + "loss": 2.8929, + "step": 442000 + }, + { + "epoch": 11.91, + "learning_rate": 0.00042300637622538697, + "loss": 2.9017, + "step": 443000 + }, + { + "epoch": 11.94, + "learning_rate": 0.0004216019212943457, + "loss": 2.895, + "step": 444000 + }, + { + "epoch": 11.96, + "learning_rate": 0.00042019887081823544, + "loss": 2.892, + "step": 445000 + }, + { + "epoch": 11.99, + "learning_rate": 0.0004187944158871942, + "loss": 2.8987, + "step": 446000 + }, + { + "epoch": 12.0, + "eval_accuracy": 0.4122773663391878, + "eval_loss": 3.355417013168335, + "eval_runtime": 147.56, + "eval_samples_per_second": 392.518, + "eval_steps_per_second": 6.133, + "step": 446412 + }, + { + "epoch": 12.02, + "learning_rate": 0.000417389960956153, + "loss": 2.8605, + "step": 447000 + }, + { + "epoch": 12.04, + "learning_rate": 0.0004159883149349737, + "loss": 2.8334, + "step": 448000 + }, + { + "epoch": 12.07, + "learning_rate": 0.0004145838600039325, + "loss": 2.8413, + "step": 449000 + }, + { + "epoch": 12.1, + "learning_rate": 0.0004131794050728912, + "loss": 2.8457, + "step": 450000 + }, + { + "epoch": 12.12, + "learning_rate": 0.00041177495014184997, + "loss": 2.8426, + "step": 451000 + }, + { + "epoch": 12.15, + "learning_rate": 0.0004103718996657397, + "loss": 2.8498, + "step": 452000 + }, + { + "epoch": 12.18, + "learning_rate": 0.0004089674447346985, + "loss": 2.8513, + "step": 453000 + }, + { + "epoch": 12.2, + "learning_rate": 0.0004075629898036572, + "loss": 2.8469, + "step": 454000 + }, + { + "epoch": 12.23, + "learning_rate": 0.0004061585348726159, + "loss": 2.8472, + "step": 455000 + }, + { + "epoch": 12.26, + "learning_rate": 0.00040475548439650573, + "loss": 2.8486, + "step": 456000 + }, + { + "epoch": 12.28, + "learning_rate": 0.0004033510294654645, + "loss": 2.8561, + "step": 457000 + }, + { + "epoch": 12.31, + "learning_rate": 0.00040194797898935426, + "loss": 2.8519, + "step": 458000 + }, + { + "epoch": 12.34, + "learning_rate": 0.00040054352405831297, + "loss": 2.8522, + "step": 459000 + }, + { + "epoch": 12.37, + "learning_rate": 0.00039913906912727174, + "loss": 2.8585, + "step": 460000 + }, + { + "epoch": 12.39, + "learning_rate": 0.00039773461419623045, + "loss": 2.8596, + "step": 461000 + }, + { + "epoch": 12.42, + "learning_rate": 0.00039633156372012026, + "loss": 2.8608, + "step": 462000 + }, + { + "epoch": 12.45, + "learning_rate": 0.00039492851324401, + "loss": 2.8572, + "step": 463000 + }, + { + "epoch": 12.47, + "learning_rate": 0.00039352405831296873, + "loss": 2.8622, + "step": 464000 + }, + { + "epoch": 12.5, + "learning_rate": 0.0003921196033819275, + "loss": 2.8632, + "step": 465000 + }, + { + "epoch": 12.53, + "learning_rate": 0.00039071655290581726, + "loss": 2.8635, + "step": 466000 + }, + { + "epoch": 12.55, + "learning_rate": 0.000389312097974776, + "loss": 2.8693, + "step": 467000 + }, + { + "epoch": 12.58, + "learning_rate": 0.00038790764304373474, + "loss": 2.8685, + "step": 468000 + }, + { + "epoch": 12.61, + "learning_rate": 0.00038650318811269345, + "loss": 2.863, + "step": 469000 + }, + { + "epoch": 12.63, + "learning_rate": 0.00038510013763658327, + "loss": 2.8722, + "step": 470000 + }, + { + "epoch": 12.66, + "learning_rate": 0.000383695682705542, + "loss": 2.8671, + "step": 471000 + }, + { + "epoch": 12.69, + "learning_rate": 0.0003822912277745007, + "loss": 2.8703, + "step": 472000 + }, + { + "epoch": 12.71, + "learning_rate": 0.0003808881772983905, + "loss": 2.8754, + "step": 473000 + }, + { + "epoch": 12.74, + "learning_rate": 0.0003794837223673492, + "loss": 2.8687, + "step": 474000 + }, + { + "epoch": 12.77, + "learning_rate": 0.00037808067189123903, + "loss": 2.8734, + "step": 475000 + }, + { + "epoch": 12.8, + "learning_rate": 0.0003766762169601978, + "loss": 2.8748, + "step": 476000 + }, + { + "epoch": 12.82, + "learning_rate": 0.00037527316648408755, + "loss": 2.8773, + "step": 477000 + }, + { + "epoch": 12.85, + "learning_rate": 0.00037386871155304627, + "loss": 2.8739, + "step": 478000 + }, + { + "epoch": 12.88, + "learning_rate": 0.000372465661076936, + "loss": 2.8759, + "step": 479000 + }, + { + "epoch": 12.9, + "learning_rate": 0.0003710612061458948, + "loss": 2.8762, + "step": 480000 + }, + { + "epoch": 12.93, + "learning_rate": 0.00036965675121485356, + "loss": 2.8766, + "step": 481000 + }, + { + "epoch": 12.96, + "learning_rate": 0.0003682522962838122, + "loss": 2.8768, + "step": 482000 + }, + { + "epoch": 12.98, + "learning_rate": 0.000366847841352771, + "loss": 2.8779, + "step": 483000 + }, + { + "epoch": 13.0, + "eval_accuracy": 0.41304986617795647, + "eval_loss": 3.3615658283233643, + "eval_runtime": 148.2115, + "eval_samples_per_second": 390.793, + "eval_steps_per_second": 6.106, + "step": 483613 + }, + { + "epoch": 13.01, + "learning_rate": 0.00036544479087666074, + "loss": 2.8498, + "step": 484000 + }, + { + "epoch": 13.04, + "learning_rate": 0.00036404174040055055, + "loss": 2.8192, + "step": 485000 + }, + { + "epoch": 13.06, + "learning_rate": 0.0003626372854695093, + "loss": 2.8181, + "step": 486000 + }, + { + "epoch": 13.09, + "learning_rate": 0.000361232830538468, + "loss": 2.8195, + "step": 487000 + }, + { + "epoch": 13.12, + "learning_rate": 0.00035982837560742675, + "loss": 2.8275, + "step": 488000 + }, + { + "epoch": 13.14, + "learning_rate": 0.0003584239206763855, + "loss": 2.8255, + "step": 489000 + }, + { + "epoch": 13.17, + "learning_rate": 0.0003570208702002753, + "loss": 2.8286, + "step": 490000 + }, + { + "epoch": 13.2, + "learning_rate": 0.000355616415269234, + "loss": 2.8343, + "step": 491000 + }, + { + "epoch": 13.23, + "learning_rate": 0.00035421336479312375, + "loss": 2.8334, + "step": 492000 + }, + { + "epoch": 13.25, + "learning_rate": 0.0003528089098620825, + "loss": 2.8299, + "step": 493000 + }, + { + "epoch": 13.28, + "learning_rate": 0.0003514044549310413, + "loss": 2.8365, + "step": 494000 + }, + { + "epoch": 13.31, + "learning_rate": 0.00035, + "loss": 2.8353, + "step": 495000 + }, + { + "epoch": 13.33, + "learning_rate": 0.00034859694952388975, + "loss": 2.8377, + "step": 496000 + }, + { + "epoch": 13.36, + "learning_rate": 0.0003471924945928485, + "loss": 2.8407, + "step": 497000 + }, + { + "epoch": 13.39, + "learning_rate": 0.0003457880396618073, + "loss": 2.8428, + "step": 498000 + }, + { + "epoch": 13.41, + "learning_rate": 0.00034438498918569705, + "loss": 2.8405, + "step": 499000 + }, + { + "epoch": 13.44, + "learning_rate": 0.00034298053425465576, + "loss": 2.8419, + "step": 500000 + }, + { + "epoch": 13.47, + "learning_rate": 0.0003415774837785455, + "loss": 2.8469, + "step": 501000 + }, + { + "epoch": 13.49, + "learning_rate": 0.0003401730288475043, + "loss": 2.8488, + "step": 502000 + }, + { + "epoch": 13.52, + "learning_rate": 0.00033876997837139404, + "loss": 2.8434, + "step": 503000 + }, + { + "epoch": 13.55, + "learning_rate": 0.0003373655234403528, + "loss": 2.8472, + "step": 504000 + }, + { + "epoch": 13.57, + "learning_rate": 0.0003359610685093115, + "loss": 2.8471, + "step": 505000 + }, + { + "epoch": 13.6, + "learning_rate": 0.0003345566135782703, + "loss": 2.8512, + "step": 506000 + }, + { + "epoch": 13.63, + "learning_rate": 0.00033315496755709114, + "loss": 2.8477, + "step": 507000 + }, + { + "epoch": 13.66, + "learning_rate": 0.0003317505126260498, + "loss": 2.8482, + "step": 508000 + }, + { + "epoch": 13.68, + "learning_rate": 0.00033034605769500857, + "loss": 2.8487, + "step": 509000 + }, + { + "epoch": 13.71, + "learning_rate": 0.0003289416027639673, + "loss": 2.8496, + "step": 510000 + }, + { + "epoch": 13.74, + "learning_rate": 0.0003275385522878571, + "loss": 2.8543, + "step": 511000 + }, + { + "epoch": 13.76, + "learning_rate": 0.0003261355018117469, + "loss": 2.851, + "step": 512000 + }, + { + "epoch": 13.79, + "learning_rate": 0.00032473104688070557, + "loss": 2.853, + "step": 513000 + }, + { + "epoch": 13.82, + "learning_rate": 0.0003233279964045954, + "loss": 2.8558, + "step": 514000 + }, + { + "epoch": 13.84, + "learning_rate": 0.00032192354147355415, + "loss": 2.8448, + "step": 515000 + }, + { + "epoch": 13.87, + "learning_rate": 0.00032051908654251286, + "loss": 2.8517, + "step": 516000 + }, + { + "epoch": 13.9, + "learning_rate": 0.0003191146316114716, + "loss": 2.8543, + "step": 517000 + }, + { + "epoch": 13.92, + "learning_rate": 0.0003177115811353614, + "loss": 2.8519, + "step": 518000 + }, + { + "epoch": 13.95, + "learning_rate": 0.0003163071262043201, + "loss": 2.8479, + "step": 519000 + }, + { + "epoch": 13.98, + "learning_rate": 0.00031490267127327887, + "loss": 2.8519, + "step": 520000 + }, + { + "epoch": 14.0, + "eval_accuracy": 0.41285833673710687, + "eval_loss": 3.369619846343994, + "eval_runtime": 147.964, + "eval_samples_per_second": 391.446, + "eval_steps_per_second": 6.116, + "step": 520814 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003134982163422376, + "loss": 2.8511, + "step": 521000 + }, + { + "epoch": 14.03, + "learning_rate": 0.00031209376141119635, + "loss": 2.802, + "step": 522000 + }, + { + "epoch": 14.06, + "learning_rate": 0.0003106907109350861, + "loss": 2.7993, + "step": 523000 + }, + { + "epoch": 14.09, + "learning_rate": 0.0003092862560040449, + "loss": 2.8106, + "step": 524000 + }, + { + "epoch": 14.11, + "learning_rate": 0.0003078818010730036, + "loss": 2.8026, + "step": 525000 + }, + { + "epoch": 14.14, + "learning_rate": 0.00030647875059689334, + "loss": 2.8074, + "step": 526000 + }, + { + "epoch": 14.17, + "learning_rate": 0.0003050742956658521, + "loss": 2.8076, + "step": 527000 + }, + { + "epoch": 14.19, + "learning_rate": 0.00030367124518974187, + "loss": 2.8154, + "step": 528000 + }, + { + "epoch": 14.22, + "learning_rate": 0.00030226679025870064, + "loss": 2.8121, + "step": 529000 + }, + { + "epoch": 14.25, + "learning_rate": 0.0003008637397825904, + "loss": 2.816, + "step": 530000 + }, + { + "epoch": 14.27, + "learning_rate": 0.0002994592848515491, + "loss": 2.8158, + "step": 531000 + }, + { + "epoch": 14.3, + "learning_rate": 0.00029805623437543886, + "loss": 2.8182, + "step": 532000 + }, + { + "epoch": 14.33, + "learning_rate": 0.0002966531838993287, + "loss": 2.8169, + "step": 533000 + }, + { + "epoch": 14.35, + "learning_rate": 0.00029524872896828744, + "loss": 2.8197, + "step": 534000 + }, + { + "epoch": 14.38, + "learning_rate": 0.00029384427403724616, + "loss": 2.818, + "step": 535000 + }, + { + "epoch": 14.41, + "learning_rate": 0.00029243981910620487, + "loss": 2.821, + "step": 536000 + }, + { + "epoch": 14.44, + "learning_rate": 0.0002910367686300946, + "loss": 2.8227, + "step": 537000 + }, + { + "epoch": 14.46, + "learning_rate": 0.0002896323136990534, + "loss": 2.8222, + "step": 538000 + }, + { + "epoch": 14.49, + "learning_rate": 0.0002882292632229432, + "loss": 2.8308, + "step": 539000 + }, + { + "epoch": 14.52, + "learning_rate": 0.0002868248082919019, + "loss": 2.8315, + "step": 540000 + }, + { + "epoch": 14.54, + "learning_rate": 0.00028542035336086063, + "loss": 2.8244, + "step": 541000 + }, + { + "epoch": 14.57, + "learning_rate": 0.0002840173028847504, + "loss": 2.8245, + "step": 542000 + }, + { + "epoch": 14.6, + "learning_rate": 0.00028261284795370916, + "loss": 2.8289, + "step": 543000 + }, + { + "epoch": 14.62, + "learning_rate": 0.00028120979747759897, + "loss": 2.8252, + "step": 544000 + }, + { + "epoch": 14.65, + "learning_rate": 0.0002798053425465577, + "loss": 2.8265, + "step": 545000 + }, + { + "epoch": 14.68, + "learning_rate": 0.0002784022920704475, + "loss": 2.8309, + "step": 546000 + }, + { + "epoch": 14.7, + "learning_rate": 0.00027699924159433725, + "loss": 2.8286, + "step": 547000 + }, + { + "epoch": 14.73, + "learning_rate": 0.00027559478666329596, + "loss": 2.8289, + "step": 548000 + }, + { + "epoch": 14.76, + "learning_rate": 0.00027419033173225473, + "loss": 2.8297, + "step": 549000 + }, + { + "epoch": 14.78, + "learning_rate": 0.00027278587680121345, + "loss": 2.8295, + "step": 550000 + }, + { + "epoch": 14.81, + "learning_rate": 0.00027138282632510326, + "loss": 2.8369, + "step": 551000 + }, + { + "epoch": 14.84, + "learning_rate": 0.000269979775848993, + "loss": 2.8354, + "step": 552000 + }, + { + "epoch": 14.87, + "learning_rate": 0.00026857532091795173, + "loss": 2.8305, + "step": 553000 + }, + { + "epoch": 14.89, + "learning_rate": 0.0002671708659869105, + "loss": 2.8355, + "step": 554000 + }, + { + "epoch": 14.92, + "learning_rate": 0.00026576641105586926, + "loss": 2.8353, + "step": 555000 + }, + { + "epoch": 14.95, + "learning_rate": 0.000264363360579759, + "loss": 2.8427, + "step": 556000 + }, + { + "epoch": 14.97, + "learning_rate": 0.00026295890564871773, + "loss": 2.8361, + "step": 557000 + }, + { + "epoch": 15.0, + "learning_rate": 0.00026155445071767645, + "loss": 2.8395, + "step": 558000 + }, + { + "epoch": 15.0, + "eval_accuracy": 0.41281084066040374, + "eval_loss": 3.3729231357574463, + "eval_runtime": 147.9664, + "eval_samples_per_second": 391.44, + "eval_steps_per_second": 6.116, + "step": 558015 + }, + { + "epoch": 15.03, + "learning_rate": 0.00026015140024156626, + "loss": 2.7847, + "step": 559000 + }, + { + "epoch": 15.05, + "learning_rate": 0.000258746945310525, + "loss": 2.7891, + "step": 560000 + }, + { + "epoch": 15.08, + "learning_rate": 0.0002573424903794837, + "loss": 2.788, + "step": 561000 + }, + { + "epoch": 15.11, + "learning_rate": 0.0002559394399033735, + "loss": 2.7885, + "step": 562000 + }, + { + "epoch": 15.13, + "learning_rate": 0.00025453498497233227, + "loss": 2.7939, + "step": 563000 + }, + { + "epoch": 15.16, + "learning_rate": 0.000253133338951153, + "loss": 2.7933, + "step": 564000 + }, + { + "epoch": 15.19, + "learning_rate": 0.0002517288840201118, + "loss": 2.7946, + "step": 565000 + }, + { + "epoch": 15.21, + "learning_rate": 0.00025032442908907055, + "loss": 2.7977, + "step": 566000 + }, + { + "epoch": 15.24, + "learning_rate": 0.0002489213786129603, + "loss": 2.7946, + "step": 567000 + }, + { + "epoch": 15.27, + "learning_rate": 0.00024751692368191907, + "loss": 2.7985, + "step": 568000 + }, + { + "epoch": 15.3, + "learning_rate": 0.0002461124687508778, + "loss": 2.7984, + "step": 569000 + }, + { + "epoch": 15.32, + "learning_rate": 0.00024470801381983655, + "loss": 2.7972, + "step": 570000 + }, + { + "epoch": 15.35, + "learning_rate": 0.0002433049633437263, + "loss": 2.7978, + "step": 571000 + }, + { + "epoch": 15.38, + "learning_rate": 0.00024190050841268505, + "loss": 2.8039, + "step": 572000 + }, + { + "epoch": 15.4, + "learning_rate": 0.00024049605348164377, + "loss": 2.8002, + "step": 573000 + }, + { + "epoch": 15.43, + "learning_rate": 0.00023909300300553355, + "loss": 2.8051, + "step": 574000 + }, + { + "epoch": 15.46, + "learning_rate": 0.00023768854807449232, + "loss": 2.8069, + "step": 575000 + }, + { + "epoch": 15.48, + "learning_rate": 0.00023628409314345103, + "loss": 2.8039, + "step": 576000 + }, + { + "epoch": 15.51, + "learning_rate": 0.00023488104266734081, + "loss": 2.8068, + "step": 577000 + }, + { + "epoch": 15.54, + "learning_rate": 0.00023347658773629953, + "loss": 2.8093, + "step": 578000 + }, + { + "epoch": 15.56, + "learning_rate": 0.0002320735372601893, + "loss": 2.8067, + "step": 579000 + }, + { + "epoch": 15.59, + "learning_rate": 0.00023066908232914808, + "loss": 2.8073, + "step": 580000 + }, + { + "epoch": 15.62, + "learning_rate": 0.0002292646273981068, + "loss": 2.8129, + "step": 581000 + }, + { + "epoch": 15.64, + "learning_rate": 0.00022786017246706554, + "loss": 2.8102, + "step": 582000 + }, + { + "epoch": 15.67, + "learning_rate": 0.00022645712199095532, + "loss": 2.812, + "step": 583000 + }, + { + "epoch": 15.7, + "learning_rate": 0.00022505266705991406, + "loss": 2.8093, + "step": 584000 + }, + { + "epoch": 15.73, + "learning_rate": 0.00022364961658380384, + "loss": 2.8139, + "step": 585000 + }, + { + "epoch": 15.75, + "learning_rate": 0.00022224516165276256, + "loss": 2.8115, + "step": 586000 + }, + { + "epoch": 15.78, + "learning_rate": 0.0002208407067217213, + "loss": 2.8157, + "step": 587000 + }, + { + "epoch": 15.81, + "learning_rate": 0.00021943765624561108, + "loss": 2.8138, + "step": 588000 + }, + { + "epoch": 15.83, + "learning_rate": 0.00021803320131456982, + "loss": 2.8146, + "step": 589000 + }, + { + "epoch": 15.86, + "learning_rate": 0.0002166301508384596, + "loss": 2.8138, + "step": 590000 + }, + { + "epoch": 15.89, + "learning_rate": 0.00021522569590741835, + "loss": 2.8195, + "step": 591000 + }, + { + "epoch": 15.91, + "learning_rate": 0.0002138226454313081, + "loss": 2.8192, + "step": 592000 + }, + { + "epoch": 15.94, + "learning_rate": 0.00021241819050026685, + "loss": 2.8169, + "step": 593000 + }, + { + "epoch": 15.97, + "learning_rate": 0.00021101373556922559, + "loss": 2.8174, + "step": 594000 + }, + { + "epoch": 15.99, + "learning_rate": 0.00020961068509311537, + "loss": 2.8151, + "step": 595000 + }, + { + "epoch": 16.0, + "eval_accuracy": 0.4140438576219447, + "eval_loss": 3.3717539310455322, + "eval_runtime": 148.2583, + "eval_samples_per_second": 390.669, + "eval_steps_per_second": 6.104, + "step": 595216 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002082062301620741, + "loss": 2.7799, + "step": 596000 + }, + { + "epoch": 16.05, + "learning_rate": 0.00020680177523103282, + "loss": 2.7671, + "step": 597000 + }, + { + "epoch": 16.07, + "learning_rate": 0.0002053987247549226, + "loss": 2.772, + "step": 598000 + }, + { + "epoch": 16.1, + "learning_rate": 0.00020399426982388135, + "loss": 2.7732, + "step": 599000 + }, + { + "epoch": 16.13, + "learning_rate": 0.00020259121934777113, + "loss": 2.7791, + "step": 600000 + }, + { + "epoch": 16.16, + "learning_rate": 0.00020118676441672987, + "loss": 2.7742, + "step": 601000 + }, + { + "epoch": 16.18, + "learning_rate": 0.0001997823094856886, + "loss": 2.7786, + "step": 602000 + }, + { + "epoch": 16.21, + "learning_rate": 0.0001983792590095784, + "loss": 2.7834, + "step": 603000 + }, + { + "epoch": 16.24, + "learning_rate": 0.00019697620853346818, + "loss": 2.7824, + "step": 604000 + }, + { + "epoch": 16.26, + "learning_rate": 0.0001955717536024269, + "loss": 2.7857, + "step": 605000 + }, + { + "epoch": 16.29, + "learning_rate": 0.00019416729867138564, + "loss": 2.7824, + "step": 606000 + }, + { + "epoch": 16.32, + "learning_rate": 0.00019276284374034438, + "loss": 2.7849, + "step": 607000 + }, + { + "epoch": 16.34, + "learning_rate": 0.00019135838880930312, + "loss": 2.7853, + "step": 608000 + }, + { + "epoch": 16.37, + "learning_rate": 0.0001899553383331929, + "loss": 2.7886, + "step": 609000 + }, + { + "epoch": 16.4, + "learning_rate": 0.00018855088340215162, + "loss": 2.7843, + "step": 610000 + }, + { + "epoch": 16.42, + "learning_rate": 0.0001871478329260414, + "loss": 2.7929, + "step": 611000 + }, + { + "epoch": 16.45, + "learning_rate": 0.00018574337799500017, + "loss": 2.7879, + "step": 612000 + }, + { + "epoch": 16.48, + "learning_rate": 0.00018434032751888993, + "loss": 2.7893, + "step": 613000 + }, + { + "epoch": 16.5, + "learning_rate": 0.0001829372770427797, + "loss": 2.791, + "step": 614000 + }, + { + "epoch": 16.53, + "learning_rate": 0.00018153282211173845, + "loss": 2.7879, + "step": 615000 + }, + { + "epoch": 16.56, + "learning_rate": 0.0001801297716356282, + "loss": 2.7904, + "step": 616000 + }, + { + "epoch": 16.59, + "learning_rate": 0.00017872531670458695, + "loss": 2.7892, + "step": 617000 + }, + { + "epoch": 16.61, + "learning_rate": 0.0001773208617735457, + "loss": 2.7929, + "step": 618000 + }, + { + "epoch": 16.64, + "learning_rate": 0.00017591640684250443, + "loss": 2.7952, + "step": 619000 + }, + { + "epoch": 16.67, + "learning_rate": 0.00017451195191146317, + "loss": 2.7835, + "step": 620000 + }, + { + "epoch": 16.69, + "learning_rate": 0.00017311030589028397, + "loss": 2.793, + "step": 621000 + }, + { + "epoch": 16.72, + "learning_rate": 0.0001717058509592427, + "loss": 2.7948, + "step": 622000 + }, + { + "epoch": 16.75, + "learning_rate": 0.00017030139602820148, + "loss": 2.7917, + "step": 623000 + }, + { + "epoch": 16.77, + "learning_rate": 0.0001688969410971602, + "loss": 2.7938, + "step": 624000 + }, + { + "epoch": 16.8, + "learning_rate": 0.00016749248616611893, + "loss": 2.7978, + "step": 625000 + }, + { + "epoch": 16.83, + "learning_rate": 0.0001660894356900087, + "loss": 2.7945, + "step": 626000 + }, + { + "epoch": 16.85, + "learning_rate": 0.00016468498075896746, + "loss": 2.7943, + "step": 627000 + }, + { + "epoch": 16.88, + "learning_rate": 0.00016328193028285724, + "loss": 2.7918, + "step": 628000 + }, + { + "epoch": 16.91, + "learning_rate": 0.00016187747535181596, + "loss": 2.7988, + "step": 629000 + }, + { + "epoch": 16.94, + "learning_rate": 0.00016047442487570574, + "loss": 2.7968, + "step": 630000 + }, + { + "epoch": 16.96, + "learning_rate": 0.00015906996994466445, + "loss": 2.7929, + "step": 631000 + }, + { + "epoch": 16.99, + "learning_rate": 0.00015766551501362322, + "loss": 2.798, + "step": 632000 + }, + { + "epoch": 17.0, + "eval_accuracy": 0.41277355590429304, + "eval_loss": 3.385791063308716, + "eval_runtime": 147.8055, + "eval_samples_per_second": 391.866, + "eval_steps_per_second": 6.123, + "step": 632417 + }, + { + "epoch": 17.02, + "learning_rate": 0.000156262464537513, + "loss": 2.7792, + "step": 633000 + }, + { + "epoch": 17.04, + "learning_rate": 0.00015485800960647172, + "loss": 2.7608, + "step": 634000 + }, + { + "epoch": 17.07, + "learning_rate": 0.0001534549591303615, + "loss": 2.7645, + "step": 635000 + }, + { + "epoch": 17.1, + "learning_rate": 0.00015205050419932024, + "loss": 2.7656, + "step": 636000 + }, + { + "epoch": 17.12, + "learning_rate": 0.00015064604926827898, + "loss": 2.7604, + "step": 637000 + }, + { + "epoch": 17.15, + "learning_rate": 0.00014924299879216877, + "loss": 2.7683, + "step": 638000 + }, + { + "epoch": 17.18, + "learning_rate": 0.00014783854386112748, + "loss": 2.7652, + "step": 639000 + }, + { + "epoch": 17.2, + "learning_rate": 0.00014643408893008622, + "loss": 2.7589, + "step": 640000 + }, + { + "epoch": 17.23, + "learning_rate": 0.000145029633999045, + "loss": 2.7709, + "step": 641000 + }, + { + "epoch": 17.26, + "learning_rate": 0.00014362658352293475, + "loss": 2.7638, + "step": 642000 + }, + { + "epoch": 17.28, + "learning_rate": 0.00014222353304682453, + "loss": 2.7667, + "step": 643000 + }, + { + "epoch": 17.31, + "learning_rate": 0.00014081907811578327, + "loss": 2.7637, + "step": 644000 + }, + { + "epoch": 17.34, + "learning_rate": 0.00013941462318474201, + "loss": 2.7752, + "step": 645000 + }, + { + "epoch": 17.37, + "learning_rate": 0.0001380115727086318, + "loss": 2.7716, + "step": 646000 + }, + { + "epoch": 17.39, + "learning_rate": 0.0001366071177775905, + "loss": 2.7665, + "step": 647000 + }, + { + "epoch": 17.42, + "learning_rate": 0.00013520266284654925, + "loss": 2.7669, + "step": 648000 + }, + { + "epoch": 17.45, + "learning_rate": 0.00013379820791550802, + "loss": 2.7743, + "step": 649000 + }, + { + "epoch": 17.47, + "learning_rate": 0.00013239375298446673, + "loss": 2.7733, + "step": 650000 + }, + { + "epoch": 17.5, + "learning_rate": 0.00013099070250835652, + "loss": 2.7713, + "step": 651000 + }, + { + "epoch": 17.53, + "learning_rate": 0.0001295876520322463, + "loss": 2.7694, + "step": 652000 + }, + { + "epoch": 17.55, + "learning_rate": 0.00012818319710120502, + "loss": 2.767, + "step": 653000 + }, + { + "epoch": 17.58, + "learning_rate": 0.00012677874217016378, + "loss": 2.7736, + "step": 654000 + }, + { + "epoch": 17.61, + "learning_rate": 0.0001253742872391225, + "loss": 2.7743, + "step": 655000 + }, + { + "epoch": 17.63, + "learning_rate": 0.00012396983230808124, + "loss": 2.7739, + "step": 656000 + }, + { + "epoch": 17.66, + "learning_rate": 0.000122566781831971, + "loss": 2.7722, + "step": 657000 + }, + { + "epoch": 17.69, + "learning_rate": 0.00012116232690092975, + "loss": 2.776, + "step": 658000 + }, + { + "epoch": 17.71, + "learning_rate": 0.00011975787196988849, + "loss": 2.7807, + "step": 659000 + }, + { + "epoch": 17.74, + "learning_rate": 0.00011835482149377827, + "loss": 2.7719, + "step": 660000 + }, + { + "epoch": 17.77, + "learning_rate": 0.000116950366562737, + "loss": 2.7747, + "step": 661000 + }, + { + "epoch": 17.8, + "learning_rate": 0.00011554731608662679, + "loss": 2.7782, + "step": 662000 + }, + { + "epoch": 17.82, + "learning_rate": 0.00011414286115558551, + "loss": 2.7738, + "step": 663000 + }, + { + "epoch": 17.85, + "learning_rate": 0.0001127398106794753, + "loss": 2.7756, + "step": 664000 + }, + { + "epoch": 17.88, + "learning_rate": 0.00011133535574843404, + "loss": 2.7715, + "step": 665000 + }, + { + "epoch": 17.9, + "learning_rate": 0.00010993090081739278, + "loss": 2.7809, + "step": 666000 + }, + { + "epoch": 17.93, + "learning_rate": 0.00010852785034128255, + "loss": 2.7813, + "step": 667000 + }, + { + "epoch": 17.96, + "learning_rate": 0.00010712339541024129, + "loss": 2.7748, + "step": 668000 + }, + { + "epoch": 17.98, + "learning_rate": 0.00010571894047920003, + "loss": 2.7738, + "step": 669000 + }, + { + "epoch": 18.0, + "eval_accuracy": 0.41297422178988324, + "eval_loss": 3.407961368560791, + "eval_runtime": 148.1896, + "eval_samples_per_second": 390.851, + "eval_steps_per_second": 6.107, + "step": 669618 + }, + { + "epoch": 18.01, + "learning_rate": 0.0001043158900030898, + "loss": 2.7651, + "step": 670000 + }, + { + "epoch": 18.04, + "learning_rate": 0.00010291143507204854, + "loss": 2.7487, + "step": 671000 + }, + { + "epoch": 18.06, + "learning_rate": 0.00010150838459593833, + "loss": 2.7489, + "step": 672000 + }, + { + "epoch": 18.09, + "learning_rate": 0.00010010392966489705, + "loss": 2.7467, + "step": 673000 + }, + { + "epoch": 18.12, + "learning_rate": 9.870087918878684e-05, + "loss": 2.7491, + "step": 674000 + }, + { + "epoch": 18.14, + "learning_rate": 9.729642425774556e-05, + "loss": 2.7511, + "step": 675000 + }, + { + "epoch": 18.17, + "learning_rate": 9.58919693267043e-05, + "loss": 2.745, + "step": 676000 + }, + { + "epoch": 18.2, + "learning_rate": 9.448891885059409e-05, + "loss": 2.7478, + "step": 677000 + }, + { + "epoch": 18.23, + "learning_rate": 9.308446391955282e-05, + "loss": 2.755, + "step": 678000 + }, + { + "epoch": 18.25, + "learning_rate": 9.168000898851156e-05, + "loss": 2.7538, + "step": 679000 + }, + { + "epoch": 18.28, + "learning_rate": 9.027695851240134e-05, + "loss": 2.755, + "step": 680000 + }, + { + "epoch": 18.31, + "learning_rate": 8.887250358136008e-05, + "loss": 2.7577, + "step": 681000 + }, + { + "epoch": 18.33, + "learning_rate": 8.746945310524985e-05, + "loss": 2.7505, + "step": 682000 + }, + { + "epoch": 18.36, + "learning_rate": 8.60649981742086e-05, + "loss": 2.7591, + "step": 683000 + }, + { + "epoch": 18.39, + "learning_rate": 8.466194769809838e-05, + "loss": 2.7601, + "step": 684000 + }, + { + "epoch": 18.41, + "learning_rate": 8.32574927670571e-05, + "loss": 2.7567, + "step": 685000 + }, + { + "epoch": 18.44, + "learning_rate": 8.185444229094687e-05, + "loss": 2.7547, + "step": 686000 + }, + { + "epoch": 18.47, + "learning_rate": 8.044998735990562e-05, + "loss": 2.7584, + "step": 687000 + }, + { + "epoch": 18.49, + "learning_rate": 7.904553242886437e-05, + "loss": 2.7554, + "step": 688000 + }, + { + "epoch": 18.52, + "learning_rate": 7.764388640768517e-05, + "loss": 2.756, + "step": 689000 + }, + { + "epoch": 18.55, + "learning_rate": 7.623943147664391e-05, + "loss": 2.7581, + "step": 690000 + }, + { + "epoch": 18.57, + "learning_rate": 7.483497654560266e-05, + "loss": 2.7593, + "step": 691000 + }, + { + "epoch": 18.6, + "learning_rate": 7.343192606949243e-05, + "loss": 2.7549, + "step": 692000 + }, + { + "epoch": 18.63, + "learning_rate": 7.202747113845116e-05, + "loss": 2.761, + "step": 693000 + }, + { + "epoch": 18.66, + "learning_rate": 7.06230162074099e-05, + "loss": 2.7556, + "step": 694000 + }, + { + "epoch": 18.68, + "learning_rate": 6.921856127636864e-05, + "loss": 2.7513, + "step": 695000 + }, + { + "epoch": 18.71, + "learning_rate": 6.781410634532737e-05, + "loss": 2.7577, + "step": 696000 + }, + { + "epoch": 18.74, + "learning_rate": 6.641105586921716e-05, + "loss": 2.759, + "step": 697000 + }, + { + "epoch": 18.76, + "learning_rate": 6.50066009381759e-05, + "loss": 2.7603, + "step": 698000 + }, + { + "epoch": 18.79, + "learning_rate": 6.360355046206568e-05, + "loss": 2.7598, + "step": 699000 + }, + { + "epoch": 18.82, + "learning_rate": 6.219909553102441e-05, + "loss": 2.7545, + "step": 700000 + }, + { + "epoch": 18.84, + "learning_rate": 6.079464059998314e-05, + "loss": 2.7603, + "step": 701000 + }, + { + "epoch": 18.87, + "learning_rate": 5.939018566894189e-05, + "loss": 2.7558, + "step": 702000 + }, + { + "epoch": 18.9, + "learning_rate": 5.798713519283167e-05, + "loss": 2.7559, + "step": 703000 + }, + { + "epoch": 18.92, + "learning_rate": 5.65826802617904e-05, + "loss": 2.7584, + "step": 704000 + }, + { + "epoch": 18.95, + "learning_rate": 5.5178225330749135e-05, + "loss": 2.7614, + "step": 705000 + }, + { + "epoch": 18.98, + "learning_rate": 5.377517485463892e-05, + "loss": 2.7555, + "step": 706000 + }, + { + "epoch": 19.0, + "eval_accuracy": 0.41307418524410433, + "eval_loss": 3.4066617488861084, + "eval_runtime": 148.4411, + "eval_samples_per_second": 390.189, + "eval_steps_per_second": 6.097, + "step": 706819 + }, + { + "epoch": 19.0, + "learning_rate": 5.237071992359765e-05, + "loss": 2.7536, + "step": 707000 + }, + { + "epoch": 19.03, + "learning_rate": 5.096766944748743e-05, + "loss": 2.7341, + "step": 708000 + }, + { + "epoch": 19.06, + "learning_rate": 4.956461897137721e-05, + "loss": 2.7413, + "step": 709000 + }, + { + "epoch": 19.09, + "learning_rate": 4.816016404033595e-05, + "loss": 2.7406, + "step": 710000 + }, + { + "epoch": 19.11, + "learning_rate": 4.675570910929468e-05, + "loss": 2.7417, + "step": 711000 + }, + { + "epoch": 19.14, + "learning_rate": 4.535265863318446e-05, + "loss": 2.7401, + "step": 712000 + }, + { + "epoch": 19.17, + "learning_rate": 4.39482037021432e-05, + "loss": 2.741, + "step": 713000 + }, + { + "epoch": 19.19, + "learning_rate": 4.254374877110194e-05, + "loss": 2.7379, + "step": 714000 + }, + { + "epoch": 19.22, + "learning_rate": 4.1139293840060675e-05, + "loss": 2.7369, + "step": 715000 + }, + { + "epoch": 19.25, + "learning_rate": 3.973483890901941e-05, + "loss": 2.7397, + "step": 716000 + }, + { + "epoch": 19.27, + "learning_rate": 3.8330383977978144e-05, + "loss": 2.7431, + "step": 717000 + }, + { + "epoch": 19.3, + "learning_rate": 3.692873795679897e-05, + "loss": 2.7404, + "step": 718000 + }, + { + "epoch": 19.33, + "learning_rate": 3.5524283025757704e-05, + "loss": 2.7413, + "step": 719000 + }, + { + "epoch": 19.35, + "learning_rate": 3.411982809471644e-05, + "loss": 2.7379, + "step": 720000 + }, + { + "epoch": 19.38, + "learning_rate": 3.271537316367518e-05, + "loss": 2.7436, + "step": 721000 + }, + { + "epoch": 19.41, + "learning_rate": 3.1312322687564956e-05, + "loss": 2.7363, + "step": 722000 + }, + { + "epoch": 19.43, + "learning_rate": 2.9907867756523694e-05, + "loss": 2.7387, + "step": 723000 + }, + { + "epoch": 19.46, + "learning_rate": 2.850481728041347e-05, + "loss": 2.7431, + "step": 724000 + }, + { + "epoch": 19.49, + "learning_rate": 2.710036234937221e-05, + "loss": 2.7458, + "step": 725000 + }, + { + "epoch": 19.52, + "learning_rate": 2.569731187326199e-05, + "loss": 2.7419, + "step": 726000 + }, + { + "epoch": 19.54, + "learning_rate": 2.4292856942220723e-05, + "loss": 2.7415, + "step": 727000 + }, + { + "epoch": 19.57, + "learning_rate": 2.2888402011179464e-05, + "loss": 2.7392, + "step": 728000 + }, + { + "epoch": 19.6, + "learning_rate": 2.1485351535069238e-05, + "loss": 2.7467, + "step": 729000 + }, + { + "epoch": 19.62, + "learning_rate": 2.008089660402798e-05, + "loss": 2.7393, + "step": 730000 + }, + { + "epoch": 19.65, + "learning_rate": 1.8677846127917755e-05, + "loss": 2.7378, + "step": 731000 + }, + { + "epoch": 19.68, + "learning_rate": 1.7273391196876493e-05, + "loss": 2.747, + "step": 732000 + }, + { + "epoch": 19.7, + "learning_rate": 1.587034072076627e-05, + "loss": 2.7419, + "step": 733000 + }, + { + "epoch": 19.73, + "learning_rate": 1.4465885789725008e-05, + "loss": 2.7466, + "step": 734000 + }, + { + "epoch": 19.76, + "learning_rate": 1.3062835313614786e-05, + "loss": 2.7354, + "step": 735000 + }, + { + "epoch": 19.78, + "learning_rate": 1.1658380382573524e-05, + "loss": 2.7457, + "step": 736000 + }, + { + "epoch": 19.81, + "learning_rate": 1.025392545153226e-05, + "loss": 2.7398, + "step": 737000 + }, + { + "epoch": 19.84, + "learning_rate": 8.850874975422038e-06, + "loss": 2.7415, + "step": 738000 + }, + { + "epoch": 19.87, + "learning_rate": 7.446420044380776e-06, + "loss": 2.7381, + "step": 739000 + }, + { + "epoch": 19.89, + "learning_rate": 6.043369568270554e-06, + "loss": 2.7407, + "step": 740000 + }, + { + "epoch": 19.92, + "learning_rate": 4.638914637229291e-06, + "loss": 2.7397, + "step": 741000 + }, + { + "epoch": 19.95, + "learning_rate": 3.2344597061880285e-06, + "loss": 2.7412, + "step": 742000 + }, + { + "epoch": 19.97, + "learning_rate": 1.831409230077807e-06, + "loss": 2.7409, + "step": 743000 + }, + { + "epoch": 20.0, + "learning_rate": 4.2695429903654394e-07, + "loss": 2.7434, + "step": 744000 + }, + { + "epoch": 20.0, + "eval_accuracy": 0.41252109443859236, + "eval_loss": 3.417576313018799, + "eval_runtime": 148.878, + "eval_samples_per_second": 389.043, + "eval_steps_per_second": 6.079, + "step": 744020 + }, + { + "epoch": 20.0, + "step": 744020, + "total_flos": 1.56740238729216e+18, + "train_loss": 2.994195082282441, + "train_runtime": 55239.7487, + "train_samples_per_second": 215.503, + "train_steps_per_second": 13.469 + } + ], + "logging_steps": 1000, + "max_steps": 744020, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 5000, + "total_flos": 1.56740238729216e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}