diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8200 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.0, + "global_step": 676224, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.00019998669080068145, + "loss": 10.1742, + "step": 500 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019997338160136289, + "loss": 5.5137, + "step": 1000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0001999600724020443, + "loss": 4.8097, + "step": 1500 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019994676320272574, + "loss": 4.638, + "step": 2000 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019993345400340717, + "loss": 4.2904, + "step": 2500 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001999201448040886, + "loss": 4.1718, + "step": 3000 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019990683560477005, + "loss": 4.1966, + "step": 3500 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019989352640545146, + "loss": 4.0892, + "step": 4000 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019988021720613287, + "loss": 4.0691, + "step": 4500 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001998669080068143, + "loss": 3.9735, + "step": 5000 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019985359880749575, + "loss": 3.8196, + "step": 5500 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019984028960817719, + "loss": 3.8869, + "step": 6000 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001998269804088586, + "loss": 3.6496, + "step": 6500 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019981367120954004, + "loss": 3.787, + "step": 7000 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019980036201022147, + "loss": 3.679, + "step": 7500 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001997870528109029, + "loss": 3.5967, + "step": 8000 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019977374361158435, + "loss": 3.459, + "step": 8500 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019976043441226576, + "loss": 3.563, + "step": 9000 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001997471252129472, + "loss": 3.4225, + "step": 9500 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019973381601362864, + "loss": 3.3701, + "step": 10000 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019972050681431008, + "loss": 3.4006, + "step": 10500 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019970719761499149, + "loss": 3.4251, + "step": 11000 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019969388841567292, + "loss": 3.4543, + "step": 11500 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019968057921635434, + "loss": 3.2496, + "step": 12000 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019966727001703577, + "loss": 3.2102, + "step": 12500 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001996539608177172, + "loss": 3.3174, + "step": 13000 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019964065161839865, + "loss": 3.2625, + "step": 13500 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001996273424190801, + "loss": 3.1891, + "step": 14000 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001996140332197615, + "loss": 3.1652, + "step": 14500 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019960072402044294, + "loss": 3.1784, + "step": 15000 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019958741482112438, + "loss": 3.1258, + "step": 15500 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019957410562180581, + "loss": 3.0577, + "step": 16000 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019956079642248725, + "loss": 3.121, + "step": 16500 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019954748722316866, + "loss": 3.0403, + "step": 17000 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001995341780238501, + "loss": 3.0235, + "step": 17500 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001995208688245315, + "loss": 2.9968, + "step": 18000 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019950755962521295, + "loss": 3.0215, + "step": 18500 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001994942504258944, + "loss": 2.9497, + "step": 19000 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001994809412265758, + "loss": 2.9783, + "step": 19500 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019946763202725724, + "loss": 2.8336, + "step": 20000 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019945432282793868, + "loss": 2.8082, + "step": 20500 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019944101362862011, + "loss": 2.9229, + "step": 21000 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019942770442930155, + "loss": 2.8394, + "step": 21500 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019941439522998296, + "loss": 2.8064, + "step": 22000 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001994010860306644, + "loss": 2.8957, + "step": 22500 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019938777683134584, + "loss": 2.8115, + "step": 23000 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019937446763202728, + "loss": 2.8356, + "step": 23500 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019936115843270872, + "loss": 2.7868, + "step": 24000 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019934784923339013, + "loss": 2.8692, + "step": 24500 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019933454003407156, + "loss": 2.7465, + "step": 25000 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019932123083475298, + "loss": 2.6548, + "step": 25500 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019930792163543441, + "loss": 2.7644, + "step": 26000 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019929461243611585, + "loss": 2.7362, + "step": 26500 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001992813032367973, + "loss": 2.773, + "step": 27000 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001992679940374787, + "loss": 2.6761, + "step": 27500 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019925468483816014, + "loss": 2.6885, + "step": 28000 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019924137563884158, + "loss": 2.79, + "step": 28500 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019922806643952302, + "loss": 2.5552, + "step": 29000 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019921475724020445, + "loss": 2.6489, + "step": 29500 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019920144804088586, + "loss": 2.6368, + "step": 30000 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001991881388415673, + "loss": 2.6442, + "step": 30500 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019917482964224874, + "loss": 2.594, + "step": 31000 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019916152044293018, + "loss": 2.6498, + "step": 31500 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019914821124361162, + "loss": 2.5306, + "step": 32000 + }, + { + "epoch": 0.43, + "learning_rate": 0.000199134902044293, + "loss": 2.6558, + "step": 32500 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019912159284497444, + "loss": 2.5459, + "step": 33000 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019910828364565588, + "loss": 2.6651, + "step": 33500 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019909497444633732, + "loss": 2.559, + "step": 34000 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019908166524701875, + "loss": 2.6479, + "step": 34500 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019906835604770017, + "loss": 2.6017, + "step": 35000 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001990550468483816, + "loss": 2.4582, + "step": 35500 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019904173764906304, + "loss": 2.4567, + "step": 36000 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019902842844974448, + "loss": 2.4983, + "step": 36500 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019901511925042592, + "loss": 2.5343, + "step": 37000 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019900181005110733, + "loss": 2.4445, + "step": 37500 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019898850085178877, + "loss": 2.5262, + "step": 38000 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001989751916524702, + "loss": 2.4907, + "step": 38500 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019896188245315164, + "loss": 2.6002, + "step": 39000 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019894857325383305, + "loss": 2.4953, + "step": 39500 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001989352640545145, + "loss": 2.4148, + "step": 40000 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001989219548551959, + "loss": 2.4096, + "step": 40500 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019890864565587734, + "loss": 2.3756, + "step": 41000 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019889533645655878, + "loss": 2.4706, + "step": 41500 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019888202725724022, + "loss": 2.4323, + "step": 42000 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019886871805792166, + "loss": 2.39, + "step": 42500 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019885540885860307, + "loss": 2.3738, + "step": 43000 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001988420996592845, + "loss": 2.3418, + "step": 43500 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019882879045996594, + "loss": 2.451, + "step": 44000 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019881548126064738, + "loss": 2.4559, + "step": 44500 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019880217206132882, + "loss": 2.3337, + "step": 45000 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019878886286201023, + "loss": 2.3844, + "step": 45500 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019877555366269167, + "loss": 2.365, + "step": 46000 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001987622444633731, + "loss": 2.4264, + "step": 46500 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019874893526405452, + "loss": 2.4404, + "step": 47000 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019873562606473596, + "loss": 2.3269, + "step": 47500 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019872231686541737, + "loss": 2.2934, + "step": 48000 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001987090076660988, + "loss": 2.2699, + "step": 48500 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019869569846678024, + "loss": 2.3035, + "step": 49000 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019868238926746168, + "loss": 2.3646, + "step": 49500 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019866908006814312, + "loss": 2.3598, + "step": 50000 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019865577086882453, + "loss": 2.2457, + "step": 50500 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019864246166950597, + "loss": 2.3047, + "step": 51000 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001986291524701874, + "loss": 2.3037, + "step": 51500 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019861584327086885, + "loss": 2.1934, + "step": 52000 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019860253407155028, + "loss": 2.2807, + "step": 52500 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001985892248722317, + "loss": 2.4267, + "step": 53000 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019857591567291313, + "loss": 2.2867, + "step": 53500 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019856260647359454, + "loss": 2.2614, + "step": 54000 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019854929727427598, + "loss": 2.2015, + "step": 54500 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019853598807495742, + "loss": 2.1686, + "step": 55000 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019852267887563886, + "loss": 2.2909, + "step": 55500 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019850936967632027, + "loss": 2.2395, + "step": 56000 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001984960604770017, + "loss": 2.2176, + "step": 56500 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019848275127768315, + "loss": 2.228, + "step": 57000 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019846944207836458, + "loss": 2.2459, + "step": 57500 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019845613287904602, + "loss": 2.277, + "step": 58000 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019844282367972743, + "loss": 2.3463, + "step": 58500 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019842951448040887, + "loss": 2.2644, + "step": 59000 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001984162052810903, + "loss": 2.2869, + "step": 59500 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019840289608177175, + "loss": 2.1746, + "step": 60000 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019838958688245316, + "loss": 2.2261, + "step": 60500 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019837627768313457, + "loss": 2.2695, + "step": 61000 + }, + { + "epoch": 0.82, + "learning_rate": 0.000198362968483816, + "loss": 2.2722, + "step": 61500 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019834965928449745, + "loss": 2.082, + "step": 62000 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019833635008517888, + "loss": 2.2108, + "step": 62500 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019832304088586032, + "loss": 2.2651, + "step": 63000 + }, + { + "epoch": 0.85, + "learning_rate": 0.00019830973168654173, + "loss": 2.1761, + "step": 63500 + }, + { + "epoch": 0.85, + "learning_rate": 0.00019829642248722317, + "loss": 2.2045, + "step": 64000 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001982831132879046, + "loss": 2.2311, + "step": 64500 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019826980408858605, + "loss": 2.2373, + "step": 65000 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019825649488926749, + "loss": 2.165, + "step": 65500 + }, + { + "epoch": 0.88, + "learning_rate": 0.0001982431856899489, + "loss": 2.2365, + "step": 66000 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019822987649063033, + "loss": 2.3064, + "step": 66500 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019821656729131177, + "loss": 2.0983, + "step": 67000 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001982032580919932, + "loss": 2.0742, + "step": 67500 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019818994889267465, + "loss": 2.1454, + "step": 68000 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019817663969335606, + "loss": 2.2284, + "step": 68500 + }, + { + "epoch": 0.92, + "learning_rate": 0.00019816333049403747, + "loss": 2.1694, + "step": 69000 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001981500212947189, + "loss": 2.1209, + "step": 69500 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019813671209540035, + "loss": 2.0298, + "step": 70000 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019812340289608179, + "loss": 2.1896, + "step": 70500 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001981100936967632, + "loss": 2.1888, + "step": 71000 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019809678449744464, + "loss": 2.0871, + "step": 71500 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019808347529812607, + "loss": 2.0753, + "step": 72000 + }, + { + "epoch": 0.96, + "learning_rate": 0.0001980701660988075, + "loss": 2.1047, + "step": 72500 + }, + { + "epoch": 0.97, + "learning_rate": 0.00019805685689948895, + "loss": 2.08, + "step": 73000 + }, + { + "epoch": 0.98, + "learning_rate": 0.00019804354770017036, + "loss": 2.1196, + "step": 73500 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001980302385008518, + "loss": 2.0891, + "step": 74000 + }, + { + "epoch": 0.99, + "learning_rate": 0.00019801692930153324, + "loss": 2.0471, + "step": 74500 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019800362010221468, + "loss": 2.1587, + "step": 75000 + }, + { + "epoch": 1.0, + "eval_loss": 1.852659821510315, + "eval_runtime": 152.8059, + "eval_samples_per_second": 61.47, + "eval_steps_per_second": 61.47, + "step": 75136 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019799031090289609, + "loss": 2.0473, + "step": 75500 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019797700170357752, + "loss": 2.03, + "step": 76000 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019796369250425894, + "loss": 2.01, + "step": 76500 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019795038330494037, + "loss": 2.1045, + "step": 77000 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001979370741056218, + "loss": 1.9704, + "step": 77500 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019792376490630325, + "loss": 1.9728, + "step": 78000 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001979104557069847, + "loss": 2.0347, + "step": 78500 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001978971465076661, + "loss": 2.029, + "step": 79000 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019788383730834754, + "loss": 2.0278, + "step": 79500 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019787052810902898, + "loss": 1.9847, + "step": 80000 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001978572189097104, + "loss": 2.0862, + "step": 80500 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019784390971039185, + "loss": 2.0602, + "step": 81000 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019783060051107326, + "loss": 1.9392, + "step": 81500 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001978172913117547, + "loss": 2.0476, + "step": 82000 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001978039821124361, + "loss": 2.022, + "step": 82500 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019779067291311755, + "loss": 1.9251, + "step": 83000 + }, + { + "epoch": 1.11, + "learning_rate": 0.000197777363713799, + "loss": 1.9334, + "step": 83500 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001977640545144804, + "loss": 1.9369, + "step": 84000 + }, + { + "epoch": 1.12, + "learning_rate": 0.00019775074531516184, + "loss": 2.0145, + "step": 84500 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019773743611584328, + "loss": 2.0389, + "step": 85000 + }, + { + "epoch": 1.14, + "learning_rate": 0.00019772412691652471, + "loss": 2.0435, + "step": 85500 + }, + { + "epoch": 1.14, + "learning_rate": 0.00019771081771720615, + "loss": 1.9536, + "step": 86000 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019769750851788756, + "loss": 2.0962, + "step": 86500 + }, + { + "epoch": 1.16, + "learning_rate": 0.000197684199318569, + "loss": 1.9748, + "step": 87000 + }, + { + "epoch": 1.16, + "learning_rate": 0.00019767089011925044, + "loss": 1.9873, + "step": 87500 + }, + { + "epoch": 1.17, + "learning_rate": 0.00019765758091993188, + "loss": 1.9304, + "step": 88000 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019764427172061332, + "loss": 1.9514, + "step": 88500 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019763096252129473, + "loss": 1.9115, + "step": 89000 + }, + { + "epoch": 1.19, + "learning_rate": 0.00019761765332197614, + "loss": 2.015, + "step": 89500 + }, + { + "epoch": 1.2, + "learning_rate": 0.00019760434412265758, + "loss": 1.8767, + "step": 90000 + }, + { + "epoch": 1.2, + "learning_rate": 0.00019759103492333901, + "loss": 1.9433, + "step": 90500 + }, + { + "epoch": 1.21, + "learning_rate": 0.00019757772572402045, + "loss": 1.8935, + "step": 91000 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001975644165247019, + "loss": 1.9653, + "step": 91500 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001975511073253833, + "loss": 1.927, + "step": 92000 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019753779812606474, + "loss": 1.9577, + "step": 92500 + }, + { + "epoch": 1.24, + "learning_rate": 0.00019752448892674618, + "loss": 1.9963, + "step": 93000 + }, + { + "epoch": 1.24, + "learning_rate": 0.00019751117972742762, + "loss": 1.8967, + "step": 93500 + }, + { + "epoch": 1.25, + "learning_rate": 0.00019749787052810905, + "loss": 1.9731, + "step": 94000 + }, + { + "epoch": 1.26, + "learning_rate": 0.00019748456132879046, + "loss": 1.9941, + "step": 94500 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001974712521294719, + "loss": 1.9225, + "step": 95000 + }, + { + "epoch": 1.27, + "learning_rate": 0.00019745794293015334, + "loss": 1.9181, + "step": 95500 + }, + { + "epoch": 1.28, + "learning_rate": 0.00019744463373083478, + "loss": 1.9383, + "step": 96000 + }, + { + "epoch": 1.28, + "learning_rate": 0.00019743132453151622, + "loss": 1.981, + "step": 96500 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001974180153321976, + "loss": 1.9374, + "step": 97000 + }, + { + "epoch": 1.3, + "learning_rate": 0.00019740470613287904, + "loss": 1.9689, + "step": 97500 + }, + { + "epoch": 1.3, + "learning_rate": 0.00019739139693356048, + "loss": 1.9001, + "step": 98000 + }, + { + "epoch": 1.31, + "learning_rate": 0.00019737808773424192, + "loss": 1.8901, + "step": 98500 + }, + { + "epoch": 1.32, + "learning_rate": 0.00019736477853492335, + "loss": 1.9768, + "step": 99000 + }, + { + "epoch": 1.32, + "learning_rate": 0.00019735146933560476, + "loss": 1.8889, + "step": 99500 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001973381601362862, + "loss": 1.7857, + "step": 100000 + }, + { + "epoch": 1.34, + "learning_rate": 0.00019732485093696764, + "loss": 1.89, + "step": 100500 + }, + { + "epoch": 1.34, + "learning_rate": 0.00019731154173764908, + "loss": 1.9971, + "step": 101000 + }, + { + "epoch": 1.35, + "learning_rate": 0.00019729823253833052, + "loss": 1.9359, + "step": 101500 + }, + { + "epoch": 1.36, + "learning_rate": 0.00019728492333901193, + "loss": 1.787, + "step": 102000 + }, + { + "epoch": 1.36, + "learning_rate": 0.00019727161413969337, + "loss": 2.013, + "step": 102500 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001972583049403748, + "loss": 1.9847, + "step": 103000 + }, + { + "epoch": 1.38, + "learning_rate": 0.00019724499574105624, + "loss": 1.9503, + "step": 103500 + }, + { + "epoch": 1.38, + "learning_rate": 0.00019723168654173765, + "loss": 1.9554, + "step": 104000 + }, + { + "epoch": 1.39, + "learning_rate": 0.0001972183773424191, + "loss": 1.9334, + "step": 104500 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001972050681431005, + "loss": 1.8504, + "step": 105000 + }, + { + "epoch": 1.4, + "learning_rate": 0.00019719175894378194, + "loss": 1.9733, + "step": 105500 + }, + { + "epoch": 1.41, + "learning_rate": 0.00019717844974446338, + "loss": 1.8735, + "step": 106000 + }, + { + "epoch": 1.42, + "learning_rate": 0.00019716514054514482, + "loss": 1.9075, + "step": 106500 + }, + { + "epoch": 1.42, + "learning_rate": 0.00019715183134582626, + "loss": 1.8733, + "step": 107000 + }, + { + "epoch": 1.43, + "learning_rate": 0.00019713852214650767, + "loss": 2.0068, + "step": 107500 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001971252129471891, + "loss": 1.8536, + "step": 108000 + }, + { + "epoch": 1.44, + "learning_rate": 0.00019711190374787054, + "loss": 1.8932, + "step": 108500 + }, + { + "epoch": 1.45, + "learning_rate": 0.00019709859454855198, + "loss": 1.9241, + "step": 109000 + }, + { + "epoch": 1.46, + "learning_rate": 0.00019708528534923342, + "loss": 1.9382, + "step": 109500 + }, + { + "epoch": 1.46, + "learning_rate": 0.00019707197614991483, + "loss": 1.8566, + "step": 110000 + }, + { + "epoch": 1.47, + "learning_rate": 0.00019705866695059627, + "loss": 1.9342, + "step": 110500 + }, + { + "epoch": 1.48, + "learning_rate": 0.00019704535775127768, + "loss": 1.8722, + "step": 111000 + }, + { + "epoch": 1.48, + "learning_rate": 0.00019703204855195912, + "loss": 1.8558, + "step": 111500 + }, + { + "epoch": 1.49, + "learning_rate": 0.00019701873935264056, + "loss": 1.8737, + "step": 112000 + }, + { + "epoch": 1.5, + "learning_rate": 0.00019700543015332197, + "loss": 1.9147, + "step": 112500 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001969921209540034, + "loss": 1.958, + "step": 113000 + }, + { + "epoch": 1.51, + "learning_rate": 0.00019697881175468484, + "loss": 1.805, + "step": 113500 + }, + { + "epoch": 1.52, + "learning_rate": 0.00019696550255536628, + "loss": 1.7978, + "step": 114000 + }, + { + "epoch": 1.52, + "learning_rate": 0.00019695219335604772, + "loss": 1.8977, + "step": 114500 + }, + { + "epoch": 1.53, + "learning_rate": 0.00019693888415672913, + "loss": 1.9328, + "step": 115000 + }, + { + "epoch": 1.54, + "learning_rate": 0.00019692557495741057, + "loss": 1.8249, + "step": 115500 + }, + { + "epoch": 1.54, + "learning_rate": 0.000196912265758092, + "loss": 1.9425, + "step": 116000 + }, + { + "epoch": 1.55, + "learning_rate": 0.00019689895655877345, + "loss": 1.7741, + "step": 116500 + }, + { + "epoch": 1.56, + "learning_rate": 0.00019688564735945488, + "loss": 1.925, + "step": 117000 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001968723381601363, + "loss": 1.8892, + "step": 117500 + }, + { + "epoch": 1.57, + "learning_rate": 0.00019685902896081773, + "loss": 1.832, + "step": 118000 + }, + { + "epoch": 1.58, + "learning_rate": 0.00019684571976149914, + "loss": 1.774, + "step": 118500 + }, + { + "epoch": 1.58, + "learning_rate": 0.00019683241056218058, + "loss": 1.8796, + "step": 119000 + }, + { + "epoch": 1.59, + "learning_rate": 0.00019681910136286202, + "loss": 1.8493, + "step": 119500 + }, + { + "epoch": 1.6, + "learning_rate": 0.00019680579216354346, + "loss": 1.8939, + "step": 120000 + }, + { + "epoch": 1.6, + "learning_rate": 0.00019679248296422487, + "loss": 1.8701, + "step": 120500 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001967791737649063, + "loss": 1.8433, + "step": 121000 + }, + { + "epoch": 1.62, + "learning_rate": 0.00019676586456558775, + "loss": 1.9049, + "step": 121500 + }, + { + "epoch": 1.62, + "learning_rate": 0.00019675255536626918, + "loss": 1.7999, + "step": 122000 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001967392461669506, + "loss": 1.918, + "step": 122500 + }, + { + "epoch": 1.64, + "learning_rate": 0.00019672593696763203, + "loss": 1.8511, + "step": 123000 + }, + { + "epoch": 1.64, + "learning_rate": 0.00019671262776831347, + "loss": 1.8173, + "step": 123500 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001966993185689949, + "loss": 1.8211, + "step": 124000 + }, + { + "epoch": 1.66, + "learning_rate": 0.00019668600936967635, + "loss": 1.7833, + "step": 124500 + }, + { + "epoch": 1.66, + "learning_rate": 0.00019667270017035776, + "loss": 1.7822, + "step": 125000 + }, + { + "epoch": 1.67, + "learning_rate": 0.00019665939097103917, + "loss": 2.0203, + "step": 125500 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001966460817717206, + "loss": 1.8281, + "step": 126000 + }, + { + "epoch": 1.68, + "learning_rate": 0.00019663277257240205, + "loss": 1.8007, + "step": 126500 + }, + { + "epoch": 1.69, + "learning_rate": 0.00019661946337308348, + "loss": 1.8349, + "step": 127000 + }, + { + "epoch": 1.7, + "learning_rate": 0.00019660615417376492, + "loss": 1.8092, + "step": 127500 + }, + { + "epoch": 1.7, + "learning_rate": 0.00019659284497444633, + "loss": 1.8903, + "step": 128000 + }, + { + "epoch": 1.71, + "learning_rate": 0.00019657953577512777, + "loss": 1.9048, + "step": 128500 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001965662265758092, + "loss": 1.796, + "step": 129000 + }, + { + "epoch": 1.72, + "learning_rate": 0.00019655291737649065, + "loss": 1.8246, + "step": 129500 + }, + { + "epoch": 1.73, + "learning_rate": 0.00019653960817717209, + "loss": 1.8512, + "step": 130000 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001965262989778535, + "loss": 1.7868, + "step": 130500 + }, + { + "epoch": 1.74, + "learning_rate": 0.00019651298977853493, + "loss": 1.8051, + "step": 131000 + }, + { + "epoch": 1.75, + "learning_rate": 0.00019649968057921637, + "loss": 1.8618, + "step": 131500 + }, + { + "epoch": 1.76, + "learning_rate": 0.0001964863713798978, + "loss": 1.8571, + "step": 132000 + }, + { + "epoch": 1.76, + "learning_rate": 0.00019647306218057922, + "loss": 1.8886, + "step": 132500 + }, + { + "epoch": 1.77, + "learning_rate": 0.00019645975298126066, + "loss": 1.9144, + "step": 133000 + }, + { + "epoch": 1.78, + "learning_rate": 0.00019644644378194207, + "loss": 1.804, + "step": 133500 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001964331345826235, + "loss": 1.7999, + "step": 134000 + }, + { + "epoch": 1.79, + "learning_rate": 0.00019641982538330495, + "loss": 1.8522, + "step": 134500 + }, + { + "epoch": 1.8, + "learning_rate": 0.00019640651618398639, + "loss": 1.6701, + "step": 135000 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001963932069846678, + "loss": 1.8355, + "step": 135500 + }, + { + "epoch": 1.81, + "learning_rate": 0.00019637989778534923, + "loss": 1.7975, + "step": 136000 + }, + { + "epoch": 1.82, + "learning_rate": 0.00019636658858603067, + "loss": 1.8187, + "step": 136500 + }, + { + "epoch": 1.82, + "learning_rate": 0.0001963532793867121, + "loss": 1.8684, + "step": 137000 + }, + { + "epoch": 1.83, + "learning_rate": 0.00019633997018739355, + "loss": 1.829, + "step": 137500 + }, + { + "epoch": 1.84, + "learning_rate": 0.00019632666098807496, + "loss": 1.8728, + "step": 138000 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001963133517887564, + "loss": 1.7919, + "step": 138500 + }, + { + "epoch": 1.85, + "learning_rate": 0.00019630004258943784, + "loss": 1.8923, + "step": 139000 + }, + { + "epoch": 1.86, + "learning_rate": 0.00019628673339011927, + "loss": 1.8129, + "step": 139500 + }, + { + "epoch": 1.86, + "learning_rate": 0.00019627342419080069, + "loss": 1.7684, + "step": 140000 + }, + { + "epoch": 1.87, + "learning_rate": 0.00019626011499148212, + "loss": 1.7987, + "step": 140500 + }, + { + "epoch": 1.88, + "learning_rate": 0.00019624680579216354, + "loss": 1.806, + "step": 141000 + }, + { + "epoch": 1.88, + "learning_rate": 0.00019623349659284497, + "loss": 1.7626, + "step": 141500 + }, + { + "epoch": 1.89, + "learning_rate": 0.0001962201873935264, + "loss": 1.7647, + "step": 142000 + }, + { + "epoch": 1.9, + "learning_rate": 0.00019620687819420785, + "loss": 1.7437, + "step": 142500 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001961935689948893, + "loss": 1.7604, + "step": 143000 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001961802597955707, + "loss": 1.7737, + "step": 143500 + }, + { + "epoch": 1.92, + "learning_rate": 0.00019616695059625214, + "loss": 1.8022, + "step": 144000 + }, + { + "epoch": 1.92, + "learning_rate": 0.00019615364139693358, + "loss": 1.7409, + "step": 144500 + }, + { + "epoch": 1.93, + "learning_rate": 0.000196140332197615, + "loss": 1.7658, + "step": 145000 + }, + { + "epoch": 1.94, + "learning_rate": 0.00019612702299829645, + "loss": 1.68, + "step": 145500 + }, + { + "epoch": 1.94, + "learning_rate": 0.00019611371379897786, + "loss": 1.8822, + "step": 146000 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001961004045996593, + "loss": 1.8034, + "step": 146500 + }, + { + "epoch": 1.96, + "learning_rate": 0.0001960870954003407, + "loss": 1.7649, + "step": 147000 + }, + { + "epoch": 1.96, + "learning_rate": 0.00019607378620102215, + "loss": 1.7675, + "step": 147500 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001960604770017036, + "loss": 1.7329, + "step": 148000 + }, + { + "epoch": 1.98, + "learning_rate": 0.000196047167802385, + "loss": 1.7883, + "step": 148500 + }, + { + "epoch": 1.98, + "learning_rate": 0.00019603385860306644, + "loss": 1.7564, + "step": 149000 + }, + { + "epoch": 1.99, + "learning_rate": 0.00019602054940374788, + "loss": 1.7273, + "step": 149500 + }, + { + "epoch": 2.0, + "learning_rate": 0.0001960072402044293, + "loss": 1.8357, + "step": 150000 + }, + { + "epoch": 2.0, + "eval_loss": 1.5852015018463135, + "eval_runtime": 154.5122, + "eval_samples_per_second": 60.791, + "eval_steps_per_second": 60.791, + "step": 150272 + }, + { + "epoch": 2.0, + "learning_rate": 0.00019599393100511075, + "loss": 1.752, + "step": 150500 + }, + { + "epoch": 2.01, + "learning_rate": 0.00019598062180579216, + "loss": 1.7735, + "step": 151000 + }, + { + "epoch": 2.02, + "learning_rate": 0.0001959673126064736, + "loss": 1.7071, + "step": 151500 + }, + { + "epoch": 2.02, + "learning_rate": 0.00019595400340715504, + "loss": 1.6139, + "step": 152000 + }, + { + "epoch": 2.03, + "learning_rate": 0.00019594069420783648, + "loss": 1.7057, + "step": 152500 + }, + { + "epoch": 2.04, + "learning_rate": 0.00019592738500851792, + "loss": 1.645, + "step": 153000 + }, + { + "epoch": 2.04, + "learning_rate": 0.00019591407580919933, + "loss": 1.5747, + "step": 153500 + }, + { + "epoch": 2.05, + "learning_rate": 0.00019590076660988074, + "loss": 1.6816, + "step": 154000 + }, + { + "epoch": 2.06, + "learning_rate": 0.00019588745741056218, + "loss": 1.6257, + "step": 154500 + }, + { + "epoch": 2.06, + "learning_rate": 0.00019587414821124361, + "loss": 1.694, + "step": 155000 + }, + { + "epoch": 2.07, + "learning_rate": 0.00019586083901192505, + "loss": 1.543, + "step": 155500 + }, + { + "epoch": 2.08, + "learning_rate": 0.0001958475298126065, + "loss": 1.6464, + "step": 156000 + }, + { + "epoch": 2.08, + "learning_rate": 0.0001958342206132879, + "loss": 1.6821, + "step": 156500 + }, + { + "epoch": 2.09, + "learning_rate": 0.00019582091141396934, + "loss": 1.6968, + "step": 157000 + }, + { + "epoch": 2.1, + "learning_rate": 0.00019580760221465078, + "loss": 1.6546, + "step": 157500 + }, + { + "epoch": 2.1, + "learning_rate": 0.00019579429301533222, + "loss": 1.6658, + "step": 158000 + }, + { + "epoch": 2.11, + "learning_rate": 0.00019578098381601365, + "loss": 1.6306, + "step": 158500 + }, + { + "epoch": 2.12, + "learning_rate": 0.00019576767461669506, + "loss": 1.6541, + "step": 159000 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001957543654173765, + "loss": 1.6013, + "step": 159500 + }, + { + "epoch": 2.13, + "learning_rate": 0.00019574105621805794, + "loss": 1.7092, + "step": 160000 + }, + { + "epoch": 2.14, + "learning_rate": 0.00019572774701873938, + "loss": 1.6469, + "step": 160500 + }, + { + "epoch": 2.14, + "learning_rate": 0.00019571443781942082, + "loss": 1.7672, + "step": 161000 + }, + { + "epoch": 2.15, + "learning_rate": 0.0001957011286201022, + "loss": 1.5536, + "step": 161500 + }, + { + "epoch": 2.16, + "learning_rate": 0.00019568781942078364, + "loss": 1.6306, + "step": 162000 + }, + { + "epoch": 2.16, + "learning_rate": 0.00019567451022146508, + "loss": 1.6919, + "step": 162500 + }, + { + "epoch": 2.17, + "learning_rate": 0.00019566120102214652, + "loss": 1.5898, + "step": 163000 + }, + { + "epoch": 2.18, + "learning_rate": 0.00019564789182282795, + "loss": 1.6933, + "step": 163500 + }, + { + "epoch": 2.18, + "learning_rate": 0.00019563458262350936, + "loss": 1.6183, + "step": 164000 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001956212734241908, + "loss": 1.6215, + "step": 164500 + }, + { + "epoch": 2.2, + "learning_rate": 0.00019560796422487224, + "loss": 1.6982, + "step": 165000 + }, + { + "epoch": 2.2, + "learning_rate": 0.00019559465502555368, + "loss": 1.6352, + "step": 165500 + }, + { + "epoch": 2.21, + "learning_rate": 0.00019558134582623512, + "loss": 1.659, + "step": 166000 + }, + { + "epoch": 2.22, + "learning_rate": 0.00019556803662691653, + "loss": 1.7314, + "step": 166500 + }, + { + "epoch": 2.22, + "learning_rate": 0.00019555472742759797, + "loss": 1.679, + "step": 167000 + }, + { + "epoch": 2.23, + "learning_rate": 0.0001955414182282794, + "loss": 1.6426, + "step": 167500 + }, + { + "epoch": 2.24, + "learning_rate": 0.00019552810902896084, + "loss": 1.7539, + "step": 168000 + }, + { + "epoch": 2.24, + "learning_rate": 0.00019551479982964225, + "loss": 1.6371, + "step": 168500 + }, + { + "epoch": 2.25, + "learning_rate": 0.0001955014906303237, + "loss": 1.5544, + "step": 169000 + }, + { + "epoch": 2.26, + "learning_rate": 0.0001954881814310051, + "loss": 1.7934, + "step": 169500 + }, + { + "epoch": 2.26, + "learning_rate": 0.00019547487223168654, + "loss": 1.6451, + "step": 170000 + }, + { + "epoch": 2.27, + "learning_rate": 0.00019546156303236798, + "loss": 1.6743, + "step": 170500 + }, + { + "epoch": 2.28, + "learning_rate": 0.00019544825383304942, + "loss": 1.7578, + "step": 171000 + }, + { + "epoch": 2.28, + "learning_rate": 0.00019543494463373086, + "loss": 1.6256, + "step": 171500 + }, + { + "epoch": 2.29, + "learning_rate": 0.00019542163543441227, + "loss": 1.6422, + "step": 172000 + }, + { + "epoch": 2.3, + "learning_rate": 0.0001954083262350937, + "loss": 1.6602, + "step": 172500 + }, + { + "epoch": 2.3, + "learning_rate": 0.00019539501703577514, + "loss": 1.6107, + "step": 173000 + }, + { + "epoch": 2.31, + "learning_rate": 0.00019538170783645658, + "loss": 1.6988, + "step": 173500 + }, + { + "epoch": 2.32, + "learning_rate": 0.00019536839863713802, + "loss": 1.6518, + "step": 174000 + }, + { + "epoch": 2.32, + "learning_rate": 0.00019535508943781943, + "loss": 1.6225, + "step": 174500 + }, + { + "epoch": 2.33, + "learning_rate": 0.00019534178023850087, + "loss": 1.6268, + "step": 175000 + }, + { + "epoch": 2.34, + "learning_rate": 0.00019532847103918228, + "loss": 1.6202, + "step": 175500 + }, + { + "epoch": 2.34, + "learning_rate": 0.00019531516183986372, + "loss": 1.5834, + "step": 176000 + }, + { + "epoch": 2.35, + "learning_rate": 0.00019530185264054516, + "loss": 1.6375, + "step": 176500 + }, + { + "epoch": 2.36, + "learning_rate": 0.00019528854344122657, + "loss": 1.6556, + "step": 177000 + }, + { + "epoch": 2.36, + "learning_rate": 0.000195275234241908, + "loss": 1.634, + "step": 177500 + }, + { + "epoch": 2.37, + "learning_rate": 0.00019526192504258944, + "loss": 1.5574, + "step": 178000 + }, + { + "epoch": 2.38, + "learning_rate": 0.00019524861584327088, + "loss": 1.6763, + "step": 178500 + }, + { + "epoch": 2.38, + "learning_rate": 0.00019523530664395232, + "loss": 1.6812, + "step": 179000 + }, + { + "epoch": 2.39, + "learning_rate": 0.00019522199744463373, + "loss": 1.7062, + "step": 179500 + }, + { + "epoch": 2.4, + "learning_rate": 0.00019520868824531517, + "loss": 1.6767, + "step": 180000 + }, + { + "epoch": 2.4, + "learning_rate": 0.0001951953790459966, + "loss": 1.7315, + "step": 180500 + }, + { + "epoch": 2.41, + "learning_rate": 0.00019518206984667805, + "loss": 1.6142, + "step": 181000 + }, + { + "epoch": 2.42, + "learning_rate": 0.00019516876064735948, + "loss": 1.6534, + "step": 181500 + }, + { + "epoch": 2.42, + "learning_rate": 0.0001951554514480409, + "loss": 1.7295, + "step": 182000 + }, + { + "epoch": 2.43, + "learning_rate": 0.00019514214224872233, + "loss": 1.5371, + "step": 182500 + }, + { + "epoch": 2.44, + "learning_rate": 0.00019512883304940374, + "loss": 1.7184, + "step": 183000 + }, + { + "epoch": 2.44, + "learning_rate": 0.00019511552385008518, + "loss": 1.6156, + "step": 183500 + }, + { + "epoch": 2.45, + "learning_rate": 0.00019510221465076662, + "loss": 1.6871, + "step": 184000 + }, + { + "epoch": 2.46, + "learning_rate": 0.00019508890545144806, + "loss": 1.6362, + "step": 184500 + }, + { + "epoch": 2.46, + "learning_rate": 0.00019507559625212947, + "loss": 1.5297, + "step": 185000 + }, + { + "epoch": 2.47, + "learning_rate": 0.0001950622870528109, + "loss": 1.6025, + "step": 185500 + }, + { + "epoch": 2.48, + "learning_rate": 0.00019504897785349235, + "loss": 1.659, + "step": 186000 + }, + { + "epoch": 2.48, + "learning_rate": 0.00019503566865417378, + "loss": 1.6596, + "step": 186500 + }, + { + "epoch": 2.49, + "learning_rate": 0.0001950223594548552, + "loss": 1.6745, + "step": 187000 + }, + { + "epoch": 2.5, + "learning_rate": 0.00019500905025553663, + "loss": 1.5484, + "step": 187500 + }, + { + "epoch": 2.5, + "learning_rate": 0.00019499574105621807, + "loss": 1.6971, + "step": 188000 + }, + { + "epoch": 2.51, + "learning_rate": 0.0001949824318568995, + "loss": 1.5821, + "step": 188500 + }, + { + "epoch": 2.52, + "learning_rate": 0.00019496912265758095, + "loss": 1.602, + "step": 189000 + }, + { + "epoch": 2.52, + "learning_rate": 0.00019495581345826236, + "loss": 1.6921, + "step": 189500 + }, + { + "epoch": 2.53, + "learning_rate": 0.00019494250425894377, + "loss": 1.6905, + "step": 190000 + }, + { + "epoch": 2.54, + "learning_rate": 0.0001949291950596252, + "loss": 1.7078, + "step": 190500 + }, + { + "epoch": 2.54, + "learning_rate": 0.00019491588586030665, + "loss": 1.6151, + "step": 191000 + }, + { + "epoch": 2.55, + "learning_rate": 0.00019490257666098808, + "loss": 1.6262, + "step": 191500 + }, + { + "epoch": 2.56, + "learning_rate": 0.00019488926746166952, + "loss": 1.7011, + "step": 192000 + }, + { + "epoch": 2.56, + "learning_rate": 0.00019487595826235093, + "loss": 1.6587, + "step": 192500 + }, + { + "epoch": 2.57, + "learning_rate": 0.00019486264906303237, + "loss": 1.6809, + "step": 193000 + }, + { + "epoch": 2.58, + "learning_rate": 0.0001948493398637138, + "loss": 1.5958, + "step": 193500 + }, + { + "epoch": 2.58, + "learning_rate": 0.00019483603066439525, + "loss": 1.6319, + "step": 194000 + }, + { + "epoch": 2.59, + "learning_rate": 0.00019482272146507669, + "loss": 1.5954, + "step": 194500 + }, + { + "epoch": 2.6, + "learning_rate": 0.0001948094122657581, + "loss": 1.6271, + "step": 195000 + }, + { + "epoch": 2.6, + "learning_rate": 0.00019479610306643953, + "loss": 1.7188, + "step": 195500 + }, + { + "epoch": 2.61, + "learning_rate": 0.00019478279386712097, + "loss": 1.5758, + "step": 196000 + }, + { + "epoch": 2.62, + "learning_rate": 0.0001947694846678024, + "loss": 1.6076, + "step": 196500 + }, + { + "epoch": 2.62, + "learning_rate": 0.00019475617546848382, + "loss": 1.6151, + "step": 197000 + }, + { + "epoch": 2.63, + "learning_rate": 0.00019474286626916526, + "loss": 1.6977, + "step": 197500 + }, + { + "epoch": 2.64, + "learning_rate": 0.00019472955706984667, + "loss": 1.5838, + "step": 198000 + }, + { + "epoch": 2.64, + "learning_rate": 0.0001947162478705281, + "loss": 1.708, + "step": 198500 + }, + { + "epoch": 2.65, + "learning_rate": 0.00019470293867120955, + "loss": 1.6652, + "step": 199000 + }, + { + "epoch": 2.66, + "learning_rate": 0.00019468962947189099, + "loss": 1.5048, + "step": 199500 + }, + { + "epoch": 2.66, + "learning_rate": 0.0001946763202725724, + "loss": 1.5494, + "step": 200000 + }, + { + "epoch": 2.67, + "learning_rate": 0.00019466301107325383, + "loss": 1.6802, + "step": 200500 + }, + { + "epoch": 2.68, + "learning_rate": 0.00019464970187393527, + "loss": 1.5599, + "step": 201000 + }, + { + "epoch": 2.68, + "learning_rate": 0.0001946363926746167, + "loss": 1.6305, + "step": 201500 + }, + { + "epoch": 2.69, + "learning_rate": 0.00019462308347529815, + "loss": 1.6988, + "step": 202000 + }, + { + "epoch": 2.7, + "learning_rate": 0.00019460977427597956, + "loss": 1.5617, + "step": 202500 + }, + { + "epoch": 2.7, + "learning_rate": 0.000194596465076661, + "loss": 1.6741, + "step": 203000 + }, + { + "epoch": 2.71, + "learning_rate": 0.00019458315587734244, + "loss": 1.7877, + "step": 203500 + }, + { + "epoch": 2.72, + "learning_rate": 0.00019456984667802387, + "loss": 1.698, + "step": 204000 + }, + { + "epoch": 2.72, + "learning_rate": 0.00019455653747870529, + "loss": 1.6675, + "step": 204500 + }, + { + "epoch": 2.73, + "learning_rate": 0.00019454322827938672, + "loss": 1.6722, + "step": 205000 + }, + { + "epoch": 2.74, + "learning_rate": 0.00019452991908006813, + "loss": 1.6445, + "step": 205500 + }, + { + "epoch": 2.74, + "learning_rate": 0.00019451660988074957, + "loss": 1.6236, + "step": 206000 + }, + { + "epoch": 2.75, + "learning_rate": 0.000194503300681431, + "loss": 1.5769, + "step": 206500 + }, + { + "epoch": 2.76, + "learning_rate": 0.00019448999148211245, + "loss": 1.6174, + "step": 207000 + }, + { + "epoch": 2.76, + "learning_rate": 0.0001944766822827939, + "loss": 1.6012, + "step": 207500 + }, + { + "epoch": 2.77, + "learning_rate": 0.0001944633730834753, + "loss": 1.5659, + "step": 208000 + }, + { + "epoch": 2.77, + "learning_rate": 0.00019445006388415674, + "loss": 1.6962, + "step": 208500 + }, + { + "epoch": 2.78, + "learning_rate": 0.00019443675468483817, + "loss": 1.6272, + "step": 209000 + }, + { + "epoch": 2.79, + "learning_rate": 0.0001944234454855196, + "loss": 1.6406, + "step": 209500 + }, + { + "epoch": 2.79, + "learning_rate": 0.00019441013628620105, + "loss": 1.5913, + "step": 210000 + }, + { + "epoch": 2.8, + "learning_rate": 0.00019439682708688246, + "loss": 1.627, + "step": 210500 + }, + { + "epoch": 2.81, + "learning_rate": 0.0001943835178875639, + "loss": 1.6659, + "step": 211000 + }, + { + "epoch": 2.81, + "learning_rate": 0.0001943702086882453, + "loss": 1.6649, + "step": 211500 + }, + { + "epoch": 2.82, + "learning_rate": 0.00019435689948892675, + "loss": 1.5073, + "step": 212000 + }, + { + "epoch": 2.83, + "learning_rate": 0.0001943435902896082, + "loss": 1.5495, + "step": 212500 + }, + { + "epoch": 2.83, + "learning_rate": 0.0001943302810902896, + "loss": 1.6255, + "step": 213000 + }, + { + "epoch": 2.84, + "learning_rate": 0.00019431697189097104, + "loss": 1.6997, + "step": 213500 + }, + { + "epoch": 2.85, + "learning_rate": 0.00019430366269165248, + "loss": 1.5781, + "step": 214000 + }, + { + "epoch": 2.85, + "learning_rate": 0.0001942903534923339, + "loss": 1.5791, + "step": 214500 + }, + { + "epoch": 2.86, + "learning_rate": 0.00019427704429301535, + "loss": 1.6821, + "step": 215000 + }, + { + "epoch": 2.87, + "learning_rate": 0.00019426373509369676, + "loss": 1.6547, + "step": 215500 + }, + { + "epoch": 2.87, + "learning_rate": 0.0001942504258943782, + "loss": 1.65, + "step": 216000 + }, + { + "epoch": 2.88, + "learning_rate": 0.00019423711669505964, + "loss": 1.591, + "step": 216500 + }, + { + "epoch": 2.89, + "learning_rate": 0.00019422380749574108, + "loss": 1.5678, + "step": 217000 + }, + { + "epoch": 2.89, + "learning_rate": 0.00019421049829642252, + "loss": 1.6103, + "step": 217500 + }, + { + "epoch": 2.9, + "learning_rate": 0.00019419718909710393, + "loss": 1.6536, + "step": 218000 + }, + { + "epoch": 2.91, + "learning_rate": 0.00019418387989778534, + "loss": 1.6381, + "step": 218500 + }, + { + "epoch": 2.91, + "learning_rate": 0.00019417057069846678, + "loss": 1.7339, + "step": 219000 + }, + { + "epoch": 2.92, + "learning_rate": 0.0001941572614991482, + "loss": 1.5957, + "step": 219500 + }, + { + "epoch": 2.93, + "learning_rate": 0.00019414395229982965, + "loss": 1.6919, + "step": 220000 + }, + { + "epoch": 2.93, + "learning_rate": 0.0001941306431005111, + "loss": 1.5275, + "step": 220500 + }, + { + "epoch": 2.94, + "learning_rate": 0.0001941173339011925, + "loss": 1.5781, + "step": 221000 + }, + { + "epoch": 2.95, + "learning_rate": 0.00019410402470187394, + "loss": 1.5521, + "step": 221500 + }, + { + "epoch": 2.95, + "learning_rate": 0.00019409071550255538, + "loss": 1.6003, + "step": 222000 + }, + { + "epoch": 2.96, + "learning_rate": 0.00019407740630323682, + "loss": 1.555, + "step": 222500 + }, + { + "epoch": 2.97, + "learning_rate": 0.00019406409710391825, + "loss": 1.5257, + "step": 223000 + }, + { + "epoch": 2.97, + "learning_rate": 0.00019405078790459966, + "loss": 1.6349, + "step": 223500 + }, + { + "epoch": 2.98, + "learning_rate": 0.0001940374787052811, + "loss": 1.5243, + "step": 224000 + }, + { + "epoch": 2.99, + "learning_rate": 0.00019402416950596254, + "loss": 1.599, + "step": 224500 + }, + { + "epoch": 2.99, + "learning_rate": 0.00019401086030664398, + "loss": 1.5874, + "step": 225000 + }, + { + "epoch": 3.0, + "eval_loss": 1.4729976654052734, + "eval_runtime": 154.2979, + "eval_samples_per_second": 60.876, + "eval_steps_per_second": 60.876, + "step": 225408 + }, + { + "epoch": 3.0, + "learning_rate": 0.0001939975511073254, + "loss": 1.5186, + "step": 225500 + }, + { + "epoch": 3.01, + "learning_rate": 0.0001939842419080068, + "loss": 1.5337, + "step": 226000 + }, + { + "epoch": 3.01, + "learning_rate": 0.00019397093270868824, + "loss": 1.5111, + "step": 226500 + }, + { + "epoch": 3.02, + "learning_rate": 0.00019395762350936968, + "loss": 1.4425, + "step": 227000 + }, + { + "epoch": 3.03, + "learning_rate": 0.00019394431431005112, + "loss": 1.4928, + "step": 227500 + }, + { + "epoch": 3.03, + "learning_rate": 0.00019393100511073255, + "loss": 1.4468, + "step": 228000 + }, + { + "epoch": 3.04, + "learning_rate": 0.00019391769591141396, + "loss": 1.5364, + "step": 228500 + }, + { + "epoch": 3.05, + "learning_rate": 0.0001939043867120954, + "loss": 1.5078, + "step": 229000 + }, + { + "epoch": 3.05, + "learning_rate": 0.00019389107751277684, + "loss": 1.5033, + "step": 229500 + }, + { + "epoch": 3.06, + "learning_rate": 0.00019387776831345828, + "loss": 1.4433, + "step": 230000 + }, + { + "epoch": 3.07, + "learning_rate": 0.00019386445911413972, + "loss": 1.5558, + "step": 230500 + }, + { + "epoch": 3.07, + "learning_rate": 0.00019385114991482113, + "loss": 1.4858, + "step": 231000 + }, + { + "epoch": 3.08, + "learning_rate": 0.00019383784071550257, + "loss": 1.4251, + "step": 231500 + }, + { + "epoch": 3.09, + "learning_rate": 0.000193824531516184, + "loss": 1.5372, + "step": 232000 + }, + { + "epoch": 3.09, + "learning_rate": 0.00019381122231686544, + "loss": 1.4567, + "step": 232500 + }, + { + "epoch": 3.1, + "learning_rate": 0.00019379791311754685, + "loss": 1.5426, + "step": 233000 + }, + { + "epoch": 3.11, + "learning_rate": 0.0001937846039182283, + "loss": 1.464, + "step": 233500 + }, + { + "epoch": 3.11, + "learning_rate": 0.0001937712947189097, + "loss": 1.4851, + "step": 234000 + }, + { + "epoch": 3.12, + "learning_rate": 0.00019375798551959114, + "loss": 1.4391, + "step": 234500 + }, + { + "epoch": 3.13, + "learning_rate": 0.00019374467632027258, + "loss": 1.4509, + "step": 235000 + }, + { + "epoch": 3.13, + "learning_rate": 0.00019373136712095402, + "loss": 1.5808, + "step": 235500 + }, + { + "epoch": 3.14, + "learning_rate": 0.00019371805792163546, + "loss": 1.4845, + "step": 236000 + }, + { + "epoch": 3.15, + "learning_rate": 0.00019370474872231687, + "loss": 1.3829, + "step": 236500 + }, + { + "epoch": 3.15, + "learning_rate": 0.0001936914395229983, + "loss": 1.5426, + "step": 237000 + }, + { + "epoch": 3.16, + "learning_rate": 0.00019367813032367974, + "loss": 1.573, + "step": 237500 + }, + { + "epoch": 3.17, + "learning_rate": 0.00019366482112436118, + "loss": 1.5125, + "step": 238000 + }, + { + "epoch": 3.17, + "learning_rate": 0.00019365151192504262, + "loss": 1.5845, + "step": 238500 + }, + { + "epoch": 3.18, + "learning_rate": 0.00019363820272572403, + "loss": 1.4887, + "step": 239000 + }, + { + "epoch": 3.19, + "learning_rate": 0.00019362489352640547, + "loss": 1.6045, + "step": 239500 + }, + { + "epoch": 3.19, + "learning_rate": 0.00019361158432708688, + "loss": 1.4978, + "step": 240000 + }, + { + "epoch": 3.2, + "learning_rate": 0.00019359827512776832, + "loss": 1.4833, + "step": 240500 + }, + { + "epoch": 3.21, + "learning_rate": 0.00019358496592844976, + "loss": 1.6137, + "step": 241000 + }, + { + "epoch": 3.21, + "learning_rate": 0.00019357165672913117, + "loss": 1.4635, + "step": 241500 + }, + { + "epoch": 3.22, + "learning_rate": 0.0001935583475298126, + "loss": 1.4702, + "step": 242000 + }, + { + "epoch": 3.23, + "learning_rate": 0.00019354503833049404, + "loss": 1.5509, + "step": 242500 + }, + { + "epoch": 3.23, + "learning_rate": 0.00019353172913117548, + "loss": 1.4055, + "step": 243000 + }, + { + "epoch": 3.24, + "learning_rate": 0.00019351841993185692, + "loss": 1.481, + "step": 243500 + }, + { + "epoch": 3.25, + "learning_rate": 0.00019350511073253833, + "loss": 1.475, + "step": 244000 + }, + { + "epoch": 3.25, + "learning_rate": 0.00019349180153321977, + "loss": 1.4319, + "step": 244500 + }, + { + "epoch": 3.26, + "learning_rate": 0.0001934784923339012, + "loss": 1.4761, + "step": 245000 + }, + { + "epoch": 3.27, + "learning_rate": 0.00019346518313458264, + "loss": 1.5438, + "step": 245500 + }, + { + "epoch": 3.27, + "learning_rate": 0.00019345187393526408, + "loss": 1.5407, + "step": 246000 + }, + { + "epoch": 3.28, + "learning_rate": 0.0001934385647359455, + "loss": 1.5398, + "step": 246500 + }, + { + "epoch": 3.29, + "learning_rate": 0.0001934252555366269, + "loss": 1.4815, + "step": 247000 + }, + { + "epoch": 3.29, + "learning_rate": 0.00019341194633730834, + "loss": 1.5605, + "step": 247500 + }, + { + "epoch": 3.3, + "learning_rate": 0.00019339863713798978, + "loss": 1.4427, + "step": 248000 + }, + { + "epoch": 3.31, + "learning_rate": 0.00019338532793867122, + "loss": 1.5884, + "step": 248500 + }, + { + "epoch": 3.31, + "learning_rate": 0.00019337201873935266, + "loss": 1.4853, + "step": 249000 + }, + { + "epoch": 3.32, + "learning_rate": 0.00019335870954003407, + "loss": 1.4497, + "step": 249500 + }, + { + "epoch": 3.33, + "learning_rate": 0.0001933454003407155, + "loss": 1.4862, + "step": 250000 + }, + { + "epoch": 3.33, + "learning_rate": 0.00019333209114139695, + "loss": 1.4691, + "step": 250500 + }, + { + "epoch": 3.34, + "learning_rate": 0.00019331878194207838, + "loss": 1.4709, + "step": 251000 + }, + { + "epoch": 3.35, + "learning_rate": 0.0001933054727427598, + "loss": 1.4811, + "step": 251500 + }, + { + "epoch": 3.35, + "learning_rate": 0.00019329216354344123, + "loss": 1.4936, + "step": 252000 + }, + { + "epoch": 3.36, + "learning_rate": 0.00019327885434412267, + "loss": 1.4381, + "step": 252500 + }, + { + "epoch": 3.37, + "learning_rate": 0.0001932655451448041, + "loss": 1.5663, + "step": 253000 + }, + { + "epoch": 3.37, + "learning_rate": 0.00019325223594548555, + "loss": 1.5392, + "step": 253500 + }, + { + "epoch": 3.38, + "learning_rate": 0.00019323892674616696, + "loss": 1.455, + "step": 254000 + }, + { + "epoch": 3.39, + "learning_rate": 0.00019322561754684837, + "loss": 1.5874, + "step": 254500 + }, + { + "epoch": 3.39, + "learning_rate": 0.0001932123083475298, + "loss": 1.5114, + "step": 255000 + }, + { + "epoch": 3.4, + "learning_rate": 0.00019319899914821125, + "loss": 1.5489, + "step": 255500 + }, + { + "epoch": 3.41, + "learning_rate": 0.00019318568994889268, + "loss": 1.4672, + "step": 256000 + }, + { + "epoch": 3.41, + "learning_rate": 0.00019317238074957412, + "loss": 1.5463, + "step": 256500 + }, + { + "epoch": 3.42, + "learning_rate": 0.00019315907155025553, + "loss": 1.4405, + "step": 257000 + }, + { + "epoch": 3.43, + "learning_rate": 0.00019314576235093697, + "loss": 1.4718, + "step": 257500 + }, + { + "epoch": 3.43, + "learning_rate": 0.0001931324531516184, + "loss": 1.5496, + "step": 258000 + }, + { + "epoch": 3.44, + "learning_rate": 0.00019311914395229985, + "loss": 1.4641, + "step": 258500 + }, + { + "epoch": 3.45, + "learning_rate": 0.00019310583475298129, + "loss": 1.4768, + "step": 259000 + }, + { + "epoch": 3.45, + "learning_rate": 0.0001930925255536627, + "loss": 1.4985, + "step": 259500 + }, + { + "epoch": 3.46, + "learning_rate": 0.00019307921635434413, + "loss": 1.497, + "step": 260000 + }, + { + "epoch": 3.47, + "learning_rate": 0.00019306590715502557, + "loss": 1.4342, + "step": 260500 + }, + { + "epoch": 3.47, + "learning_rate": 0.000193052597955707, + "loss": 1.5205, + "step": 261000 + }, + { + "epoch": 3.48, + "learning_rate": 0.00019303928875638842, + "loss": 1.5208, + "step": 261500 + }, + { + "epoch": 3.49, + "learning_rate": 0.00019302597955706986, + "loss": 1.4806, + "step": 262000 + }, + { + "epoch": 3.49, + "learning_rate": 0.00019301267035775127, + "loss": 1.4631, + "step": 262500 + }, + { + "epoch": 3.5, + "learning_rate": 0.0001929993611584327, + "loss": 1.4191, + "step": 263000 + }, + { + "epoch": 3.51, + "learning_rate": 0.00019298605195911415, + "loss": 1.5351, + "step": 263500 + }, + { + "epoch": 3.51, + "learning_rate": 0.00019297274275979559, + "loss": 1.4531, + "step": 264000 + }, + { + "epoch": 3.52, + "learning_rate": 0.000192959433560477, + "loss": 1.4089, + "step": 264500 + }, + { + "epoch": 3.53, + "learning_rate": 0.00019294612436115843, + "loss": 1.4426, + "step": 265000 + }, + { + "epoch": 3.53, + "learning_rate": 0.00019293281516183987, + "loss": 1.4957, + "step": 265500 + }, + { + "epoch": 3.54, + "learning_rate": 0.0001929195059625213, + "loss": 1.6548, + "step": 266000 + }, + { + "epoch": 3.55, + "learning_rate": 0.00019290619676320275, + "loss": 1.5277, + "step": 266500 + }, + { + "epoch": 3.55, + "learning_rate": 0.00019289288756388416, + "loss": 1.5082, + "step": 267000 + }, + { + "epoch": 3.56, + "learning_rate": 0.0001928795783645656, + "loss": 1.4819, + "step": 267500 + }, + { + "epoch": 3.57, + "learning_rate": 0.00019286626916524704, + "loss": 1.4152, + "step": 268000 + }, + { + "epoch": 3.57, + "learning_rate": 0.00019285295996592845, + "loss": 1.4693, + "step": 268500 + }, + { + "epoch": 3.58, + "learning_rate": 0.00019283965076660989, + "loss": 1.5178, + "step": 269000 + }, + { + "epoch": 3.59, + "learning_rate": 0.00019282634156729132, + "loss": 1.4476, + "step": 269500 + }, + { + "epoch": 3.59, + "learning_rate": 0.00019281303236797273, + "loss": 1.4921, + "step": 270000 + }, + { + "epoch": 3.6, + "learning_rate": 0.00019279972316865417, + "loss": 1.4735, + "step": 270500 + }, + { + "epoch": 3.61, + "learning_rate": 0.0001927864139693356, + "loss": 1.4243, + "step": 271000 + }, + { + "epoch": 3.61, + "learning_rate": 0.00019277310477001705, + "loss": 1.4855, + "step": 271500 + }, + { + "epoch": 3.62, + "learning_rate": 0.0001927597955706985, + "loss": 1.5052, + "step": 272000 + }, + { + "epoch": 3.63, + "learning_rate": 0.0001927464863713799, + "loss": 1.4815, + "step": 272500 + }, + { + "epoch": 3.63, + "learning_rate": 0.00019273317717206134, + "loss": 1.4478, + "step": 273000 + }, + { + "epoch": 3.64, + "learning_rate": 0.00019271986797274277, + "loss": 1.5841, + "step": 273500 + }, + { + "epoch": 3.65, + "learning_rate": 0.0001927065587734242, + "loss": 1.4793, + "step": 274000 + }, + { + "epoch": 3.65, + "learning_rate": 0.00019269324957410565, + "loss": 1.503, + "step": 274500 + }, + { + "epoch": 3.66, + "learning_rate": 0.00019267994037478706, + "loss": 1.4324, + "step": 275000 + }, + { + "epoch": 3.67, + "learning_rate": 0.0001926666311754685, + "loss": 1.471, + "step": 275500 + }, + { + "epoch": 3.67, + "learning_rate": 0.0001926533219761499, + "loss": 1.4935, + "step": 276000 + }, + { + "epoch": 3.68, + "learning_rate": 0.00019264001277683135, + "loss": 1.3821, + "step": 276500 + }, + { + "epoch": 3.69, + "learning_rate": 0.0001926267035775128, + "loss": 1.4417, + "step": 277000 + }, + { + "epoch": 3.69, + "learning_rate": 0.0001926133943781942, + "loss": 1.4789, + "step": 277500 + }, + { + "epoch": 3.7, + "learning_rate": 0.00019260008517887564, + "loss": 1.5533, + "step": 278000 + }, + { + "epoch": 3.71, + "learning_rate": 0.00019258677597955707, + "loss": 1.5845, + "step": 278500 + }, + { + "epoch": 3.71, + "learning_rate": 0.0001925734667802385, + "loss": 1.3698, + "step": 279000 + }, + { + "epoch": 3.72, + "learning_rate": 0.00019256015758091995, + "loss": 1.5313, + "step": 279500 + }, + { + "epoch": 3.73, + "learning_rate": 0.00019254684838160136, + "loss": 1.3552, + "step": 280000 + }, + { + "epoch": 3.73, + "learning_rate": 0.0001925335391822828, + "loss": 1.3905, + "step": 280500 + }, + { + "epoch": 3.74, + "learning_rate": 0.00019252022998296424, + "loss": 1.5076, + "step": 281000 + }, + { + "epoch": 3.75, + "learning_rate": 0.00019250692078364568, + "loss": 1.5277, + "step": 281500 + }, + { + "epoch": 3.75, + "learning_rate": 0.00019249361158432711, + "loss": 1.4739, + "step": 282000 + }, + { + "epoch": 3.76, + "learning_rate": 0.00019248030238500853, + "loss": 1.5227, + "step": 282500 + }, + { + "epoch": 3.77, + "learning_rate": 0.00019246699318568994, + "loss": 1.4798, + "step": 283000 + }, + { + "epoch": 3.77, + "learning_rate": 0.00019245368398637138, + "loss": 1.4321, + "step": 283500 + }, + { + "epoch": 3.78, + "learning_rate": 0.0001924403747870528, + "loss": 1.5775, + "step": 284000 + }, + { + "epoch": 3.79, + "learning_rate": 0.00019242706558773425, + "loss": 1.5437, + "step": 284500 + }, + { + "epoch": 3.79, + "learning_rate": 0.0001924137563884157, + "loss": 1.4227, + "step": 285000 + }, + { + "epoch": 3.8, + "learning_rate": 0.0001924004471890971, + "loss": 1.4115, + "step": 285500 + }, + { + "epoch": 3.81, + "learning_rate": 0.00019238713798977854, + "loss": 1.5419, + "step": 286000 + }, + { + "epoch": 3.81, + "learning_rate": 0.00019237382879045998, + "loss": 1.5094, + "step": 286500 + }, + { + "epoch": 3.82, + "learning_rate": 0.00019236051959114142, + "loss": 1.4809, + "step": 287000 + }, + { + "epoch": 3.83, + "learning_rate": 0.00019234721039182285, + "loss": 1.5076, + "step": 287500 + }, + { + "epoch": 3.83, + "learning_rate": 0.00019233390119250426, + "loss": 1.4439, + "step": 288000 + }, + { + "epoch": 3.84, + "learning_rate": 0.0001923205919931857, + "loss": 1.4485, + "step": 288500 + }, + { + "epoch": 3.85, + "learning_rate": 0.00019230728279386714, + "loss": 1.5172, + "step": 289000 + }, + { + "epoch": 3.85, + "learning_rate": 0.00019229397359454858, + "loss": 1.4831, + "step": 289500 + }, + { + "epoch": 3.86, + "learning_rate": 0.00019228066439523, + "loss": 1.494, + "step": 290000 + }, + { + "epoch": 3.87, + "learning_rate": 0.0001922673551959114, + "loss": 1.4732, + "step": 290500 + }, + { + "epoch": 3.87, + "learning_rate": 0.00019225404599659284, + "loss": 1.5131, + "step": 291000 + }, + { + "epoch": 3.88, + "learning_rate": 0.00019224073679727428, + "loss": 1.4725, + "step": 291500 + }, + { + "epoch": 3.89, + "learning_rate": 0.00019222742759795572, + "loss": 1.5446, + "step": 292000 + }, + { + "epoch": 3.89, + "learning_rate": 0.00019221411839863715, + "loss": 1.508, + "step": 292500 + }, + { + "epoch": 3.9, + "learning_rate": 0.00019220080919931856, + "loss": 1.4328, + "step": 293000 + }, + { + "epoch": 3.91, + "learning_rate": 0.0001921875, + "loss": 1.4796, + "step": 293500 + }, + { + "epoch": 3.91, + "learning_rate": 0.00019217419080068144, + "loss": 1.4528, + "step": 294000 + }, + { + "epoch": 3.92, + "learning_rate": 0.00019216088160136288, + "loss": 1.5422, + "step": 294500 + }, + { + "epoch": 3.93, + "learning_rate": 0.00019214757240204432, + "loss": 1.5243, + "step": 295000 + }, + { + "epoch": 3.93, + "learning_rate": 0.00019213426320272573, + "loss": 1.452, + "step": 295500 + }, + { + "epoch": 3.94, + "learning_rate": 0.00019212095400340717, + "loss": 1.4417, + "step": 296000 + }, + { + "epoch": 3.95, + "learning_rate": 0.0001921076448040886, + "loss": 1.4921, + "step": 296500 + }, + { + "epoch": 3.95, + "learning_rate": 0.00019209433560477004, + "loss": 1.4464, + "step": 297000 + }, + { + "epoch": 3.96, + "learning_rate": 0.00019208102640545145, + "loss": 1.5396, + "step": 297500 + }, + { + "epoch": 3.97, + "learning_rate": 0.0001920677172061329, + "loss": 1.5577, + "step": 298000 + }, + { + "epoch": 3.97, + "learning_rate": 0.0001920544080068143, + "loss": 1.4185, + "step": 298500 + }, + { + "epoch": 3.98, + "learning_rate": 0.00019204109880749574, + "loss": 1.4259, + "step": 299000 + }, + { + "epoch": 3.99, + "learning_rate": 0.00019202778960817718, + "loss": 1.4868, + "step": 299500 + }, + { + "epoch": 3.99, + "learning_rate": 0.00019201448040885862, + "loss": 1.4656, + "step": 300000 + }, + { + "epoch": 4.0, + "learning_rate": 0.00019200117120954006, + "loss": 1.5002, + "step": 300500 + }, + { + "epoch": 4.0, + "eval_loss": 1.3984296321868896, + "eval_runtime": 153.5164, + "eval_samples_per_second": 61.186, + "eval_steps_per_second": 61.186, + "step": 300544 + }, + { + "epoch": 4.01, + "learning_rate": 0.00019198786201022147, + "loss": 1.3097, + "step": 301000 + }, + { + "epoch": 4.01, + "learning_rate": 0.0001919745528109029, + "loss": 1.3614, + "step": 301500 + }, + { + "epoch": 4.02, + "learning_rate": 0.00019196124361158434, + "loss": 1.3087, + "step": 302000 + }, + { + "epoch": 4.03, + "learning_rate": 0.00019194793441226578, + "loss": 1.3089, + "step": 302500 + }, + { + "epoch": 4.03, + "learning_rate": 0.00019193462521294722, + "loss": 1.2949, + "step": 303000 + }, + { + "epoch": 4.04, + "learning_rate": 0.00019192131601362863, + "loss": 1.4222, + "step": 303500 + }, + { + "epoch": 4.05, + "learning_rate": 0.00019190800681431007, + "loss": 1.3124, + "step": 304000 + }, + { + "epoch": 4.05, + "learning_rate": 0.00019189469761499148, + "loss": 1.4448, + "step": 304500 + }, + { + "epoch": 4.06, + "learning_rate": 0.00019188138841567292, + "loss": 1.3385, + "step": 305000 + }, + { + "epoch": 4.07, + "learning_rate": 0.00019186807921635436, + "loss": 1.4459, + "step": 305500 + }, + { + "epoch": 4.07, + "learning_rate": 0.00019185477001703577, + "loss": 1.3182, + "step": 306000 + }, + { + "epoch": 4.08, + "learning_rate": 0.0001918414608177172, + "loss": 1.4025, + "step": 306500 + }, + { + "epoch": 4.09, + "learning_rate": 0.00019182815161839864, + "loss": 1.3668, + "step": 307000 + }, + { + "epoch": 4.09, + "learning_rate": 0.00019181484241908008, + "loss": 1.3769, + "step": 307500 + }, + { + "epoch": 4.1, + "learning_rate": 0.00019180153321976152, + "loss": 1.3798, + "step": 308000 + }, + { + "epoch": 4.11, + "learning_rate": 0.00019178822402044293, + "loss": 1.3717, + "step": 308500 + }, + { + "epoch": 4.11, + "learning_rate": 0.00019177491482112437, + "loss": 1.2967, + "step": 309000 + }, + { + "epoch": 4.12, + "learning_rate": 0.0001917616056218058, + "loss": 1.2975, + "step": 309500 + }, + { + "epoch": 4.13, + "learning_rate": 0.00019174829642248724, + "loss": 1.3321, + "step": 310000 + }, + { + "epoch": 4.13, + "learning_rate": 0.00019173498722316868, + "loss": 1.3499, + "step": 310500 + }, + { + "epoch": 4.14, + "learning_rate": 0.0001917216780238501, + "loss": 1.3556, + "step": 311000 + }, + { + "epoch": 4.15, + "learning_rate": 0.0001917083688245315, + "loss": 1.409, + "step": 311500 + }, + { + "epoch": 4.15, + "learning_rate": 0.00019169505962521294, + "loss": 1.3666, + "step": 312000 + }, + { + "epoch": 4.16, + "learning_rate": 0.00019168175042589438, + "loss": 1.4045, + "step": 312500 + }, + { + "epoch": 4.17, + "learning_rate": 0.00019166844122657582, + "loss": 1.4168, + "step": 313000 + }, + { + "epoch": 4.17, + "learning_rate": 0.00019165513202725726, + "loss": 1.3961, + "step": 313500 + }, + { + "epoch": 4.18, + "learning_rate": 0.00019164182282793867, + "loss": 1.3621, + "step": 314000 + }, + { + "epoch": 4.19, + "learning_rate": 0.0001916285136286201, + "loss": 1.3018, + "step": 314500 + }, + { + "epoch": 4.19, + "learning_rate": 0.00019161520442930154, + "loss": 1.341, + "step": 315000 + }, + { + "epoch": 4.2, + "learning_rate": 0.00019160189522998298, + "loss": 1.4319, + "step": 315500 + }, + { + "epoch": 4.21, + "learning_rate": 0.0001915885860306644, + "loss": 1.3935, + "step": 316000 + }, + { + "epoch": 4.21, + "learning_rate": 0.00019157527683134583, + "loss": 1.3445, + "step": 316500 + }, + { + "epoch": 4.22, + "learning_rate": 0.00019156196763202727, + "loss": 1.4346, + "step": 317000 + }, + { + "epoch": 4.23, + "learning_rate": 0.0001915486584327087, + "loss": 1.4014, + "step": 317500 + }, + { + "epoch": 4.23, + "learning_rate": 0.00019153534923339015, + "loss": 1.4154, + "step": 318000 + }, + { + "epoch": 4.24, + "learning_rate": 0.00019152204003407156, + "loss": 1.2909, + "step": 318500 + }, + { + "epoch": 4.25, + "learning_rate": 0.00019150873083475297, + "loss": 1.352, + "step": 319000 + }, + { + "epoch": 4.25, + "learning_rate": 0.0001914954216354344, + "loss": 1.3872, + "step": 319500 + }, + { + "epoch": 4.26, + "learning_rate": 0.00019148211243611585, + "loss": 1.3701, + "step": 320000 + }, + { + "epoch": 4.27, + "learning_rate": 0.00019146880323679728, + "loss": 1.4155, + "step": 320500 + }, + { + "epoch": 4.27, + "learning_rate": 0.00019145549403747872, + "loss": 1.4313, + "step": 321000 + }, + { + "epoch": 4.28, + "learning_rate": 0.00019144218483816013, + "loss": 1.3959, + "step": 321500 + }, + { + "epoch": 4.29, + "learning_rate": 0.00019142887563884157, + "loss": 1.3712, + "step": 322000 + }, + { + "epoch": 4.29, + "learning_rate": 0.000191415566439523, + "loss": 1.2681, + "step": 322500 + }, + { + "epoch": 4.3, + "learning_rate": 0.00019140225724020445, + "loss": 1.4065, + "step": 323000 + }, + { + "epoch": 4.31, + "learning_rate": 0.00019138894804088589, + "loss": 1.4301, + "step": 323500 + }, + { + "epoch": 4.31, + "learning_rate": 0.0001913756388415673, + "loss": 1.4, + "step": 324000 + }, + { + "epoch": 4.32, + "learning_rate": 0.00019136232964224873, + "loss": 1.3669, + "step": 324500 + }, + { + "epoch": 4.33, + "learning_rate": 0.00019134902044293017, + "loss": 1.4134, + "step": 325000 + }, + { + "epoch": 4.33, + "learning_rate": 0.0001913357112436116, + "loss": 1.3596, + "step": 325500 + }, + { + "epoch": 4.34, + "learning_rate": 0.00019132240204429302, + "loss": 1.325, + "step": 326000 + }, + { + "epoch": 4.35, + "learning_rate": 0.00019130909284497446, + "loss": 1.415, + "step": 326500 + }, + { + "epoch": 4.35, + "learning_rate": 0.00019129578364565587, + "loss": 1.3677, + "step": 327000 + }, + { + "epoch": 4.36, + "learning_rate": 0.0001912824744463373, + "loss": 1.4166, + "step": 327500 + }, + { + "epoch": 4.37, + "learning_rate": 0.00019126916524701875, + "loss": 1.3797, + "step": 328000 + }, + { + "epoch": 4.37, + "learning_rate": 0.00019125585604770019, + "loss": 1.4002, + "step": 328500 + }, + { + "epoch": 4.38, + "learning_rate": 0.0001912425468483816, + "loss": 1.3016, + "step": 329000 + }, + { + "epoch": 4.39, + "learning_rate": 0.00019122923764906303, + "loss": 1.2929, + "step": 329500 + }, + { + "epoch": 4.39, + "learning_rate": 0.00019121592844974447, + "loss": 1.303, + "step": 330000 + }, + { + "epoch": 4.4, + "learning_rate": 0.0001912026192504259, + "loss": 1.387, + "step": 330500 + }, + { + "epoch": 4.41, + "learning_rate": 0.00019118931005110735, + "loss": 1.3859, + "step": 331000 + }, + { + "epoch": 4.41, + "learning_rate": 0.00019117600085178876, + "loss": 1.3026, + "step": 331500 + }, + { + "epoch": 4.42, + "learning_rate": 0.0001911626916524702, + "loss": 1.3718, + "step": 332000 + }, + { + "epoch": 4.43, + "learning_rate": 0.00019114938245315164, + "loss": 1.3643, + "step": 332500 + }, + { + "epoch": 4.43, + "learning_rate": 0.00019113607325383305, + "loss": 1.3631, + "step": 333000 + }, + { + "epoch": 4.44, + "learning_rate": 0.00019112276405451449, + "loss": 1.3937, + "step": 333500 + }, + { + "epoch": 4.45, + "learning_rate": 0.00019110945485519592, + "loss": 1.3724, + "step": 334000 + }, + { + "epoch": 4.45, + "learning_rate": 0.00019109614565587733, + "loss": 1.4076, + "step": 334500 + }, + { + "epoch": 4.46, + "learning_rate": 0.00019108283645655877, + "loss": 1.4288, + "step": 335000 + }, + { + "epoch": 4.47, + "learning_rate": 0.0001910695272572402, + "loss": 1.362, + "step": 335500 + }, + { + "epoch": 4.47, + "learning_rate": 0.00019105621805792165, + "loss": 1.3388, + "step": 336000 + }, + { + "epoch": 4.48, + "learning_rate": 0.0001910429088586031, + "loss": 1.3455, + "step": 336500 + }, + { + "epoch": 4.49, + "learning_rate": 0.0001910295996592845, + "loss": 1.3507, + "step": 337000 + }, + { + "epoch": 4.49, + "learning_rate": 0.00019101629045996594, + "loss": 1.387, + "step": 337500 + }, + { + "epoch": 4.5, + "learning_rate": 0.00019100298126064737, + "loss": 1.3889, + "step": 338000 + }, + { + "epoch": 4.51, + "learning_rate": 0.0001909896720613288, + "loss": 1.4924, + "step": 338500 + }, + { + "epoch": 4.51, + "learning_rate": 0.00019097636286201025, + "loss": 1.3709, + "step": 339000 + }, + { + "epoch": 4.52, + "learning_rate": 0.00019096305366269166, + "loss": 1.3821, + "step": 339500 + }, + { + "epoch": 4.53, + "learning_rate": 0.0001909497444633731, + "loss": 1.45, + "step": 340000 + }, + { + "epoch": 4.53, + "learning_rate": 0.0001909364352640545, + "loss": 1.432, + "step": 340500 + }, + { + "epoch": 4.54, + "learning_rate": 0.00019092312606473595, + "loss": 1.2902, + "step": 341000 + }, + { + "epoch": 4.55, + "learning_rate": 0.0001909098168654174, + "loss": 1.393, + "step": 341500 + }, + { + "epoch": 4.55, + "learning_rate": 0.0001908965076660988, + "loss": 1.382, + "step": 342000 + }, + { + "epoch": 4.56, + "learning_rate": 0.00019088319846678024, + "loss": 1.4208, + "step": 342500 + }, + { + "epoch": 4.57, + "learning_rate": 0.00019086988926746167, + "loss": 1.3832, + "step": 343000 + }, + { + "epoch": 4.57, + "learning_rate": 0.0001908565800681431, + "loss": 1.3807, + "step": 343500 + }, + { + "epoch": 4.58, + "learning_rate": 0.00019084327086882455, + "loss": 1.4388, + "step": 344000 + }, + { + "epoch": 4.59, + "learning_rate": 0.00019082996166950596, + "loss": 1.3082, + "step": 344500 + }, + { + "epoch": 4.59, + "learning_rate": 0.0001908166524701874, + "loss": 1.4276, + "step": 345000 + }, + { + "epoch": 4.6, + "learning_rate": 0.00019080334327086884, + "loss": 1.4532, + "step": 345500 + }, + { + "epoch": 4.6, + "learning_rate": 0.00019079003407155028, + "loss": 1.3624, + "step": 346000 + }, + { + "epoch": 4.61, + "learning_rate": 0.00019077672487223171, + "loss": 1.526, + "step": 346500 + }, + { + "epoch": 4.62, + "learning_rate": 0.00019076341567291313, + "loss": 1.3321, + "step": 347000 + }, + { + "epoch": 4.62, + "learning_rate": 0.00019075010647359454, + "loss": 1.4103, + "step": 347500 + }, + { + "epoch": 4.63, + "learning_rate": 0.00019073679727427598, + "loss": 1.3803, + "step": 348000 + }, + { + "epoch": 4.64, + "learning_rate": 0.0001907234880749574, + "loss": 1.3081, + "step": 348500 + }, + { + "epoch": 4.64, + "learning_rate": 0.00019071017887563885, + "loss": 1.3709, + "step": 349000 + }, + { + "epoch": 4.65, + "learning_rate": 0.0001906968696763203, + "loss": 1.4199, + "step": 349500 + }, + { + "epoch": 4.66, + "learning_rate": 0.0001906835604770017, + "loss": 1.3941, + "step": 350000 + }, + { + "epoch": 4.66, + "learning_rate": 0.00019067025127768314, + "loss": 1.3927, + "step": 350500 + }, + { + "epoch": 4.67, + "learning_rate": 0.00019065694207836458, + "loss": 1.3579, + "step": 351000 + }, + { + "epoch": 4.68, + "learning_rate": 0.00019064363287904601, + "loss": 1.2922, + "step": 351500 + }, + { + "epoch": 4.68, + "learning_rate": 0.00019063032367972745, + "loss": 1.3024, + "step": 352000 + }, + { + "epoch": 4.69, + "learning_rate": 0.00019061701448040886, + "loss": 1.4132, + "step": 352500 + }, + { + "epoch": 4.7, + "learning_rate": 0.0001906037052810903, + "loss": 1.5121, + "step": 353000 + }, + { + "epoch": 4.7, + "learning_rate": 0.00019059039608177174, + "loss": 1.4538, + "step": 353500 + }, + { + "epoch": 4.71, + "learning_rate": 0.00019057708688245318, + "loss": 1.5066, + "step": 354000 + }, + { + "epoch": 4.72, + "learning_rate": 0.0001905637776831346, + "loss": 1.3405, + "step": 354500 + }, + { + "epoch": 4.72, + "learning_rate": 0.000190550468483816, + "loss": 1.33, + "step": 355000 + }, + { + "epoch": 4.73, + "learning_rate": 0.00019053715928449744, + "loss": 1.2941, + "step": 355500 + }, + { + "epoch": 4.74, + "learning_rate": 0.00019052385008517888, + "loss": 1.4025, + "step": 356000 + }, + { + "epoch": 4.74, + "learning_rate": 0.00019051054088586032, + "loss": 1.3327, + "step": 356500 + }, + { + "epoch": 4.75, + "learning_rate": 0.00019049723168654175, + "loss": 1.4117, + "step": 357000 + }, + { + "epoch": 4.76, + "learning_rate": 0.00019048392248722316, + "loss": 1.3084, + "step": 357500 + }, + { + "epoch": 4.76, + "learning_rate": 0.0001904706132879046, + "loss": 1.339, + "step": 358000 + }, + { + "epoch": 4.77, + "learning_rate": 0.00019045730408858604, + "loss": 1.4461, + "step": 358500 + }, + { + "epoch": 4.78, + "learning_rate": 0.00019044399488926748, + "loss": 1.2968, + "step": 359000 + }, + { + "epoch": 4.78, + "learning_rate": 0.00019043068568994892, + "loss": 1.312, + "step": 359500 + }, + { + "epoch": 4.79, + "learning_rate": 0.00019041737649063033, + "loss": 1.3812, + "step": 360000 + }, + { + "epoch": 4.8, + "learning_rate": 0.00019040406729131177, + "loss": 1.4189, + "step": 360500 + }, + { + "epoch": 4.8, + "learning_rate": 0.0001903907580919932, + "loss": 1.43, + "step": 361000 + }, + { + "epoch": 4.81, + "learning_rate": 0.00019037744889267464, + "loss": 1.382, + "step": 361500 + }, + { + "epoch": 4.82, + "learning_rate": 0.00019036413969335605, + "loss": 1.4704, + "step": 362000 + }, + { + "epoch": 4.82, + "learning_rate": 0.0001903508304940375, + "loss": 1.3853, + "step": 362500 + }, + { + "epoch": 4.83, + "learning_rate": 0.0001903375212947189, + "loss": 1.3542, + "step": 363000 + }, + { + "epoch": 4.84, + "learning_rate": 0.00019032421209540034, + "loss": 1.3554, + "step": 363500 + }, + { + "epoch": 4.84, + "learning_rate": 0.00019031090289608178, + "loss": 1.528, + "step": 364000 + }, + { + "epoch": 4.85, + "learning_rate": 0.00019029759369676322, + "loss": 1.4008, + "step": 364500 + }, + { + "epoch": 4.86, + "learning_rate": 0.00019028428449744466, + "loss": 1.3436, + "step": 365000 + }, + { + "epoch": 4.86, + "learning_rate": 0.00019027097529812607, + "loss": 1.3648, + "step": 365500 + }, + { + "epoch": 4.87, + "learning_rate": 0.0001902576660988075, + "loss": 1.491, + "step": 366000 + }, + { + "epoch": 4.88, + "learning_rate": 0.00019024435689948894, + "loss": 1.3592, + "step": 366500 + }, + { + "epoch": 4.88, + "learning_rate": 0.00019023104770017038, + "loss": 1.4573, + "step": 367000 + }, + { + "epoch": 4.89, + "learning_rate": 0.0001902177385008518, + "loss": 1.4006, + "step": 367500 + }, + { + "epoch": 4.9, + "learning_rate": 0.00019020442930153323, + "loss": 1.3101, + "step": 368000 + }, + { + "epoch": 4.9, + "learning_rate": 0.00019019112010221467, + "loss": 1.3908, + "step": 368500 + }, + { + "epoch": 4.91, + "learning_rate": 0.00019017781090289608, + "loss": 1.4466, + "step": 369000 + }, + { + "epoch": 4.92, + "learning_rate": 0.00019016450170357752, + "loss": 1.3716, + "step": 369500 + }, + { + "epoch": 4.92, + "learning_rate": 0.00019015119250425896, + "loss": 1.3971, + "step": 370000 + }, + { + "epoch": 4.93, + "learning_rate": 0.00019013788330494037, + "loss": 1.438, + "step": 370500 + }, + { + "epoch": 4.94, + "learning_rate": 0.0001901245741056218, + "loss": 1.3733, + "step": 371000 + }, + { + "epoch": 4.94, + "learning_rate": 0.00019011126490630324, + "loss": 1.3873, + "step": 371500 + }, + { + "epoch": 4.95, + "learning_rate": 0.00019009795570698468, + "loss": 1.3727, + "step": 372000 + }, + { + "epoch": 4.96, + "learning_rate": 0.00019008464650766612, + "loss": 1.3708, + "step": 372500 + }, + { + "epoch": 4.96, + "learning_rate": 0.00019007133730834753, + "loss": 1.3796, + "step": 373000 + }, + { + "epoch": 4.97, + "learning_rate": 0.00019005802810902897, + "loss": 1.3348, + "step": 373500 + }, + { + "epoch": 4.98, + "learning_rate": 0.0001900447189097104, + "loss": 1.2869, + "step": 374000 + }, + { + "epoch": 4.98, + "learning_rate": 0.00019003140971039184, + "loss": 1.4122, + "step": 374500 + }, + { + "epoch": 4.99, + "learning_rate": 0.00019001810051107328, + "loss": 1.4599, + "step": 375000 + }, + { + "epoch": 5.0, + "learning_rate": 0.0001900047913117547, + "loss": 1.482, + "step": 375500 + }, + { + "epoch": 5.0, + "eval_loss": 1.3631517887115479, + "eval_runtime": 151.4049, + "eval_samples_per_second": 62.039, + "eval_steps_per_second": 62.039, + "step": 375680 + }, + { + "epoch": 5.0, + "learning_rate": 0.0001899914821124361, + "loss": 1.2407, + "step": 376000 + }, + { + "epoch": 5.01, + "learning_rate": 0.00018997817291311754, + "loss": 1.2129, + "step": 376500 + }, + { + "epoch": 5.02, + "learning_rate": 0.00018996486371379898, + "loss": 1.3004, + "step": 377000 + }, + { + "epoch": 5.02, + "learning_rate": 0.00018995155451448042, + "loss": 1.2392, + "step": 377500 + }, + { + "epoch": 5.03, + "learning_rate": 0.00018993824531516186, + "loss": 1.353, + "step": 378000 + }, + { + "epoch": 5.04, + "learning_rate": 0.00018992493611584327, + "loss": 1.2066, + "step": 378500 + }, + { + "epoch": 5.04, + "learning_rate": 0.0001899116269165247, + "loss": 1.2607, + "step": 379000 + }, + { + "epoch": 5.05, + "learning_rate": 0.00018989831771720614, + "loss": 1.2476, + "step": 379500 + }, + { + "epoch": 5.06, + "learning_rate": 0.00018988500851788758, + "loss": 1.2611, + "step": 380000 + }, + { + "epoch": 5.06, + "learning_rate": 0.000189871699318569, + "loss": 1.3003, + "step": 380500 + }, + { + "epoch": 5.07, + "learning_rate": 0.00018985839011925043, + "loss": 1.2301, + "step": 381000 + }, + { + "epoch": 5.08, + "learning_rate": 0.00018984508091993187, + "loss": 1.1755, + "step": 381500 + }, + { + "epoch": 5.08, + "learning_rate": 0.0001898317717206133, + "loss": 1.2484, + "step": 382000 + }, + { + "epoch": 5.09, + "learning_rate": 0.00018981846252129475, + "loss": 1.3084, + "step": 382500 + }, + { + "epoch": 5.1, + "learning_rate": 0.00018980515332197616, + "loss": 1.2391, + "step": 383000 + }, + { + "epoch": 5.1, + "learning_rate": 0.00018979184412265757, + "loss": 1.2445, + "step": 383500 + }, + { + "epoch": 5.11, + "learning_rate": 0.000189778534923339, + "loss": 1.3115, + "step": 384000 + }, + { + "epoch": 5.12, + "learning_rate": 0.00018976522572402044, + "loss": 1.2671, + "step": 384500 + }, + { + "epoch": 5.12, + "learning_rate": 0.00018975191652470188, + "loss": 1.3634, + "step": 385000 + }, + { + "epoch": 5.13, + "learning_rate": 0.00018973860732538332, + "loss": 1.318, + "step": 385500 + }, + { + "epoch": 5.14, + "learning_rate": 0.00018972529812606473, + "loss": 1.3099, + "step": 386000 + }, + { + "epoch": 5.14, + "learning_rate": 0.00018971198892674617, + "loss": 1.2595, + "step": 386500 + }, + { + "epoch": 5.15, + "learning_rate": 0.0001896986797274276, + "loss": 1.3852, + "step": 387000 + }, + { + "epoch": 5.16, + "learning_rate": 0.00018968537052810905, + "loss": 1.2959, + "step": 387500 + }, + { + "epoch": 5.16, + "learning_rate": 0.00018967206132879048, + "loss": 1.2599, + "step": 388000 + }, + { + "epoch": 5.17, + "learning_rate": 0.0001896587521294719, + "loss": 1.2856, + "step": 388500 + }, + { + "epoch": 5.18, + "learning_rate": 0.00018964544293015333, + "loss": 1.3148, + "step": 389000 + }, + { + "epoch": 5.18, + "learning_rate": 0.00018963213373083477, + "loss": 1.2934, + "step": 389500 + }, + { + "epoch": 5.19, + "learning_rate": 0.0001896188245315162, + "loss": 1.2729, + "step": 390000 + }, + { + "epoch": 5.2, + "learning_rate": 0.00018960551533219762, + "loss": 1.3682, + "step": 390500 + }, + { + "epoch": 5.2, + "learning_rate": 0.00018959220613287906, + "loss": 1.3046, + "step": 391000 + }, + { + "epoch": 5.21, + "learning_rate": 0.00018957889693356047, + "loss": 1.2826, + "step": 391500 + }, + { + "epoch": 5.22, + "learning_rate": 0.0001895655877342419, + "loss": 1.3338, + "step": 392000 + }, + { + "epoch": 5.22, + "learning_rate": 0.00018955227853492335, + "loss": 1.3051, + "step": 392500 + }, + { + "epoch": 5.23, + "learning_rate": 0.00018953896933560479, + "loss": 1.279, + "step": 393000 + }, + { + "epoch": 5.24, + "learning_rate": 0.0001895256601362862, + "loss": 1.2571, + "step": 393500 + }, + { + "epoch": 5.24, + "learning_rate": 0.00018951235093696763, + "loss": 1.2967, + "step": 394000 + }, + { + "epoch": 5.25, + "learning_rate": 0.00018949904173764907, + "loss": 1.3146, + "step": 394500 + }, + { + "epoch": 5.26, + "learning_rate": 0.0001894857325383305, + "loss": 1.3376, + "step": 395000 + }, + { + "epoch": 5.26, + "learning_rate": 0.00018947242333901195, + "loss": 1.307, + "step": 395500 + }, + { + "epoch": 5.27, + "learning_rate": 0.00018945911413969336, + "loss": 1.29, + "step": 396000 + }, + { + "epoch": 5.28, + "learning_rate": 0.0001894458049403748, + "loss": 1.2576, + "step": 396500 + }, + { + "epoch": 5.28, + "learning_rate": 0.00018943249574105624, + "loss": 1.2077, + "step": 397000 + }, + { + "epoch": 5.29, + "learning_rate": 0.00018941918654173765, + "loss": 1.3651, + "step": 397500 + }, + { + "epoch": 5.3, + "learning_rate": 0.00018940587734241909, + "loss": 1.3213, + "step": 398000 + }, + { + "epoch": 5.3, + "learning_rate": 0.00018939256814310052, + "loss": 1.2586, + "step": 398500 + }, + { + "epoch": 5.31, + "learning_rate": 0.00018937925894378193, + "loss": 1.3278, + "step": 399000 + }, + { + "epoch": 5.32, + "learning_rate": 0.00018936594974446337, + "loss": 1.2745, + "step": 399500 + }, + { + "epoch": 5.32, + "learning_rate": 0.0001893526405451448, + "loss": 1.2649, + "step": 400000 + }, + { + "epoch": 5.33, + "learning_rate": 0.00018933933134582625, + "loss": 1.3187, + "step": 400500 + }, + { + "epoch": 5.34, + "learning_rate": 0.0001893260221465077, + "loss": 1.2126, + "step": 401000 + }, + { + "epoch": 5.34, + "learning_rate": 0.0001893127129471891, + "loss": 1.281, + "step": 401500 + }, + { + "epoch": 5.35, + "learning_rate": 0.00018929940374787054, + "loss": 1.1913, + "step": 402000 + }, + { + "epoch": 5.36, + "learning_rate": 0.00018928609454855197, + "loss": 1.3565, + "step": 402500 + }, + { + "epoch": 5.36, + "learning_rate": 0.0001892727853492334, + "loss": 1.337, + "step": 403000 + }, + { + "epoch": 5.37, + "learning_rate": 0.00018925947614991485, + "loss": 1.2379, + "step": 403500 + }, + { + "epoch": 5.38, + "learning_rate": 0.00018924616695059626, + "loss": 1.2413, + "step": 404000 + }, + { + "epoch": 5.38, + "learning_rate": 0.00018923285775127767, + "loss": 1.3328, + "step": 404500 + }, + { + "epoch": 5.39, + "learning_rate": 0.0001892195485519591, + "loss": 1.2681, + "step": 405000 + }, + { + "epoch": 5.4, + "learning_rate": 0.00018920623935264055, + "loss": 1.332, + "step": 405500 + }, + { + "epoch": 5.4, + "learning_rate": 0.000189192930153322, + "loss": 1.3436, + "step": 406000 + }, + { + "epoch": 5.41, + "learning_rate": 0.0001891796209540034, + "loss": 1.2292, + "step": 406500 + }, + { + "epoch": 5.42, + "learning_rate": 0.00018916631175468484, + "loss": 1.3298, + "step": 407000 + }, + { + "epoch": 5.42, + "learning_rate": 0.00018915300255536627, + "loss": 1.323, + "step": 407500 + }, + { + "epoch": 5.43, + "learning_rate": 0.0001891396933560477, + "loss": 1.2168, + "step": 408000 + }, + { + "epoch": 5.44, + "learning_rate": 0.00018912638415672915, + "loss": 1.2748, + "step": 408500 + }, + { + "epoch": 5.44, + "learning_rate": 0.00018911307495741056, + "loss": 1.2818, + "step": 409000 + }, + { + "epoch": 5.45, + "learning_rate": 0.000189099765758092, + "loss": 1.3531, + "step": 409500 + }, + { + "epoch": 5.46, + "learning_rate": 0.00018908645655877344, + "loss": 1.2883, + "step": 410000 + }, + { + "epoch": 5.46, + "learning_rate": 0.00018907314735945488, + "loss": 1.2805, + "step": 410500 + }, + { + "epoch": 5.47, + "learning_rate": 0.00018905983816013631, + "loss": 1.1821, + "step": 411000 + }, + { + "epoch": 5.48, + "learning_rate": 0.00018904652896081773, + "loss": 1.3083, + "step": 411500 + }, + { + "epoch": 5.48, + "learning_rate": 0.00018903321976149914, + "loss": 1.2781, + "step": 412000 + }, + { + "epoch": 5.49, + "learning_rate": 0.00018901991056218057, + "loss": 1.3032, + "step": 412500 + }, + { + "epoch": 5.5, + "learning_rate": 0.000189006601362862, + "loss": 1.3755, + "step": 413000 + }, + { + "epoch": 5.5, + "learning_rate": 0.00018899329216354345, + "loss": 1.3112, + "step": 413500 + }, + { + "epoch": 5.51, + "learning_rate": 0.0001889799829642249, + "loss": 1.2157, + "step": 414000 + }, + { + "epoch": 5.52, + "learning_rate": 0.0001889666737649063, + "loss": 1.3974, + "step": 414500 + }, + { + "epoch": 5.52, + "learning_rate": 0.00018895336456558774, + "loss": 1.2776, + "step": 415000 + }, + { + "epoch": 5.53, + "learning_rate": 0.00018894005536626918, + "loss": 1.3196, + "step": 415500 + }, + { + "epoch": 5.54, + "learning_rate": 0.00018892674616695061, + "loss": 1.2642, + "step": 416000 + }, + { + "epoch": 5.54, + "learning_rate": 0.00018891343696763205, + "loss": 1.3229, + "step": 416500 + }, + { + "epoch": 5.55, + "learning_rate": 0.00018890012776831346, + "loss": 1.3571, + "step": 417000 + }, + { + "epoch": 5.56, + "learning_rate": 0.0001888868185689949, + "loss": 1.2576, + "step": 417500 + }, + { + "epoch": 5.56, + "learning_rate": 0.00018887350936967634, + "loss": 1.2513, + "step": 418000 + }, + { + "epoch": 5.57, + "learning_rate": 0.00018886020017035778, + "loss": 1.2904, + "step": 418500 + }, + { + "epoch": 5.58, + "learning_rate": 0.0001888468909710392, + "loss": 1.2498, + "step": 419000 + }, + { + "epoch": 5.58, + "learning_rate": 0.0001888335817717206, + "loss": 1.3343, + "step": 419500 + }, + { + "epoch": 5.59, + "learning_rate": 0.00018882027257240204, + "loss": 1.287, + "step": 420000 + }, + { + "epoch": 5.6, + "learning_rate": 0.00018880696337308348, + "loss": 1.2978, + "step": 420500 + }, + { + "epoch": 5.6, + "learning_rate": 0.00018879365417376491, + "loss": 1.4191, + "step": 421000 + }, + { + "epoch": 5.61, + "learning_rate": 0.00018878034497444635, + "loss": 1.2907, + "step": 421500 + }, + { + "epoch": 5.62, + "learning_rate": 0.00018876703577512776, + "loss": 1.2843, + "step": 422000 + }, + { + "epoch": 5.62, + "learning_rate": 0.0001887537265758092, + "loss": 1.2897, + "step": 422500 + }, + { + "epoch": 5.63, + "learning_rate": 0.00018874041737649064, + "loss": 1.2947, + "step": 423000 + }, + { + "epoch": 5.64, + "learning_rate": 0.00018872710817717208, + "loss": 1.2929, + "step": 423500 + }, + { + "epoch": 5.64, + "learning_rate": 0.00018871379897785352, + "loss": 1.3531, + "step": 424000 + }, + { + "epoch": 5.65, + "learning_rate": 0.00018870048977853493, + "loss": 1.2122, + "step": 424500 + }, + { + "epoch": 5.66, + "learning_rate": 0.00018868718057921637, + "loss": 1.3542, + "step": 425000 + }, + { + "epoch": 5.66, + "learning_rate": 0.0001886738713798978, + "loss": 1.2293, + "step": 425500 + }, + { + "epoch": 5.67, + "learning_rate": 0.00018866056218057922, + "loss": 1.3058, + "step": 426000 + }, + { + "epoch": 5.68, + "learning_rate": 0.00018864725298126065, + "loss": 1.3417, + "step": 426500 + }, + { + "epoch": 5.68, + "learning_rate": 0.0001886339437819421, + "loss": 1.3006, + "step": 427000 + }, + { + "epoch": 5.69, + "learning_rate": 0.0001886206345826235, + "loss": 1.3495, + "step": 427500 + }, + { + "epoch": 5.7, + "learning_rate": 0.00018860732538330494, + "loss": 1.2705, + "step": 428000 + }, + { + "epoch": 5.7, + "learning_rate": 0.00018859401618398638, + "loss": 1.2444, + "step": 428500 + }, + { + "epoch": 5.71, + "learning_rate": 0.00018858070698466782, + "loss": 1.3491, + "step": 429000 + }, + { + "epoch": 5.72, + "learning_rate": 0.00018856739778534926, + "loss": 1.3384, + "step": 429500 + }, + { + "epoch": 5.72, + "learning_rate": 0.00018855408858603067, + "loss": 1.3292, + "step": 430000 + }, + { + "epoch": 5.73, + "learning_rate": 0.0001885407793867121, + "loss": 1.3075, + "step": 430500 + }, + { + "epoch": 5.74, + "learning_rate": 0.00018852747018739354, + "loss": 1.3825, + "step": 431000 + }, + { + "epoch": 5.74, + "learning_rate": 0.00018851416098807498, + "loss": 1.3393, + "step": 431500 + }, + { + "epoch": 5.75, + "learning_rate": 0.0001885008517887564, + "loss": 1.2794, + "step": 432000 + }, + { + "epoch": 5.76, + "learning_rate": 0.00018848754258943783, + "loss": 1.3184, + "step": 432500 + }, + { + "epoch": 5.76, + "learning_rate": 0.00018847423339011927, + "loss": 1.312, + "step": 433000 + }, + { + "epoch": 5.77, + "learning_rate": 0.00018846092419080068, + "loss": 1.2864, + "step": 433500 + }, + { + "epoch": 5.78, + "learning_rate": 0.00018844761499148212, + "loss": 1.264, + "step": 434000 + }, + { + "epoch": 5.78, + "learning_rate": 0.00018843430579216356, + "loss": 1.2549, + "step": 434500 + }, + { + "epoch": 5.79, + "learning_rate": 0.00018842099659284497, + "loss": 1.397, + "step": 435000 + }, + { + "epoch": 5.8, + "learning_rate": 0.0001884076873935264, + "loss": 1.4127, + "step": 435500 + }, + { + "epoch": 5.8, + "learning_rate": 0.00018839437819420784, + "loss": 1.28, + "step": 436000 + }, + { + "epoch": 5.81, + "learning_rate": 0.00018838106899488928, + "loss": 1.2579, + "step": 436500 + }, + { + "epoch": 5.82, + "learning_rate": 0.00018836775979557072, + "loss": 1.3153, + "step": 437000 + }, + { + "epoch": 5.82, + "learning_rate": 0.00018835445059625213, + "loss": 1.2852, + "step": 437500 + }, + { + "epoch": 5.83, + "learning_rate": 0.00018834114139693357, + "loss": 1.3078, + "step": 438000 + }, + { + "epoch": 5.84, + "learning_rate": 0.000188327832197615, + "loss": 1.198, + "step": 438500 + }, + { + "epoch": 5.84, + "learning_rate": 0.00018831452299829644, + "loss": 1.294, + "step": 439000 + }, + { + "epoch": 5.85, + "learning_rate": 0.00018830121379897788, + "loss": 1.3666, + "step": 439500 + }, + { + "epoch": 5.86, + "learning_rate": 0.0001882879045996593, + "loss": 1.2163, + "step": 440000 + }, + { + "epoch": 5.86, + "learning_rate": 0.0001882745954003407, + "loss": 1.3198, + "step": 440500 + }, + { + "epoch": 5.87, + "learning_rate": 0.00018826128620102214, + "loss": 1.2969, + "step": 441000 + }, + { + "epoch": 5.88, + "learning_rate": 0.00018824797700170358, + "loss": 1.2432, + "step": 441500 + }, + { + "epoch": 5.88, + "learning_rate": 0.00018823466780238502, + "loss": 1.2734, + "step": 442000 + }, + { + "epoch": 5.89, + "learning_rate": 0.00018822135860306646, + "loss": 1.2472, + "step": 442500 + }, + { + "epoch": 5.9, + "learning_rate": 0.00018820804940374787, + "loss": 1.3459, + "step": 443000 + }, + { + "epoch": 5.9, + "learning_rate": 0.0001881947402044293, + "loss": 1.2772, + "step": 443500 + }, + { + "epoch": 5.91, + "learning_rate": 0.00018818143100511074, + "loss": 1.2499, + "step": 444000 + }, + { + "epoch": 5.92, + "learning_rate": 0.00018816812180579218, + "loss": 1.2926, + "step": 444500 + }, + { + "epoch": 5.92, + "learning_rate": 0.0001881548126064736, + "loss": 1.3315, + "step": 445000 + }, + { + "epoch": 5.93, + "learning_rate": 0.00018814150340715503, + "loss": 1.3512, + "step": 445500 + }, + { + "epoch": 5.94, + "learning_rate": 0.00018812819420783647, + "loss": 1.309, + "step": 446000 + }, + { + "epoch": 5.94, + "learning_rate": 0.0001881148850085179, + "loss": 1.3363, + "step": 446500 + }, + { + "epoch": 5.95, + "learning_rate": 0.00018810157580919935, + "loss": 1.2856, + "step": 447000 + }, + { + "epoch": 5.96, + "learning_rate": 0.00018808826660988076, + "loss": 1.302, + "step": 447500 + }, + { + "epoch": 5.96, + "learning_rate": 0.00018807495741056217, + "loss": 1.2676, + "step": 448000 + }, + { + "epoch": 5.97, + "learning_rate": 0.0001880616482112436, + "loss": 1.2811, + "step": 448500 + }, + { + "epoch": 5.98, + "learning_rate": 0.00018804833901192504, + "loss": 1.3252, + "step": 449000 + }, + { + "epoch": 5.98, + "learning_rate": 0.00018803502981260648, + "loss": 1.2677, + "step": 449500 + }, + { + "epoch": 5.99, + "learning_rate": 0.00018802172061328792, + "loss": 1.367, + "step": 450000 + }, + { + "epoch": 6.0, + "learning_rate": 0.00018800841141396933, + "loss": 1.321, + "step": 450500 + }, + { + "epoch": 6.0, + "eval_loss": 1.3407889604568481, + "eval_runtime": 150.7509, + "eval_samples_per_second": 62.308, + "eval_steps_per_second": 62.308, + "step": 450816 + }, + { + "epoch": 6.0, + "learning_rate": 0.00018799510221465077, + "loss": 1.2992, + "step": 451000 + }, + { + "epoch": 6.01, + "learning_rate": 0.0001879817930153322, + "loss": 1.2075, + "step": 451500 + }, + { + "epoch": 6.02, + "learning_rate": 0.00018796848381601365, + "loss": 1.211, + "step": 452000 + }, + { + "epoch": 6.02, + "learning_rate": 0.00018795517461669508, + "loss": 1.1365, + "step": 452500 + }, + { + "epoch": 6.03, + "learning_rate": 0.0001879418654173765, + "loss": 1.1299, + "step": 453000 + }, + { + "epoch": 6.04, + "learning_rate": 0.00018792855621805793, + "loss": 1.233, + "step": 453500 + }, + { + "epoch": 6.04, + "learning_rate": 0.00018791524701873937, + "loss": 1.1957, + "step": 454000 + }, + { + "epoch": 6.05, + "learning_rate": 0.0001879019378194208, + "loss": 1.0896, + "step": 454500 + }, + { + "epoch": 6.06, + "learning_rate": 0.00018788862862010222, + "loss": 1.1323, + "step": 455000 + }, + { + "epoch": 6.06, + "learning_rate": 0.00018787531942078366, + "loss": 1.1648, + "step": 455500 + }, + { + "epoch": 6.07, + "learning_rate": 0.00018786201022146507, + "loss": 1.1893, + "step": 456000 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001878487010221465, + "loss": 1.2025, + "step": 456500 + }, + { + "epoch": 6.08, + "learning_rate": 0.00018783539182282795, + "loss": 1.1842, + "step": 457000 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018782208262350938, + "loss": 1.2402, + "step": 457500 + }, + { + "epoch": 6.1, + "learning_rate": 0.0001878087734241908, + "loss": 1.1911, + "step": 458000 + }, + { + "epoch": 6.1, + "learning_rate": 0.00018779546422487223, + "loss": 1.1726, + "step": 458500 + }, + { + "epoch": 6.11, + "learning_rate": 0.00018778215502555367, + "loss": 1.1387, + "step": 459000 + }, + { + "epoch": 6.12, + "learning_rate": 0.0001877688458262351, + "loss": 1.1896, + "step": 459500 + }, + { + "epoch": 6.12, + "learning_rate": 0.00018775553662691655, + "loss": 1.3012, + "step": 460000 + }, + { + "epoch": 6.13, + "learning_rate": 0.00018774222742759796, + "loss": 1.2844, + "step": 460500 + }, + { + "epoch": 6.14, + "learning_rate": 0.0001877289182282794, + "loss": 1.2092, + "step": 461000 + }, + { + "epoch": 6.14, + "learning_rate": 0.00018771560902896084, + "loss": 1.2416, + "step": 461500 + }, + { + "epoch": 6.15, + "learning_rate": 0.00018770229982964225, + "loss": 1.1934, + "step": 462000 + }, + { + "epoch": 6.16, + "learning_rate": 0.00018768899063032369, + "loss": 1.2612, + "step": 462500 + }, + { + "epoch": 6.16, + "learning_rate": 0.00018767568143100512, + "loss": 1.2074, + "step": 463000 + }, + { + "epoch": 6.17, + "learning_rate": 0.00018766237223168653, + "loss": 1.1902, + "step": 463500 + }, + { + "epoch": 6.18, + "learning_rate": 0.00018764906303236797, + "loss": 1.2334, + "step": 464000 + }, + { + "epoch": 6.18, + "learning_rate": 0.0001876357538330494, + "loss": 1.2369, + "step": 464500 + }, + { + "epoch": 6.19, + "learning_rate": 0.00018762244463373085, + "loss": 1.2007, + "step": 465000 + }, + { + "epoch": 6.2, + "learning_rate": 0.0001876091354344123, + "loss": 1.2398, + "step": 465500 + }, + { + "epoch": 6.2, + "learning_rate": 0.0001875958262350937, + "loss": 1.1333, + "step": 466000 + }, + { + "epoch": 6.21, + "learning_rate": 0.00018758251703577514, + "loss": 1.3337, + "step": 466500 + }, + { + "epoch": 6.22, + "learning_rate": 0.00018756920783645657, + "loss": 1.2476, + "step": 467000 + }, + { + "epoch": 6.22, + "learning_rate": 0.000187555898637138, + "loss": 1.2868, + "step": 467500 + }, + { + "epoch": 6.23, + "learning_rate": 0.00018754258943781945, + "loss": 1.2082, + "step": 468000 + }, + { + "epoch": 6.24, + "learning_rate": 0.00018752928023850086, + "loss": 1.2373, + "step": 468500 + }, + { + "epoch": 6.24, + "learning_rate": 0.00018751597103918227, + "loss": 1.2823, + "step": 469000 + }, + { + "epoch": 6.25, + "learning_rate": 0.0001875026618398637, + "loss": 1.1542, + "step": 469500 + }, + { + "epoch": 6.26, + "learning_rate": 0.00018748935264054515, + "loss": 1.2151, + "step": 470000 + }, + { + "epoch": 6.26, + "learning_rate": 0.0001874760434412266, + "loss": 1.145, + "step": 470500 + }, + { + "epoch": 6.27, + "learning_rate": 0.000187462734241908, + "loss": 1.2274, + "step": 471000 + }, + { + "epoch": 6.28, + "learning_rate": 0.00018744942504258944, + "loss": 1.1398, + "step": 471500 + }, + { + "epoch": 6.28, + "learning_rate": 0.00018743611584327087, + "loss": 1.2637, + "step": 472000 + }, + { + "epoch": 6.29, + "learning_rate": 0.0001874228066439523, + "loss": 1.2776, + "step": 472500 + }, + { + "epoch": 6.3, + "learning_rate": 0.00018740949744463375, + "loss": 1.2042, + "step": 473000 + }, + { + "epoch": 6.3, + "learning_rate": 0.00018739618824531516, + "loss": 1.2968, + "step": 473500 + }, + { + "epoch": 6.31, + "learning_rate": 0.0001873828790459966, + "loss": 1.2255, + "step": 474000 + }, + { + "epoch": 6.32, + "learning_rate": 0.00018736956984667804, + "loss": 1.1385, + "step": 474500 + }, + { + "epoch": 6.32, + "learning_rate": 0.00018735626064735948, + "loss": 1.204, + "step": 475000 + }, + { + "epoch": 6.33, + "learning_rate": 0.00018734295144804091, + "loss": 1.1504, + "step": 475500 + }, + { + "epoch": 6.34, + "learning_rate": 0.00018732964224872233, + "loss": 1.2177, + "step": 476000 + }, + { + "epoch": 6.34, + "learning_rate": 0.00018731633304940374, + "loss": 1.2204, + "step": 476500 + }, + { + "epoch": 6.35, + "learning_rate": 0.00018730302385008517, + "loss": 1.1733, + "step": 477000 + }, + { + "epoch": 6.36, + "learning_rate": 0.0001872897146507666, + "loss": 1.1992, + "step": 477500 + }, + { + "epoch": 6.36, + "learning_rate": 0.00018727640545144805, + "loss": 1.1848, + "step": 478000 + }, + { + "epoch": 6.37, + "learning_rate": 0.0001872630962521295, + "loss": 1.2006, + "step": 478500 + }, + { + "epoch": 6.38, + "learning_rate": 0.0001872497870528109, + "loss": 1.1762, + "step": 479000 + }, + { + "epoch": 6.38, + "learning_rate": 0.00018723647785349234, + "loss": 1.326, + "step": 479500 + }, + { + "epoch": 6.39, + "learning_rate": 0.00018722316865417378, + "loss": 1.2996, + "step": 480000 + }, + { + "epoch": 6.4, + "learning_rate": 0.00018720985945485521, + "loss": 1.2594, + "step": 480500 + }, + { + "epoch": 6.4, + "learning_rate": 0.00018719655025553665, + "loss": 1.246, + "step": 481000 + }, + { + "epoch": 6.41, + "learning_rate": 0.00018718324105621806, + "loss": 1.1864, + "step": 481500 + }, + { + "epoch": 6.42, + "learning_rate": 0.0001871699318568995, + "loss": 1.2074, + "step": 482000 + }, + { + "epoch": 6.42, + "learning_rate": 0.00018715662265758094, + "loss": 1.1846, + "step": 482500 + }, + { + "epoch": 6.43, + "learning_rate": 0.00018714331345826238, + "loss": 1.2251, + "step": 483000 + }, + { + "epoch": 6.43, + "learning_rate": 0.0001871300042589438, + "loss": 1.2194, + "step": 483500 + }, + { + "epoch": 6.44, + "learning_rate": 0.0001871166950596252, + "loss": 1.1532, + "step": 484000 + }, + { + "epoch": 6.45, + "learning_rate": 0.00018710338586030664, + "loss": 1.1033, + "step": 484500 + }, + { + "epoch": 6.45, + "learning_rate": 0.00018709007666098808, + "loss": 1.2574, + "step": 485000 + }, + { + "epoch": 6.46, + "learning_rate": 0.00018707676746166951, + "loss": 1.177, + "step": 485500 + }, + { + "epoch": 6.47, + "learning_rate": 0.00018706345826235095, + "loss": 1.3113, + "step": 486000 + }, + { + "epoch": 6.47, + "learning_rate": 0.00018705014906303236, + "loss": 1.2113, + "step": 486500 + }, + { + "epoch": 6.48, + "learning_rate": 0.0001870368398637138, + "loss": 1.2341, + "step": 487000 + }, + { + "epoch": 6.49, + "learning_rate": 0.00018702353066439524, + "loss": 1.3178, + "step": 487500 + }, + { + "epoch": 6.49, + "learning_rate": 0.00018701022146507668, + "loss": 1.1443, + "step": 488000 + }, + { + "epoch": 6.5, + "learning_rate": 0.00018699691226575812, + "loss": 1.1875, + "step": 488500 + }, + { + "epoch": 6.51, + "learning_rate": 0.00018698360306643953, + "loss": 1.2945, + "step": 489000 + }, + { + "epoch": 6.51, + "learning_rate": 0.00018697029386712097, + "loss": 1.214, + "step": 489500 + }, + { + "epoch": 6.52, + "learning_rate": 0.0001869569846678024, + "loss": 1.1998, + "step": 490000 + }, + { + "epoch": 6.53, + "learning_rate": 0.00018694367546848382, + "loss": 1.2449, + "step": 490500 + }, + { + "epoch": 6.53, + "learning_rate": 0.00018693036626916525, + "loss": 1.2695, + "step": 491000 + }, + { + "epoch": 6.54, + "learning_rate": 0.0001869170570698467, + "loss": 1.2479, + "step": 491500 + }, + { + "epoch": 6.55, + "learning_rate": 0.0001869037478705281, + "loss": 1.1694, + "step": 492000 + }, + { + "epoch": 6.55, + "learning_rate": 0.00018689043867120954, + "loss": 1.2422, + "step": 492500 + }, + { + "epoch": 6.56, + "learning_rate": 0.00018687712947189098, + "loss": 1.2026, + "step": 493000 + }, + { + "epoch": 6.57, + "learning_rate": 0.00018686382027257242, + "loss": 1.2459, + "step": 493500 + }, + { + "epoch": 6.57, + "learning_rate": 0.00018685051107325385, + "loss": 1.1844, + "step": 494000 + }, + { + "epoch": 6.58, + "learning_rate": 0.00018683720187393527, + "loss": 1.2062, + "step": 494500 + }, + { + "epoch": 6.59, + "learning_rate": 0.0001868238926746167, + "loss": 1.262, + "step": 495000 + }, + { + "epoch": 6.59, + "learning_rate": 0.00018681058347529814, + "loss": 1.3093, + "step": 495500 + }, + { + "epoch": 6.6, + "learning_rate": 0.00018679727427597958, + "loss": 1.2191, + "step": 496000 + }, + { + "epoch": 6.61, + "learning_rate": 0.000186783965076661, + "loss": 1.2455, + "step": 496500 + }, + { + "epoch": 6.61, + "learning_rate": 0.00018677065587734243, + "loss": 1.2894, + "step": 497000 + }, + { + "epoch": 6.62, + "learning_rate": 0.00018675734667802387, + "loss": 1.2219, + "step": 497500 + }, + { + "epoch": 6.63, + "learning_rate": 0.00018674403747870528, + "loss": 1.2323, + "step": 498000 + }, + { + "epoch": 6.63, + "learning_rate": 0.00018673072827938672, + "loss": 1.2455, + "step": 498500 + }, + { + "epoch": 6.64, + "learning_rate": 0.00018671741908006816, + "loss": 1.2717, + "step": 499000 + }, + { + "epoch": 6.65, + "learning_rate": 0.00018670410988074957, + "loss": 1.3192, + "step": 499500 + }, + { + "epoch": 6.65, + "learning_rate": 0.000186690800681431, + "loss": 1.1968, + "step": 500000 + }, + { + "epoch": 6.66, + "learning_rate": 0.00018667749148211244, + "loss": 1.3435, + "step": 500500 + }, + { + "epoch": 6.67, + "learning_rate": 0.00018666418228279388, + "loss": 1.1948, + "step": 501000 + }, + { + "epoch": 6.67, + "learning_rate": 0.00018665087308347532, + "loss": 1.2478, + "step": 501500 + }, + { + "epoch": 6.68, + "learning_rate": 0.00018663756388415673, + "loss": 1.2542, + "step": 502000 + }, + { + "epoch": 6.69, + "learning_rate": 0.00018662425468483817, + "loss": 1.2587, + "step": 502500 + }, + { + "epoch": 6.69, + "learning_rate": 0.0001866109454855196, + "loss": 1.1819, + "step": 503000 + }, + { + "epoch": 6.7, + "learning_rate": 0.00018659763628620104, + "loss": 1.227, + "step": 503500 + }, + { + "epoch": 6.71, + "learning_rate": 0.00018658432708688248, + "loss": 1.2652, + "step": 504000 + }, + { + "epoch": 6.71, + "learning_rate": 0.0001865710178875639, + "loss": 1.2934, + "step": 504500 + }, + { + "epoch": 6.72, + "learning_rate": 0.0001865577086882453, + "loss": 1.2275, + "step": 505000 + }, + { + "epoch": 6.73, + "learning_rate": 0.00018654439948892674, + "loss": 1.2533, + "step": 505500 + }, + { + "epoch": 6.73, + "learning_rate": 0.00018653109028960818, + "loss": 1.283, + "step": 506000 + }, + { + "epoch": 6.74, + "learning_rate": 0.00018651778109028962, + "loss": 1.2534, + "step": 506500 + }, + { + "epoch": 6.75, + "learning_rate": 0.00018650447189097106, + "loss": 1.2625, + "step": 507000 + }, + { + "epoch": 6.75, + "learning_rate": 0.00018649116269165247, + "loss": 1.1477, + "step": 507500 + }, + { + "epoch": 6.76, + "learning_rate": 0.0001864778534923339, + "loss": 1.2436, + "step": 508000 + }, + { + "epoch": 6.77, + "learning_rate": 0.00018646454429301534, + "loss": 1.2876, + "step": 508500 + }, + { + "epoch": 6.77, + "learning_rate": 0.00018645123509369678, + "loss": 1.1998, + "step": 509000 + }, + { + "epoch": 6.78, + "learning_rate": 0.0001864379258943782, + "loss": 1.2604, + "step": 509500 + }, + { + "epoch": 6.79, + "learning_rate": 0.00018642461669505963, + "loss": 1.2464, + "step": 510000 + }, + { + "epoch": 6.79, + "learning_rate": 0.00018641130749574107, + "loss": 1.2474, + "step": 510500 + }, + { + "epoch": 6.8, + "learning_rate": 0.0001863979982964225, + "loss": 1.2516, + "step": 511000 + }, + { + "epoch": 6.81, + "learning_rate": 0.00018638468909710395, + "loss": 1.1846, + "step": 511500 + }, + { + "epoch": 6.81, + "learning_rate": 0.00018637137989778536, + "loss": 1.2755, + "step": 512000 + }, + { + "epoch": 6.82, + "learning_rate": 0.00018635807069846677, + "loss": 1.2636, + "step": 512500 + }, + { + "epoch": 6.83, + "learning_rate": 0.0001863447614991482, + "loss": 1.2519, + "step": 513000 + }, + { + "epoch": 6.83, + "learning_rate": 0.00018633145229982964, + "loss": 1.3264, + "step": 513500 + }, + { + "epoch": 6.84, + "learning_rate": 0.00018631814310051108, + "loss": 1.2963, + "step": 514000 + }, + { + "epoch": 6.85, + "learning_rate": 0.00018630483390119252, + "loss": 1.3338, + "step": 514500 + }, + { + "epoch": 6.85, + "learning_rate": 0.00018629152470187393, + "loss": 1.3028, + "step": 515000 + }, + { + "epoch": 6.86, + "learning_rate": 0.00018627821550255537, + "loss": 1.2046, + "step": 515500 + }, + { + "epoch": 6.87, + "learning_rate": 0.0001862649063032368, + "loss": 1.2223, + "step": 516000 + }, + { + "epoch": 6.87, + "learning_rate": 0.00018625159710391825, + "loss": 1.257, + "step": 516500 + }, + { + "epoch": 6.88, + "learning_rate": 0.00018623828790459968, + "loss": 1.2178, + "step": 517000 + }, + { + "epoch": 6.89, + "learning_rate": 0.0001862249787052811, + "loss": 1.149, + "step": 517500 + }, + { + "epoch": 6.89, + "learning_rate": 0.00018621166950596253, + "loss": 1.2528, + "step": 518000 + }, + { + "epoch": 6.9, + "learning_rate": 0.00018619836030664397, + "loss": 1.2645, + "step": 518500 + }, + { + "epoch": 6.91, + "learning_rate": 0.00018618505110732538, + "loss": 1.2341, + "step": 519000 + }, + { + "epoch": 6.91, + "learning_rate": 0.00018617174190800682, + "loss": 1.2593, + "step": 519500 + }, + { + "epoch": 6.92, + "learning_rate": 0.00018615843270868826, + "loss": 1.2828, + "step": 520000 + }, + { + "epoch": 6.93, + "learning_rate": 0.00018614512350936967, + "loss": 1.2659, + "step": 520500 + }, + { + "epoch": 6.93, + "learning_rate": 0.0001861318143100511, + "loss": 1.2177, + "step": 521000 + }, + { + "epoch": 6.94, + "learning_rate": 0.00018611850511073255, + "loss": 1.2723, + "step": 521500 + }, + { + "epoch": 6.95, + "learning_rate": 0.00018610519591141398, + "loss": 1.2375, + "step": 522000 + }, + { + "epoch": 6.95, + "learning_rate": 0.0001860918867120954, + "loss": 1.1982, + "step": 522500 + }, + { + "epoch": 6.96, + "learning_rate": 0.00018607857751277683, + "loss": 1.288, + "step": 523000 + }, + { + "epoch": 6.97, + "learning_rate": 0.00018606526831345827, + "loss": 1.2791, + "step": 523500 + }, + { + "epoch": 6.97, + "learning_rate": 0.0001860519591141397, + "loss": 1.2297, + "step": 524000 + }, + { + "epoch": 6.98, + "learning_rate": 0.00018603864991482115, + "loss": 1.2586, + "step": 524500 + }, + { + "epoch": 6.99, + "learning_rate": 0.00018602534071550256, + "loss": 1.247, + "step": 525000 + }, + { + "epoch": 6.99, + "learning_rate": 0.000186012031516184, + "loss": 1.2443, + "step": 525500 + }, + { + "epoch": 7.0, + "eval_loss": 1.3239233493804932, + "eval_runtime": 151.989, + "eval_samples_per_second": 61.801, + "eval_steps_per_second": 61.801, + "step": 525952 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018599872231686544, + "loss": 1.3257, + "step": 526000 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018598541311754685, + "loss": 1.1216, + "step": 526500 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018597210391822829, + "loss": 1.0827, + "step": 527000 + }, + { + "epoch": 7.02, + "learning_rate": 0.00018595879471890972, + "loss": 1.1344, + "step": 527500 + }, + { + "epoch": 7.03, + "learning_rate": 0.00018594548551959113, + "loss": 1.147, + "step": 528000 + }, + { + "epoch": 7.03, + "learning_rate": 0.00018593217632027257, + "loss": 1.1411, + "step": 528500 + }, + { + "epoch": 7.04, + "learning_rate": 0.000185918867120954, + "loss": 1.1783, + "step": 529000 + }, + { + "epoch": 7.05, + "learning_rate": 0.00018590555792163545, + "loss": 1.199, + "step": 529500 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001858922487223169, + "loss": 1.1345, + "step": 530000 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001858789395229983, + "loss": 1.1102, + "step": 530500 + }, + { + "epoch": 7.07, + "learning_rate": 0.00018586563032367974, + "loss": 1.19, + "step": 531000 + }, + { + "epoch": 7.07, + "learning_rate": 0.00018585232112436117, + "loss": 1.1419, + "step": 531500 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001858390119250426, + "loss": 1.1731, + "step": 532000 + }, + { + "epoch": 7.09, + "learning_rate": 0.00018582570272572405, + "loss": 1.1318, + "step": 532500 + }, + { + "epoch": 7.09, + "learning_rate": 0.00018581239352640546, + "loss": 1.1764, + "step": 533000 + }, + { + "epoch": 7.1, + "learning_rate": 0.00018579908432708687, + "loss": 1.1271, + "step": 533500 + }, + { + "epoch": 7.11, + "learning_rate": 0.0001857857751277683, + "loss": 1.1208, + "step": 534000 + }, + { + "epoch": 7.11, + "learning_rate": 0.00018577246592844975, + "loss": 1.1597, + "step": 534500 + }, + { + "epoch": 7.12, + "learning_rate": 0.0001857591567291312, + "loss": 1.169, + "step": 535000 + }, + { + "epoch": 7.13, + "learning_rate": 0.0001857458475298126, + "loss": 1.1914, + "step": 535500 + }, + { + "epoch": 7.13, + "learning_rate": 0.00018573253833049404, + "loss": 1.1697, + "step": 536000 + }, + { + "epoch": 7.14, + "learning_rate": 0.00018571922913117547, + "loss": 1.135, + "step": 536500 + }, + { + "epoch": 7.15, + "learning_rate": 0.0001857059199318569, + "loss": 1.1092, + "step": 537000 + }, + { + "epoch": 7.15, + "learning_rate": 0.00018569261073253835, + "loss": 1.2167, + "step": 537500 + }, + { + "epoch": 7.16, + "learning_rate": 0.00018567930153321976, + "loss": 1.2271, + "step": 538000 + }, + { + "epoch": 7.17, + "learning_rate": 0.0001856659923339012, + "loss": 1.0969, + "step": 538500 + }, + { + "epoch": 7.17, + "learning_rate": 0.00018565268313458264, + "loss": 1.1361, + "step": 539000 + }, + { + "epoch": 7.18, + "learning_rate": 0.00018563937393526408, + "loss": 1.1424, + "step": 539500 + }, + { + "epoch": 7.19, + "learning_rate": 0.00018562606473594551, + "loss": 1.1361, + "step": 540000 + }, + { + "epoch": 7.19, + "learning_rate": 0.00018561275553662693, + "loss": 1.1466, + "step": 540500 + }, + { + "epoch": 7.2, + "learning_rate": 0.00018559944633730834, + "loss": 1.157, + "step": 541000 + }, + { + "epoch": 7.21, + "learning_rate": 0.00018558613713798977, + "loss": 1.0685, + "step": 541500 + }, + { + "epoch": 7.21, + "learning_rate": 0.0001855728279386712, + "loss": 1.0829, + "step": 542000 + }, + { + "epoch": 7.22, + "learning_rate": 0.00018555951873935265, + "loss": 1.1264, + "step": 542500 + }, + { + "epoch": 7.23, + "learning_rate": 0.0001855462095400341, + "loss": 1.1427, + "step": 543000 + }, + { + "epoch": 7.23, + "learning_rate": 0.0001855329003407155, + "loss": 1.2047, + "step": 543500 + }, + { + "epoch": 7.24, + "learning_rate": 0.00018551959114139694, + "loss": 1.0471, + "step": 544000 + }, + { + "epoch": 7.25, + "learning_rate": 0.00018550628194207838, + "loss": 1.2338, + "step": 544500 + }, + { + "epoch": 7.25, + "learning_rate": 0.00018549297274275981, + "loss": 1.1665, + "step": 545000 + }, + { + "epoch": 7.26, + "learning_rate": 0.00018547966354344125, + "loss": 1.1577, + "step": 545500 + }, + { + "epoch": 7.27, + "learning_rate": 0.00018546635434412266, + "loss": 1.2127, + "step": 546000 + }, + { + "epoch": 7.27, + "learning_rate": 0.0001854530451448041, + "loss": 1.1236, + "step": 546500 + }, + { + "epoch": 7.28, + "learning_rate": 0.00018543973594548554, + "loss": 1.1113, + "step": 547000 + }, + { + "epoch": 7.29, + "learning_rate": 0.00018542642674616698, + "loss": 1.1382, + "step": 547500 + }, + { + "epoch": 7.29, + "learning_rate": 0.0001854131175468484, + "loss": 1.1848, + "step": 548000 + }, + { + "epoch": 7.3, + "learning_rate": 0.0001853998083475298, + "loss": 1.2489, + "step": 548500 + }, + { + "epoch": 7.31, + "learning_rate": 0.00018538649914821124, + "loss": 1.1431, + "step": 549000 + }, + { + "epoch": 7.31, + "learning_rate": 0.00018537318994889268, + "loss": 1.1791, + "step": 549500 + }, + { + "epoch": 7.32, + "learning_rate": 0.00018535988074957411, + "loss": 1.102, + "step": 550000 + }, + { + "epoch": 7.33, + "learning_rate": 0.00018534657155025555, + "loss": 1.1928, + "step": 550500 + }, + { + "epoch": 7.33, + "learning_rate": 0.00018533326235093696, + "loss": 1.2256, + "step": 551000 + }, + { + "epoch": 7.34, + "learning_rate": 0.0001853199531516184, + "loss": 1.1329, + "step": 551500 + }, + { + "epoch": 7.35, + "learning_rate": 0.00018530664395229984, + "loss": 1.1367, + "step": 552000 + }, + { + "epoch": 7.35, + "learning_rate": 0.00018529333475298128, + "loss": 1.1584, + "step": 552500 + }, + { + "epoch": 7.36, + "learning_rate": 0.00018528002555366272, + "loss": 1.2093, + "step": 553000 + }, + { + "epoch": 7.37, + "learning_rate": 0.00018526671635434413, + "loss": 1.1427, + "step": 553500 + }, + { + "epoch": 7.37, + "learning_rate": 0.00018525340715502557, + "loss": 1.0839, + "step": 554000 + }, + { + "epoch": 7.38, + "learning_rate": 0.000185240097955707, + "loss": 1.1644, + "step": 554500 + }, + { + "epoch": 7.39, + "learning_rate": 0.00018522678875638841, + "loss": 1.1239, + "step": 555000 + }, + { + "epoch": 7.39, + "learning_rate": 0.00018521347955706985, + "loss": 1.2292, + "step": 555500 + }, + { + "epoch": 7.4, + "learning_rate": 0.0001852001703577513, + "loss": 1.1276, + "step": 556000 + }, + { + "epoch": 7.41, + "learning_rate": 0.0001851868611584327, + "loss": 1.1925, + "step": 556500 + }, + { + "epoch": 7.41, + "learning_rate": 0.00018517355195911414, + "loss": 1.1839, + "step": 557000 + }, + { + "epoch": 7.42, + "learning_rate": 0.00018516024275979558, + "loss": 1.1241, + "step": 557500 + }, + { + "epoch": 7.43, + "learning_rate": 0.00018514693356047702, + "loss": 1.1849, + "step": 558000 + }, + { + "epoch": 7.43, + "learning_rate": 0.00018513362436115845, + "loss": 1.1465, + "step": 558500 + }, + { + "epoch": 7.44, + "learning_rate": 0.00018512031516183987, + "loss": 1.2153, + "step": 559000 + }, + { + "epoch": 7.45, + "learning_rate": 0.0001851070059625213, + "loss": 1.2451, + "step": 559500 + }, + { + "epoch": 7.45, + "learning_rate": 0.00018509369676320274, + "loss": 1.1751, + "step": 560000 + }, + { + "epoch": 7.46, + "learning_rate": 0.00018508038756388418, + "loss": 1.111, + "step": 560500 + }, + { + "epoch": 7.47, + "learning_rate": 0.0001850670783645656, + "loss": 1.1836, + "step": 561000 + }, + { + "epoch": 7.47, + "learning_rate": 0.00018505376916524703, + "loss": 1.2, + "step": 561500 + }, + { + "epoch": 7.48, + "learning_rate": 0.00018504045996592844, + "loss": 1.1223, + "step": 562000 + }, + { + "epoch": 7.49, + "learning_rate": 0.00018502715076660988, + "loss": 1.1108, + "step": 562500 + }, + { + "epoch": 7.49, + "learning_rate": 0.00018501384156729132, + "loss": 1.2332, + "step": 563000 + }, + { + "epoch": 7.5, + "learning_rate": 0.00018500053236797275, + "loss": 1.2405, + "step": 563500 + }, + { + "epoch": 7.51, + "learning_rate": 0.00018498722316865417, + "loss": 1.1162, + "step": 564000 + }, + { + "epoch": 7.51, + "learning_rate": 0.0001849739139693356, + "loss": 1.1503, + "step": 564500 + }, + { + "epoch": 7.52, + "learning_rate": 0.00018496060477001704, + "loss": 1.1573, + "step": 565000 + }, + { + "epoch": 7.53, + "learning_rate": 0.00018494729557069848, + "loss": 1.1674, + "step": 565500 + }, + { + "epoch": 7.53, + "learning_rate": 0.00018493398637137992, + "loss": 1.2043, + "step": 566000 + }, + { + "epoch": 7.54, + "learning_rate": 0.00018492067717206133, + "loss": 1.0957, + "step": 566500 + }, + { + "epoch": 7.55, + "learning_rate": 0.00018490736797274277, + "loss": 1.1712, + "step": 567000 + }, + { + "epoch": 7.55, + "learning_rate": 0.0001848940587734242, + "loss": 1.2128, + "step": 567500 + }, + { + "epoch": 7.56, + "learning_rate": 0.00018488074957410564, + "loss": 1.147, + "step": 568000 + }, + { + "epoch": 7.57, + "learning_rate": 0.00018486744037478708, + "loss": 1.2303, + "step": 568500 + }, + { + "epoch": 7.57, + "learning_rate": 0.0001848541311754685, + "loss": 1.2177, + "step": 569000 + }, + { + "epoch": 7.58, + "learning_rate": 0.0001848408219761499, + "loss": 1.1713, + "step": 569500 + }, + { + "epoch": 7.59, + "learning_rate": 0.00018482751277683134, + "loss": 1.2166, + "step": 570000 + }, + { + "epoch": 7.59, + "learning_rate": 0.00018481420357751278, + "loss": 1.1373, + "step": 570500 + }, + { + "epoch": 7.6, + "learning_rate": 0.00018480089437819422, + "loss": 1.1508, + "step": 571000 + }, + { + "epoch": 7.61, + "learning_rate": 0.00018478758517887566, + "loss": 1.2145, + "step": 571500 + }, + { + "epoch": 7.61, + "learning_rate": 0.00018477427597955707, + "loss": 1.1326, + "step": 572000 + }, + { + "epoch": 7.62, + "learning_rate": 0.0001847609667802385, + "loss": 1.1433, + "step": 572500 + }, + { + "epoch": 7.63, + "learning_rate": 0.00018474765758091994, + "loss": 1.2325, + "step": 573000 + }, + { + "epoch": 7.63, + "learning_rate": 0.00018473434838160138, + "loss": 1.1889, + "step": 573500 + }, + { + "epoch": 7.64, + "learning_rate": 0.0001847210391822828, + "loss": 1.2153, + "step": 574000 + }, + { + "epoch": 7.65, + "learning_rate": 0.00018470772998296423, + "loss": 1.254, + "step": 574500 + }, + { + "epoch": 7.65, + "learning_rate": 0.00018469442078364567, + "loss": 1.194, + "step": 575000 + }, + { + "epoch": 7.66, + "learning_rate": 0.0001846811115843271, + "loss": 1.1518, + "step": 575500 + }, + { + "epoch": 7.67, + "learning_rate": 0.00018466780238500855, + "loss": 1.1641, + "step": 576000 + }, + { + "epoch": 7.67, + "learning_rate": 0.00018465449318568996, + "loss": 1.1449, + "step": 576500 + }, + { + "epoch": 7.68, + "learning_rate": 0.00018464118398637137, + "loss": 1.2422, + "step": 577000 + }, + { + "epoch": 7.69, + "learning_rate": 0.0001846278747870528, + "loss": 1.1855, + "step": 577500 + }, + { + "epoch": 7.69, + "learning_rate": 0.00018461456558773424, + "loss": 1.2294, + "step": 578000 + }, + { + "epoch": 7.7, + "learning_rate": 0.00018460125638841568, + "loss": 1.1216, + "step": 578500 + }, + { + "epoch": 7.71, + "learning_rate": 0.00018458794718909712, + "loss": 1.1729, + "step": 579000 + }, + { + "epoch": 7.71, + "learning_rate": 0.00018457463798977853, + "loss": 1.2155, + "step": 579500 + }, + { + "epoch": 7.72, + "learning_rate": 0.00018456132879045997, + "loss": 1.1955, + "step": 580000 + }, + { + "epoch": 7.73, + "learning_rate": 0.0001845480195911414, + "loss": 1.1139, + "step": 580500 + }, + { + "epoch": 7.73, + "learning_rate": 0.00018453471039182285, + "loss": 1.2305, + "step": 581000 + }, + { + "epoch": 7.74, + "learning_rate": 0.00018452140119250428, + "loss": 1.1756, + "step": 581500 + }, + { + "epoch": 7.75, + "learning_rate": 0.0001845080919931857, + "loss": 1.121, + "step": 582000 + }, + { + "epoch": 7.75, + "learning_rate": 0.00018449478279386713, + "loss": 1.2492, + "step": 582500 + }, + { + "epoch": 7.76, + "learning_rate": 0.00018448147359454857, + "loss": 1.1613, + "step": 583000 + }, + { + "epoch": 7.77, + "learning_rate": 0.00018446816439522998, + "loss": 1.1859, + "step": 583500 + }, + { + "epoch": 7.77, + "learning_rate": 0.00018445485519591142, + "loss": 1.116, + "step": 584000 + }, + { + "epoch": 7.78, + "learning_rate": 0.00018444154599659286, + "loss": 1.1553, + "step": 584500 + }, + { + "epoch": 7.79, + "learning_rate": 0.00018442823679727427, + "loss": 1.1895, + "step": 585000 + }, + { + "epoch": 7.79, + "learning_rate": 0.0001844149275979557, + "loss": 1.2087, + "step": 585500 + }, + { + "epoch": 7.8, + "learning_rate": 0.00018440161839863715, + "loss": 1.12, + "step": 586000 + }, + { + "epoch": 7.81, + "learning_rate": 0.00018438830919931858, + "loss": 1.1421, + "step": 586500 + }, + { + "epoch": 7.81, + "learning_rate": 0.000184375, + "loss": 1.2359, + "step": 587000 + }, + { + "epoch": 7.82, + "learning_rate": 0.00018436169080068143, + "loss": 1.2278, + "step": 587500 + }, + { + "epoch": 7.83, + "learning_rate": 0.00018434838160136287, + "loss": 1.1655, + "step": 588000 + }, + { + "epoch": 7.83, + "learning_rate": 0.0001843350724020443, + "loss": 1.1725, + "step": 588500 + }, + { + "epoch": 7.84, + "learning_rate": 0.00018432176320272575, + "loss": 1.1489, + "step": 589000 + }, + { + "epoch": 7.85, + "learning_rate": 0.00018430845400340716, + "loss": 1.1555, + "step": 589500 + }, + { + "epoch": 7.85, + "learning_rate": 0.0001842951448040886, + "loss": 1.1856, + "step": 590000 + }, + { + "epoch": 7.86, + "learning_rate": 0.00018428183560477004, + "loss": 1.1403, + "step": 590500 + }, + { + "epoch": 7.87, + "learning_rate": 0.00018426852640545145, + "loss": 1.1823, + "step": 591000 + }, + { + "epoch": 7.87, + "learning_rate": 0.00018425521720613288, + "loss": 1.217, + "step": 591500 + }, + { + "epoch": 7.88, + "learning_rate": 0.00018424190800681432, + "loss": 1.1805, + "step": 592000 + }, + { + "epoch": 7.89, + "learning_rate": 0.00018422859880749573, + "loss": 1.157, + "step": 592500 + }, + { + "epoch": 7.89, + "learning_rate": 0.00018421528960817717, + "loss": 1.2218, + "step": 593000 + }, + { + "epoch": 7.9, + "learning_rate": 0.0001842019804088586, + "loss": 1.142, + "step": 593500 + }, + { + "epoch": 7.91, + "learning_rate": 0.00018418867120954005, + "loss": 1.2264, + "step": 594000 + }, + { + "epoch": 7.91, + "learning_rate": 0.0001841753620102215, + "loss": 1.3256, + "step": 594500 + }, + { + "epoch": 7.92, + "learning_rate": 0.0001841620528109029, + "loss": 1.27, + "step": 595000 + }, + { + "epoch": 7.93, + "learning_rate": 0.00018414874361158434, + "loss": 1.1317, + "step": 595500 + }, + { + "epoch": 7.93, + "learning_rate": 0.00018413543441226577, + "loss": 1.2121, + "step": 596000 + }, + { + "epoch": 7.94, + "learning_rate": 0.0001841221252129472, + "loss": 1.1319, + "step": 596500 + }, + { + "epoch": 7.95, + "learning_rate": 0.00018410881601362865, + "loss": 1.233, + "step": 597000 + }, + { + "epoch": 7.95, + "learning_rate": 0.00018409550681431006, + "loss": 1.2116, + "step": 597500 + }, + { + "epoch": 7.96, + "learning_rate": 0.00018408219761499147, + "loss": 1.2027, + "step": 598000 + }, + { + "epoch": 7.97, + "learning_rate": 0.0001840688884156729, + "loss": 1.2243, + "step": 598500 + }, + { + "epoch": 7.97, + "learning_rate": 0.00018405557921635435, + "loss": 1.1878, + "step": 599000 + }, + { + "epoch": 7.98, + "learning_rate": 0.0001840422700170358, + "loss": 1.261, + "step": 599500 + }, + { + "epoch": 7.99, + "learning_rate": 0.0001840289608177172, + "loss": 1.1661, + "step": 600000 + }, + { + "epoch": 7.99, + "learning_rate": 0.00018401565161839864, + "loss": 1.1935, + "step": 600500 + }, + { + "epoch": 8.0, + "learning_rate": 0.00018400234241908007, + "loss": 1.2027, + "step": 601000 + }, + { + "epoch": 8.0, + "eval_loss": 1.3234707117080688, + "eval_runtime": 151.4681, + "eval_samples_per_second": 62.013, + "eval_steps_per_second": 62.013, + "step": 601088 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001839890332197615, + "loss": 1.0817, + "step": 601500 + }, + { + "epoch": 8.01, + "learning_rate": 0.00018397572402044295, + "loss": 1.0331, + "step": 602000 + }, + { + "epoch": 8.02, + "learning_rate": 0.00018396241482112436, + "loss": 1.0431, + "step": 602500 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001839491056218058, + "loss": 1.0799, + "step": 603000 + }, + { + "epoch": 8.03, + "learning_rate": 0.00018393579642248724, + "loss": 1.0249, + "step": 603500 + }, + { + "epoch": 8.04, + "learning_rate": 0.00018392248722316868, + "loss": 1.0811, + "step": 604000 + }, + { + "epoch": 8.05, + "learning_rate": 0.00018390917802385011, + "loss": 1.0711, + "step": 604500 + }, + { + "epoch": 8.05, + "learning_rate": 0.00018389586882453153, + "loss": 1.1798, + "step": 605000 + }, + { + "epoch": 8.06, + "learning_rate": 0.00018388255962521294, + "loss": 1.0025, + "step": 605500 + }, + { + "epoch": 8.07, + "learning_rate": 0.00018386925042589437, + "loss": 1.1001, + "step": 606000 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001838559412265758, + "loss": 1.0006, + "step": 606500 + }, + { + "epoch": 8.08, + "learning_rate": 0.00018384263202725725, + "loss": 1.0731, + "step": 607000 + }, + { + "epoch": 8.09, + "learning_rate": 0.0001838293228279387, + "loss": 1.0955, + "step": 607500 + }, + { + "epoch": 8.09, + "learning_rate": 0.0001838160136286201, + "loss": 1.066, + "step": 608000 + }, + { + "epoch": 8.1, + "learning_rate": 0.00018380270442930154, + "loss": 1.1392, + "step": 608500 + }, + { + "epoch": 8.11, + "learning_rate": 0.00018378939522998298, + "loss": 1.1349, + "step": 609000 + }, + { + "epoch": 8.11, + "learning_rate": 0.00018377608603066441, + "loss": 1.0779, + "step": 609500 + }, + { + "epoch": 8.12, + "learning_rate": 0.00018376277683134585, + "loss": 1.0935, + "step": 610000 + }, + { + "epoch": 8.13, + "learning_rate": 0.00018374946763202726, + "loss": 1.0643, + "step": 610500 + }, + { + "epoch": 8.13, + "learning_rate": 0.0001837361584327087, + "loss": 1.1145, + "step": 611000 + }, + { + "epoch": 8.14, + "learning_rate": 0.00018372284923339014, + "loss": 1.0131, + "step": 611500 + }, + { + "epoch": 8.15, + "learning_rate": 0.00018370954003407158, + "loss": 1.014, + "step": 612000 + }, + { + "epoch": 8.15, + "learning_rate": 0.000183696230834753, + "loss": 1.0756, + "step": 612500 + }, + { + "epoch": 8.16, + "learning_rate": 0.0001836829216354344, + "loss": 1.1036, + "step": 613000 + }, + { + "epoch": 8.17, + "learning_rate": 0.00018366961243611584, + "loss": 1.0628, + "step": 613500 + }, + { + "epoch": 8.17, + "learning_rate": 0.00018365630323679728, + "loss": 1.1678, + "step": 614000 + }, + { + "epoch": 8.18, + "learning_rate": 0.00018364299403747871, + "loss": 1.1626, + "step": 614500 + }, + { + "epoch": 8.19, + "learning_rate": 0.00018362968483816015, + "loss": 1.0559, + "step": 615000 + }, + { + "epoch": 8.19, + "learning_rate": 0.00018361637563884156, + "loss": 1.135, + "step": 615500 + }, + { + "epoch": 8.2, + "learning_rate": 0.000183603066439523, + "loss": 1.0733, + "step": 616000 + }, + { + "epoch": 8.21, + "learning_rate": 0.00018358975724020444, + "loss": 1.1718, + "step": 616500 + }, + { + "epoch": 8.21, + "learning_rate": 0.00018357644804088588, + "loss": 1.0518, + "step": 617000 + }, + { + "epoch": 8.22, + "learning_rate": 0.00018356313884156732, + "loss": 1.0494, + "step": 617500 + }, + { + "epoch": 8.23, + "learning_rate": 0.00018354982964224873, + "loss": 1.1376, + "step": 618000 + }, + { + "epoch": 8.23, + "learning_rate": 0.00018353652044293017, + "loss": 1.1724, + "step": 618500 + }, + { + "epoch": 8.24, + "learning_rate": 0.0001835232112436116, + "loss": 1.1045, + "step": 619000 + }, + { + "epoch": 8.25, + "learning_rate": 0.00018350990204429301, + "loss": 1.1085, + "step": 619500 + }, + { + "epoch": 8.25, + "learning_rate": 0.00018349659284497445, + "loss": 1.0354, + "step": 620000 + }, + { + "epoch": 8.26, + "learning_rate": 0.0001834832836456559, + "loss": 1.0804, + "step": 620500 + }, + { + "epoch": 8.27, + "learning_rate": 0.0001834699744463373, + "loss": 1.1119, + "step": 621000 + }, + { + "epoch": 8.27, + "learning_rate": 0.00018345666524701874, + "loss": 1.0813, + "step": 621500 + }, + { + "epoch": 8.28, + "learning_rate": 0.00018344335604770018, + "loss": 1.0846, + "step": 622000 + }, + { + "epoch": 8.28, + "learning_rate": 0.00018343004684838162, + "loss": 1.2261, + "step": 622500 + }, + { + "epoch": 8.29, + "learning_rate": 0.00018341673764906305, + "loss": 1.1004, + "step": 623000 + }, + { + "epoch": 8.3, + "learning_rate": 0.00018340342844974447, + "loss": 1.0597, + "step": 623500 + }, + { + "epoch": 8.3, + "learning_rate": 0.0001833901192504259, + "loss": 1.0623, + "step": 624000 + }, + { + "epoch": 8.31, + "learning_rate": 0.00018337681005110734, + "loss": 1.0565, + "step": 624500 + }, + { + "epoch": 8.32, + "learning_rate": 0.00018336350085178878, + "loss": 1.1088, + "step": 625000 + }, + { + "epoch": 8.32, + "learning_rate": 0.0001833501916524702, + "loss": 1.054, + "step": 625500 + }, + { + "epoch": 8.33, + "learning_rate": 0.00018333688245315163, + "loss": 1.0979, + "step": 626000 + }, + { + "epoch": 8.34, + "learning_rate": 0.00018332357325383304, + "loss": 1.0508, + "step": 626500 + }, + { + "epoch": 8.34, + "learning_rate": 0.00018331026405451448, + "loss": 1.0952, + "step": 627000 + }, + { + "epoch": 8.35, + "learning_rate": 0.00018329695485519592, + "loss": 1.1436, + "step": 627500 + }, + { + "epoch": 8.36, + "learning_rate": 0.00018328364565587735, + "loss": 1.1392, + "step": 628000 + }, + { + "epoch": 8.36, + "learning_rate": 0.00018327033645655877, + "loss": 1.1703, + "step": 628500 + }, + { + "epoch": 8.37, + "learning_rate": 0.0001832570272572402, + "loss": 1.0527, + "step": 629000 + }, + { + "epoch": 8.38, + "learning_rate": 0.00018324371805792164, + "loss": 1.0601, + "step": 629500 + }, + { + "epoch": 8.38, + "learning_rate": 0.00018323040885860308, + "loss": 1.1344, + "step": 630000 + }, + { + "epoch": 8.39, + "learning_rate": 0.00018321709965928452, + "loss": 1.0683, + "step": 630500 + }, + { + "epoch": 8.4, + "learning_rate": 0.00018320379045996593, + "loss": 1.0863, + "step": 631000 + }, + { + "epoch": 8.4, + "learning_rate": 0.00018319048126064737, + "loss": 1.0342, + "step": 631500 + }, + { + "epoch": 8.41, + "learning_rate": 0.0001831771720613288, + "loss": 1.0712, + "step": 632000 + }, + { + "epoch": 8.42, + "learning_rate": 0.00018316386286201024, + "loss": 1.1242, + "step": 632500 + }, + { + "epoch": 8.42, + "learning_rate": 0.00018315055366269168, + "loss": 1.1044, + "step": 633000 + }, + { + "epoch": 8.43, + "learning_rate": 0.0001831372444633731, + "loss": 1.0876, + "step": 633500 + }, + { + "epoch": 8.44, + "learning_rate": 0.0001831239352640545, + "loss": 1.1795, + "step": 634000 + }, + { + "epoch": 8.44, + "learning_rate": 0.00018311062606473594, + "loss": 1.0908, + "step": 634500 + }, + { + "epoch": 8.45, + "learning_rate": 0.00018309731686541738, + "loss": 1.1993, + "step": 635000 + }, + { + "epoch": 8.46, + "learning_rate": 0.00018308400766609882, + "loss": 1.243, + "step": 635500 + }, + { + "epoch": 8.46, + "learning_rate": 0.00018307069846678026, + "loss": 1.1208, + "step": 636000 + }, + { + "epoch": 8.47, + "learning_rate": 0.00018305738926746167, + "loss": 1.1218, + "step": 636500 + }, + { + "epoch": 8.48, + "learning_rate": 0.0001830440800681431, + "loss": 1.1829, + "step": 637000 + }, + { + "epoch": 8.48, + "learning_rate": 0.00018303077086882454, + "loss": 1.1467, + "step": 637500 + }, + { + "epoch": 8.49, + "learning_rate": 0.00018301746166950598, + "loss": 1.0848, + "step": 638000 + }, + { + "epoch": 8.5, + "learning_rate": 0.0001830041524701874, + "loss": 1.1364, + "step": 638500 + }, + { + "epoch": 8.5, + "learning_rate": 0.00018299084327086883, + "loss": 1.1178, + "step": 639000 + }, + { + "epoch": 8.51, + "learning_rate": 0.00018297753407155027, + "loss": 1.118, + "step": 639500 + }, + { + "epoch": 8.52, + "learning_rate": 0.0001829642248722317, + "loss": 1.1277, + "step": 640000 + }, + { + "epoch": 8.52, + "learning_rate": 0.00018295091567291315, + "loss": 1.1548, + "step": 640500 + }, + { + "epoch": 8.53, + "learning_rate": 0.00018293760647359456, + "loss": 1.1264, + "step": 641000 + }, + { + "epoch": 8.54, + "learning_rate": 0.00018292429727427597, + "loss": 1.1071, + "step": 641500 + }, + { + "epoch": 8.54, + "learning_rate": 0.0001829109880749574, + "loss": 1.0937, + "step": 642000 + }, + { + "epoch": 8.55, + "learning_rate": 0.00018289767887563884, + "loss": 1.1876, + "step": 642500 + }, + { + "epoch": 8.56, + "learning_rate": 0.00018288436967632028, + "loss": 1.1334, + "step": 643000 + }, + { + "epoch": 8.56, + "learning_rate": 0.00018287106047700172, + "loss": 1.2755, + "step": 643500 + }, + { + "epoch": 8.57, + "learning_rate": 0.00018285775127768313, + "loss": 1.1289, + "step": 644000 + }, + { + "epoch": 8.58, + "learning_rate": 0.00018284444207836457, + "loss": 1.0956, + "step": 644500 + }, + { + "epoch": 8.58, + "learning_rate": 0.000182831132879046, + "loss": 1.1023, + "step": 645000 + }, + { + "epoch": 8.59, + "learning_rate": 0.00018281782367972745, + "loss": 1.0352, + "step": 645500 + }, + { + "epoch": 8.6, + "learning_rate": 0.00018280451448040888, + "loss": 1.0956, + "step": 646000 + }, + { + "epoch": 8.6, + "learning_rate": 0.0001827912052810903, + "loss": 1.0834, + "step": 646500 + }, + { + "epoch": 8.61, + "learning_rate": 0.00018277789608177173, + "loss": 1.148, + "step": 647000 + }, + { + "epoch": 8.62, + "learning_rate": 0.00018276458688245317, + "loss": 1.125, + "step": 647500 + }, + { + "epoch": 8.62, + "learning_rate": 0.00018275127768313458, + "loss": 1.0742, + "step": 648000 + }, + { + "epoch": 8.63, + "learning_rate": 0.00018273796848381602, + "loss": 1.1637, + "step": 648500 + }, + { + "epoch": 8.64, + "learning_rate": 0.00018272465928449746, + "loss": 1.2304, + "step": 649000 + }, + { + "epoch": 8.64, + "learning_rate": 0.00018271135008517887, + "loss": 1.1194, + "step": 649500 + }, + { + "epoch": 8.65, + "learning_rate": 0.0001826980408858603, + "loss": 1.0773, + "step": 650000 + }, + { + "epoch": 8.66, + "learning_rate": 0.00018268473168654175, + "loss": 1.0997, + "step": 650500 + }, + { + "epoch": 8.66, + "learning_rate": 0.00018267142248722318, + "loss": 1.1384, + "step": 651000 + }, + { + "epoch": 8.67, + "learning_rate": 0.0001826581132879046, + "loss": 1.0972, + "step": 651500 + }, + { + "epoch": 8.68, + "learning_rate": 0.00018264480408858603, + "loss": 1.1346, + "step": 652000 + }, + { + "epoch": 8.68, + "learning_rate": 0.00018263149488926747, + "loss": 1.1175, + "step": 652500 + }, + { + "epoch": 8.69, + "learning_rate": 0.0001826181856899489, + "loss": 1.0364, + "step": 653000 + }, + { + "epoch": 8.7, + "learning_rate": 0.00018260487649063035, + "loss": 1.1752, + "step": 653500 + }, + { + "epoch": 8.7, + "learning_rate": 0.00018259156729131176, + "loss": 1.0964, + "step": 654000 + }, + { + "epoch": 8.71, + "learning_rate": 0.0001825782580919932, + "loss": 1.0879, + "step": 654500 + }, + { + "epoch": 8.72, + "learning_rate": 0.00018256494889267464, + "loss": 1.1167, + "step": 655000 + }, + { + "epoch": 8.72, + "learning_rate": 0.00018255163969335605, + "loss": 1.1088, + "step": 655500 + }, + { + "epoch": 8.73, + "learning_rate": 0.00018253833049403748, + "loss": 1.1252, + "step": 656000 + }, + { + "epoch": 8.74, + "learning_rate": 0.00018252502129471892, + "loss": 1.1303, + "step": 656500 + }, + { + "epoch": 8.74, + "learning_rate": 0.00018251171209540033, + "loss": 1.0708, + "step": 657000 + }, + { + "epoch": 8.75, + "learning_rate": 0.00018249840289608177, + "loss": 1.1427, + "step": 657500 + }, + { + "epoch": 8.76, + "learning_rate": 0.0001824850936967632, + "loss": 1.0791, + "step": 658000 + }, + { + "epoch": 8.76, + "learning_rate": 0.00018247178449744465, + "loss": 1.0627, + "step": 658500 + }, + { + "epoch": 8.77, + "learning_rate": 0.0001824584752981261, + "loss": 1.1436, + "step": 659000 + }, + { + "epoch": 8.78, + "learning_rate": 0.0001824451660988075, + "loss": 1.1278, + "step": 659500 + }, + { + "epoch": 8.78, + "learning_rate": 0.00018243185689948894, + "loss": 1.1135, + "step": 660000 + }, + { + "epoch": 8.79, + "learning_rate": 0.00018241854770017037, + "loss": 1.1699, + "step": 660500 + }, + { + "epoch": 8.8, + "learning_rate": 0.0001824052385008518, + "loss": 1.0952, + "step": 661000 + }, + { + "epoch": 8.8, + "learning_rate": 0.00018239192930153325, + "loss": 1.1151, + "step": 661500 + }, + { + "epoch": 8.81, + "learning_rate": 0.00018237862010221466, + "loss": 1.1177, + "step": 662000 + }, + { + "epoch": 8.82, + "learning_rate": 0.00018236531090289607, + "loss": 1.0539, + "step": 662500 + }, + { + "epoch": 8.82, + "learning_rate": 0.0001823520017035775, + "loss": 1.244, + "step": 663000 + }, + { + "epoch": 8.83, + "learning_rate": 0.00018233869250425895, + "loss": 1.1707, + "step": 663500 + }, + { + "epoch": 8.84, + "learning_rate": 0.0001823253833049404, + "loss": 1.1065, + "step": 664000 + }, + { + "epoch": 8.84, + "learning_rate": 0.0001823120741056218, + "loss": 1.1775, + "step": 664500 + }, + { + "epoch": 8.85, + "learning_rate": 0.00018229876490630324, + "loss": 1.1488, + "step": 665000 + }, + { + "epoch": 8.86, + "learning_rate": 0.00018228545570698467, + "loss": 1.1673, + "step": 665500 + }, + { + "epoch": 8.86, + "learning_rate": 0.0001822721465076661, + "loss": 1.1918, + "step": 666000 + }, + { + "epoch": 8.87, + "learning_rate": 0.00018225883730834755, + "loss": 1.1079, + "step": 666500 + }, + { + "epoch": 8.88, + "learning_rate": 0.00018224552810902896, + "loss": 1.1284, + "step": 667000 + }, + { + "epoch": 8.88, + "learning_rate": 0.0001822322189097104, + "loss": 1.2182, + "step": 667500 + }, + { + "epoch": 8.89, + "learning_rate": 0.00018221890971039184, + "loss": 1.1137, + "step": 668000 + }, + { + "epoch": 8.9, + "learning_rate": 0.00018220560051107328, + "loss": 1.216, + "step": 668500 + }, + { + "epoch": 8.9, + "learning_rate": 0.00018219229131175471, + "loss": 1.0904, + "step": 669000 + }, + { + "epoch": 8.91, + "learning_rate": 0.00018217898211243613, + "loss": 1.2131, + "step": 669500 + }, + { + "epoch": 8.92, + "learning_rate": 0.00018216567291311754, + "loss": 1.1682, + "step": 670000 + }, + { + "epoch": 8.92, + "learning_rate": 0.00018215236371379897, + "loss": 1.0603, + "step": 670500 + }, + { + "epoch": 8.93, + "learning_rate": 0.0001821390545144804, + "loss": 1.0959, + "step": 671000 + }, + { + "epoch": 8.94, + "learning_rate": 0.00018212574531516185, + "loss": 1.2124, + "step": 671500 + }, + { + "epoch": 8.94, + "learning_rate": 0.0001821124361158433, + "loss": 1.0974, + "step": 672000 + }, + { + "epoch": 8.95, + "learning_rate": 0.0001820991269165247, + "loss": 1.1375, + "step": 672500 + }, + { + "epoch": 8.96, + "learning_rate": 0.00018208581771720614, + "loss": 1.1494, + "step": 673000 + }, + { + "epoch": 8.96, + "learning_rate": 0.00018207250851788758, + "loss": 1.0756, + "step": 673500 + }, + { + "epoch": 8.97, + "learning_rate": 0.00018205919931856901, + "loss": 1.139, + "step": 674000 + }, + { + "epoch": 8.98, + "learning_rate": 0.00018204589011925045, + "loss": 1.0752, + "step": 674500 + }, + { + "epoch": 8.98, + "learning_rate": 0.00018203258091993186, + "loss": 1.192, + "step": 675000 + }, + { + "epoch": 8.99, + "learning_rate": 0.0001820192717206133, + "loss": 1.1568, + "step": 675500 + }, + { + "epoch": 9.0, + "learning_rate": 0.00018200596252129474, + "loss": 1.2032, + "step": 676000 + }, + { + "epoch": 9.0, + "eval_loss": 1.3050340414047241, + "eval_runtime": 151.9979, + "eval_samples_per_second": 61.797, + "eval_steps_per_second": 61.797, + "step": 676224 + } + ], + "max_steps": 7513600, + "num_train_epochs": 100, + "total_flos": 6240548899325952.0, + "trial_name": null, + "trial_params": null +}