{ "best_metric": 6.020912170410156, "best_model_checkpoint": "./results/models/checkpoint-139568", "epoch": 5.0, "eval_steps": 500, "global_step": 174460, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 0.0019994268027054914, "loss": 6.8812, "step": 500 }, { "epoch": 0.03, "learning_rate": 0.0019988536054109823, "loss": 6.7318, "step": 1000 }, { "epoch": 0.04, "learning_rate": 0.0019982804081164736, "loss": 6.7006, "step": 1500 }, { "epoch": 0.06, "learning_rate": 0.001997707210821965, "loss": 6.6761, "step": 2000 }, { "epoch": 0.07, "learning_rate": 0.0019971340135274563, "loss": 6.6478, "step": 2500 }, { "epoch": 0.09, "learning_rate": 0.001996560816232947, "loss": 6.6334, "step": 3000 }, { "epoch": 0.1, "learning_rate": 0.0019959876189384385, "loss": 6.6285, "step": 3500 }, { "epoch": 0.11, "learning_rate": 0.00199541442164393, "loss": 6.6122, "step": 4000 }, { "epoch": 0.13, "learning_rate": 0.0019948412243494212, "loss": 6.6007, "step": 4500 }, { "epoch": 0.14, "learning_rate": 0.001994268027054912, "loss": 6.5815, "step": 5000 }, { "epoch": 0.16, "learning_rate": 0.0019936948297604035, "loss": 6.5741, "step": 5500 }, { "epoch": 0.17, "learning_rate": 0.001993121632465895, "loss": 6.5711, "step": 6000 }, { "epoch": 0.19, "learning_rate": 0.001992548435171386, "loss": 6.5854, "step": 6500 }, { "epoch": 0.2, "learning_rate": 0.001991975237876877, "loss": 6.5759, "step": 7000 }, { "epoch": 0.21, "learning_rate": 0.0019914020405823684, "loss": 6.5474, "step": 7500 }, { "epoch": 0.23, "learning_rate": 0.0019908288432878597, "loss": 6.5431, "step": 8000 }, { "epoch": 0.24, "learning_rate": 0.001990255645993351, "loss": 6.5421, "step": 8500 }, { "epoch": 0.26, "learning_rate": 0.001989682448698842, "loss": 6.532, "step": 9000 }, { "epoch": 0.27, "learning_rate": 0.0019891092514043333, "loss": 6.5349, "step": 9500 }, { "epoch": 0.29, "learning_rate": 0.0019885360541098246, "loss": 6.5146, "step": 10000 }, { "epoch": 0.3, "learning_rate": 0.001987962856815316, "loss": 6.5103, "step": 10500 }, { "epoch": 0.32, "learning_rate": 0.0019873896595208073, "loss": 6.5063, "step": 11000 }, { "epoch": 0.33, "learning_rate": 0.0019868164622262982, "loss": 6.494, "step": 11500 }, { "epoch": 0.34, "learning_rate": 0.0019862432649317896, "loss": 6.5092, "step": 12000 }, { "epoch": 0.36, "learning_rate": 0.001985670067637281, "loss": 6.4969, "step": 12500 }, { "epoch": 0.37, "learning_rate": 0.0019850968703427722, "loss": 6.4886, "step": 13000 }, { "epoch": 0.39, "learning_rate": 0.001984523673048263, "loss": 6.4776, "step": 13500 }, { "epoch": 0.4, "learning_rate": 0.0019839504757537545, "loss": 6.4729, "step": 14000 }, { "epoch": 0.42, "learning_rate": 0.001983377278459246, "loss": 6.4706, "step": 14500 }, { "epoch": 0.43, "learning_rate": 0.001982804081164737, "loss": 6.4803, "step": 15000 }, { "epoch": 0.44, "learning_rate": 0.001982230883870228, "loss": 6.4827, "step": 15500 }, { "epoch": 0.46, "learning_rate": 0.0019816576865757194, "loss": 6.4716, "step": 16000 }, { "epoch": 0.47, "learning_rate": 0.0019810844892812107, "loss": 6.4559, "step": 16500 }, { "epoch": 0.49, "learning_rate": 0.001980511291986702, "loss": 6.4458, "step": 17000 }, { "epoch": 0.5, "learning_rate": 0.001979938094692193, "loss": 6.4338, "step": 17500 }, { "epoch": 0.52, "learning_rate": 0.0019793648973976843, "loss": 6.4282, "step": 18000 }, { "epoch": 0.53, "learning_rate": 0.0019787917001031757, "loss": 6.438, "step": 18500 }, { "epoch": 0.54, "learning_rate": 0.0019782185028086666, "loss": 6.4408, "step": 19000 }, { "epoch": 0.56, "learning_rate": 0.001977645305514158, "loss": 6.4282, "step": 19500 }, { "epoch": 0.57, "learning_rate": 0.0019770721082196492, "loss": 6.43, "step": 20000 }, { "epoch": 0.59, "learning_rate": 0.0019764989109251406, "loss": 6.4287, "step": 20500 }, { "epoch": 0.6, "learning_rate": 0.0019759257136306315, "loss": 6.4267, "step": 21000 }, { "epoch": 0.62, "learning_rate": 0.001975352516336123, "loss": 6.4148, "step": 21500 }, { "epoch": 0.63, "learning_rate": 0.001974779319041614, "loss": 6.4, "step": 22000 }, { "epoch": 0.64, "learning_rate": 0.0019742061217471055, "loss": 6.3894, "step": 22500 }, { "epoch": 0.66, "learning_rate": 0.0019736329244525964, "loss": 6.395, "step": 23000 }, { "epoch": 0.67, "learning_rate": 0.0019730597271580877, "loss": 6.3977, "step": 23500 }, { "epoch": 0.69, "learning_rate": 0.001972486529863579, "loss": 6.4017, "step": 24000 }, { "epoch": 0.7, "learning_rate": 0.0019719133325690704, "loss": 6.3995, "step": 24500 }, { "epoch": 0.72, "learning_rate": 0.0019713401352745613, "loss": 6.3949, "step": 25000 }, { "epoch": 0.73, "learning_rate": 0.0019707669379800527, "loss": 6.3947, "step": 25500 }, { "epoch": 0.75, "learning_rate": 0.001970193740685544, "loss": 6.4071, "step": 26000 }, { "epoch": 0.76, "learning_rate": 0.0019696205433910353, "loss": 6.3988, "step": 26500 }, { "epoch": 0.77, "learning_rate": 0.0019690473460965262, "loss": 6.3915, "step": 27000 }, { "epoch": 0.79, "learning_rate": 0.0019684741488020176, "loss": 6.3869, "step": 27500 }, { "epoch": 0.8, "learning_rate": 0.001967900951507509, "loss": 6.3724, "step": 28000 }, { "epoch": 0.82, "learning_rate": 0.0019673277542130003, "loss": 6.3722, "step": 28500 }, { "epoch": 0.83, "learning_rate": 0.0019667545569184916, "loss": 6.3742, "step": 29000 }, { "epoch": 0.85, "learning_rate": 0.0019661813596239825, "loss": 6.3672, "step": 29500 }, { "epoch": 0.86, "learning_rate": 0.001965608162329474, "loss": 6.3707, "step": 30000 }, { "epoch": 0.87, "learning_rate": 0.001965034965034965, "loss": 6.3687, "step": 30500 }, { "epoch": 0.89, "learning_rate": 0.0019644617677404565, "loss": 6.3644, "step": 31000 }, { "epoch": 0.9, "learning_rate": 0.0019638885704459474, "loss": 6.3596, "step": 31500 }, { "epoch": 0.92, "learning_rate": 0.0019633153731514388, "loss": 6.3613, "step": 32000 }, { "epoch": 0.93, "learning_rate": 0.00196274217585693, "loss": 6.3542, "step": 32500 }, { "epoch": 0.95, "learning_rate": 0.0019621689785624214, "loss": 6.3539, "step": 33000 }, { "epoch": 0.96, "learning_rate": 0.0019615957812679123, "loss": 6.3551, "step": 33500 }, { "epoch": 0.97, "learning_rate": 0.0019610225839734037, "loss": 6.3491, "step": 34000 }, { "epoch": 0.99, "learning_rate": 0.001960449386678895, "loss": 6.3431, "step": 34500 }, { "epoch": 1.0, "eval_loss": 6.34874963760376, "eval_runtime": 28.5, "eval_samples_per_second": 39.228, "eval_steps_per_second": 1.228, "step": 34892 }, { "epoch": 1.0, "learning_rate": 0.0019598761893843864, "loss": 6.3416, "step": 35000 }, { "epoch": 1.02, "learning_rate": 0.0019593029920898773, "loss": 6.343, "step": 35500 }, { "epoch": 1.03, "learning_rate": 0.0019587297947953686, "loss": 6.3372, "step": 36000 }, { "epoch": 1.05, "learning_rate": 0.0019581565975008595, "loss": 6.3392, "step": 36500 }, { "epoch": 1.06, "learning_rate": 0.001957583400206351, "loss": 6.3235, "step": 37000 }, { "epoch": 1.07, "learning_rate": 0.001957010202911842, "loss": 6.3273, "step": 37500 }, { "epoch": 1.09, "learning_rate": 0.0019564370056173335, "loss": 6.3301, "step": 38000 }, { "epoch": 1.1, "learning_rate": 0.001955863808322825, "loss": 6.3422, "step": 38500 }, { "epoch": 1.12, "learning_rate": 0.0019552906110283158, "loss": 6.3314, "step": 39000 }, { "epoch": 1.13, "learning_rate": 0.001954717413733807, "loss": 6.3208, "step": 39500 }, { "epoch": 1.15, "learning_rate": 0.0019541442164392984, "loss": 6.3132, "step": 40000 }, { "epoch": 1.16, "learning_rate": 0.00195357101914479, "loss": 6.3034, "step": 40500 }, { "epoch": 1.18, "learning_rate": 0.001952997821850281, "loss": 6.2984, "step": 41000 }, { "epoch": 1.19, "learning_rate": 0.0019524246245557722, "loss": 6.3067, "step": 41500 }, { "epoch": 1.2, "learning_rate": 0.0019518514272612634, "loss": 6.2983, "step": 42000 }, { "epoch": 1.22, "learning_rate": 0.0019512782299667547, "loss": 6.2971, "step": 42500 }, { "epoch": 1.23, "learning_rate": 0.0019507050326722458, "loss": 6.2949, "step": 43000 }, { "epoch": 1.25, "learning_rate": 0.0019501318353777372, "loss": 6.2861, "step": 43500 }, { "epoch": 1.26, "learning_rate": 0.0019495586380832283, "loss": 6.2903, "step": 44000 }, { "epoch": 1.28, "learning_rate": 0.0019489854407887196, "loss": 6.2845, "step": 44500 }, { "epoch": 1.29, "learning_rate": 0.0019484122434942107, "loss": 6.2811, "step": 45000 }, { "epoch": 1.3, "learning_rate": 0.001947839046199702, "loss": 6.2832, "step": 45500 }, { "epoch": 1.32, "learning_rate": 0.0019472658489051932, "loss": 6.2748, "step": 46000 }, { "epoch": 1.33, "learning_rate": 0.0019466926516106843, "loss": 6.2664, "step": 46500 }, { "epoch": 1.35, "learning_rate": 0.0019461194543161757, "loss": 6.2707, "step": 47000 }, { "epoch": 1.36, "learning_rate": 0.0019455462570216668, "loss": 6.2626, "step": 47500 }, { "epoch": 1.38, "learning_rate": 0.0019449730597271581, "loss": 6.2621, "step": 48000 }, { "epoch": 1.39, "learning_rate": 0.0019443998624326493, "loss": 6.2637, "step": 48500 }, { "epoch": 1.4, "learning_rate": 0.0019438266651381406, "loss": 6.2644, "step": 49000 }, { "epoch": 1.42, "learning_rate": 0.0019432534678436317, "loss": 6.2595, "step": 49500 }, { "epoch": 1.43, "learning_rate": 0.001942680270549123, "loss": 6.2502, "step": 50000 }, { "epoch": 1.45, "learning_rate": 0.0019421070732546142, "loss": 6.2373, "step": 50500 }, { "epoch": 1.46, "learning_rate": 0.0019415338759601055, "loss": 6.2397, "step": 51000 }, { "epoch": 1.48, "learning_rate": 0.0019409606786655966, "loss": 6.2358, "step": 51500 }, { "epoch": 1.49, "learning_rate": 0.001940387481371088, "loss": 6.2362, "step": 52000 }, { "epoch": 1.5, "learning_rate": 0.001939814284076579, "loss": 6.2348, "step": 52500 }, { "epoch": 1.52, "learning_rate": 0.0019392410867820704, "loss": 6.2401, "step": 53000 }, { "epoch": 1.53, "learning_rate": 0.0019386678894875616, "loss": 6.2407, "step": 53500 }, { "epoch": 1.55, "learning_rate": 0.0019380946921930529, "loss": 6.2371, "step": 54000 }, { "epoch": 1.56, "learning_rate": 0.001937521494898544, "loss": 6.2278, "step": 54500 }, { "epoch": 1.58, "learning_rate": 0.0019369482976040354, "loss": 6.222, "step": 55000 }, { "epoch": 1.59, "learning_rate": 0.0019363751003095267, "loss": 6.219, "step": 55500 }, { "epoch": 1.6, "learning_rate": 0.0019358019030150178, "loss": 6.2236, "step": 56000 }, { "epoch": 1.62, "learning_rate": 0.0019352287057205091, "loss": 6.2238, "step": 56500 }, { "epoch": 1.63, "learning_rate": 0.0019346555084260003, "loss": 6.2226, "step": 57000 }, { "epoch": 1.65, "learning_rate": 0.0019340823111314916, "loss": 6.2154, "step": 57500 }, { "epoch": 1.66, "learning_rate": 0.0019335091138369827, "loss": 6.2112, "step": 58000 }, { "epoch": 1.68, "learning_rate": 0.001932935916542474, "loss": 6.2077, "step": 58500 }, { "epoch": 1.69, "learning_rate": 0.0019323627192479652, "loss": 6.211, "step": 59000 }, { "epoch": 1.71, "learning_rate": 0.0019317895219534565, "loss": 6.2161, "step": 59500 }, { "epoch": 1.72, "learning_rate": 0.0019312163246589477, "loss": 6.2132, "step": 60000 }, { "epoch": 1.73, "learning_rate": 0.001930643127364439, "loss": 6.2179, "step": 60500 }, { "epoch": 1.75, "learning_rate": 0.0019300699300699301, "loss": 6.222, "step": 61000 }, { "epoch": 1.76, "learning_rate": 0.0019294967327754215, "loss": 6.2136, "step": 61500 }, { "epoch": 1.78, "learning_rate": 0.0019289235354809126, "loss": 6.2057, "step": 62000 }, { "epoch": 1.79, "learning_rate": 0.001928350338186404, "loss": 6.2021, "step": 62500 }, { "epoch": 1.81, "learning_rate": 0.001927777140891895, "loss": 6.1955, "step": 63000 }, { "epoch": 1.82, "learning_rate": 0.0019272039435973864, "loss": 6.2036, "step": 63500 }, { "epoch": 1.83, "learning_rate": 0.0019266307463028773, "loss": 6.196, "step": 64000 }, { "epoch": 1.85, "learning_rate": 0.0019260575490083686, "loss": 6.199, "step": 64500 }, { "epoch": 1.86, "learning_rate": 0.00192548435171386, "loss": 6.1965, "step": 65000 }, { "epoch": 1.88, "learning_rate": 0.001924911154419351, "loss": 6.199, "step": 65500 }, { "epoch": 1.89, "learning_rate": 0.0019243379571248424, "loss": 6.1923, "step": 66000 }, { "epoch": 1.91, "learning_rate": 0.0019237647598303335, "loss": 6.1953, "step": 66500 }, { "epoch": 1.92, "learning_rate": 0.0019231915625358249, "loss": 6.1907, "step": 67000 }, { "epoch": 1.93, "learning_rate": 0.001922618365241316, "loss": 6.195, "step": 67500 }, { "epoch": 1.95, "learning_rate": 0.0019220451679468073, "loss": 6.1954, "step": 68000 }, { "epoch": 1.96, "learning_rate": 0.0019214719706522985, "loss": 6.1866, "step": 68500 }, { "epoch": 1.98, "learning_rate": 0.0019208987733577898, "loss": 6.1891, "step": 69000 }, { "epoch": 1.99, "learning_rate": 0.001920325576063281, "loss": 6.1892, "step": 69500 }, { "epoch": 2.0, "eval_loss": 6.193855285644531, "eval_runtime": 29.902, "eval_samples_per_second": 37.389, "eval_steps_per_second": 1.17, "step": 69784 }, { "epoch": 2.01, "learning_rate": 0.0019197523787687723, "loss": 6.1899, "step": 70000 }, { "epoch": 2.02, "learning_rate": 0.0019191791814742634, "loss": 6.1933, "step": 70500 }, { "epoch": 2.03, "learning_rate": 0.0019186059841797547, "loss": 6.1924, "step": 71000 }, { "epoch": 2.05, "learning_rate": 0.0019180327868852458, "loss": 6.1888, "step": 71500 }, { "epoch": 2.06, "learning_rate": 0.0019174595895907372, "loss": 6.1943, "step": 72000 }, { "epoch": 2.08, "learning_rate": 0.0019168863922962283, "loss": 6.1907, "step": 72500 }, { "epoch": 2.09, "learning_rate": 0.0019163131950017196, "loss": 6.1904, "step": 73000 }, { "epoch": 2.11, "learning_rate": 0.001915739997707211, "loss": 6.1815, "step": 73500 }, { "epoch": 2.12, "learning_rate": 0.001915166800412702, "loss": 6.1816, "step": 74000 }, { "epoch": 2.14, "learning_rate": 0.0019145936031181934, "loss": 6.1742, "step": 74500 }, { "epoch": 2.15, "learning_rate": 0.0019140204058236846, "loss": 6.1747, "step": 75000 }, { "epoch": 2.16, "learning_rate": 0.001913447208529176, "loss": 6.1707, "step": 75500 }, { "epoch": 2.18, "learning_rate": 0.001912874011234667, "loss": 6.1718, "step": 76000 }, { "epoch": 2.19, "learning_rate": 0.0019123008139401584, "loss": 6.1677, "step": 76500 }, { "epoch": 2.21, "learning_rate": 0.0019117276166456495, "loss": 6.1584, "step": 77000 }, { "epoch": 2.22, "learning_rate": 0.0019111544193511408, "loss": 6.1722, "step": 77500 }, { "epoch": 2.24, "learning_rate": 0.001910581222056632, "loss": 6.1628, "step": 78000 }, { "epoch": 2.25, "learning_rate": 0.0019100080247621233, "loss": 6.156, "step": 78500 }, { "epoch": 2.26, "learning_rate": 0.0019094348274676144, "loss": 6.1512, "step": 79000 }, { "epoch": 2.28, "learning_rate": 0.0019088616301731057, "loss": 6.1462, "step": 79500 }, { "epoch": 2.29, "learning_rate": 0.0019082884328785969, "loss": 6.1388, "step": 80000 }, { "epoch": 2.31, "learning_rate": 0.0019077152355840882, "loss": 6.1404, "step": 80500 }, { "epoch": 2.32, "learning_rate": 0.0019071420382895793, "loss": 6.1397, "step": 81000 }, { "epoch": 2.34, "learning_rate": 0.0019065688409950707, "loss": 6.1493, "step": 81500 }, { "epoch": 2.35, "learning_rate": 0.0019059956437005616, "loss": 6.1431, "step": 82000 }, { "epoch": 2.36, "learning_rate": 0.001905422446406053, "loss": 6.1521, "step": 82500 }, { "epoch": 2.38, "learning_rate": 0.0019048492491115442, "loss": 6.1409, "step": 83000 }, { "epoch": 2.39, "learning_rate": 0.0019042760518170354, "loss": 6.1408, "step": 83500 }, { "epoch": 2.41, "learning_rate": 0.0019037028545225267, "loss": 6.1416, "step": 84000 }, { "epoch": 2.42, "learning_rate": 0.0019031296572280178, "loss": 6.1371, "step": 84500 }, { "epoch": 2.44, "learning_rate": 0.0019025564599335092, "loss": 6.1367, "step": 85000 }, { "epoch": 2.45, "learning_rate": 0.0019019832626390003, "loss": 6.1372, "step": 85500 }, { "epoch": 2.46, "learning_rate": 0.0019014100653444916, "loss": 6.1358, "step": 86000 }, { "epoch": 2.48, "learning_rate": 0.0019008368680499827, "loss": 6.1283, "step": 86500 }, { "epoch": 2.49, "learning_rate": 0.001900263670755474, "loss": 6.1347, "step": 87000 }, { "epoch": 2.51, "learning_rate": 0.0018996904734609652, "loss": 6.135, "step": 87500 }, { "epoch": 2.52, "learning_rate": 0.0018991172761664565, "loss": 6.1352, "step": 88000 }, { "epoch": 2.54, "learning_rate": 0.0018985440788719477, "loss": 6.1378, "step": 88500 }, { "epoch": 2.55, "learning_rate": 0.001897970881577439, "loss": 6.1335, "step": 89000 }, { "epoch": 2.57, "learning_rate": 0.0018973976842829301, "loss": 6.1267, "step": 89500 }, { "epoch": 2.58, "learning_rate": 0.0018968244869884215, "loss": 6.1266, "step": 90000 }, { "epoch": 2.59, "learning_rate": 0.0018962512896939126, "loss": 6.1195, "step": 90500 }, { "epoch": 2.61, "learning_rate": 0.001895678092399404, "loss": 6.1167, "step": 91000 }, { "epoch": 2.62, "learning_rate": 0.0018951048951048953, "loss": 6.1108, "step": 91500 }, { "epoch": 2.64, "learning_rate": 0.0018945316978103864, "loss": 6.1093, "step": 92000 }, { "epoch": 2.65, "learning_rate": 0.0018939585005158777, "loss": 6.1124, "step": 92500 }, { "epoch": 2.67, "learning_rate": 0.0018933853032213688, "loss": 6.1069, "step": 93000 }, { "epoch": 2.68, "learning_rate": 0.0018928121059268602, "loss": 6.1051, "step": 93500 }, { "epoch": 2.69, "learning_rate": 0.0018922389086323513, "loss": 6.1192, "step": 94000 }, { "epoch": 2.71, "learning_rate": 0.0018916657113378426, "loss": 6.1231, "step": 94500 }, { "epoch": 2.72, "learning_rate": 0.0018910925140433338, "loss": 6.1156, "step": 95000 }, { "epoch": 2.74, "learning_rate": 0.001890519316748825, "loss": 6.1098, "step": 95500 }, { "epoch": 2.75, "learning_rate": 0.0018899461194543162, "loss": 6.0978, "step": 96000 }, { "epoch": 2.77, "learning_rate": 0.0018893729221598076, "loss": 6.0956, "step": 96500 }, { "epoch": 2.78, "learning_rate": 0.0018887997248652987, "loss": 6.0932, "step": 97000 }, { "epoch": 2.79, "learning_rate": 0.00188822652757079, "loss": 6.0938, "step": 97500 }, { "epoch": 2.81, "learning_rate": 0.0018876533302762811, "loss": 6.0921, "step": 98000 }, { "epoch": 2.82, "learning_rate": 0.0018870801329817725, "loss": 6.0834, "step": 98500 }, { "epoch": 2.84, "learning_rate": 0.0018865069356872636, "loss": 6.0833, "step": 99000 }, { "epoch": 2.85, "learning_rate": 0.001885933738392755, "loss": 6.0827, "step": 99500 }, { "epoch": 2.87, "learning_rate": 0.001885360541098246, "loss": 6.0858, "step": 100000 }, { "epoch": 2.88, "learning_rate": 0.0018847873438037372, "loss": 6.0814, "step": 100500 }, { "epoch": 2.89, "learning_rate": 0.0018842141465092285, "loss": 6.0793, "step": 101000 }, { "epoch": 2.91, "learning_rate": 0.0018836409492147196, "loss": 6.0743, "step": 101500 }, { "epoch": 2.92, "learning_rate": 0.001883067751920211, "loss": 6.0821, "step": 102000 }, { "epoch": 2.94, "learning_rate": 0.001882494554625702, "loss": 6.0827, "step": 102500 }, { "epoch": 2.95, "learning_rate": 0.0018819213573311934, "loss": 6.0723, "step": 103000 }, { "epoch": 2.97, "learning_rate": 0.0018813481600366846, "loss": 6.0692, "step": 103500 }, { "epoch": 2.98, "learning_rate": 0.001880774962742176, "loss": 6.065, "step": 104000 }, { "epoch": 2.99, "learning_rate": 0.001880201765447667, "loss": 6.0677, "step": 104500 }, { "epoch": 3.0, "eval_loss": 6.062596321105957, "eval_runtime": 26.9494, "eval_samples_per_second": 41.485, "eval_steps_per_second": 1.299, "step": 104676 }, { "epoch": 3.01, "learning_rate": 0.0018796285681531584, "loss": 6.0603, "step": 105000 }, { "epoch": 3.02, "learning_rate": 0.0018790553708586495, "loss": 6.0606, "step": 105500 }, { "epoch": 3.04, "learning_rate": 0.0018784821735641408, "loss": 6.0603, "step": 106000 }, { "epoch": 3.05, "learning_rate": 0.001877908976269632, "loss": 6.066, "step": 106500 }, { "epoch": 3.07, "learning_rate": 0.0018773357789751233, "loss": 6.0745, "step": 107000 }, { "epoch": 3.08, "learning_rate": 0.0018767625816806144, "loss": 6.0723, "step": 107500 }, { "epoch": 3.1, "learning_rate": 0.0018761893843861057, "loss": 6.0678, "step": 108000 }, { "epoch": 3.11, "learning_rate": 0.0018756161870915969, "loss": 6.0639, "step": 108500 }, { "epoch": 3.12, "learning_rate": 0.0018750429897970882, "loss": 6.0588, "step": 109000 }, { "epoch": 3.14, "learning_rate": 0.0018744697925025793, "loss": 6.0602, "step": 109500 }, { "epoch": 3.15, "learning_rate": 0.0018738965952080707, "loss": 6.054, "step": 110000 }, { "epoch": 3.17, "learning_rate": 0.001873323397913562, "loss": 6.0581, "step": 110500 }, { "epoch": 3.18, "learning_rate": 0.0018727502006190531, "loss": 6.0562, "step": 111000 }, { "epoch": 3.2, "learning_rate": 0.0018721770033245445, "loss": 6.0482, "step": 111500 }, { "epoch": 3.21, "learning_rate": 0.0018716038060300356, "loss": 6.0504, "step": 112000 }, { "epoch": 3.22, "learning_rate": 0.001871030608735527, "loss": 6.0509, "step": 112500 }, { "epoch": 3.24, "learning_rate": 0.001870457411441018, "loss": 6.0545, "step": 113000 }, { "epoch": 3.25, "learning_rate": 0.0018698842141465094, "loss": 6.0529, "step": 113500 }, { "epoch": 3.27, "learning_rate": 0.0018693110168520005, "loss": 6.0509, "step": 114000 }, { "epoch": 3.28, "learning_rate": 0.0018687378195574918, "loss": 6.0604, "step": 114500 }, { "epoch": 3.3, "learning_rate": 0.001868164622262983, "loss": 6.0547, "step": 115000 }, { "epoch": 3.31, "learning_rate": 0.0018675914249684743, "loss": 6.0622, "step": 115500 }, { "epoch": 3.32, "learning_rate": 0.0018670182276739654, "loss": 6.0639, "step": 116000 }, { "epoch": 3.34, "learning_rate": 0.0018664450303794568, "loss": 6.0567, "step": 116500 }, { "epoch": 3.35, "learning_rate": 0.0018658718330849479, "loss": 6.0493, "step": 117000 }, { "epoch": 3.37, "learning_rate": 0.0018652986357904392, "loss": 6.0514, "step": 117500 }, { "epoch": 3.38, "learning_rate": 0.0018647254384959303, "loss": 6.0456, "step": 118000 }, { "epoch": 3.4, "learning_rate": 0.0018641522412014217, "loss": 6.0514, "step": 118500 }, { "epoch": 3.41, "learning_rate": 0.0018635790439069128, "loss": 6.0442, "step": 119000 }, { "epoch": 3.42, "learning_rate": 0.001863005846612404, "loss": 6.0402, "step": 119500 }, { "epoch": 3.44, "learning_rate": 0.0018624326493178953, "loss": 6.0444, "step": 120000 }, { "epoch": 3.45, "learning_rate": 0.0018618594520233864, "loss": 6.0423, "step": 120500 }, { "epoch": 3.47, "learning_rate": 0.0018612862547288777, "loss": 6.0527, "step": 121000 }, { "epoch": 3.48, "learning_rate": 0.0018607130574343688, "loss": 6.0419, "step": 121500 }, { "epoch": 3.5, "learning_rate": 0.0018601398601398602, "loss": 6.0386, "step": 122000 }, { "epoch": 3.51, "learning_rate": 0.0018595666628453513, "loss": 6.0389, "step": 122500 }, { "epoch": 3.53, "learning_rate": 0.0018589934655508426, "loss": 6.0372, "step": 123000 }, { "epoch": 3.54, "learning_rate": 0.0018584202682563338, "loss": 6.03, "step": 123500 }, { "epoch": 3.55, "learning_rate": 0.001857847070961825, "loss": 6.031, "step": 124000 }, { "epoch": 3.57, "learning_rate": 0.0018572738736673162, "loss": 6.038, "step": 124500 }, { "epoch": 3.58, "learning_rate": 0.0018567006763728076, "loss": 6.0285, "step": 125000 }, { "epoch": 3.6, "learning_rate": 0.0018561274790782987, "loss": 6.0309, "step": 125500 }, { "epoch": 3.61, "learning_rate": 0.00185555428178379, "loss": 6.0351, "step": 126000 }, { "epoch": 3.63, "learning_rate": 0.0018549810844892811, "loss": 6.0312, "step": 126500 }, { "epoch": 3.64, "learning_rate": 0.0018544078871947725, "loss": 6.0369, "step": 127000 }, { "epoch": 3.65, "learning_rate": 0.0018538346899002636, "loss": 6.0317, "step": 127500 }, { "epoch": 3.67, "learning_rate": 0.001853261492605755, "loss": 6.036, "step": 128000 }, { "epoch": 3.68, "learning_rate": 0.0018526882953112463, "loss": 6.038, "step": 128500 }, { "epoch": 3.7, "learning_rate": 0.0018521150980167374, "loss": 6.034, "step": 129000 }, { "epoch": 3.71, "learning_rate": 0.0018515419007222287, "loss": 6.0312, "step": 129500 }, { "epoch": 3.73, "learning_rate": 0.0018509687034277199, "loss": 6.0305, "step": 130000 }, { "epoch": 3.74, "learning_rate": 0.0018503955061332112, "loss": 6.0283, "step": 130500 }, { "epoch": 3.75, "learning_rate": 0.0018498223088387023, "loss": 6.0298, "step": 131000 }, { "epoch": 3.77, "learning_rate": 0.0018492491115441937, "loss": 6.0355, "step": 131500 }, { "epoch": 3.78, "learning_rate": 0.0018486759142496848, "loss": 6.0299, "step": 132000 }, { "epoch": 3.8, "learning_rate": 0.0018481027169551761, "loss": 6.0283, "step": 132500 }, { "epoch": 3.81, "learning_rate": 0.0018475295196606672, "loss": 6.0302, "step": 133000 }, { "epoch": 3.83, "learning_rate": 0.0018469563223661586, "loss": 6.022, "step": 133500 }, { "epoch": 3.84, "learning_rate": 0.0018463831250716497, "loss": 6.023, "step": 134000 }, { "epoch": 3.85, "learning_rate": 0.001845809927777141, "loss": 6.0229, "step": 134500 }, { "epoch": 3.87, "learning_rate": 0.0018452367304826322, "loss": 6.0253, "step": 135000 }, { "epoch": 3.88, "learning_rate": 0.0018446635331881235, "loss": 6.0343, "step": 135500 }, { "epoch": 3.9, "learning_rate": 0.0018440903358936146, "loss": 6.0315, "step": 136000 }, { "epoch": 3.91, "learning_rate": 0.001843517138599106, "loss": 6.031, "step": 136500 }, { "epoch": 3.93, "learning_rate": 0.0018429439413045969, "loss": 6.0269, "step": 137000 }, { "epoch": 3.94, "learning_rate": 0.0018423707440100882, "loss": 6.0263, "step": 137500 }, { "epoch": 3.96, "learning_rate": 0.0018417975467155795, "loss": 6.0277, "step": 138000 }, { "epoch": 3.97, "learning_rate": 0.0018412243494210707, "loss": 6.0346, "step": 138500 }, { "epoch": 3.98, "learning_rate": 0.001840651152126562, "loss": 6.0296, "step": 139000 }, { "epoch": 4.0, "learning_rate": 0.0018400779548320531, "loss": 6.0229, "step": 139500 }, { "epoch": 4.0, "eval_loss": 6.020912170410156, "eval_runtime": 27.2309, "eval_samples_per_second": 41.056, "eval_steps_per_second": 1.285, "step": 139568 }, { "epoch": 4.01, "learning_rate": 0.0018395047575375445, "loss": 6.0199, "step": 140000 }, { "epoch": 4.03, "learning_rate": 0.0018389315602430356, "loss": 6.0325, "step": 140500 }, { "epoch": 4.04, "learning_rate": 0.001838358362948527, "loss": 6.0448, "step": 141000 }, { "epoch": 4.06, "learning_rate": 0.001837785165654018, "loss": 6.0367, "step": 141500 }, { "epoch": 4.07, "learning_rate": 0.0018372119683595094, "loss": 6.0357, "step": 142000 }, { "epoch": 4.08, "learning_rate": 0.0018366387710650005, "loss": 6.0411, "step": 142500 }, { "epoch": 4.1, "learning_rate": 0.0018360655737704918, "loss": 6.0365, "step": 143000 }, { "epoch": 4.11, "learning_rate": 0.001835492376475983, "loss": 6.0269, "step": 143500 }, { "epoch": 4.13, "learning_rate": 0.0018349191791814743, "loss": 6.0342, "step": 144000 }, { "epoch": 4.14, "learning_rate": 0.0018343459818869654, "loss": 6.0291, "step": 144500 }, { "epoch": 4.16, "learning_rate": 0.0018337727845924568, "loss": 6.0281, "step": 145000 }, { "epoch": 4.17, "learning_rate": 0.0018331995872979479, "loss": 6.027, "step": 145500 }, { "epoch": 4.18, "learning_rate": 0.0018326263900034392, "loss": 6.0249, "step": 146000 }, { "epoch": 4.2, "learning_rate": 0.0018320531927089306, "loss": 6.0276, "step": 146500 }, { "epoch": 4.21, "learning_rate": 0.0018314799954144217, "loss": 6.0251, "step": 147000 }, { "epoch": 4.23, "learning_rate": 0.001830906798119913, "loss": 6.021, "step": 147500 }, { "epoch": 4.24, "learning_rate": 0.0018303336008254041, "loss": 6.0251, "step": 148000 }, { "epoch": 4.26, "learning_rate": 0.0018297604035308955, "loss": 6.0278, "step": 148500 }, { "epoch": 4.27, "learning_rate": 0.0018291872062363866, "loss": 6.0329, "step": 149000 }, { "epoch": 4.28, "learning_rate": 0.001828614008941878, "loss": 6.0398, "step": 149500 }, { "epoch": 4.3, "learning_rate": 0.001828040811647369, "loss": 6.0468, "step": 150000 }, { "epoch": 4.31, "learning_rate": 0.0018274676143528604, "loss": 6.0485, "step": 150500 }, { "epoch": 4.33, "learning_rate": 0.0018268944170583515, "loss": 6.0523, "step": 151000 }, { "epoch": 4.34, "learning_rate": 0.0018263212197638429, "loss": 6.0604, "step": 151500 }, { "epoch": 4.36, "learning_rate": 0.001825748022469334, "loss": 6.0486, "step": 152000 }, { "epoch": 4.37, "learning_rate": 0.0018251748251748253, "loss": 6.0446, "step": 152500 }, { "epoch": 4.38, "learning_rate": 0.0018246016278803164, "loss": 6.0368, "step": 153000 }, { "epoch": 4.4, "learning_rate": 0.0018240284305858078, "loss": 6.0323, "step": 153500 }, { "epoch": 4.41, "learning_rate": 0.001823455233291299, "loss": 6.0308, "step": 154000 }, { "epoch": 4.43, "learning_rate": 0.0018228820359967902, "loss": 6.0291, "step": 154500 }, { "epoch": 4.44, "learning_rate": 0.0018223088387022812, "loss": 6.0372, "step": 155000 }, { "epoch": 4.46, "learning_rate": 0.0018217356414077725, "loss": 6.0355, "step": 155500 }, { "epoch": 4.47, "learning_rate": 0.0018211624441132638, "loss": 6.0415, "step": 156000 }, { "epoch": 4.49, "learning_rate": 0.001820589246818755, "loss": 6.0396, "step": 156500 }, { "epoch": 4.5, "learning_rate": 0.0018200160495242463, "loss": 6.034, "step": 157000 }, { "epoch": 4.51, "learning_rate": 0.0018194428522297374, "loss": 6.0322, "step": 157500 }, { "epoch": 4.53, "learning_rate": 0.0018188696549352287, "loss": 6.0336, "step": 158000 }, { "epoch": 4.54, "learning_rate": 0.0018182964576407199, "loss": 6.0386, "step": 158500 }, { "epoch": 4.56, "learning_rate": 0.0018177232603462112, "loss": 6.0294, "step": 159000 }, { "epoch": 4.57, "learning_rate": 0.0018171500630517023, "loss": 6.0288, "step": 159500 }, { "epoch": 4.59, "learning_rate": 0.0018165768657571937, "loss": 6.0205, "step": 160000 }, { "epoch": 4.6, "learning_rate": 0.0018160036684626848, "loss": 6.0317, "step": 160500 }, { "epoch": 4.61, "learning_rate": 0.0018154304711681761, "loss": 6.0411, "step": 161000 }, { "epoch": 4.63, "learning_rate": 0.0018148572738736672, "loss": 6.0329, "step": 161500 }, { "epoch": 4.64, "learning_rate": 0.0018142840765791586, "loss": 6.0292, "step": 162000 }, { "epoch": 4.66, "learning_rate": 0.0018137108792846497, "loss": 6.0329, "step": 162500 }, { "epoch": 4.67, "learning_rate": 0.001813137681990141, "loss": 6.0332, "step": 163000 }, { "epoch": 4.69, "learning_rate": 0.0018125644846956322, "loss": 6.0351, "step": 163500 }, { "epoch": 4.7, "learning_rate": 0.0018119912874011235, "loss": 6.0342, "step": 164000 }, { "epoch": 4.71, "learning_rate": 0.0018114180901066146, "loss": 6.0428, "step": 164500 }, { "epoch": 4.73, "learning_rate": 0.001810844892812106, "loss": 6.039, "step": 165000 }, { "epoch": 4.74, "learning_rate": 0.0018102716955175973, "loss": 6.0341, "step": 165500 }, { "epoch": 4.76, "learning_rate": 0.0018096984982230884, "loss": 6.035, "step": 166000 }, { "epoch": 4.77, "learning_rate": 0.0018091253009285798, "loss": 6.0465, "step": 166500 }, { "epoch": 4.79, "learning_rate": 0.0018085521036340709, "loss": 6.0363, "step": 167000 }, { "epoch": 4.8, "learning_rate": 0.0018079789063395622, "loss": 6.045, "step": 167500 }, { "epoch": 4.81, "learning_rate": 0.0018074057090450533, "loss": 6.0442, "step": 168000 }, { "epoch": 4.83, "learning_rate": 0.0018068325117505447, "loss": 6.0453, "step": 168500 }, { "epoch": 4.84, "learning_rate": 0.0018062593144560358, "loss": 6.0351, "step": 169000 }, { "epoch": 4.86, "learning_rate": 0.0018056861171615271, "loss": 6.0323, "step": 169500 }, { "epoch": 4.87, "learning_rate": 0.0018051129198670183, "loss": 6.0308, "step": 170000 }, { "epoch": 4.89, "learning_rate": 0.0018045397225725096, "loss": 6.0335, "step": 170500 }, { "epoch": 4.9, "learning_rate": 0.0018039665252780007, "loss": 6.037, "step": 171000 }, { "epoch": 4.92, "learning_rate": 0.001803393327983492, "loss": 6.0301, "step": 171500 }, { "epoch": 4.93, "learning_rate": 0.0018028201306889832, "loss": 6.0363, "step": 172000 }, { "epoch": 4.94, "learning_rate": 0.0018022469333944745, "loss": 6.0376, "step": 172500 }, { "epoch": 4.96, "learning_rate": 0.0018016737360999656, "loss": 6.0319, "step": 173000 }, { "epoch": 4.97, "learning_rate": 0.0018011005388054568, "loss": 6.0311, "step": 173500 }, { "epoch": 4.99, "learning_rate": 0.0018005273415109481, "loss": 6.0321, "step": 174000 }, { "epoch": 5.0, "eval_loss": 6.036427974700928, "eval_runtime": 27.0381, "eval_samples_per_second": 41.349, "eval_steps_per_second": 1.294, "step": 174460 } ], "logging_steps": 500, "max_steps": 1744600, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "total_flos": 2.390379476809689e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }