{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.992481203007518, "global_step": 330, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06, "learning_rate": 0.00011764705882352942, "loss": 5.1019, "step": 2 }, { "epoch": 0.12, "learning_rate": 0.00023529411764705883, "loss": 4.8202, "step": 4 }, { "epoch": 0.18, "learning_rate": 0.00035294117647058826, "loss": 4.4144, "step": 6 }, { "epoch": 0.24, "learning_rate": 0.00047058823529411766, "loss": 4.0763, "step": 8 }, { "epoch": 0.3, "learning_rate": 0.0005882352941176471, "loss": 3.7584, "step": 10 }, { "epoch": 0.36, "learning_rate": 0.0007058823529411765, "loss": 3.583, "step": 12 }, { "epoch": 0.42, "learning_rate": 0.0008235294117647058, "loss": 3.4246, "step": 14 }, { "epoch": 0.48, "learning_rate": 0.0009411764705882353, "loss": 3.301, "step": 16 }, { "epoch": 0.54, "learning_rate": 0.0009999748146823375, "loss": 3.225, "step": 18 }, { "epoch": 0.6, "learning_rate": 0.0009997733473639876, "loss": 3.1705, "step": 20 }, { "epoch": 0.66, "learning_rate": 0.0009993704939095377, "loss": 3.0495, "step": 22 }, { "epoch": 0.72, "learning_rate": 0.0009987664166507748, "loss": 2.9806, "step": 24 }, { "epoch": 0.78, "learning_rate": 0.0009979613590036108, "loss": 2.9235, "step": 26 }, { "epoch": 0.84, "learning_rate": 0.0009969556453699965, "loss": 2.8419, "step": 28 }, { "epoch": 0.9, "learning_rate": 0.0009957496810072027, "loss": 2.7978, "step": 30 }, { "epoch": 0.96, "learning_rate": 0.0009943439518645192, "loss": 2.7061, "step": 32 }, { "epoch": 0.99, "eval_loss": 2.585865020751953, "eval_runtime": 6.5169, "eval_samples_per_second": 68.13, "eval_steps_per_second": 17.033, "step": 33 }, { "epoch": 1.03, "learning_rate": 0.0009927390243874398, "loss": 2.9771, "step": 34 }, { "epoch": 1.09, "learning_rate": 0.0009909355452894098, "loss": 2.5679, "step": 36 }, { "epoch": 1.15, "learning_rate": 0.0009889342412912295, "loss": 2.5551, "step": 38 }, { "epoch": 1.21, "learning_rate": 0.0009867359188282193, "loss": 2.4842, "step": 40 }, { "epoch": 1.27, "learning_rate": 0.0009843414637252614, "loss": 2.4689, "step": 42 }, { "epoch": 1.33, "learning_rate": 0.0009817518408398536, "loss": 2.4216, "step": 44 }, { "epoch": 1.39, "learning_rate": 0.000978968093673314, "loss": 2.4006, "step": 46 }, { "epoch": 1.45, "learning_rate": 0.0009759913439502981, "loss": 2.342, "step": 48 }, { "epoch": 1.51, "learning_rate": 0.0009728227911667932, "loss": 2.3004, "step": 50 }, { "epoch": 1.57, "learning_rate": 0.0009694637121067764, "loss": 2.2644, "step": 52 }, { "epoch": 1.63, "learning_rate": 0.0009659154603277282, "loss": 2.2406, "step": 54 }, { "epoch": 1.69, "learning_rate": 0.0009621794656152091, "loss": 2.2076, "step": 56 }, { "epoch": 1.75, "learning_rate": 0.0009582572334067213, "loss": 2.1834, "step": 58 }, { "epoch": 1.81, "learning_rate": 0.0009541503441850843, "loss": 2.1652, "step": 60 }, { "epoch": 1.87, "learning_rate": 0.0009498604528415731, "loss": 2.1353, "step": 62 }, { "epoch": 1.93, "learning_rate": 0.0009453892880090695, "loss": 2.1394, "step": 64 }, { "epoch": 1.99, "learning_rate": 0.000940738651365503, "loss": 2.08, "step": 66 }, { "epoch": 1.99, "eval_loss": 1.996474027633667, "eval_runtime": 6.333, "eval_samples_per_second": 70.109, "eval_steps_per_second": 17.527, "step": 66 }, { "epoch": 2.06, "learning_rate": 0.000935910416907854, "loss": 2.2925, "step": 68 }, { "epoch": 2.12, "learning_rate": 0.0009309065301970192, "loss": 2.0167, "step": 70 }, { "epoch": 2.18, "learning_rate": 0.0009257290075738364, "loss": 1.9594, "step": 72 }, { "epoch": 2.24, "learning_rate": 0.0009203799353465918, "loss": 1.9508, "step": 74 }, { "epoch": 2.3, "learning_rate": 0.0009148614689503306, "loss": 1.9579, "step": 76 }, { "epoch": 2.36, "learning_rate": 0.0009091758320783139, "loss": 1.9166, "step": 78 }, { "epoch": 2.42, "learning_rate": 0.0009033253157859713, "loss": 1.8802, "step": 80 }, { "epoch": 2.48, "learning_rate": 0.0008973122775677078, "loss": 1.8642, "step": 82 }, { "epoch": 2.54, "learning_rate": 0.0008911391404069408, "loss": 1.8552, "step": 84 }, { "epoch": 2.6, "learning_rate": 0.0008848083917997462, "loss": 1.8637, "step": 86 }, { "epoch": 2.66, "learning_rate": 0.0008783225827525098, "loss": 1.852, "step": 88 }, { "epoch": 2.72, "learning_rate": 0.0008716843267539868, "loss": 1.7914, "step": 90 }, { "epoch": 2.78, "learning_rate": 0.0008648962987221837, "loss": 1.8048, "step": 92 }, { "epoch": 2.84, "learning_rate": 0.0008579612339264867, "loss": 1.7966, "step": 94 }, { "epoch": 2.9, "learning_rate": 0.0008508819268854713, "loss": 1.7871, "step": 96 }, { "epoch": 2.96, "learning_rate": 0.0008436612302408376, "loss": 1.7623, "step": 98 }, { "epoch": 2.99, "eval_loss": 1.724814772605896, "eval_runtime": 7.0329, "eval_samples_per_second": 63.132, "eval_steps_per_second": 15.783, "step": 99 }, { "epoch": 3.03, "learning_rate": 0.0008363020536079239, "loss": 1.9929, "step": 100 }, { "epoch": 3.09, "learning_rate": 0.0008288073624032633, "loss": 1.7159, "step": 102 }, { "epoch": 3.15, "learning_rate": 0.0008211801766496537, "loss": 1.6946, "step": 104 }, { "epoch": 3.21, "learning_rate": 0.000813423569759226, "loss": 1.6397, "step": 106 }, { "epoch": 3.27, "learning_rate": 0.0008055406672949956, "loss": 1.669, "step": 108 }, { "epoch": 3.33, "learning_rate": 0.0007975346457114034, "loss": 1.6531, "step": 110 }, { "epoch": 3.39, "learning_rate": 0.0007894087310743467, "loss": 1.6478, "step": 112 }, { "epoch": 3.45, "learning_rate": 0.0007811661977612201, "loss": 1.6231, "step": 114 }, { "epoch": 3.51, "learning_rate": 0.0007728103671414887, "loss": 1.6478, "step": 116 }, { "epoch": 3.57, "learning_rate": 0.0007643446062383273, "loss": 1.6287, "step": 118 }, { "epoch": 3.63, "learning_rate": 0.0007557723263718596, "loss": 1.5939, "step": 120 }, { "epoch": 3.69, "learning_rate": 0.0007470969817845518, "loss": 1.6309, "step": 122 }, { "epoch": 3.75, "learning_rate": 0.000738322068249308, "loss": 1.5665, "step": 124 }, { "epoch": 3.81, "learning_rate": 0.0007294511216608307, "loss": 1.5953, "step": 126 }, { "epoch": 3.87, "learning_rate": 0.0007204877166108151, "loss": 1.5987, "step": 128 }, { "epoch": 3.93, "learning_rate": 0.0007114354649475498, "loss": 1.5961, "step": 130 }, { "epoch": 3.99, "learning_rate": 0.0007022980143205046, "loss": 1.5408, "step": 132 }, { "epoch": 3.99, "eval_loss": 1.5449421405792236, "eval_runtime": 7.1026, "eval_samples_per_second": 62.512, "eval_steps_per_second": 15.628, "step": 132 }, { "epoch": 4.06, "learning_rate": 0.0006930790467104916, "loss": 1.7394, "step": 134 }, { "epoch": 4.12, "learning_rate": 0.0006837822769459941, "loss": 1.5015, "step": 136 }, { "epoch": 4.18, "learning_rate": 0.000674411451206257, "loss": 1.4962, "step": 138 }, { "epoch": 4.24, "learning_rate": 0.0006649703455117458, "loss": 1.496, "step": 140 }, { "epoch": 4.3, "learning_rate": 0.0006554627642025807, "loss": 1.4703, "step": 142 }, { "epoch": 4.36, "learning_rate": 0.0006458925384055585, "loss": 1.474, "step": 144 }, { "epoch": 4.42, "learning_rate": 0.0006362635244903819, "loss": 1.4663, "step": 146 }, { "epoch": 4.48, "learning_rate": 0.0006265796025157153, "loss": 1.4556, "step": 148 }, { "epoch": 4.54, "learning_rate": 0.0006168446746656973, "loss": 1.4779, "step": 150 }, { "epoch": 4.6, "learning_rate": 0.0006070626636775348, "loss": 1.4687, "step": 152 }, { "epoch": 4.66, "learning_rate": 0.0005972375112608181, "loss": 1.4614, "step": 154 }, { "epoch": 4.72, "learning_rate": 0.000587373176509189, "loss": 1.4615, "step": 156 }, { "epoch": 4.78, "learning_rate": 0.0005774736343050039, "loss": 1.4479, "step": 158 }, { "epoch": 4.84, "learning_rate": 0.0005675428737176367, "loss": 1.427, "step": 160 }, { "epoch": 4.9, "learning_rate": 0.000557584896396062, "loss": 1.4327, "step": 162 }, { "epoch": 4.96, "learning_rate": 0.0005476037149563726, "loss": 1.4147, "step": 164 }, { "epoch": 4.99, "eval_loss": 1.4437452554702759, "eval_runtime": 6.9311, "eval_samples_per_second": 64.059, "eval_steps_per_second": 16.015, "step": 165 }, { "epoch": 5.03, "learning_rate": 0.0005376033513648743, "loss": 1.5806, "step": 166 }, { "epoch": 5.09, "learning_rate": 0.0005275878353174165, "loss": 1.3567, "step": 168 }, { "epoch": 5.15, "learning_rate": 0.0005175612026156045, "loss": 1.3639, "step": 170 }, { "epoch": 5.21, "learning_rate": 0.0005075274935405553, "loss": 1.3578, "step": 172 }, { "epoch": 5.27, "learning_rate": 0.0004974907512248451, "loss": 1.3787, "step": 174 }, { "epoch": 5.33, "learning_rate": 0.0004874550200233085, "loss": 1.3406, "step": 176 }, { "epoch": 5.39, "learning_rate": 0.0004774243438833481, "loss": 1.368, "step": 178 }, { "epoch": 5.45, "learning_rate": 0.00046740276471540364, "loss": 1.3549, "step": 180 }, { "epoch": 5.51, "learning_rate": 0.00045739432076424515, "loss": 1.3655, "step": 182 }, { "epoch": 5.57, "learning_rate": 0.00044740304498174226, "loss": 1.35, "step": 184 }, { "epoch": 5.63, "learning_rate": 0.0004374329634017669, "loss": 1.3604, "step": 186 }, { "epoch": 5.69, "learning_rate": 0.00042748809351788165, "loss": 1.3692, "step": 188 }, { "epoch": 5.75, "learning_rate": 0.0004175724426644724, "loss": 1.3231, "step": 190 }, { "epoch": 5.81, "learning_rate": 0.00040769000640197205, "loss": 1.3361, "step": 192 }, { "epoch": 5.87, "learning_rate": 0.00039784476690683085, "loss": 1.3391, "step": 194 }, { "epoch": 5.93, "learning_rate": 0.0003880406913668777, "loss": 1.3259, "step": 196 }, { "epoch": 5.99, "learning_rate": 0.0003782817303827226, "loss": 1.3593, "step": 198 }, { "epoch": 5.99, "eval_loss": 1.3767662048339844, "eval_runtime": 6.7366, "eval_samples_per_second": 65.908, "eval_steps_per_second": 16.477, "step": 198 }, { "epoch": 6.06, "learning_rate": 0.0003685718163758427, "loss": 1.4657, "step": 200 }, { "epoch": 6.12, "learning_rate": 0.0003589148620039941, "loss": 1.2816, "step": 202 }, { "epoch": 6.18, "learning_rate": 0.00034931475858458635, "loss": 1.2989, "step": 204 }, { "epoch": 6.24, "learning_rate": 0.0003397753745266571, "loss": 1.3021, "step": 206 }, { "epoch": 6.3, "learning_rate": 0.0003303005537720778, "loss": 1.2478, "step": 208 }, { "epoch": 6.36, "learning_rate": 0.00032089411424661863, "loss": 1.2827, "step": 210 }, { "epoch": 6.42, "learning_rate": 0.0003115598463214956, "loss": 1.2458, "step": 212 }, { "epoch": 6.48, "learning_rate": 0.0003023015112860228, "loss": 1.2954, "step": 214 }, { "epoch": 6.54, "learning_rate": 0.00029312283983198097, "loss": 1.2782, "step": 216 }, { "epoch": 6.6, "learning_rate": 0.0002840275305503186, "loss": 1.2653, "step": 218 }, { "epoch": 6.66, "learning_rate": 0.00027501924844078535, "loss": 1.2701, "step": 220 }, { "epoch": 6.72, "learning_rate": 0.0002661016234351018, "loss": 1.2862, "step": 222 }, { "epoch": 6.78, "learning_rate": 0.00025727824893426166, "loss": 1.277, "step": 224 }, { "epoch": 6.84, "learning_rate": 0.00024855268036055346, "loss": 1.2791, "step": 226 }, { "epoch": 6.9, "learning_rate": 0.00023992843372488355, "loss": 1.266, "step": 228 }, { "epoch": 6.96, "learning_rate": 0.00023140898420998424, "loss": 1.2703, "step": 230 }, { "epoch": 6.99, "eval_loss": 1.3362174034118652, "eval_runtime": 6.9977, "eval_samples_per_second": 63.449, "eval_steps_per_second": 15.862, "step": 231 }, { "epoch": 7.03, "learning_rate": 0.0002229977647700707, "loss": 1.4282, "step": 232 }, { "epoch": 7.09, "learning_rate": 0.00021469816474751563, "loss": 1.2356, "step": 234 }, { "epoch": 7.15, "learning_rate": 0.00020651352850709653, "loss": 1.247, "step": 236 }, { "epoch": 7.21, "learning_rate": 0.00019844715408836789, "loss": 1.2564, "step": 238 }, { "epoch": 7.27, "learning_rate": 0.00019050229187669949, "loss": 1.2187, "step": 240 }, { "epoch": 7.33, "learning_rate": 0.00018268214329351796, "loss": 1.2388, "step": 242 }, { "epoch": 7.39, "learning_rate": 0.00017498985950627793, "loss": 1.2368, "step": 244 }, { "epoch": 7.45, "learning_rate": 0.00016742854015868347, "loss": 1.212, "step": 246 }, { "epoch": 7.51, "learning_rate": 0.00016000123212167155, "loss": 1.2377, "step": 248 }, { "epoch": 7.57, "learning_rate": 0.00015271092826566108, "loss": 1.2146, "step": 250 }, { "epoch": 7.63, "learning_rate": 0.0001455605662545592, "loss": 1.2209, "step": 252 }, { "epoch": 7.69, "learning_rate": 0.00013855302736201687, "loss": 1.2319, "step": 254 }, { "epoch": 7.75, "learning_rate": 0.00013169113531040461, "loss": 1.2271, "step": 256 }, { "epoch": 7.81, "learning_rate": 0.00012497765513297976, "loss": 1.2021, "step": 258 }, { "epoch": 7.87, "learning_rate": 0.00011841529205970281, "loss": 1.2264, "step": 260 }, { "epoch": 7.93, "learning_rate": 0.00011200669042715162, "loss": 1.2228, "step": 262 }, { "epoch": 7.99, "learning_rate": 0.00010575443261297229, "loss": 1.2528, "step": 264 }, { "epoch": 7.99, "eval_loss": 1.3175491094589233, "eval_runtime": 6.7551, "eval_samples_per_second": 65.729, "eval_steps_per_second": 16.432, "step": 264 }, { "epoch": 8.06, "learning_rate": 9.96610379952989e-05, "loss": 1.3448, "step": 266 }, { "epoch": 8.12, "learning_rate": 9.37289619375562e-05, "loss": 1.1886, "step": 268 }, { "epoch": 8.18, "learning_rate": 8.7960594799059e-05, "loss": 1.2062, "step": 270 }, { "epoch": 8.24, "learning_rate": 8.235826097180565e-05, "loss": 1.2207, "step": 272 }, { "epoch": 8.3, "learning_rate": 7.692421794385312e-05, "loss": 1.2095, "step": 274 }, { "epoch": 8.36, "learning_rate": 7.166065538964955e-05, "loss": 1.1986, "step": 276 }, { "epoch": 8.42, "learning_rate": 6.656969428769566e-05, "loss": 1.1962, "step": 278 }, { "epoch": 8.48, "learning_rate": 6.165338606588517e-05, "loss": 1.2164, "step": 280 }, { "epoch": 8.54, "learning_rate": 5.6913711774872144e-05, "loss": 1.1904, "step": 282 }, { "epoch": 8.6, "learning_rate": 5.235258128979675e-05, "loss": 1.2172, "step": 284 }, { "epoch": 8.66, "learning_rate": 4.797183254069176e-05, "loss": 1.2345, "step": 286 }, { "epoch": 8.72, "learning_rate": 4.3773230771879005e-05, "loss": 1.1994, "step": 288 }, { "epoch": 8.78, "learning_rate": 3.975846783065662e-05, "loss": 1.1963, "step": 290 }, { "epoch": 8.84, "learning_rate": 3.5929161485559694e-05, "loss": 1.1995, "step": 292 }, { "epoch": 8.9, "learning_rate": 3.2286854774472905e-05, "loss": 1.1779, "step": 294 }, { "epoch": 8.96, "learning_rate": 2.883301538285582e-05, "loss": 1.1981, "step": 296 }, { "epoch": 8.99, "eval_loss": 1.3090853691101074, "eval_runtime": 6.8294, "eval_samples_per_second": 65.013, "eval_steps_per_second": 16.253, "step": 297 }, { "epoch": 9.03, "learning_rate": 2.5569035052332156e-05, "loss": 1.3461, "step": 298 }, { "epoch": 9.09, "learning_rate": 2.2496229019879632e-05, "loss": 1.1812, "step": 300 }, { "epoch": 9.15, "learning_rate": 1.9615835487849675e-05, "loss": 1.177, "step": 302 }, { "epoch": 9.21, "learning_rate": 1.6929015125027312e-05, "loss": 1.1856, "step": 304 }, { "epoch": 9.27, "learning_rate": 1.443685059893396e-05, "loss": 1.1984, "step": 306 }, { "epoch": 9.33, "learning_rate": 1.2140346139561276e-05, "loss": 1.1711, "step": 308 }, { "epoch": 9.39, "learning_rate": 1.0040427134711649e-05, "loss": 1.1905, "step": 310 }, { "epoch": 9.45, "learning_rate": 8.137939757108525e-06, "loss": 1.1924, "step": 312 }, { "epoch": 9.51, "learning_rate": 6.433650623427378e-06, "loss": 1.2033, "step": 314 }, { "epoch": 9.57, "learning_rate": 4.928246485383147e-06, "loss": 1.2159, "step": 316 }, { "epoch": 9.63, "learning_rate": 3.6223339530006004e-06, "loss": 1.1831, "step": 318 }, { "epoch": 9.69, "learning_rate": 2.516439250177749e-06, "loss": 1.2011, "step": 320 }, { "epoch": 9.75, "learning_rate": 1.611008002641412e-06, "loss": 1.1991, "step": 322 }, { "epoch": 9.81, "learning_rate": 9.064050583800221e-07, "loss": 1.1786, "step": 324 }, { "epoch": 9.87, "learning_rate": 4.029143406262259e-07, "loss": 1.1945, "step": 326 }, { "epoch": 9.93, "learning_rate": 1.0073873344895734e-07, "loss": 1.1972, "step": 328 }, { "epoch": 9.99, "learning_rate": 0.0, "loss": 1.2117, "step": 330 }, { "epoch": 9.99, "eval_loss": 1.3089168071746826, "eval_runtime": 6.8863, "eval_samples_per_second": 64.476, "eval_steps_per_second": 16.119, "step": 330 }, { "epoch": 9.99, "step": 330, "total_flos": 1.1112428209176576e+16, "train_loss": 1.7149053530259566, "train_runtime": 770.4396, "train_samples_per_second": 110.534, "train_steps_per_second": 0.428 } ], "max_steps": 330, "num_train_epochs": 10, "total_flos": 1.1112428209176576e+16, "trial_name": null, "trial_params": null }