{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.99726094101893, "global_step": 512, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 1.3124999999999999e-05, "loss": 3.3565, "step": 3 }, { "epoch": 0.02, "learning_rate": 2.6249999999999998e-05, "loss": 3.1293, "step": 6 }, { "epoch": 0.04, "learning_rate": 3.9374999999999995e-05, "loss": 2.987, "step": 9 }, { "epoch": 0.05, "learning_rate": 5.2499999999999995e-05, "loss": 2.9036, "step": 12 }, { "epoch": 0.06, "learning_rate": 6.5625e-05, "loss": 2.7414, "step": 15 }, { "epoch": 0.07, "learning_rate": 6.99971917961865e-05, "loss": 2.7513, "step": 18 }, { "epoch": 0.08, "learning_rate": 6.998244995833964e-05, "loss": 2.6567, "step": 21 }, { "epoch": 0.09, "learning_rate": 6.995507775098683e-05, "loss": 2.6567, "step": 24 }, { "epoch": 0.11, "learning_rate": 6.991508505682909e-05, "loss": 2.651, "step": 27 }, { "epoch": 0.12, "learning_rate": 6.986248631517822e-05, "loss": 2.5459, "step": 30 }, { "epoch": 0.13, "learning_rate": 6.979730051674372e-05, "loss": 2.5653, "step": 33 }, { "epoch": 0.14, "learning_rate": 6.9719551196776e-05, "loss": 2.5159, "step": 36 }, { "epoch": 0.15, "learning_rate": 6.962926642656914e-05, "loss": 2.4596, "step": 39 }, { "epoch": 0.16, "learning_rate": 6.952647880332572e-05, "loss": 2.5057, "step": 42 }, { "epoch": 0.18, "learning_rate": 6.941122543838767e-05, "loss": 2.4062, "step": 45 }, { "epoch": 0.19, "learning_rate": 6.92835479438373e-05, "loss": 2.44, "step": 48 }, { "epoch": 0.2, "learning_rate": 6.914349241747322e-05, "loss": 2.3659, "step": 51 }, { "epoch": 0.21, "learning_rate": 6.899110942616686e-05, "loss": 2.3672, "step": 54 }, { "epoch": 0.22, "learning_rate": 6.882645398760536e-05, "loss": 2.3759, "step": 57 }, { "epoch": 0.23, "learning_rate": 6.864958555042743e-05, "loss": 2.3567, "step": 60 }, { "epoch": 0.25, "learning_rate": 6.846056797275964e-05, "loss": 2.3606, "step": 63 }, { "epoch": 0.26, "learning_rate": 6.825946949916035e-05, "loss": 2.3166, "step": 66 }, { "epoch": 0.27, "learning_rate": 6.804636273598024e-05, "loss": 2.2707, "step": 69 }, { "epoch": 0.28, "learning_rate": 6.782132462514781e-05, "loss": 2.2617, "step": 72 }, { "epoch": 0.29, "learning_rate": 6.758443641638958e-05, "loss": 2.2521, "step": 75 }, { "epoch": 0.3, "learning_rate": 6.733578363789503e-05, "loss": 2.2746, "step": 78 }, { "epoch": 0.32, "learning_rate": 6.707545606543672e-05, "loss": 2.2112, "step": 81 }, { "epoch": 0.33, "learning_rate": 6.680354768995687e-05, "loss": 2.2237, "step": 84 }, { "epoch": 0.34, "learning_rate": 6.652015668363205e-05, "loss": 2.2225, "step": 87 }, { "epoch": 0.35, "learning_rate": 6.622538536442822e-05, "loss": 2.1471, "step": 90 }, { "epoch": 0.36, "learning_rate": 6.5919340159159e-05, "loss": 2.0738, "step": 93 }, { "epoch": 0.37, "learning_rate": 6.560213156506037e-05, "loss": 2.1797, "step": 96 }, { "epoch": 0.39, "learning_rate": 6.527387410989579e-05, "loss": 2.2545, "step": 99 }, { "epoch": 0.4, "learning_rate": 6.493468631060607e-05, "loss": 2.214, "step": 102 }, { "epoch": 0.41, "learning_rate": 6.458469063051903e-05, "loss": 2.1769, "step": 105 }, { "epoch": 0.42, "learning_rate": 6.422401343513426e-05, "loss": 2.1163, "step": 108 }, { "epoch": 0.43, "learning_rate": 6.385278494649894e-05, "loss": 2.1318, "step": 111 }, { "epoch": 0.44, "learning_rate": 6.347113919619143e-05, "loss": 2.1464, "step": 114 }, { "epoch": 0.46, "learning_rate": 6.307921397692931e-05, "loss": 2.1739, "step": 117 }, { "epoch": 0.47, "learning_rate": 6.267715079281944e-05, "loss": 2.1177, "step": 120 }, { "epoch": 0.48, "learning_rate": 6.226509480826817e-05, "loss": 2.1126, "step": 123 }, { "epoch": 0.49, "learning_rate": 6.184319479556984e-05, "loss": 2.1321, "step": 126 }, { "epoch": 0.5, "learning_rate": 6.141160308119273e-05, "loss": 2.0559, "step": 129 }, { "epoch": 0.51, "learning_rate": 6.0970475490781874e-05, "loss": 2.1131, "step": 132 }, { "epoch": 0.53, "learning_rate": 6.0519971292898285e-05, "loss": 2.1012, "step": 135 }, { "epoch": 0.54, "learning_rate": 6.0060253141515295e-05, "loss": 2.076, "step": 138 }, { "epoch": 0.55, "learning_rate": 5.95914870172926e-05, "loss": 2.0437, "step": 141 }, { "epoch": 0.56, "learning_rate": 5.911384216764903e-05, "loss": 2.0269, "step": 144 }, { "epoch": 0.57, "learning_rate": 5.862749104565608e-05, "loss": 2.0285, "step": 147 }, { "epoch": 0.58, "learning_rate": 5.8132609247773926e-05, "loss": 2.0499, "step": 150 }, { "epoch": 0.6, "learning_rate": 5.762937545045251e-05, "loss": 2.0388, "step": 153 }, { "epoch": 0.61, "learning_rate": 5.711797134562063e-05, "loss": 1.9678, "step": 156 }, { "epoch": 0.62, "learning_rate": 5.6598581575086404e-05, "loss": 2.0444, "step": 159 }, { "epoch": 0.63, "learning_rate": 5.60713936638724e-05, "loss": 2.0038, "step": 162 }, { "epoch": 0.64, "learning_rate": 5.553659795251013e-05, "loss": 1.9812, "step": 165 }, { "epoch": 0.65, "learning_rate": 5.499438752831773e-05, "loss": 1.9962, "step": 168 }, { "epoch": 0.67, "learning_rate": 5.444495815568607e-05, "loss": 1.9857, "step": 171 }, { "epoch": 0.68, "learning_rate": 5.38885082053983e-05, "loss": 1.9288, "step": 174 }, { "epoch": 0.69, "learning_rate": 5.332523858300823e-05, "loss": 1.9271, "step": 177 }, { "epoch": 0.7, "learning_rate": 5.2755352656303755e-05, "loss": 1.9128, "step": 180 }, { "epoch": 0.71, "learning_rate": 5.217905618188108e-05, "loss": 2.0431, "step": 183 }, { "epoch": 0.72, "learning_rate": 5.1596557230856576e-05, "loss": 1.8937, "step": 186 }, { "epoch": 0.74, "learning_rate": 5.1008066113743024e-05, "loss": 1.9296, "step": 189 }, { "epoch": 0.75, "learning_rate": 5.04137953045172e-05, "loss": 1.9901, "step": 192 }, { "epoch": 0.76, "learning_rate": 4.981395936390644e-05, "loss": 1.9698, "step": 195 }, { "epoch": 0.77, "learning_rate": 4.920877486192174e-05, "loss": 1.9076, "step": 198 }, { "epoch": 0.78, "learning_rate": 4.85984602996655e-05, "loss": 1.8708, "step": 201 }, { "epoch": 0.79, "learning_rate": 4.798323603044187e-05, "loss": 1.9115, "step": 204 }, { "epoch": 0.81, "learning_rate": 4.736332418019853e-05, "loss": 1.8812, "step": 207 }, { "epoch": 0.82, "learning_rate": 4.67389485673284e-05, "loss": 1.9152, "step": 210 }, { "epoch": 0.83, "learning_rate": 4.6110334621860254e-05, "loss": 2.0206, "step": 213 }, { "epoch": 0.84, "learning_rate": 4.547770930406753e-05, "loss": 1.9291, "step": 216 }, { "epoch": 0.85, "learning_rate": 4.4841301022524574e-05, "loss": 1.9792, "step": 219 }, { "epoch": 0.86, "learning_rate": 4.420133955164006e-05, "loss": 1.9464, "step": 222 }, { "epoch": 0.88, "learning_rate": 4.3558055948697185e-05, "loss": 1.835, "step": 225 }, { "epoch": 0.89, "learning_rate": 4.291168247043072e-05, "loss": 1.771, "step": 228 }, { "epoch": 0.9, "learning_rate": 4.2262452489171054e-05, "loss": 1.9584, "step": 231 }, { "epoch": 0.91, "learning_rate": 4.1610600408585395e-05, "loss": 2.0061, "step": 234 }, { "epoch": 0.92, "learning_rate": 4.095636157904658e-05, "loss": 1.8876, "step": 237 }, { "epoch": 0.93, "learning_rate": 4.029997221266018e-05, "loss": 1.8706, "step": 240 }, { "epoch": 0.95, "learning_rate": 3.964166929798036e-05, "loss": 1.884, "step": 243 }, { "epoch": 0.96, "learning_rate": 3.898169051444552e-05, "loss": 1.8348, "step": 246 }, { "epoch": 0.97, "learning_rate": 3.8320274146564356e-05, "loss": 1.8026, "step": 249 }, { "epoch": 0.98, "learning_rate": 3.7657658997883615e-05, "loss": 1.847, "step": 252 }, { "epoch": 0.99, "learning_rate": 3.699408430476834e-05, "loss": 1.8465, "step": 255 }, { "epoch": 1.0, "eval_loss": 1.8656249046325684, "eval_runtime": 309.3485, "eval_samples_per_second": 13.629, "eval_steps_per_second": 13.629, "step": 256 }, { "epoch": 1.01, "learning_rate": 3.632978965002587e-05, "loss": 2.1828, "step": 258 }, { "epoch": 1.02, "learning_rate": 3.566501487640479e-05, "loss": 1.6617, "step": 261 }, { "epoch": 1.03, "learning_rate": 3.5e-05, "loss": 1.6496, "step": 264 }, { "epoch": 1.04, "learning_rate": 3.433498512359521e-05, "loss": 1.5581, "step": 267 }, { "epoch": 1.05, "learning_rate": 3.367021034997412e-05, "loss": 1.6121, "step": 270 }, { "epoch": 1.07, "learning_rate": 3.300591569523165e-05, "loss": 1.6664, "step": 273 }, { "epoch": 1.08, "learning_rate": 3.2342341002116385e-05, "loss": 1.5229, "step": 276 }, { "epoch": 1.09, "learning_rate": 3.1679725853435645e-05, "loss": 1.5276, "step": 279 }, { "epoch": 1.1, "learning_rate": 3.101830948555448e-05, "loss": 1.6506, "step": 282 }, { "epoch": 1.11, "learning_rate": 3.035833070201963e-05, "loss": 1.5594, "step": 285 }, { "epoch": 1.12, "learning_rate": 2.9700027787339826e-05, "loss": 1.4913, "step": 288 }, { "epoch": 1.14, "learning_rate": 2.904363842095341e-05, "loss": 1.6122, "step": 291 }, { "epoch": 1.15, "learning_rate": 2.838939959141461e-05, "loss": 1.6086, "step": 294 }, { "epoch": 1.16, "learning_rate": 2.7737547510828943e-05, "loss": 1.5963, "step": 297 }, { "epoch": 1.17, "learning_rate": 2.7088317529569277e-05, "loss": 1.6068, "step": 300 }, { "epoch": 1.18, "learning_rate": 2.6441944051302816e-05, "loss": 1.528, "step": 303 }, { "epoch": 1.19, "learning_rate": 2.5798660448359928e-05, "loss": 1.629, "step": 306 }, { "epoch": 1.21, "learning_rate": 2.5158698977475426e-05, "loss": 1.6335, "step": 309 }, { "epoch": 1.22, "learning_rate": 2.4522290695932468e-05, "loss": 1.5641, "step": 312 }, { "epoch": 1.23, "learning_rate": 2.3889665378139753e-05, "loss": 1.5514, "step": 315 }, { "epoch": 1.24, "learning_rate": 2.32610514326716e-05, "loss": 1.6076, "step": 318 }, { "epoch": 1.25, "learning_rate": 2.263667581980147e-05, "loss": 1.5815, "step": 321 }, { "epoch": 1.26, "learning_rate": 2.2016763969558128e-05, "loss": 1.5281, "step": 324 }, { "epoch": 1.28, "learning_rate": 2.140153970033449e-05, "loss": 1.5254, "step": 327 }, { "epoch": 1.29, "learning_rate": 2.0791225138078253e-05, "loss": 1.5966, "step": 330 }, { "epoch": 1.3, "learning_rate": 2.0186040636093567e-05, "loss": 1.5427, "step": 333 }, { "epoch": 1.31, "learning_rate": 1.9586204695482795e-05, "loss": 1.6041, "step": 336 }, { "epoch": 1.32, "learning_rate": 1.8991933886256963e-05, "loss": 1.4729, "step": 339 }, { "epoch": 1.34, "learning_rate": 1.840344276914342e-05, "loss": 1.564, "step": 342 }, { "epoch": 1.35, "learning_rate": 1.7820943818118924e-05, "loss": 1.5334, "step": 345 }, { "epoch": 1.36, "learning_rate": 1.7244647343696252e-05, "loss": 1.5026, "step": 348 }, { "epoch": 1.37, "learning_rate": 1.6674761416991767e-05, "loss": 1.586, "step": 351 }, { "epoch": 1.38, "learning_rate": 1.611149179460171e-05, "loss": 1.5368, "step": 354 }, { "epoch": 1.39, "learning_rate": 1.555504184431393e-05, "loss": 1.5588, "step": 357 }, { "epoch": 1.41, "learning_rate": 1.500561247168226e-05, "loss": 1.5276, "step": 360 }, { "epoch": 1.42, "learning_rate": 1.446340204748987e-05, "loss": 1.4957, "step": 363 }, { "epoch": 1.43, "learning_rate": 1.3928606336127589e-05, "loss": 1.5479, "step": 366 }, { "epoch": 1.44, "learning_rate": 1.34014184249136e-05, "loss": 1.4862, "step": 369 }, { "epoch": 1.45, "learning_rate": 1.2882028654379362e-05, "loss": 1.5849, "step": 372 }, { "epoch": 1.46, "learning_rate": 1.2370624549547507e-05, "loss": 1.5811, "step": 375 }, { "epoch": 1.48, "learning_rate": 1.186739075222608e-05, "loss": 1.4914, "step": 378 }, { "epoch": 1.49, "learning_rate": 1.1372508954343916e-05, "loss": 1.5442, "step": 381 }, { "epoch": 1.5, "learning_rate": 1.0886157832350968e-05, "loss": 1.528, "step": 384 }, { "epoch": 1.51, "learning_rate": 1.0408512982707408e-05, "loss": 1.51, "step": 387 }, { "epoch": 1.52, "learning_rate": 9.939746858484699e-06, "loss": 1.4226, "step": 390 }, { "epoch": 1.53, "learning_rate": 9.480028707101716e-06, "loss": 1.5474, "step": 393 }, { "epoch": 1.55, "learning_rate": 9.02952450921813e-06, "loss": 1.5436, "step": 396 }, { "epoch": 1.56, "learning_rate": 8.588396918807265e-06, "loss": 1.5102, "step": 399 }, { "epoch": 1.57, "learning_rate": 8.156805204430163e-06, "loss": 1.4774, "step": 402 }, { "epoch": 1.58, "learning_rate": 7.734905191731818e-06, "loss": 1.5731, "step": 405 }, { "epoch": 1.59, "learning_rate": 7.322849207180554e-06, "loss": 1.4854, "step": 408 }, { "epoch": 1.6, "learning_rate": 6.92078602307069e-06, "loss": 1.5024, "step": 411 }, { "epoch": 1.62, "learning_rate": 6.528860803808572e-06, "loss": 1.4762, "step": 414 }, { "epoch": 1.63, "learning_rate": 6.147215053501067e-06, "loss": 1.5015, "step": 417 }, { "epoch": 1.64, "learning_rate": 5.775986564865746e-06, "loss": 1.4655, "step": 420 }, { "epoch": 1.65, "learning_rate": 5.4153093694809615e-06, "loss": 1.3807, "step": 423 }, { "epoch": 1.66, "learning_rate": 5.065313689393926e-06, "loss": 1.5052, "step": 426 }, { "epoch": 1.67, "learning_rate": 4.7261258901042164e-06, "loss": 1.5502, "step": 429 }, { "epoch": 1.69, "learning_rate": 4.397868434939627e-06, "loss": 1.4234, "step": 432 }, { "epoch": 1.7, "learning_rate": 4.080659840841001e-06, "loss": 1.4875, "step": 435 }, { "epoch": 1.71, "learning_rate": 3.774614635571785e-06, "loss": 1.4858, "step": 438 }, { "epoch": 1.72, "learning_rate": 3.4798433163679534e-06, "loss": 1.4217, "step": 441 }, { "epoch": 1.73, "learning_rate": 3.1964523100431278e-06, "loss": 1.4572, "step": 444 }, { "epoch": 1.74, "learning_rate": 2.924543934563277e-06, "loss": 1.5039, "step": 447 }, { "epoch": 1.76, "learning_rate": 2.664216362104964e-06, "loss": 1.4897, "step": 450 }, { "epoch": 1.77, "learning_rate": 2.4155635836104113e-06, "loss": 1.5384, "step": 453 }, { "epoch": 1.78, "learning_rate": 2.178675374852189e-06, "loss": 1.505, "step": 456 }, { "epoch": 1.79, "learning_rate": 1.953637264019748e-06, "loss": 1.5019, "step": 459 }, { "epoch": 1.8, "learning_rate": 1.7405305008396436e-06, "loss": 1.5941, "step": 462 }, { "epoch": 1.81, "learning_rate": 1.5394320272403605e-06, "loss": 1.5265, "step": 465 }, { "epoch": 1.83, "learning_rate": 1.3504144495725661e-06, "loss": 1.5035, "step": 468 }, { "epoch": 1.84, "learning_rate": 1.1735460123946455e-06, "loss": 1.5002, "step": 471 }, { "epoch": 1.85, "learning_rate": 1.0088905738331372e-06, "loss": 1.4317, "step": 474 }, { "epoch": 1.86, "learning_rate": 8.565075825267759e-07, "loss": 1.4852, "step": 477 }, { "epoch": 1.87, "learning_rate": 7.16452056162693e-07, "loss": 1.5445, "step": 480 }, { "epoch": 1.88, "learning_rate": 5.887745616123169e-07, "loss": 1.4855, "step": 483 }, { "epoch": 1.9, "learning_rate": 4.735211966742819e-07, "loss": 1.4852, "step": 486 }, { "epoch": 1.91, "learning_rate": 3.7073357343086086e-07, "loss": 1.4545, "step": 489 }, { "epoch": 1.92, "learning_rate": 2.804488032239943e-07, "loss": 1.445, "step": 492 }, { "epoch": 1.93, "learning_rate": 2.0269948325627227e-07, "loss": 1.4484, "step": 495 }, { "epoch": 1.94, "learning_rate": 1.3751368482176727e-07, "loss": 1.4113, "step": 498 }, { "epoch": 1.95, "learning_rate": 8.491494317091229e-08, "loss": 1.5394, "step": 501 }, { "epoch": 1.97, "learning_rate": 4.492224901315344e-08, "loss": 1.4475, "step": 504 }, { "epoch": 1.98, "learning_rate": 1.7550041660350232e-08, "loss": 1.4934, "step": 507 }, { "epoch": 1.99, "learning_rate": 2.808203813499665e-09, "loss": 1.4903, "step": 510 }, { "epoch": 2.0, "eval_loss": 1.7396166324615479, "eval_runtime": 309.3927, "eval_samples_per_second": 13.627, "eval_steps_per_second": 13.627, "step": 512 }, { "epoch": 2.0, "step": 512, "total_flos": 2.7416092993388544e+17, "train_loss": 1.8553011305630207, "train_runtime": 15007.5766, "train_samples_per_second": 4.379, "train_steps_per_second": 0.034 } ], "max_steps": 512, "num_train_epochs": 2, "total_flos": 2.7416092993388544e+17, "trial_name": null, "trial_params": null }