{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.5002344116268167, "eval_steps": 500, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 4.999956573574533e-05, "loss": 0.9781, "step": 5 }, { "epoch": 0.02, "learning_rate": 4.999826295806815e-05, "loss": 0.9708, "step": 10 }, { "epoch": 0.03, "learning_rate": 4.999609171222846e-05, "loss": 0.9112, "step": 15 }, { "epoch": 0.04, "learning_rate": 4.99930520736578e-05, "loss": 0.9353, "step": 20 }, { "epoch": 0.05, "learning_rate": 4.998914414795668e-05, "loss": 0.8337, "step": 25 }, { "epoch": 0.06, "learning_rate": 4.99843680708909e-05, "loss": 0.8133, "step": 30 }, { "epoch": 0.07, "learning_rate": 4.997872400838682e-05, "loss": 0.8195, "step": 35 }, { "epoch": 0.08, "learning_rate": 4.997221215652562e-05, "loss": 0.7842, "step": 40 }, { "epoch": 0.08, "learning_rate": 4.9964832741536444e-05, "loss": 0.7454, "step": 45 }, { "epoch": 0.09, "learning_rate": 4.9956586019788584e-05, "loss": 0.7732, "step": 50 }, { "epoch": 0.1, "learning_rate": 4.9947472277782584e-05, "loss": 0.7903, "step": 55 }, { "epoch": 0.11, "learning_rate": 4.993749183214021e-05, "loss": 0.8053, "step": 60 }, { "epoch": 0.12, "learning_rate": 4.992664502959351e-05, "loss": 0.7508, "step": 65 }, { "epoch": 0.13, "learning_rate": 4.991493224697281e-05, "loss": 0.7461, "step": 70 }, { "epoch": 0.14, "learning_rate": 4.990235389119352e-05, "loss": 0.747, "step": 75 }, { "epoch": 0.15, "learning_rate": 4.9888910399242065e-05, "loss": 0.734, "step": 80 }, { "epoch": 0.16, "learning_rate": 4.987460223816067e-05, "loss": 0.736, "step": 85 }, { "epoch": 0.17, "learning_rate": 4.985942990503119e-05, "loss": 0.7058, "step": 90 }, { "epoch": 0.18, "learning_rate": 4.984339392695777e-05, "loss": 0.6632, "step": 95 }, { "epoch": 0.19, "learning_rate": 4.9826494861048576e-05, "loss": 0.7428, "step": 100 }, { "epoch": 0.2, "learning_rate": 4.980873329439644e-05, "loss": 0.7101, "step": 105 }, { "epoch": 0.21, "learning_rate": 4.979010984405842e-05, "loss": 0.6687, "step": 110 }, { "epoch": 0.22, "learning_rate": 4.9770625157034436e-05, "loss": 0.6999, "step": 115 }, { "epoch": 0.23, "learning_rate": 4.975027991024473e-05, "loss": 0.6625, "step": 120 }, { "epoch": 0.23, "learning_rate": 4.972907481050637e-05, "loss": 0.7781, "step": 125 }, { "epoch": 0.24, "learning_rate": 4.970701059450872e-05, "loss": 0.6394, "step": 130 }, { "epoch": 0.25, "learning_rate": 4.968408802878778e-05, "loss": 0.6739, "step": 135 }, { "epoch": 0.26, "learning_rate": 4.9660307909699645e-05, "loss": 0.7285, "step": 140 }, { "epoch": 0.27, "learning_rate": 4.963567106339276e-05, "loss": 0.7181, "step": 145 }, { "epoch": 0.28, "learning_rate": 4.961017834577927e-05, "loss": 0.6995, "step": 150 }, { "epoch": 0.29, "learning_rate": 4.958383064250525e-05, "loss": 0.6791, "step": 155 }, { "epoch": 0.3, "learning_rate": 4.955662886891995e-05, "loss": 0.7085, "step": 160 }, { "epoch": 0.31, "learning_rate": 4.952857397004401e-05, "loss": 0.6656, "step": 165 }, { "epoch": 0.32, "learning_rate": 4.949966692053663e-05, "loss": 0.7306, "step": 170 }, { "epoch": 0.33, "learning_rate": 4.946990872466164e-05, "loss": 0.663, "step": 175 }, { "epoch": 0.34, "learning_rate": 4.943930041625272e-05, "loss": 0.6459, "step": 180 }, { "epoch": 0.35, "learning_rate": 4.940784305867741e-05, "loss": 0.6371, "step": 185 }, { "epoch": 0.36, "learning_rate": 4.937553774480018e-05, "loss": 0.7068, "step": 190 }, { "epoch": 0.37, "learning_rate": 4.934238559694448e-05, "loss": 0.7062, "step": 195 }, { "epoch": 0.38, "learning_rate": 4.9308387766853725e-05, "loss": 0.6465, "step": 200 }, { "epoch": 0.38, "learning_rate": 4.92735454356513e-05, "loss": 0.7055, "step": 205 }, { "epoch": 0.39, "learning_rate": 4.9237859813799535e-05, "loss": 0.6427, "step": 210 }, { "epoch": 0.4, "learning_rate": 4.9201332141057623e-05, "loss": 0.6281, "step": 215 }, { "epoch": 0.41, "learning_rate": 4.9163963686438575e-05, "loss": 0.6499, "step": 220 }, { "epoch": 0.42, "learning_rate": 4.912575574816511e-05, "loss": 0.6558, "step": 225 }, { "epoch": 0.43, "learning_rate": 4.908670965362457e-05, "loss": 0.6523, "step": 230 }, { "epoch": 0.44, "learning_rate": 4.9046826759322825e-05, "loss": 0.6378, "step": 235 }, { "epoch": 0.45, "learning_rate": 4.9006108450837095e-05, "loss": 0.6463, "step": 240 }, { "epoch": 0.46, "learning_rate": 4.8964556142767845e-05, "loss": 0.6825, "step": 245 }, { "epoch": 0.47, "learning_rate": 4.892217127868965e-05, "loss": 0.6576, "step": 250 }, { "epoch": 0.48, "learning_rate": 4.8878955331101026e-05, "loss": 0.687, "step": 255 }, { "epoch": 0.49, "learning_rate": 4.8834909801373264e-05, "loss": 0.6567, "step": 260 }, { "epoch": 0.5, "learning_rate": 4.879003621969831e-05, "loss": 0.6697, "step": 265 }, { "epoch": 0.51, "learning_rate": 4.874433614503554e-05, "loss": 0.6451, "step": 270 }, { "epoch": 0.52, "learning_rate": 4.869781116505768e-05, "loss": 0.6601, "step": 275 }, { "epoch": 0.53, "learning_rate": 4.8650462896095597e-05, "loss": 0.677, "step": 280 }, { "epoch": 0.53, "learning_rate": 4.860229298308213e-05, "loss": 0.676, "step": 285 }, { "epoch": 0.54, "learning_rate": 4.8553303099495e-05, "loss": 0.6286, "step": 290 }, { "epoch": 0.55, "learning_rate": 4.8503494947298634e-05, "loss": 0.6844, "step": 295 }, { "epoch": 0.56, "learning_rate": 4.845287025688503e-05, "loss": 0.6389, "step": 300 }, { "epoch": 0.57, "learning_rate": 4.8401430787013666e-05, "loss": 0.641, "step": 305 }, { "epoch": 0.58, "learning_rate": 4.8349178324750387e-05, "loss": 0.6959, "step": 310 }, { "epoch": 0.59, "learning_rate": 4.8296114685405324e-05, "loss": 0.6356, "step": 315 }, { "epoch": 0.6, "learning_rate": 4.824224171246981e-05, "loss": 0.6673, "step": 320 }, { "epoch": 0.61, "learning_rate": 4.8187561277552374e-05, "loss": 0.5884, "step": 325 }, { "epoch": 0.62, "learning_rate": 4.813207528031366e-05, "loss": 0.6729, "step": 330 }, { "epoch": 0.63, "learning_rate": 4.807578564840051e-05, "loss": 0.6228, "step": 335 }, { "epoch": 0.64, "learning_rate": 4.801869433737891e-05, "loss": 0.6136, "step": 340 }, { "epoch": 0.65, "learning_rate": 4.796080333066613e-05, "loss": 0.634, "step": 345 }, { "epoch": 0.66, "learning_rate": 4.790211463946174e-05, "loss": 0.6636, "step": 350 }, { "epoch": 0.67, "learning_rate": 4.784263030267781e-05, "loss": 0.64, "step": 355 }, { "epoch": 0.68, "learning_rate": 4.7782352386868035e-05, "loss": 0.6477, "step": 360 }, { "epoch": 0.68, "learning_rate": 4.7721282986155945e-05, "loss": 0.6591, "step": 365 }, { "epoch": 0.69, "learning_rate": 4.7659424222162165e-05, "loss": 0.626, "step": 370 }, { "epoch": 0.7, "learning_rate": 4.7596778243930694e-05, "loss": 0.6154, "step": 375 }, { "epoch": 0.71, "learning_rate": 4.7533347227854265e-05, "loss": 0.6612, "step": 380 }, { "epoch": 0.72, "learning_rate": 4.7469133377598695e-05, "loss": 0.6235, "step": 385 }, { "epoch": 0.73, "learning_rate": 4.740413892402639e-05, "loss": 0.6507, "step": 390 }, { "epoch": 0.74, "learning_rate": 4.7338366125118775e-05, "loss": 0.6008, "step": 395 }, { "epoch": 0.75, "learning_rate": 4.727181726589789e-05, "loss": 0.611, "step": 400 }, { "epoch": 0.76, "learning_rate": 4.7204494658346996e-05, "loss": 0.638, "step": 405 }, { "epoch": 0.77, "learning_rate": 4.713640064133025e-05, "loss": 0.6045, "step": 410 }, { "epoch": 0.78, "learning_rate": 4.706753758051145e-05, "loss": 0.6266, "step": 415 }, { "epoch": 0.79, "learning_rate": 4.699790786827188e-05, "loss": 0.6175, "step": 420 }, { "epoch": 0.8, "learning_rate": 4.6927513923627124e-05, "loss": 0.6393, "step": 425 }, { "epoch": 0.81, "learning_rate": 4.68563581921431e-05, "loss": 0.6792, "step": 430 }, { "epoch": 0.82, "learning_rate": 4.6784443145851074e-05, "loss": 0.6274, "step": 435 }, { "epoch": 0.83, "learning_rate": 4.671177128316176e-05, "loss": 0.6671, "step": 440 }, { "epoch": 0.83, "learning_rate": 4.663834512877853e-05, "loss": 0.6933, "step": 445 }, { "epoch": 0.84, "learning_rate": 4.6564167233609736e-05, "loss": 0.6553, "step": 450 }, { "epoch": 0.85, "learning_rate": 4.648924017468003e-05, "loss": 0.6261, "step": 455 }, { "epoch": 0.86, "learning_rate": 4.6413566555040896e-05, "loss": 0.6421, "step": 460 }, { "epoch": 0.87, "learning_rate": 4.633714900368018e-05, "loss": 0.6464, "step": 465 }, { "epoch": 0.88, "learning_rate": 4.625999017543075e-05, "loss": 0.6902, "step": 470 }, { "epoch": 0.89, "learning_rate": 4.618209275087829e-05, "loss": 0.655, "step": 475 }, { "epoch": 0.9, "learning_rate": 4.610345943626817e-05, "loss": 0.5977, "step": 480 }, { "epoch": 0.91, "learning_rate": 4.602409296341141e-05, "loss": 0.6449, "step": 485 }, { "epoch": 0.92, "learning_rate": 4.5943996089589775e-05, "loss": 0.6055, "step": 490 }, { "epoch": 0.93, "learning_rate": 4.586317159746001e-05, "loss": 0.6475, "step": 495 }, { "epoch": 0.94, "learning_rate": 4.5781622294957136e-05, "loss": 0.6492, "step": 500 }, { "epoch": 0.95, "learning_rate": 4.569935101519692e-05, "loss": 0.6209, "step": 505 }, { "epoch": 0.96, "learning_rate": 4.561636061637745e-05, "loss": 0.6437, "step": 510 }, { "epoch": 0.97, "learning_rate": 4.553265398167981e-05, "loss": 0.616, "step": 515 }, { "epoch": 0.98, "learning_rate": 4.5448234019167945e-05, "loss": 0.6512, "step": 520 }, { "epoch": 0.98, "learning_rate": 4.536310366168763e-05, "loss": 0.6408, "step": 525 }, { "epoch": 0.99, "learning_rate": 4.5277265866764565e-05, "loss": 0.6349, "step": 530 }, { "epoch": 1.0, "learning_rate": 4.519072361650163e-05, "loss": 0.6541, "step": 535 }, { "epoch": 1.01, "learning_rate": 4.5103479917475286e-05, "loss": 0.6365, "step": 540 }, { "epoch": 1.02, "learning_rate": 4.501553780063113e-05, "loss": 0.5719, "step": 545 }, { "epoch": 1.03, "learning_rate": 4.4926900321178595e-05, "loss": 0.5938, "step": 550 }, { "epoch": 1.04, "learning_rate": 4.483757055848479e-05, "loss": 0.6431, "step": 555 }, { "epoch": 1.05, "learning_rate": 4.4747551615967534e-05, "loss": 0.6333, "step": 560 }, { "epoch": 1.06, "learning_rate": 4.4656846620987557e-05, "loss": 0.6413, "step": 565 }, { "epoch": 1.07, "learning_rate": 4.4565458724739825e-05, "loss": 0.639, "step": 570 }, { "epoch": 1.08, "learning_rate": 4.447339110214405e-05, "loss": 0.6792, "step": 575 }, { "epoch": 1.09, "learning_rate": 4.438064695173446e-05, "loss": 0.606, "step": 580 }, { "epoch": 1.1, "learning_rate": 4.428722949554857e-05, "loss": 0.6392, "step": 585 }, { "epoch": 1.11, "learning_rate": 4.419314197901537e-05, "loss": 0.5808, "step": 590 }, { "epoch": 1.12, "learning_rate": 4.4098387670842466e-05, "loss": 0.5524, "step": 595 }, { "epoch": 1.13, "learning_rate": 4.400296986290258e-05, "loss": 0.5939, "step": 600 }, { "epoch": 1.13, "learning_rate": 4.390689187011917e-05, "loss": 0.6328, "step": 605 }, { "epoch": 1.14, "learning_rate": 4.3810157030351276e-05, "loss": 0.6542, "step": 610 }, { "epoch": 1.15, "learning_rate": 4.371276870427753e-05, "loss": 0.5911, "step": 615 }, { "epoch": 1.16, "learning_rate": 4.3614730275279457e-05, "loss": 0.6093, "step": 620 }, { "epoch": 1.17, "learning_rate": 4.351604514932387e-05, "loss": 0.5915, "step": 625 }, { "epoch": 1.18, "learning_rate": 4.341671675484459e-05, "loss": 0.6436, "step": 630 }, { "epoch": 1.19, "learning_rate": 4.331674854262331e-05, "loss": 0.6547, "step": 635 }, { "epoch": 1.2, "learning_rate": 4.321614398566972e-05, "loss": 0.6356, "step": 640 }, { "epoch": 1.21, "learning_rate": 4.3114906579100853e-05, "loss": 0.6458, "step": 645 }, { "epoch": 1.22, "learning_rate": 4.301303984001967e-05, "loss": 0.6527, "step": 650 }, { "epoch": 1.23, "learning_rate": 4.291054730739286e-05, "loss": 0.6872, "step": 655 }, { "epoch": 1.24, "learning_rate": 4.2807432541927865e-05, "loss": 0.5961, "step": 660 }, { "epoch": 1.25, "learning_rate": 4.2703699125949245e-05, "loss": 0.6227, "step": 665 }, { "epoch": 1.26, "learning_rate": 4.259935066327415e-05, "loss": 0.6541, "step": 670 }, { "epoch": 1.27, "learning_rate": 4.2494390779087187e-05, "loss": 0.654, "step": 675 }, { "epoch": 1.28, "learning_rate": 4.238882311981441e-05, "loss": 0.6677, "step": 680 }, { "epoch": 1.28, "learning_rate": 4.228265135299669e-05, "loss": 0.568, "step": 685 }, { "epoch": 1.29, "learning_rate": 4.2175879167162304e-05, "loss": 0.5655, "step": 690 }, { "epoch": 1.3, "learning_rate": 4.206851027169871e-05, "loss": 0.5781, "step": 695 }, { "epoch": 1.31, "learning_rate": 4.196054839672382e-05, "loss": 0.599, "step": 700 }, { "epoch": 1.32, "learning_rate": 4.1851997292956255e-05, "loss": 0.6432, "step": 705 }, { "epoch": 1.33, "learning_rate": 4.174286073158516e-05, "loss": 0.685, "step": 710 }, { "epoch": 1.34, "learning_rate": 4.163314250413913e-05, "loss": 0.6159, "step": 715 }, { "epoch": 1.35, "learning_rate": 4.152284642235452e-05, "loss": 0.6293, "step": 720 }, { "epoch": 1.36, "learning_rate": 4.141197631804298e-05, "loss": 0.608, "step": 725 }, { "epoch": 1.37, "learning_rate": 4.1300536042958354e-05, "loss": 0.6286, "step": 730 }, { "epoch": 1.38, "learning_rate": 4.118852946866291e-05, "loss": 0.6234, "step": 735 }, { "epoch": 1.39, "learning_rate": 4.107596048639274e-05, "loss": 0.6079, "step": 740 }, { "epoch": 1.4, "learning_rate": 4.0962833006922675e-05, "loss": 0.6374, "step": 745 }, { "epoch": 1.41, "learning_rate": 4.0849150960430356e-05, "loss": 0.6089, "step": 750 }, { "epoch": 1.42, "learning_rate": 4.0734918296359716e-05, "loss": 0.6256, "step": 755 }, { "epoch": 1.43, "learning_rate": 4.0620138983283785e-05, "loss": 0.59, "step": 760 }, { "epoch": 1.43, "learning_rate": 4.050481700876677e-05, "loss": 0.5918, "step": 765 }, { "epoch": 1.44, "learning_rate": 4.038895637922559e-05, "loss": 0.5828, "step": 770 }, { "epoch": 1.45, "learning_rate": 4.027256111979063e-05, "loss": 0.6087, "step": 775 }, { "epoch": 1.46, "learning_rate": 4.015563527416595e-05, "loss": 0.6005, "step": 780 }, { "epoch": 1.47, "learning_rate": 4.003818290448876e-05, "loss": 0.6306, "step": 785 }, { "epoch": 1.48, "learning_rate": 3.992020809118832e-05, "loss": 0.5853, "step": 790 }, { "epoch": 1.49, "learning_rate": 3.980171493284418e-05, "loss": 0.6681, "step": 795 }, { "epoch": 1.5, "learning_rate": 3.9682707546043785e-05, "loss": 0.6093, "step": 800 } ], "logging_steps": 5, "max_steps": 2665, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "total_flos": 6.268370894910259e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }