{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.913937547600914, "eval_steps": 500, "global_step": 900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 6.756756756756757e-08, "loss": 2.0057, "step": 1 }, { "epoch": 0.01, "learning_rate": 3.378378378378379e-07, "loss": 2.0564, "step": 5 }, { "epoch": 0.01, "learning_rate": 6.756756756756758e-07, "loss": 2.0403, "step": 10 }, { "epoch": 0.02, "learning_rate": 1.0135135135135136e-06, "loss": 2.0396, "step": 15 }, { "epoch": 0.02, "learning_rate": 1.3513513513513515e-06, "loss": 1.9947, "step": 20 }, { "epoch": 0.03, "learning_rate": 1.6891891891891894e-06, "loss": 2.0448, "step": 25 }, { "epoch": 0.03, "learning_rate": 2.0270270270270273e-06, "loss": 2.0019, "step": 30 }, { "epoch": 0.04, "learning_rate": 2.364864864864865e-06, "loss": 1.9959, "step": 35 }, { "epoch": 0.04, "learning_rate": 2.702702702702703e-06, "loss": 2.0093, "step": 40 }, { "epoch": 0.05, "learning_rate": 3.040540540540541e-06, "loss": 2.0242, "step": 45 }, { "epoch": 0.05, "learning_rate": 3.3783783783783788e-06, "loss": 1.9715, "step": 50 }, { "epoch": 0.06, "learning_rate": 3.7162162162162162e-06, "loss": 1.9773, "step": 55 }, { "epoch": 0.06, "learning_rate": 4.0540540540540545e-06, "loss": 1.9811, "step": 60 }, { "epoch": 0.07, "learning_rate": 4.391891891891892e-06, "loss": 1.9947, "step": 65 }, { "epoch": 0.07, "learning_rate": 4.72972972972973e-06, "loss": 1.988, "step": 70 }, { "epoch": 0.08, "learning_rate": 5.067567567567568e-06, "loss": 1.9718, "step": 75 }, { "epoch": 0.08, "learning_rate": 5.405405405405406e-06, "loss": 1.9505, "step": 80 }, { "epoch": 0.09, "learning_rate": 5.743243243243244e-06, "loss": 1.9706, "step": 85 }, { "epoch": 0.09, "learning_rate": 6.081081081081082e-06, "loss": 1.9506, "step": 90 }, { "epoch": 0.1, "learning_rate": 6.41891891891892e-06, "loss": 1.9243, "step": 95 }, { "epoch": 0.1, "learning_rate": 6.7567567567567575e-06, "loss": 1.9785, "step": 100 }, { "epoch": 0.11, "learning_rate": 7.0945945945945946e-06, "loss": 1.8531, "step": 105 }, { "epoch": 0.11, "learning_rate": 7.4324324324324324e-06, "loss": 1.9446, "step": 110 }, { "epoch": 0.12, "learning_rate": 7.77027027027027e-06, "loss": 1.9, "step": 115 }, { "epoch": 0.12, "learning_rate": 8.108108108108109e-06, "loss": 1.8533, "step": 120 }, { "epoch": 0.13, "learning_rate": 8.445945945945948e-06, "loss": 1.92, "step": 125 }, { "epoch": 0.13, "learning_rate": 8.783783783783785e-06, "loss": 1.8792, "step": 130 }, { "epoch": 0.14, "learning_rate": 9.121621621621622e-06, "loss": 1.9132, "step": 135 }, { "epoch": 0.14, "learning_rate": 9.45945945945946e-06, "loss": 1.8309, "step": 140 }, { "epoch": 0.15, "learning_rate": 9.797297297297298e-06, "loss": 1.8194, "step": 145 }, { "epoch": 0.15, "learning_rate": 1.0135135135135136e-05, "loss": 1.7605, "step": 150 }, { "epoch": 0.16, "learning_rate": 1.0472972972972975e-05, "loss": 1.806, "step": 155 }, { "epoch": 0.16, "learning_rate": 1.0810810810810812e-05, "loss": 1.879, "step": 160 }, { "epoch": 0.17, "learning_rate": 1.114864864864865e-05, "loss": 1.7906, "step": 165 }, { "epoch": 0.17, "learning_rate": 1.1486486486486488e-05, "loss": 1.7865, "step": 170 }, { "epoch": 0.18, "learning_rate": 1.1824324324324325e-05, "loss": 1.855, "step": 175 }, { "epoch": 0.18, "learning_rate": 1.2162162162162164e-05, "loss": 1.7428, "step": 180 }, { "epoch": 0.19, "learning_rate": 1.25e-05, "loss": 1.8219, "step": 185 }, { "epoch": 0.19, "learning_rate": 1.283783783783784e-05, "loss": 1.7607, "step": 190 }, { "epoch": 0.2, "learning_rate": 1.3175675675675676e-05, "loss": 1.8289, "step": 195 }, { "epoch": 0.2, "learning_rate": 1.3513513513513515e-05, "loss": 1.8311, "step": 200 }, { "epoch": 0.21, "learning_rate": 1.3851351351351352e-05, "loss": 1.7481, "step": 205 }, { "epoch": 0.21, "learning_rate": 1.4189189189189189e-05, "loss": 1.795, "step": 210 }, { "epoch": 0.22, "learning_rate": 1.4527027027027028e-05, "loss": 1.7709, "step": 215 }, { "epoch": 0.22, "learning_rate": 1.4864864864864865e-05, "loss": 1.7569, "step": 220 }, { "epoch": 0.23, "learning_rate": 1.5202702702702704e-05, "loss": 1.7625, "step": 225 }, { "epoch": 0.23, "learning_rate": 1.554054054054054e-05, "loss": 1.7773, "step": 230 }, { "epoch": 0.24, "learning_rate": 1.587837837837838e-05, "loss": 1.7449, "step": 235 }, { "epoch": 0.24, "learning_rate": 1.6216216216216218e-05, "loss": 1.7536, "step": 240 }, { "epoch": 0.25, "learning_rate": 1.6554054054054057e-05, "loss": 1.7815, "step": 245 }, { "epoch": 0.25, "learning_rate": 1.6891891891891896e-05, "loss": 1.81, "step": 250 }, { "epoch": 0.26, "learning_rate": 1.722972972972973e-05, "loss": 1.7928, "step": 255 }, { "epoch": 0.26, "learning_rate": 1.756756756756757e-05, "loss": 1.7427, "step": 260 }, { "epoch": 0.27, "learning_rate": 1.790540540540541e-05, "loss": 1.6618, "step": 265 }, { "epoch": 0.27, "learning_rate": 1.8243243243243244e-05, "loss": 1.7179, "step": 270 }, { "epoch": 0.28, "learning_rate": 1.8581081081081082e-05, "loss": 1.7514, "step": 275 }, { "epoch": 0.28, "learning_rate": 1.891891891891892e-05, "loss": 1.7457, "step": 280 }, { "epoch": 0.29, "learning_rate": 1.925675675675676e-05, "loss": 1.8168, "step": 285 }, { "epoch": 0.29, "learning_rate": 1.9594594594594595e-05, "loss": 1.7466, "step": 290 }, { "epoch": 0.3, "learning_rate": 1.9932432432432434e-05, "loss": 1.7226, "step": 295 }, { "epoch": 0.3, "learning_rate": 1.9999888073536733e-05, "loss": 1.7554, "step": 300 }, { "epoch": 0.31, "learning_rate": 1.9999433376573803e-05, "loss": 1.8218, "step": 305 }, { "epoch": 0.31, "learning_rate": 1.9998628929598956e-05, "loss": 1.7816, "step": 310 }, { "epoch": 0.32, "learning_rate": 1.9997474760749314e-05, "loss": 1.7414, "step": 315 }, { "epoch": 0.32, "learning_rate": 1.9995970910394228e-05, "loss": 1.778, "step": 320 }, { "epoch": 0.33, "learning_rate": 1.9994117431133843e-05, "loss": 1.7267, "step": 325 }, { "epoch": 0.34, "learning_rate": 1.9991914387797266e-05, "loss": 1.6588, "step": 330 }, { "epoch": 0.34, "learning_rate": 1.9989361857440318e-05, "loss": 1.7615, "step": 335 }, { "epoch": 0.35, "learning_rate": 1.99864599293428e-05, "loss": 1.7038, "step": 340 }, { "epoch": 0.35, "learning_rate": 1.998320870500541e-05, "loss": 1.751, "step": 345 }, { "epoch": 0.36, "learning_rate": 1.997960829814616e-05, "loss": 1.695, "step": 350 }, { "epoch": 0.36, "learning_rate": 1.997565883469643e-05, "loss": 1.6773, "step": 355 }, { "epoch": 0.37, "learning_rate": 1.9971360452796523e-05, "loss": 1.7159, "step": 360 }, { "epoch": 0.37, "learning_rate": 1.9966713302790875e-05, "loss": 1.7268, "step": 365 }, { "epoch": 0.38, "learning_rate": 1.9961717547222775e-05, "loss": 1.7641, "step": 370 }, { "epoch": 0.38, "learning_rate": 1.995637336082868e-05, "loss": 1.7376, "step": 375 }, { "epoch": 0.39, "learning_rate": 1.9950680930532107e-05, "loss": 1.7687, "step": 380 }, { "epoch": 0.39, "learning_rate": 1.994464045543709e-05, "loss": 1.7248, "step": 385 }, { "epoch": 0.4, "learning_rate": 1.9938252146821236e-05, "loss": 1.7281, "step": 390 }, { "epoch": 0.4, "learning_rate": 1.9931516228128295e-05, "loss": 1.7265, "step": 395 }, { "epoch": 0.41, "learning_rate": 1.9924432934960384e-05, "loss": 1.7459, "step": 400 }, { "epoch": 0.41, "learning_rate": 1.9917002515069732e-05, "loss": 1.719, "step": 405 }, { "epoch": 0.42, "learning_rate": 1.9909225228350007e-05, "loss": 1.6819, "step": 410 }, { "epoch": 0.42, "learning_rate": 1.9901101346827233e-05, "loss": 1.6849, "step": 415 }, { "epoch": 0.43, "learning_rate": 1.989263115465028e-05, "loss": 1.7905, "step": 420 }, { "epoch": 0.43, "learning_rate": 1.9883814948080918e-05, "loss": 1.7414, "step": 425 }, { "epoch": 0.44, "learning_rate": 1.987465303548345e-05, "loss": 1.7082, "step": 430 }, { "epoch": 0.44, "learning_rate": 1.9865145737313937e-05, "loss": 1.743, "step": 435 }, { "epoch": 0.45, "learning_rate": 1.9855293386108995e-05, "loss": 1.7089, "step": 440 }, { "epoch": 0.45, "learning_rate": 1.9845096326474135e-05, "loss": 1.7491, "step": 445 }, { "epoch": 0.46, "learning_rate": 1.9834554915071745e-05, "loss": 1.6897, "step": 450 }, { "epoch": 0.46, "learning_rate": 1.982366952060859e-05, "loss": 1.7759, "step": 455 }, { "epoch": 0.47, "learning_rate": 1.981244052382293e-05, "loss": 1.7774, "step": 460 }, { "epoch": 0.47, "learning_rate": 1.9800868317471196e-05, "loss": 1.6844, "step": 465 }, { "epoch": 0.48, "learning_rate": 1.978895330631425e-05, "loss": 1.7419, "step": 470 }, { "epoch": 0.48, "learning_rate": 1.977669590710324e-05, "loss": 1.7966, "step": 475 }, { "epoch": 0.49, "learning_rate": 1.976409654856501e-05, "loss": 1.6607, "step": 480 }, { "epoch": 0.49, "learning_rate": 1.975115567138711e-05, "loss": 1.8121, "step": 485 }, { "epoch": 0.5, "learning_rate": 1.9737873728202376e-05, "loss": 1.7282, "step": 490 }, { "epoch": 0.5, "learning_rate": 1.972425118357312e-05, "loss": 1.6864, "step": 495 }, { "epoch": 0.51, "learning_rate": 1.9710288513974846e-05, "loss": 1.7665, "step": 500 }, { "epoch": 0.51, "learning_rate": 1.969598620777962e-05, "loss": 1.6843, "step": 505 }, { "epoch": 0.52, "learning_rate": 1.9681344765238958e-05, "loss": 1.6675, "step": 510 }, { "epoch": 0.52, "learning_rate": 1.966636469846635e-05, "loss": 1.7891, "step": 515 }, { "epoch": 0.53, "learning_rate": 1.9651046531419335e-05, "loss": 1.6728, "step": 520 }, { "epoch": 0.53, "learning_rate": 1.9635390799881186e-05, "loss": 1.7289, "step": 525 }, { "epoch": 0.54, "learning_rate": 1.961939805144216e-05, "loss": 1.7241, "step": 530 }, { "epoch": 0.54, "learning_rate": 1.9603068845480347e-05, "loss": 1.7378, "step": 535 }, { "epoch": 0.55, "learning_rate": 1.9586403753142104e-05, "loss": 1.7717, "step": 540 }, { "epoch": 0.55, "learning_rate": 1.956940335732209e-05, "loss": 1.7401, "step": 545 }, { "epoch": 0.56, "learning_rate": 1.9552068252642858e-05, "loss": 1.6758, "step": 550 }, { "epoch": 0.56, "learning_rate": 1.9534399045434073e-05, "loss": 1.7241, "step": 555 }, { "epoch": 0.57, "learning_rate": 1.9516396353711297e-05, "loss": 1.7026, "step": 560 }, { "epoch": 0.57, "learning_rate": 1.9498060807154368e-05, "loss": 1.7662, "step": 565 }, { "epoch": 0.58, "learning_rate": 1.9479393047085392e-05, "loss": 1.6935, "step": 570 }, { "epoch": 0.58, "learning_rate": 1.94603937264463e-05, "loss": 1.7303, "step": 575 }, { "epoch": 0.59, "learning_rate": 1.9441063509776003e-05, "loss": 1.7758, "step": 580 }, { "epoch": 0.59, "learning_rate": 1.9421403073187162e-05, "loss": 1.7017, "step": 585 }, { "epoch": 0.6, "learning_rate": 1.9401413104342535e-05, "loss": 1.7726, "step": 590 }, { "epoch": 0.6, "learning_rate": 1.938109430243093e-05, "loss": 1.6689, "step": 595 }, { "epoch": 0.61, "learning_rate": 1.936044737814273e-05, "loss": 1.6734, "step": 600 }, { "epoch": 0.61, "learning_rate": 1.9339473053645056e-05, "loss": 1.656, "step": 605 }, { "epoch": 0.62, "learning_rate": 1.931817206255651e-05, "loss": 1.7196, "step": 610 }, { "epoch": 0.62, "learning_rate": 1.9296545149921488e-05, "loss": 1.686, "step": 615 }, { "epoch": 0.63, "learning_rate": 1.9274593072184152e-05, "loss": 1.7709, "step": 620 }, { "epoch": 0.63, "learning_rate": 1.9252316597161947e-05, "loss": 1.6834, "step": 625 }, { "epoch": 0.64, "learning_rate": 1.922971650401877e-05, "loss": 1.6787, "step": 630 }, { "epoch": 0.64, "learning_rate": 1.920679358323769e-05, "loss": 1.6922, "step": 635 }, { "epoch": 0.65, "learning_rate": 1.9183548636593322e-05, "loss": 1.7244, "step": 640 }, { "epoch": 0.65, "learning_rate": 1.9159982477123776e-05, "loss": 1.7187, "step": 645 }, { "epoch": 0.66, "learning_rate": 1.9136095929102204e-05, "loss": 1.6827, "step": 650 }, { "epoch": 0.67, "learning_rate": 1.9111889828007997e-05, "loss": 1.7466, "step": 655 }, { "epoch": 0.67, "learning_rate": 1.908736502049754e-05, "loss": 1.7892, "step": 660 }, { "epoch": 0.68, "learning_rate": 1.9062522364374617e-05, "loss": 1.744, "step": 665 }, { "epoch": 0.68, "learning_rate": 1.903736272856038e-05, "loss": 1.6438, "step": 670 }, { "epoch": 0.69, "learning_rate": 1.9011886993062994e-05, "loss": 1.7538, "step": 675 }, { "epoch": 0.69, "learning_rate": 1.8986096048946826e-05, "loss": 1.6889, "step": 680 }, { "epoch": 0.7, "learning_rate": 1.8959990798301286e-05, "loss": 1.7199, "step": 685 }, { "epoch": 0.7, "learning_rate": 1.893357215420929e-05, "loss": 1.7801, "step": 690 }, { "epoch": 0.71, "learning_rate": 1.8906841040715304e-05, "loss": 1.7559, "step": 695 }, { "epoch": 0.71, "learning_rate": 1.8879798392793033e-05, "loss": 1.6639, "step": 700 }, { "epoch": 0.72, "learning_rate": 1.8852445156312713e-05, "loss": 1.687, "step": 705 }, { "epoch": 0.72, "learning_rate": 1.8824782288008038e-05, "loss": 1.7619, "step": 710 }, { "epoch": 0.73, "learning_rate": 1.8796810755442675e-05, "loss": 1.7529, "step": 715 }, { "epoch": 0.73, "learning_rate": 1.8768531536976452e-05, "loss": 1.7513, "step": 720 }, { "epoch": 0.74, "learning_rate": 1.8739945621731114e-05, "loss": 1.7322, "step": 725 }, { "epoch": 0.74, "learning_rate": 1.8711054009555736e-05, "loss": 1.7762, "step": 730 }, { "epoch": 0.75, "learning_rate": 1.8681857710991745e-05, "loss": 1.6949, "step": 735 }, { "epoch": 0.75, "learning_rate": 1.865235774723758e-05, "loss": 1.6526, "step": 740 }, { "epoch": 0.76, "learning_rate": 1.862255515011298e-05, "loss": 1.7155, "step": 745 }, { "epoch": 0.76, "learning_rate": 1.8592450962022872e-05, "loss": 1.8251, "step": 750 }, { "epoch": 0.77, "learning_rate": 1.8562046235920938e-05, "loss": 1.7077, "step": 755 }, { "epoch": 0.77, "learning_rate": 1.8531342035272768e-05, "loss": 1.6627, "step": 760 }, { "epoch": 0.78, "learning_rate": 1.850033943401867e-05, "loss": 1.7163, "step": 765 }, { "epoch": 0.78, "learning_rate": 1.84690395165361e-05, "loss": 1.713, "step": 770 }, { "epoch": 0.79, "learning_rate": 1.8437443377601736e-05, "loss": 1.7024, "step": 775 }, { "epoch": 0.79, "learning_rate": 1.8405552122353212e-05, "loss": 1.7253, "step": 780 }, { "epoch": 0.8, "learning_rate": 1.8373366866250407e-05, "loss": 1.7109, "step": 785 }, { "epoch": 0.8, "learning_rate": 1.8340888735036485e-05, "loss": 1.7563, "step": 790 }, { "epoch": 0.81, "learning_rate": 1.830811886469849e-05, "loss": 1.7479, "step": 795 }, { "epoch": 0.81, "learning_rate": 1.8275058401427622e-05, "loss": 1.7315, "step": 800 }, { "epoch": 0.82, "learning_rate": 1.8241708501579146e-05, "loss": 1.7126, "step": 805 }, { "epoch": 0.82, "learning_rate": 1.8208070331631943e-05, "loss": 1.6866, "step": 810 }, { "epoch": 0.83, "learning_rate": 1.817414506814772e-05, "loss": 1.6708, "step": 815 }, { "epoch": 0.83, "learning_rate": 1.8139933897729833e-05, "loss": 1.6868, "step": 820 }, { "epoch": 0.84, "learning_rate": 1.8105438016981816e-05, "loss": 1.7493, "step": 825 }, { "epoch": 0.84, "learning_rate": 1.807065863246551e-05, "loss": 1.7707, "step": 830 }, { "epoch": 0.85, "learning_rate": 1.8035596960658856e-05, "loss": 1.7692, "step": 835 }, { "epoch": 0.85, "learning_rate": 1.8000254227913346e-05, "loss": 1.7761, "step": 840 }, { "epoch": 0.86, "learning_rate": 1.7964631670411154e-05, "loss": 1.6765, "step": 845 }, { "epoch": 0.86, "learning_rate": 1.7928730534121872e-05, "loss": 1.6437, "step": 850 }, { "epoch": 0.87, "learning_rate": 1.7892552074758932e-05, "loss": 1.6804, "step": 855 }, { "epoch": 0.87, "learning_rate": 1.7856097557735697e-05, "loss": 1.7373, "step": 860 }, { "epoch": 0.88, "learning_rate": 1.7819368258121188e-05, "loss": 1.7316, "step": 865 }, { "epoch": 0.88, "learning_rate": 1.77823654605955e-05, "loss": 1.6764, "step": 870 }, { "epoch": 0.89, "learning_rate": 1.7745090459404853e-05, "loss": 1.7456, "step": 875 }, { "epoch": 0.89, "learning_rate": 1.7707544558316332e-05, "loss": 1.6518, "step": 880 }, { "epoch": 0.9, "learning_rate": 1.766972907057229e-05, "loss": 1.7705, "step": 885 }, { "epoch": 0.9, "learning_rate": 1.763164531884439e-05, "loss": 1.7431, "step": 890 }, { "epoch": 0.91, "learning_rate": 1.7593294635187385e-05, "loss": 1.6928, "step": 895 }, { "epoch": 0.91, "learning_rate": 1.7554678360992475e-05, "loss": 1.7483, "step": 900 }, { "epoch": 0.91, "step": 900, "total_flos": 1.2656331456763658e+18, "train_loss": 0.0, "train_runtime": 3.9268, "train_samples_per_second": 12034.857, "train_steps_per_second": 187.937 } ], "logging_steps": 5, "max_steps": 738, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 1.2656331456763658e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }