{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.15998240193578706, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 4.9999964908081455e-05, "loss": 0.7285, "step": 5 }, { "epoch": 0.0, "learning_rate": 4.999985963242432e-05, "loss": 0.6712, "step": 10 }, { "epoch": 0.0, "learning_rate": 4.999968417332415e-05, "loss": 0.6081, "step": 15 }, { "epoch": 0.0, "learning_rate": 4.999943853127351e-05, "loss": 0.6383, "step": 20 }, { "epoch": 0.0, "learning_rate": 4.999912270696202e-05, "loss": 0.6456, "step": 25 }, { "epoch": 0.0, "learning_rate": 4.9998736701276295e-05, "loss": 0.6228, "step": 30 }, { "epoch": 0.01, "learning_rate": 4.99982805153e-05, "loss": 0.6134, "step": 35 }, { "epoch": 0.01, "learning_rate": 4.9997754150313815e-05, "loss": 0.5975, "step": 40 }, { "epoch": 0.01, "learning_rate": 4.999715760779541e-05, "loss": 0.6053, "step": 45 }, { "epoch": 0.01, "learning_rate": 4.9996490889419514e-05, "loss": 0.6064, "step": 50 }, { "epoch": 0.01, "learning_rate": 4.999575399705783e-05, "loss": 0.5947, "step": 55 }, { "epoch": 0.01, "learning_rate": 4.999494693277907e-05, "loss": 0.5839, "step": 60 }, { "epoch": 0.01, "learning_rate": 4.999406969884897e-05, "loss": 0.6106, "step": 65 }, { "epoch": 0.01, "learning_rate": 4.999312229773022e-05, "loss": 0.6146, "step": 70 }, { "epoch": 0.01, "learning_rate": 4.99921047320825e-05, "loss": 0.5659, "step": 75 }, { "epoch": 0.01, "learning_rate": 4.9991017004762496e-05, "loss": 0.5682, "step": 80 }, { "epoch": 0.01, "learning_rate": 4.998985911882384e-05, "loss": 0.6352, "step": 85 }, { "epoch": 0.01, "learning_rate": 4.998863107751711e-05, "loss": 0.6018, "step": 90 }, { "epoch": 0.02, "learning_rate": 4.998733288428987e-05, "loss": 0.6342, "step": 95 }, { "epoch": 0.02, "learning_rate": 4.9985964542786614e-05, "loss": 0.5886, "step": 100 }, { "epoch": 0.02, "learning_rate": 4.998452605684874e-05, "loss": 0.6097, "step": 105 }, { "epoch": 0.02, "learning_rate": 4.998301743051459e-05, "loss": 0.5687, "step": 110 }, { "epoch": 0.02, "learning_rate": 4.998143866801942e-05, "loss": 0.5866, "step": 115 }, { "epoch": 0.02, "learning_rate": 4.997978977379536e-05, "loss": 0.5612, "step": 120 }, { "epoch": 0.02, "learning_rate": 4.997807075247146e-05, "loss": 0.6221, "step": 125 }, { "epoch": 0.02, "learning_rate": 4.997628160887361e-05, "loss": 0.5728, "step": 130 }, { "epoch": 0.02, "learning_rate": 4.997442234802456e-05, "loss": 0.6105, "step": 135 }, { "epoch": 0.02, "learning_rate": 4.997249297514394e-05, "loss": 0.6161, "step": 140 }, { "epoch": 0.02, "learning_rate": 4.997049349564814e-05, "loss": 0.6511, "step": 145 }, { "epoch": 0.02, "learning_rate": 4.996842391515044e-05, "loss": 0.6108, "step": 150 }, { "epoch": 0.02, "learning_rate": 4.996628423946087e-05, "loss": 0.5664, "step": 155 }, { "epoch": 0.03, "learning_rate": 4.996407447458626e-05, "loss": 0.6127, "step": 160 }, { "epoch": 0.03, "learning_rate": 4.99617946267302e-05, "loss": 0.6213, "step": 165 }, { "epoch": 0.03, "learning_rate": 4.995944470229302e-05, "loss": 0.596, "step": 170 }, { "epoch": 0.03, "learning_rate": 4.9957024707871806e-05, "loss": 0.5731, "step": 175 }, { "epoch": 0.03, "learning_rate": 4.995453465026032e-05, "loss": 0.5704, "step": 180 }, { "epoch": 0.03, "learning_rate": 4.995197453644905e-05, "loss": 0.5767, "step": 185 }, { "epoch": 0.03, "learning_rate": 4.994934437362513e-05, "loss": 0.6134, "step": 190 }, { "epoch": 0.03, "learning_rate": 4.9946644169172355e-05, "loss": 0.5919, "step": 195 }, { "epoch": 0.03, "learning_rate": 4.994387393067117e-05, "loss": 0.6031, "step": 200 }, { "epoch": 0.03, "learning_rate": 4.994103366589859e-05, "loss": 0.6236, "step": 205 }, { "epoch": 0.03, "learning_rate": 4.993812338282826e-05, "loss": 0.6307, "step": 210 }, { "epoch": 0.03, "learning_rate": 4.993514308963036e-05, "loss": 0.5618, "step": 215 }, { "epoch": 0.04, "learning_rate": 4.993209279467164e-05, "loss": 0.6093, "step": 220 }, { "epoch": 0.04, "learning_rate": 4.992897250651535e-05, "loss": 0.5744, "step": 225 }, { "epoch": 0.04, "learning_rate": 4.992578223392124e-05, "loss": 0.5844, "step": 230 }, { "epoch": 0.04, "learning_rate": 4.992252198584554e-05, "loss": 0.5358, "step": 235 }, { "epoch": 0.04, "learning_rate": 4.9919191771440905e-05, "loss": 0.6067, "step": 240 }, { "epoch": 0.04, "learning_rate": 4.991579160005644e-05, "loss": 0.6147, "step": 245 }, { "epoch": 0.04, "learning_rate": 4.991232148123761e-05, "loss": 0.6185, "step": 250 }, { "epoch": 0.04, "learning_rate": 4.990878142472628e-05, "loss": 0.5797, "step": 255 }, { "epoch": 0.04, "learning_rate": 4.990517144046064e-05, "loss": 0.58, "step": 260 }, { "epoch": 0.04, "learning_rate": 4.9901491538575185e-05, "loss": 0.6051, "step": 265 }, { "epoch": 0.04, "learning_rate": 4.9897741729400705e-05, "loss": 0.6199, "step": 270 }, { "epoch": 0.04, "learning_rate": 4.9893922023464236e-05, "loss": 0.6173, "step": 275 }, { "epoch": 0.04, "learning_rate": 4.989003243148904e-05, "loss": 0.5907, "step": 280 }, { "epoch": 0.05, "learning_rate": 4.988607296439458e-05, "loss": 0.5818, "step": 285 }, { "epoch": 0.05, "learning_rate": 4.988204363329648e-05, "loss": 0.5767, "step": 290 }, { "epoch": 0.05, "learning_rate": 4.987794444950651e-05, "loss": 0.579, "step": 295 }, { "epoch": 0.05, "learning_rate": 4.987377542453251e-05, "loss": 0.6454, "step": 300 }, { "epoch": 0.05, "learning_rate": 4.986953657007841e-05, "loss": 0.5777, "step": 305 }, { "epoch": 0.05, "learning_rate": 4.986522789804417e-05, "loss": 0.532, "step": 310 }, { "epoch": 0.05, "learning_rate": 4.9860849420525766e-05, "loss": 0.56, "step": 315 }, { "epoch": 0.05, "learning_rate": 4.9856401149815126e-05, "loss": 0.5624, "step": 320 }, { "epoch": 0.05, "learning_rate": 4.985188309840012e-05, "loss": 0.6008, "step": 325 }, { "epoch": 0.05, "learning_rate": 4.9847295278964514e-05, "loss": 0.6425, "step": 330 }, { "epoch": 0.05, "learning_rate": 4.984263770438793e-05, "loss": 0.5907, "step": 335 }, { "epoch": 0.05, "learning_rate": 4.9837910387745845e-05, "loss": 0.5926, "step": 340 }, { "epoch": 0.06, "learning_rate": 4.98331133423095e-05, "loss": 0.6115, "step": 345 }, { "epoch": 0.06, "learning_rate": 4.982824658154589e-05, "loss": 0.5685, "step": 350 }, { "epoch": 0.06, "learning_rate": 4.982331011911774e-05, "loss": 0.5412, "step": 355 }, { "epoch": 0.06, "learning_rate": 4.981830396888344e-05, "loss": 0.6032, "step": 360 }, { "epoch": 0.06, "learning_rate": 4.981322814489703e-05, "loss": 0.568, "step": 365 }, { "epoch": 0.06, "learning_rate": 4.980808266140813e-05, "loss": 0.5835, "step": 370 }, { "epoch": 0.06, "learning_rate": 4.980286753286195e-05, "loss": 0.5633, "step": 375 }, { "epoch": 0.06, "learning_rate": 4.979758277389919e-05, "loss": 0.574, "step": 380 }, { "epoch": 0.06, "learning_rate": 4.979222839935602e-05, "loss": 0.5774, "step": 385 }, { "epoch": 0.06, "learning_rate": 4.9786804424264085e-05, "loss": 0.5608, "step": 390 }, { "epoch": 0.06, "learning_rate": 4.9781310863850405e-05, "loss": 0.5659, "step": 395 }, { "epoch": 0.06, "learning_rate": 4.977574773353732e-05, "loss": 0.6167, "step": 400 }, { "epoch": 0.06, "learning_rate": 4.977011504894252e-05, "loss": 0.5523, "step": 405 }, { "epoch": 0.07, "learning_rate": 4.9764412825878943e-05, "loss": 0.5804, "step": 410 }, { "epoch": 0.07, "learning_rate": 4.975864108035474e-05, "loss": 0.5811, "step": 415 }, { "epoch": 0.07, "learning_rate": 4.975279982857324e-05, "loss": 0.5832, "step": 420 }, { "epoch": 0.07, "learning_rate": 4.9746889086932895e-05, "loss": 0.6035, "step": 425 }, { "epoch": 0.07, "learning_rate": 4.974090887202726e-05, "loss": 0.6077, "step": 430 }, { "epoch": 0.07, "learning_rate": 4.9734859200644905e-05, "loss": 0.6147, "step": 435 }, { "epoch": 0.07, "learning_rate": 4.97287400897694e-05, "loss": 0.6825, "step": 440 }, { "epoch": 0.07, "learning_rate": 4.972255155657925e-05, "loss": 0.5573, "step": 445 }, { "epoch": 0.07, "learning_rate": 4.971629361844785e-05, "loss": 0.5691, "step": 450 }, { "epoch": 0.07, "learning_rate": 4.9709966292943455e-05, "loss": 0.5768, "step": 455 }, { "epoch": 0.07, "learning_rate": 4.970356959782909e-05, "loss": 0.5953, "step": 460 }, { "epoch": 0.07, "learning_rate": 4.9697103551062556e-05, "loss": 0.6207, "step": 465 }, { "epoch": 0.08, "learning_rate": 4.969056817079633e-05, "loss": 0.5845, "step": 470 }, { "epoch": 0.08, "learning_rate": 4.968396347537751e-05, "loss": 0.5719, "step": 475 }, { "epoch": 0.08, "learning_rate": 4.967728948334784e-05, "loss": 0.5889, "step": 480 }, { "epoch": 0.08, "learning_rate": 4.967054621344356e-05, "loss": 0.5574, "step": 485 }, { "epoch": 0.08, "learning_rate": 4.966373368459541e-05, "loss": 0.6037, "step": 490 }, { "epoch": 0.08, "learning_rate": 4.965685191592859e-05, "loss": 0.5707, "step": 495 }, { "epoch": 0.08, "learning_rate": 4.964990092676263e-05, "loss": 0.6586, "step": 500 }, { "epoch": 0.08, "learning_rate": 4.964288073661142e-05, "loss": 0.5559, "step": 505 }, { "epoch": 0.08, "learning_rate": 4.963579136518312e-05, "loss": 0.5609, "step": 510 }, { "epoch": 0.08, "learning_rate": 4.96286328323801e-05, "loss": 0.6081, "step": 515 }, { "epoch": 0.08, "learning_rate": 4.96214051582989e-05, "loss": 0.5724, "step": 520 }, { "epoch": 0.08, "learning_rate": 4.9614108363230135e-05, "loss": 0.5758, "step": 525 }, { "epoch": 0.08, "learning_rate": 4.960674246765851e-05, "loss": 0.6191, "step": 530 }, { "epoch": 0.09, "learning_rate": 4.959930749226269e-05, "loss": 0.5638, "step": 535 }, { "epoch": 0.09, "learning_rate": 4.959180345791528e-05, "loss": 0.5698, "step": 540 }, { "epoch": 0.09, "learning_rate": 4.958423038568274e-05, "loss": 0.5878, "step": 545 }, { "epoch": 0.09, "learning_rate": 4.9576588296825386e-05, "loss": 0.5734, "step": 550 }, { "epoch": 0.09, "learning_rate": 4.956887721279726e-05, "loss": 0.606, "step": 555 }, { "epoch": 0.09, "learning_rate": 4.956109715524608e-05, "loss": 0.5871, "step": 560 }, { "epoch": 0.09, "learning_rate": 4.955324814601324e-05, "loss": 0.6306, "step": 565 }, { "epoch": 0.09, "learning_rate": 4.9545330207133664e-05, "loss": 0.6231, "step": 570 }, { "epoch": 0.09, "learning_rate": 4.953734336083583e-05, "loss": 0.6278, "step": 575 }, { "epoch": 0.09, "learning_rate": 4.952928762954161e-05, "loss": 0.5551, "step": 580 }, { "epoch": 0.09, "learning_rate": 4.952116303586631e-05, "loss": 0.6441, "step": 585 }, { "epoch": 0.09, "learning_rate": 4.951296960261853e-05, "loss": 0.5753, "step": 590 }, { "epoch": 0.1, "learning_rate": 4.9504707352800125e-05, "loss": 0.5458, "step": 595 }, { "epoch": 0.1, "learning_rate": 4.949637630960617e-05, "loss": 0.5668, "step": 600 }, { "epoch": 0.1, "learning_rate": 4.948797649642484e-05, "loss": 0.5727, "step": 605 }, { "epoch": 0.1, "learning_rate": 4.9479507936837364e-05, "loss": 0.628, "step": 610 }, { "epoch": 0.1, "learning_rate": 4.947097065461801e-05, "loss": 0.5958, "step": 615 }, { "epoch": 0.1, "learning_rate": 4.946236467373392e-05, "loss": 0.6173, "step": 620 }, { "epoch": 0.1, "learning_rate": 4.9453690018345144e-05, "loss": 0.5473, "step": 625 }, { "epoch": 0.1, "learning_rate": 4.9444946712804494e-05, "loss": 0.6012, "step": 630 }, { "epoch": 0.1, "learning_rate": 4.943613478165753e-05, "loss": 0.6303, "step": 635 }, { "epoch": 0.1, "learning_rate": 4.9427254249642444e-05, "loss": 0.6188, "step": 640 }, { "epoch": 0.1, "learning_rate": 4.941830514169004e-05, "loss": 0.6079, "step": 645 }, { "epoch": 0.1, "learning_rate": 4.940928748292363e-05, "loss": 0.5589, "step": 650 }, { "epoch": 0.1, "learning_rate": 4.940020129865895e-05, "loss": 0.5652, "step": 655 }, { "epoch": 0.11, "learning_rate": 4.939104661440415e-05, "loss": 0.5415, "step": 660 }, { "epoch": 0.11, "learning_rate": 4.938182345585966e-05, "loss": 0.5657, "step": 665 }, { "epoch": 0.11, "learning_rate": 4.9372531848918145e-05, "loss": 0.6403, "step": 670 }, { "epoch": 0.11, "learning_rate": 4.9363171819664434e-05, "loss": 0.6198, "step": 675 }, { "epoch": 0.11, "learning_rate": 4.935374339437543e-05, "loss": 0.5685, "step": 680 }, { "epoch": 0.11, "learning_rate": 4.934424659952006e-05, "loss": 0.5737, "step": 685 }, { "epoch": 0.11, "learning_rate": 4.933468146175918e-05, "loss": 0.5975, "step": 690 }, { "epoch": 0.11, "learning_rate": 4.9325048007945526e-05, "loss": 0.6033, "step": 695 }, { "epoch": 0.11, "learning_rate": 4.9315346265123594e-05, "loss": 0.6587, "step": 700 }, { "epoch": 0.11, "learning_rate": 4.9305576260529607e-05, "loss": 0.5903, "step": 705 }, { "epoch": 0.11, "learning_rate": 4.929573802159143e-05, "loss": 0.5883, "step": 710 }, { "epoch": 0.11, "learning_rate": 4.9285831575928465e-05, "loss": 0.6151, "step": 715 }, { "epoch": 0.12, "learning_rate": 4.927585695135162e-05, "loss": 0.5687, "step": 720 }, { "epoch": 0.12, "learning_rate": 4.9265814175863186e-05, "loss": 0.6008, "step": 725 }, { "epoch": 0.12, "learning_rate": 4.925570327765678e-05, "loss": 0.6045, "step": 730 }, { "epoch": 0.12, "learning_rate": 4.9245524285117274e-05, "loss": 0.5439, "step": 735 }, { "epoch": 0.12, "learning_rate": 4.9235277226820695e-05, "loss": 0.61, "step": 740 }, { "epoch": 0.12, "learning_rate": 4.922496213153416e-05, "loss": 0.5913, "step": 745 }, { "epoch": 0.12, "learning_rate": 4.9214579028215776e-05, "loss": 0.6255, "step": 750 }, { "epoch": 0.12, "learning_rate": 4.920412794601461e-05, "loss": 0.5834, "step": 755 }, { "epoch": 0.12, "learning_rate": 4.9193608914270515e-05, "loss": 0.6102, "step": 760 }, { "epoch": 0.12, "learning_rate": 4.918302196251415e-05, "loss": 0.5591, "step": 765 }, { "epoch": 0.12, "learning_rate": 4.917236712046682e-05, "loss": 0.556, "step": 770 }, { "epoch": 0.12, "learning_rate": 4.916164441804044e-05, "loss": 0.5774, "step": 775 }, { "epoch": 0.12, "learning_rate": 4.9150853885337426e-05, "loss": 0.6256, "step": 780 }, { "epoch": 0.13, "learning_rate": 4.913999555265062e-05, "loss": 0.6505, "step": 785 }, { "epoch": 0.13, "learning_rate": 4.9129069450463186e-05, "loss": 0.6012, "step": 790 }, { "epoch": 0.13, "learning_rate": 4.911807560944858e-05, "loss": 0.5913, "step": 795 }, { "epoch": 0.13, "learning_rate": 4.910701406047037e-05, "loss": 0.6963, "step": 800 }, { "epoch": 0.13, "learning_rate": 4.909588483458225e-05, "loss": 0.5588, "step": 805 }, { "epoch": 0.13, "learning_rate": 4.9084687963027894e-05, "loss": 0.6078, "step": 810 }, { "epoch": 0.13, "learning_rate": 4.907342347724087e-05, "loss": 0.5816, "step": 815 }, { "epoch": 0.13, "learning_rate": 4.906209140884459e-05, "loss": 0.5989, "step": 820 }, { "epoch": 0.13, "learning_rate": 4.905069178965215e-05, "loss": 0.5568, "step": 825 }, { "epoch": 0.13, "learning_rate": 4.9039224651666325e-05, "loss": 0.6021, "step": 830 }, { "epoch": 0.13, "learning_rate": 4.902769002707942e-05, "loss": 0.6009, "step": 835 }, { "epoch": 0.13, "learning_rate": 4.90160879482732e-05, "loss": 0.5801, "step": 840 }, { "epoch": 0.14, "learning_rate": 4.9004418447818815e-05, "loss": 0.5939, "step": 845 }, { "epoch": 0.14, "learning_rate": 4.899268155847667e-05, "loss": 0.6027, "step": 850 }, { "epoch": 0.14, "learning_rate": 4.898087731319636e-05, "loss": 0.5416, "step": 855 }, { "epoch": 0.14, "learning_rate": 4.896900574511657e-05, "loss": 0.5596, "step": 860 }, { "epoch": 0.14, "learning_rate": 4.8957066887565e-05, "loss": 0.6187, "step": 865 }, { "epoch": 0.14, "learning_rate": 4.894506077405824e-05, "loss": 0.5733, "step": 870 }, { "epoch": 0.14, "learning_rate": 4.893298743830168e-05, "loss": 0.5862, "step": 875 }, { "epoch": 0.14, "learning_rate": 4.892084691418947e-05, "loss": 0.6191, "step": 880 }, { "epoch": 0.14, "learning_rate": 4.8908639235804324e-05, "loss": 0.6258, "step": 885 }, { "epoch": 0.14, "learning_rate": 4.889636443741752e-05, "loss": 0.5504, "step": 890 }, { "epoch": 0.14, "learning_rate": 4.888402255348876e-05, "loss": 0.6241, "step": 895 }, { "epoch": 0.14, "learning_rate": 4.887161361866608e-05, "loss": 0.5839, "step": 900 }, { "epoch": 0.14, "learning_rate": 4.8859137667785735e-05, "loss": 0.5797, "step": 905 }, { "epoch": 0.15, "learning_rate": 4.884659473587213e-05, "loss": 0.6189, "step": 910 }, { "epoch": 0.15, "learning_rate": 4.8833984858137715e-05, "loss": 0.6194, "step": 915 }, { "epoch": 0.15, "learning_rate": 4.8821308069982867e-05, "loss": 0.585, "step": 920 }, { "epoch": 0.15, "learning_rate": 4.880856440699582e-05, "loss": 0.582, "step": 925 }, { "epoch": 0.15, "learning_rate": 4.8795753904952534e-05, "loss": 0.5789, "step": 930 }, { "epoch": 0.15, "learning_rate": 4.878287659981662e-05, "loss": 0.6236, "step": 935 }, { "epoch": 0.15, "learning_rate": 4.8769932527739225e-05, "loss": 0.5614, "step": 940 }, { "epoch": 0.15, "learning_rate": 4.8756921725058934e-05, "loss": 0.5972, "step": 945 }, { "epoch": 0.15, "learning_rate": 4.874384422830167e-05, "loss": 0.5682, "step": 950 }, { "epoch": 0.15, "learning_rate": 4.873070007418059e-05, "loss": 0.6327, "step": 955 }, { "epoch": 0.15, "learning_rate": 4.871748929959598e-05, "loss": 0.5577, "step": 960 }, { "epoch": 0.15, "learning_rate": 4.870421194163515e-05, "loss": 0.56, "step": 965 }, { "epoch": 0.16, "learning_rate": 4.8690868037572346e-05, "loss": 0.6324, "step": 970 }, { "epoch": 0.16, "learning_rate": 4.867745762486861e-05, "loss": 0.5764, "step": 975 }, { "epoch": 0.16, "learning_rate": 4.8663980741171724e-05, "loss": 0.564, "step": 980 }, { "epoch": 0.16, "learning_rate": 4.865043742431605e-05, "loss": 0.5534, "step": 985 }, { "epoch": 0.16, "learning_rate": 4.863682771232248e-05, "loss": 0.5953, "step": 990 }, { "epoch": 0.16, "learning_rate": 4.862315164339829e-05, "loss": 0.4992, "step": 995 }, { "epoch": 0.16, "learning_rate": 4.860940925593703e-05, "loss": 0.6061, "step": 1000 } ], "logging_steps": 5, "max_steps": 9375, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "total_flos": 2.2933185503035392e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }