{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 625, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 7.936507936507937e-08, "logits/chosen": 0.0711287260055542, "logits/rejected": 0.20400863885879517, "logps/chosen": -313.75396728515625, "logps/rejected": -420.73980712890625, "loss": 0.2285, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 7.936507936507937e-07, "logits/chosen": 0.2657856047153473, "logits/rejected": 0.23175562918186188, "logps/chosen": -354.52484130859375, "logps/rejected": -365.6746520996094, "loss": 0.203, "rewards/accuracies": 0.3125, "rewards/chosen": -0.0005699184257537127, "rewards/margins": 2.8342270525172353e-05, "rewards/rejected": -0.0005982607253827155, "step": 10 }, { "epoch": 0.03, "learning_rate": 1.5873015873015873e-06, "logits/chosen": 0.16931554675102234, "logits/rejected": 0.265936940908432, "logps/chosen": -376.02166748046875, "logps/rejected": -396.56524658203125, "loss": 0.2032, "rewards/accuracies": 0.3062500059604645, "rewards/chosen": -0.0009135094587691128, "rewards/margins": -1.457883081457112e-05, "rewards/rejected": -0.000898930593393743, "step": 20 }, { "epoch": 0.05, "learning_rate": 2.380952380952381e-06, "logits/chosen": 0.1378197968006134, "logits/rejected": 0.2801808714866638, "logps/chosen": -394.52301025390625, "logps/rejected": -406.01971435546875, "loss": 0.2045, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.0023785685189068317, "rewards/margins": -4.536235792329535e-05, "rewards/rejected": -0.0023332058917731047, "step": 30 }, { "epoch": 0.06, "learning_rate": 3.1746031746031746e-06, "logits/chosen": 0.22031190991401672, "logits/rejected": 0.2555253505706787, "logps/chosen": -413.9378356933594, "logps/rejected": -384.2980041503906, "loss": 0.2078, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0029828939586877823, "rewards/margins": 0.0004020760825369507, "rewards/rejected": -0.0033849701285362244, "step": 40 }, { "epoch": 0.08, "learning_rate": 3.968253968253968e-06, "logits/chosen": 0.17930440604686737, "logits/rejected": 0.27743226289749146, "logps/chosen": -437.57220458984375, "logps/rejected": -428.25726318359375, "loss": 0.2109, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.0068945749662816525, "rewards/margins": 0.0003428836935199797, "rewards/rejected": -0.0072374590672552586, "step": 50 }, { "epoch": 0.1, "learning_rate": 4.761904761904762e-06, "logits/chosen": 0.1960189789533615, "logits/rejected": 0.20567241311073303, "logps/chosen": -412.0870666503906, "logps/rejected": -429.5210876464844, "loss": 0.2106, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.01030620839446783, "rewards/margins": 0.0008126860484480858, "rewards/rejected": -0.011118894442915916, "step": 60 }, { "epoch": 0.11, "learning_rate": 4.998086282661188e-06, "logits/chosen": 0.18384099006652832, "logits/rejected": 0.16322512924671173, "logps/chosen": -480.0482482910156, "logps/rejected": -449.12872314453125, "loss": 0.2125, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.012364747002720833, "rewards/margins": 0.0012391259660944343, "rewards/rejected": -0.013603871688246727, "step": 70 }, { "epoch": 0.13, "learning_rate": 4.988720025682995e-06, "logits/chosen": 0.17162439227104187, "logits/rejected": 0.3024311661720276, "logps/chosen": -425.10821533203125, "logps/rejected": -405.69903564453125, "loss": 0.203, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.002422564197331667, "rewards/margins": 0.0019961665384471416, "rewards/rejected": -0.004418730735778809, "step": 80 }, { "epoch": 0.14, "learning_rate": 4.9715789537359126e-06, "logits/chosen": 0.14976395666599274, "logits/rejected": 0.293326199054718, "logps/chosen": -450.09283447265625, "logps/rejected": -463.3335876464844, "loss": 0.1928, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.004968429449945688, "rewards/margins": 0.004259931854903698, "rewards/rejected": 0.0007084974786266685, "step": 90 }, { "epoch": 0.16, "learning_rate": 4.946716615897932e-06, "logits/chosen": 0.14262747764587402, "logits/rejected": 0.14231689274311066, "logps/chosen": -409.68719482421875, "logps/rejected": -385.9997253417969, "loss": 0.2044, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.0011129004415124655, "rewards/margins": 0.0027904310263693333, "rewards/rejected": -0.001677530468441546, "step": 100 }, { "epoch": 0.18, "learning_rate": 4.9142106826480114e-06, "logits/chosen": 0.09172149002552032, "logits/rejected": 0.18723782896995544, "logps/chosen": -376.23382568359375, "logps/rejected": -382.14312744140625, "loss": 0.2117, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.0011799463536590338, "rewards/margins": 0.006289638578891754, "rewards/rejected": -0.005109691992402077, "step": 110 }, { "epoch": 0.19, "learning_rate": 4.874162703221823e-06, "logits/chosen": 0.21056421101093292, "logits/rejected": 0.10638086497783661, "logps/chosen": -455.8184509277344, "logps/rejected": -461.58575439453125, "loss": 0.2024, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.0018554453272372484, "rewards/margins": 0.007339824922382832, "rewards/rejected": -0.005484379827976227, "step": 120 }, { "epoch": 0.21, "learning_rate": 4.826697788369752e-06, "logits/chosen": 0.0843835398554802, "logits/rejected": 0.10346312820911407, "logps/chosen": -420.7416076660156, "logps/rejected": -426.8565979003906, "loss": 0.1938, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.0005354422028176486, "rewards/margins": 0.012502538040280342, "rewards/rejected": -0.011967095546424389, "step": 130 }, { "epoch": 0.22, "learning_rate": 4.7719642195082224e-06, "logits/chosen": 0.03641371801495552, "logits/rejected": 0.10479255020618439, "logps/chosen": -401.6028137207031, "logps/rejected": -464.2025451660156, "loss": 0.1958, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.024544300511479378, "rewards/margins": 0.018662814050912857, "rewards/rejected": -0.043207116425037384, "step": 140 }, { "epoch": 0.24, "learning_rate": 4.710132985485355e-06, "logits/chosen": 0.11580105125904083, "logits/rejected": 0.021813513711094856, "logps/chosen": -430.5687561035156, "logps/rejected": -515.4051513671875, "loss": 0.1976, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.04684633016586304, "rewards/margins": 0.043844569474458694, "rewards/rejected": -0.09069089591503143, "step": 150 }, { "epoch": 0.26, "learning_rate": 4.641397248408122e-06, "logits/chosen": -0.0415755994617939, "logits/rejected": 0.07411660254001617, "logps/chosen": -467.6756286621094, "logps/rejected": -472.20001220703125, "loss": 0.1972, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0470997616648674, "rewards/margins": 0.02540501020848751, "rewards/rejected": -0.07250477373600006, "step": 160 }, { "epoch": 0.27, "learning_rate": 4.5659717401997655e-06, "logits/chosen": -0.03600798547267914, "logits/rejected": 0.027341466397047043, "logps/chosen": -504.47894287109375, "logps/rejected": -516.9270629882812, "loss": 0.1948, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.05515031889081001, "rewards/margins": 0.03458093851804733, "rewards/rejected": -0.08973126113414764, "step": 170 }, { "epoch": 0.29, "learning_rate": 4.4840920917726425e-06, "logits/chosen": -0.039528343826532364, "logits/rejected": -0.019377198070287704, "logps/chosen": -484.7760314941406, "logps/rejected": -515.7113037109375, "loss": 0.1955, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.05891401320695877, "rewards/margins": 0.03348463773727417, "rewards/rejected": -0.09239865839481354, "step": 180 }, { "epoch": 0.3, "learning_rate": 4.396014096912182e-06, "logits/chosen": 0.041537586599588394, "logits/rejected": -0.029888898134231567, "logps/chosen": -474.3414001464844, "logps/rejected": -515.9990234375, "loss": 0.1919, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.07046758383512497, "rewards/margins": 0.03831402584910393, "rewards/rejected": -0.1087816134095192, "step": 190 }, { "epoch": 0.32, "learning_rate": 4.302012913171584e-06, "logits/chosen": -0.017491130158305168, "logits/rejected": -0.04960538074374199, "logps/chosen": -484.7154846191406, "logps/rejected": -524.5064086914062, "loss": 0.1935, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.07989688217639923, "rewards/margins": 0.04493844509124756, "rewards/rejected": -0.12483533471822739, "step": 200 }, { "epoch": 0.34, "learning_rate": 4.202382202273702e-06, "logits/chosen": -0.13748802244663239, "logits/rejected": -0.001641835318878293, "logps/chosen": -507.37506103515625, "logps/rejected": -535.3721923828125, "loss": 0.1804, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07145627588033676, "rewards/margins": 0.051401056349277496, "rewards/rejected": -0.12285731732845306, "step": 210 }, { "epoch": 0.35, "learning_rate": 4.097433212705492e-06, "logits/chosen": -0.021799543872475624, "logits/rejected": -0.074518583714962, "logps/chosen": -477.99981689453125, "logps/rejected": -518.2962646484375, "loss": 0.2019, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.09213604032993317, "rewards/margins": 0.04488346725702286, "rewards/rejected": -0.13701951503753662, "step": 220 }, { "epoch": 0.37, "learning_rate": 3.987493807371033e-06, "logits/chosen": -0.16067767143249512, "logits/rejected": 0.01047535240650177, "logps/chosen": -489.7818298339844, "logps/rejected": -525.8330688476562, "loss": 0.1852, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0610770583152771, "rewards/margins": 0.04805826395750046, "rewards/rejected": -0.10913531482219696, "step": 230 }, { "epoch": 0.38, "learning_rate": 3.872907439340758e-06, "logits/chosen": -0.035437412559986115, "logits/rejected": -0.07582642138004303, "logps/chosen": -440.7459411621094, "logps/rejected": -492.434326171875, "loss": 0.177, "rewards/accuracies": 0.46875, "rewards/chosen": -0.04167747497558594, "rewards/margins": 0.04895929619669914, "rewards/rejected": -0.09063677489757538, "step": 240 }, { "epoch": 0.4, "learning_rate": 3.75403207889666e-06, "logits/chosen": -0.09768973290920258, "logits/rejected": -0.0913015678524971, "logps/chosen": -445.75750732421875, "logps/rejected": -473.06671142578125, "loss": 0.1765, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.04659276083111763, "rewards/margins": 0.03554708510637283, "rewards/rejected": -0.08213984221220016, "step": 250 }, { "epoch": 0.42, "learning_rate": 3.631239095225417e-06, "logits/chosen": -0.002584155648946762, "logits/rejected": -0.0503731295466423, "logps/chosen": -443.8072204589844, "logps/rejected": -522.3037109375, "loss": 0.1903, "rewards/accuracies": 0.5, "rewards/chosen": -0.057579755783081055, "rewards/margins": 0.042577676475048065, "rewards/rejected": -0.10015741735696793, "step": 260 }, { "epoch": 0.43, "learning_rate": 3.5049120962530608e-06, "logits/chosen": -0.07919616252183914, "logits/rejected": -0.051559675484895706, "logps/chosen": -442.90020751953125, "logps/rejected": -506.71026611328125, "loss": 0.1801, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.04894575849175453, "rewards/margins": 0.060835689306259155, "rewards/rejected": -0.1097814291715622, "step": 270 }, { "epoch": 0.45, "learning_rate": 3.3754457302455464e-06, "logits/chosen": -0.05536097288131714, "logits/rejected": -0.05594850331544876, "logps/chosen": -491.963623046875, "logps/rejected": -476.1817932128906, "loss": 0.1987, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.05905281379818916, "rewards/margins": 0.03382394462823868, "rewards/rejected": -0.09287675470113754, "step": 280 }, { "epoch": 0.46, "learning_rate": 3.2432444529190714e-06, "logits/chosen": -0.06521332263946533, "logits/rejected": 0.04030764847993851, "logps/chosen": -503.37115478515625, "logps/rejected": -539.033935546875, "loss": 0.1758, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.06568136066198349, "rewards/margins": 0.04754118248820305, "rewards/rejected": -0.11322255432605743, "step": 290 }, { "epoch": 0.48, "learning_rate": 3.1087212639117057e-06, "logits/chosen": -0.007789143826812506, "logits/rejected": -0.08723556995391846, "logps/chosen": -448.15185546875, "logps/rejected": -472.66046142578125, "loss": 0.195, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.04991251975297928, "rewards/margins": 0.03361000493168831, "rewards/rejected": -0.08352252840995789, "step": 300 }, { "epoch": 0.5, "learning_rate": 2.9722964165636263e-06, "logits/chosen": -0.10513603687286377, "logits/rejected": -0.02427390217781067, "logps/chosen": -456.4302673339844, "logps/rejected": -480.01904296875, "loss": 0.1777, "rewards/accuracies": 0.46875, "rewards/chosen": -0.04909784346818924, "rewards/margins": 0.032266296446323395, "rewards/rejected": -0.08136413991451263, "step": 310 }, { "epoch": 0.51, "learning_rate": 2.8343961050366275e-06, "logits/chosen": -0.06911730021238327, "logits/rejected": -0.02331097424030304, "logps/chosen": -412.3163146972656, "logps/rejected": -419.2935485839844, "loss": 0.1807, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.031493835151195526, "rewards/margins": 0.03578699007630348, "rewards/rejected": -0.06728082150220871, "step": 320 }, { "epoch": 0.53, "learning_rate": 2.695451132874385e-06, "logits/chosen": 0.030731942504644394, "logits/rejected": -0.028069671243429184, "logps/chosen": -477.44512939453125, "logps/rejected": -501.154541015625, "loss": 0.1925, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.045379411429166794, "rewards/margins": 0.04541900008916855, "rewards/rejected": -0.09079841524362564, "step": 330 }, { "epoch": 0.54, "learning_rate": 2.5558955671628964e-06, "logits/chosen": -0.0826629176735878, "logits/rejected": -0.04008691757917404, "logps/chosen": -429.1664123535156, "logps/rejected": -467.46173095703125, "loss": 0.1838, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.05404701828956604, "rewards/margins": 0.04224228113889694, "rewards/rejected": -0.09628931432962418, "step": 340 }, { "epoch": 0.56, "learning_rate": 2.4161653824955654e-06, "logits/chosen": -0.056598931550979614, "logits/rejected": -0.10974061489105225, "logps/chosen": -513.0817260742188, "logps/rejected": -546.1875, "loss": 0.1864, "rewards/accuracies": 0.5, "rewards/chosen": -0.06718280166387558, "rewards/margins": 0.05270420387387276, "rewards/rejected": -0.11988700926303864, "step": 350 }, { "epoch": 0.58, "learning_rate": 2.2766970989791697e-06, "logits/chosen": -0.04662217199802399, "logits/rejected": -0.05574822425842285, "logps/chosen": -480.88848876953125, "logps/rejected": -530.1246337890625, "loss": 0.1915, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.06772033870220184, "rewards/margins": 0.03862085938453674, "rewards/rejected": -0.10634119808673859, "step": 360 }, { "epoch": 0.59, "learning_rate": 2.1379264185356545e-06, "logits/chosen": -0.020046677440404892, "logits/rejected": 0.09056108444929123, "logps/chosen": -486.93115234375, "logps/rejected": -528.6192626953125, "loss": 0.1814, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.054186802357435226, "rewards/margins": 0.05737306550145149, "rewards/rejected": -0.11155986785888672, "step": 370 }, { "epoch": 0.61, "learning_rate": 2.000286863759934e-06, "logits/chosen": -0.09826477617025375, "logits/rejected": -0.07946722954511642, "logps/chosen": -470.00811767578125, "logps/rejected": -511.309814453125, "loss": 0.1828, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.06328146904706955, "rewards/margins": 0.05404907464981079, "rewards/rejected": -0.11733055114746094, "step": 380 }, { "epoch": 0.62, "learning_rate": 1.8642084235859764e-06, "logits/chosen": -0.15358105301856995, "logits/rejected": -0.07723913341760635, "logps/chosen": -438.9100646972656, "logps/rejected": -479.63092041015625, "loss": 0.1972, "rewards/accuracies": 0.46875, "rewards/chosen": -0.05276118963956833, "rewards/margins": 0.04282676801085472, "rewards/rejected": -0.09558796137571335, "step": 390 }, { "epoch": 0.64, "learning_rate": 1.7301162099921013e-06, "logits/chosen": -0.06951303780078888, "logits/rejected": -0.16843798756599426, "logps/chosen": -422.1139221191406, "logps/rejected": -459.0763244628906, "loss": 0.1995, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.05717768147587776, "rewards/margins": 0.040271807461977005, "rewards/rejected": -0.09744948893785477, "step": 400 }, { "epoch": 0.66, "learning_rate": 1.5984291299420117e-06, "logits/chosen": -0.1256350576877594, "logits/rejected": -0.08233270049095154, "logps/chosen": -404.26263427734375, "logps/rejected": -455.4290466308594, "loss": 0.1892, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.04233751818537712, "rewards/margins": 0.05102572590112686, "rewards/rejected": -0.09336324036121368, "step": 410 }, { "epoch": 0.67, "learning_rate": 1.4695585767104092e-06, "logits/chosen": -0.0601225309073925, "logits/rejected": -0.12005972862243652, "logps/chosen": -482.4237365722656, "logps/rejected": -531.4822998046875, "loss": 0.1715, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.045725829899311066, "rewards/margins": 0.04561304301023483, "rewards/rejected": -0.0913388729095459, "step": 420 }, { "epoch": 0.69, "learning_rate": 1.3439071446815452e-06, "logits/chosen": -0.15923841297626495, "logits/rejected": 0.011970462277531624, "logps/chosen": -441.70648193359375, "logps/rejected": -458.97052001953125, "loss": 0.186, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.03852088749408722, "rewards/margins": 0.038460638374090195, "rewards/rejected": -0.07698152959346771, "step": 430 }, { "epoch": 0.7, "learning_rate": 1.2218673716356919e-06, "logits/chosen": -0.05337870121002197, "logits/rejected": -0.16203683614730835, "logps/chosen": -411.7693786621094, "logps/rejected": -472.7538146972656, "loss": 0.1868, "rewards/accuracies": 0.53125, "rewards/chosen": -0.04310641810297966, "rewards/margins": 0.053300343453884125, "rewards/rejected": -0.09640677273273468, "step": 440 }, { "epoch": 0.72, "learning_rate": 1.103820512452661e-06, "logits/chosen": -0.16054467856884003, "logits/rejected": -0.10418369621038437, "logps/chosen": -471.15740966796875, "logps/rejected": -499.28997802734375, "loss": 0.1866, "rewards/accuracies": 0.5, "rewards/chosen": -0.044102419167757034, "rewards/margins": 0.04977239668369293, "rewards/rejected": -0.09387481212615967, "step": 450 }, { "epoch": 0.74, "learning_rate": 9.901353480633468e-07, "logits/chosen": -0.01945580169558525, "logits/rejected": -0.08020860701799393, "logps/chosen": -478.95135498046875, "logps/rejected": -510.05645751953125, "loss": 0.1876, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.05099078267812729, "rewards/margins": 0.04500190168619156, "rewards/rejected": -0.09599269926548004, "step": 460 }, { "epoch": 0.75, "learning_rate": 8.811670333701544e-07, "logits/chosen": -0.09503830224275589, "logits/rejected": -0.12620458006858826, "logps/chosen": -442.7535095214844, "logps/rejected": -478.792236328125, "loss": 0.1797, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.039722852408885956, "rewards/margins": 0.04783398285508156, "rewards/rejected": -0.08755683898925781, "step": 470 }, { "epoch": 0.77, "learning_rate": 7.772559877354341e-07, "logits/chosen": -0.1489885151386261, "logits/rejected": -0.052212201058864594, "logps/chosen": -408.1684265136719, "logps/rejected": -442.21466064453125, "loss": 0.1972, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.05573519319295883, "rewards/margins": 0.033387087285518646, "rewards/rejected": -0.08912228047847748, "step": 480 }, { "epoch": 0.78, "learning_rate": 6.787268315040604e-07, "logits/chosen": -0.05187498405575752, "logits/rejected": -0.04689168184995651, "logps/chosen": -473.5567932128906, "logps/rejected": -553.9025268554688, "loss": 0.1722, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.047236062586307526, "rewards/margins": 0.06369349360466003, "rewards/rejected": -0.11092956364154816, "step": 490 }, { "epoch": 0.8, "learning_rate": 5.858873718824829e-07, "logits/chosen": -0.1269344836473465, "logits/rejected": -0.12827740609645844, "logps/chosen": -436.95001220703125, "logps/rejected": -530.323486328125, "loss": 0.1828, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.03250400722026825, "rewards/margins": 0.06746237725019455, "rewards/rejected": -0.0999663770198822, "step": 500 }, { "epoch": 0.82, "learning_rate": 4.990276413423817e-07, "logits/chosen": -0.05357852578163147, "logits/rejected": -0.0927656888961792, "logps/chosen": -499.13262939453125, "logps/rejected": -548.8838500976562, "loss": 0.1697, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.04802858829498291, "rewards/margins": 0.05423368886113167, "rewards/rejected": -0.10226227343082428, "step": 510 }, { "epoch": 0.83, "learning_rate": 4.184189915529796e-07, "logits/chosen": -0.11798721551895142, "logits/rejected": -0.09332814067602158, "logps/chosen": -465.55816650390625, "logps/rejected": -533.0569458007812, "loss": 0.1967, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.042215488851070404, "rewards/margins": 0.05858848616480827, "rewards/rejected": -0.10080397129058838, "step": 520 }, { "epoch": 0.85, "learning_rate": 3.4431324567258176e-07, "logits/chosen": -0.1948605328798294, "logits/rejected": -0.08446381241083145, "logps/chosen": -440.0462951660156, "logps/rejected": -479.0397033691406, "loss": 0.1885, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.045319609344005585, "rewards/margins": 0.04320630431175232, "rewards/rejected": -0.08852590620517731, "step": 530 }, { "epoch": 0.86, "learning_rate": 2.769419116476052e-07, "logits/chosen": -0.2041836529970169, "logits/rejected": -0.06282895803451538, "logps/chosen": -415.84576416015625, "logps/rejected": -452.8704528808594, "loss": 0.1829, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.050526998937129974, "rewards/margins": 0.03672551363706589, "rewards/rejected": -0.08725249767303467, "step": 540 }, { "epoch": 0.88, "learning_rate": 2.1651545897676512e-07, "logits/chosen": -0.056377578526735306, "logits/rejected": -0.0602828674018383, "logps/chosen": -482.2828674316406, "logps/rejected": -510.7273864746094, "loss": 0.194, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.04934944957494736, "rewards/margins": 0.04233521968126297, "rewards/rejected": -0.09168466180562973, "step": 550 }, { "epoch": 0.9, "learning_rate": 1.6322266119983222e-07, "logits/chosen": -0.08606644719839096, "logits/rejected": -0.05609310790896416, "logps/chosen": -435.7982482910156, "logps/rejected": -460.5846252441406, "loss": 0.185, "rewards/accuracies": 0.5, "rewards/chosen": -0.037545233964920044, "rewards/margins": 0.0525357648730278, "rewards/rejected": -0.09008099883794785, "step": 560 }, { "epoch": 0.91, "learning_rate": 1.1723000616502167e-07, "logits/chosen": -0.21670150756835938, "logits/rejected": -0.034546859562397, "logps/chosen": -498.1796875, "logps/rejected": -531.9613037109375, "loss": 0.1779, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.05661952495574951, "rewards/margins": 0.05806880071759224, "rewards/rejected": -0.11468833684921265, "step": 570 }, { "epoch": 0.93, "learning_rate": 7.868117591737585e-08, "logits/chosen": -0.15769873559474945, "logits/rejected": -0.11322434991598129, "logps/chosen": -501.90802001953125, "logps/rejected": -510.9298400878906, "loss": 0.1865, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.0535421147942543, "rewards/margins": 0.04890746995806694, "rewards/rejected": -0.10244959592819214, "step": 580 }, { "epoch": 0.94, "learning_rate": 4.769659783295383e-08, "logits/chosen": -0.1612723022699356, "logits/rejected": -0.09152115881443024, "logps/chosen": -478.5709533691406, "logps/rejected": -501.6288146972656, "loss": 0.1926, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.04939908906817436, "rewards/margins": 0.04336509853601456, "rewards/rejected": -0.09276418387889862, "step": 590 }, { "epoch": 0.96, "learning_rate": 2.4373068401120358e-08, "logits/chosen": -0.16256621479988098, "logits/rejected": 0.01677759736776352, "logps/chosen": -480.29571533203125, "logps/rejected": -480.0000915527344, "loss": 0.1777, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.059523385018110275, "rewards/margins": 0.03656711056828499, "rewards/rejected": -0.09609050303697586, "step": 600 }, { "epoch": 0.98, "learning_rate": 8.78345083022425e-09, "logits/chosen": -0.2046932876110077, "logits/rejected": -0.010766489431262016, "logps/chosen": -467.3860778808594, "logps/rejected": -513.6978149414062, "loss": 0.195, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0569658987224102, "rewards/margins": 0.04467035457491875, "rewards/rejected": -0.10163625329732895, "step": 610 }, { "epoch": 0.99, "learning_rate": 9.764474213677654e-10, "logits/chosen": -0.15736037492752075, "logits/rejected": -0.16347061097621918, "logps/chosen": -454.09942626953125, "logps/rejected": -512.8070678710938, "loss": 0.1987, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.06590863317251205, "rewards/margins": 0.044044043868780136, "rewards/rejected": -0.10995267331600189, "step": 620 }, { "epoch": 1.0, "step": 625, "total_flos": 0.0, "train_loss": 0.1911162176847458, "train_runtime": 4612.7069, "train_samples_per_second": 4.336, "train_steps_per_second": 0.135 } ], "logging_steps": 10, "max_steps": 625, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }