{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994666666666666, "eval_steps": 500, "global_step": 937, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.319148936170213e-08, "logits/chosen": -0.31276124715805054, "logits/rejected": -0.11341337859630585, "logps/chosen": -559.525146484375, "logps/rejected": -486.2456970214844, "loss": 0.21, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 5.319148936170213e-07, "logits/chosen": -0.20243170857429504, "logits/rejected": -0.07215167582035065, "logps/chosen": -473.5186767578125, "logps/rejected": -507.1302185058594, "loss": 0.2065, "rewards/accuracies": 0.3541666567325592, "rewards/chosen": -7.249015470733866e-05, "rewards/margins": 0.00014273211127147079, "rewards/rejected": -0.0002152222878066823, "step": 10 }, { "epoch": 0.02, "learning_rate": 1.0638297872340427e-06, "logits/chosen": -0.18446393311023712, "logits/rejected": -0.09755989164113998, "logps/chosen": -501.7010803222656, "logps/rejected": -487.3160705566406, "loss": 0.2124, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -5.829105430166237e-05, "rewards/margins": 7.958527567097917e-05, "rewards/rejected": -0.0001378763117827475, "step": 20 }, { "epoch": 0.03, "learning_rate": 1.595744680851064e-06, "logits/chosen": -0.15609130263328552, "logits/rejected": -0.04423709958791733, "logps/chosen": -560.1486206054688, "logps/rejected": -544.0206298828125, "loss": 0.2048, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0003287494764663279, "rewards/margins": 0.00016076143947429955, "rewards/rejected": -0.0004895109450444579, "step": 30 }, { "epoch": 0.04, "learning_rate": 2.1276595744680853e-06, "logits/chosen": -0.2074490785598755, "logits/rejected": -0.14103737473487854, "logps/chosen": -507.80450439453125, "logps/rejected": -515.2080078125, "loss": 0.214, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0010981714585795999, "rewards/margins": 0.00048262160271406174, "rewards/rejected": -0.0015807930612936616, "step": 40 }, { "epoch": 0.05, "learning_rate": 2.6595744680851065e-06, "logits/chosen": -0.12519846856594086, "logits/rejected": -0.1412961781024933, "logps/chosen": -461.9590759277344, "logps/rejected": -499.2351989746094, "loss": 0.2124, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0024143296759575605, "rewards/margins": 0.0007537025958299637, "rewards/rejected": -0.0031680327374488115, "step": 50 }, { "epoch": 0.06, "learning_rate": 3.191489361702128e-06, "logits/chosen": -0.173623189330101, "logits/rejected": -0.03094838559627533, "logps/chosen": -551.9820556640625, "logps/rejected": -527.4284057617188, "loss": 0.2003, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.00582545343786478, "rewards/margins": 0.0019644282292574644, "rewards/rejected": -0.007789881434291601, "step": 60 }, { "epoch": 0.07, "learning_rate": 3.723404255319149e-06, "logits/chosen": -0.161810502409935, "logits/rejected": -0.10678007453680038, "logps/chosen": -567.8081665039062, "logps/rejected": -562.3734130859375, "loss": 0.2098, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.012994857504963875, "rewards/margins": 0.003251770045608282, "rewards/rejected": -0.016246628016233444, "step": 70 }, { "epoch": 0.09, "learning_rate": 4.255319148936171e-06, "logits/chosen": -0.15964026749134064, "logits/rejected": -0.27652230858802795, "logps/chosen": -562.570556640625, "logps/rejected": -621.7036743164062, "loss": 0.2037, "rewards/accuracies": 0.5, "rewards/chosen": -0.026814639568328857, "rewards/margins": 0.0097076166421175, "rewards/rejected": -0.03652225807309151, "step": 80 }, { "epoch": 0.1, "learning_rate": 4.787234042553192e-06, "logits/chosen": -0.2600744664669037, "logits/rejected": -0.20050808787345886, "logps/chosen": -609.1525268554688, "logps/rejected": -612.4235229492188, "loss": 0.2067, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.059279996901750565, "rewards/margins": 0.004630334675312042, "rewards/rejected": -0.0639103353023529, "step": 90 }, { "epoch": 0.11, "learning_rate": 4.999375059004058e-06, "logits/chosen": -0.2565140724182129, "logits/rejected": -0.22637882828712463, "logps/chosen": -574.8885498046875, "logps/rejected": -590.8546142578125, "loss": 0.1998, "rewards/accuracies": 0.40625, "rewards/chosen": -0.07415835559368134, "rewards/margins": 0.01800454594194889, "rewards/rejected": -0.09216289967298508, "step": 100 }, { "epoch": 0.12, "learning_rate": 4.9955571065548795e-06, "logits/chosen": -0.1685013473033905, "logits/rejected": -0.2401442974805832, "logps/chosen": -557.1212158203125, "logps/rejected": -602.7764892578125, "loss": 0.196, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.09011422097682953, "rewards/margins": 0.019372332841157913, "rewards/rejected": -0.10948655754327774, "step": 110 }, { "epoch": 0.13, "learning_rate": 4.9882736864879e-06, "logits/chosen": -0.2641439139842987, "logits/rejected": -0.2980344891548157, "logps/chosen": -588.050537109375, "logps/rejected": -627.3956298828125, "loss": 0.2053, "rewards/accuracies": 0.4375, "rewards/chosen": -0.10959631204605103, "rewards/margins": 0.014565527439117432, "rewards/rejected": -0.12416181713342667, "step": 120 }, { "epoch": 0.14, "learning_rate": 4.977534912960124e-06, "logits/chosen": -0.2924054265022278, "logits/rejected": -0.08088915795087814, "logps/chosen": -576.1680297851562, "logps/rejected": -614.0890502929688, "loss": 0.1901, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.09112486243247986, "rewards/margins": 0.025440961122512817, "rewards/rejected": -0.11656580865383148, "step": 130 }, { "epoch": 0.15, "learning_rate": 4.963355698422092e-06, "logits/chosen": -0.10601979494094849, "logits/rejected": -0.1950257569551468, "logps/chosen": -595.1011352539062, "logps/rejected": -659.9929809570312, "loss": 0.2058, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.1052999347448349, "rewards/margins": 0.02551344595849514, "rewards/rejected": -0.1308133900165558, "step": 140 }, { "epoch": 0.16, "learning_rate": 4.945755732909625e-06, "logits/chosen": -0.2408047914505005, "logits/rejected": -0.2040824145078659, "logps/chosen": -551.7179565429688, "logps/rejected": -606.5433959960938, "loss": 0.1955, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.07721008360385895, "rewards/margins": 0.026318836957216263, "rewards/rejected": -0.10352891683578491, "step": 150 }, { "epoch": 0.17, "learning_rate": 4.924759456701167e-06, "logits/chosen": -0.21895582973957062, "logits/rejected": -0.2554505467414856, "logps/chosen": -608.0427856445312, "logps/rejected": -679.7128295898438, "loss": 0.2025, "rewards/accuracies": 0.40625, "rewards/chosen": -0.10357453674077988, "rewards/margins": 0.022874176502227783, "rewards/rejected": -0.12644873559474945, "step": 160 }, { "epoch": 0.18, "learning_rate": 4.900396026378671e-06, "logits/chosen": -0.25241002440452576, "logits/rejected": -0.2686356008052826, "logps/chosen": -576.2278442382812, "logps/rejected": -611.9133911132812, "loss": 0.2044, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.1014503687620163, "rewards/margins": 0.020282840356230736, "rewards/rejected": -0.12173320353031158, "step": 170 }, { "epoch": 0.19, "learning_rate": 4.872699274339169e-06, "logits/chosen": -0.24474278092384338, "logits/rejected": -0.19586482644081116, "logps/chosen": -570.9044189453125, "logps/rejected": -617.5431518554688, "loss": 0.1944, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.09906121343374252, "rewards/margins": 0.01674678549170494, "rewards/rejected": -0.11580799520015717, "step": 180 }, { "epoch": 0.2, "learning_rate": 4.8417076618132434e-06, "logits/chosen": -0.2917916774749756, "logits/rejected": -0.20423956215381622, "logps/chosen": -567.7699584960938, "logps/rejected": -593.5147705078125, "loss": 0.2046, "rewards/accuracies": 0.40625, "rewards/chosen": -0.08719009160995483, "rewards/margins": 0.013276703655719757, "rewards/rejected": -0.10046680271625519, "step": 190 }, { "epoch": 0.21, "learning_rate": 4.807464225455655e-06, "logits/chosen": -0.14698217809200287, "logits/rejected": -0.23266562819480896, "logps/chosen": -531.8690185546875, "logps/rejected": -583.5828857421875, "loss": 0.1964, "rewards/accuracies": 0.40625, "rewards/chosen": -0.07782838493585587, "rewards/margins": 0.0252009816467762, "rewards/rejected": -0.10302937030792236, "step": 200 }, { "epoch": 0.22, "learning_rate": 4.770016517582283e-06, "logits/chosen": -0.21580150723457336, "logits/rejected": -0.18905040621757507, "logps/chosen": -626.87744140625, "logps/rejected": -649.6925659179688, "loss": 0.1977, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.104043148458004, "rewards/margins": 0.021797046065330505, "rewards/rejected": -0.1258401870727539, "step": 210 }, { "epoch": 0.23, "learning_rate": 4.7294165401363616e-06, "logits/chosen": -0.12353191524744034, "logits/rejected": -0.2215413749217987, "logps/chosen": -633.0154418945312, "logps/rejected": -633.0941162109375, "loss": 0.2058, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.10003998130559921, "rewards/margins": 0.009050301276147366, "rewards/rejected": -0.10909029096364975, "step": 220 }, { "epoch": 0.25, "learning_rate": 4.68572067247573e-06, "logits/chosen": -0.16852374374866486, "logits/rejected": -0.21371085941791534, "logps/chosen": -614.1183471679688, "logps/rejected": -670.2012939453125, "loss": 0.2077, "rewards/accuracies": 0.4375, "rewards/chosen": -0.08841963112354279, "rewards/margins": 0.02279593050479889, "rewards/rejected": -0.11121556907892227, "step": 230 }, { "epoch": 0.26, "learning_rate": 4.638989593081364e-06, "logits/chosen": -0.1663983315229416, "logits/rejected": -0.21970775723457336, "logps/chosen": -602.5869750976562, "logps/rejected": -618.7034912109375, "loss": 0.2061, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.07862231880426407, "rewards/margins": 0.021257968619465828, "rewards/rejected": -0.09988027811050415, "step": 240 }, { "epoch": 0.27, "learning_rate": 4.5892881952959015e-06, "logits/chosen": -0.21088270843029022, "logits/rejected": -0.14775848388671875, "logps/chosen": -577.7684326171875, "logps/rejected": -632.3033447265625, "loss": 0.2054, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.0773148387670517, "rewards/margins": 0.026050010696053505, "rewards/rejected": -0.10336484014987946, "step": 250 }, { "epoch": 0.28, "learning_rate": 4.536685497209182e-06, "logits/chosen": -0.1055503636598587, "logits/rejected": -0.06379745155572891, "logps/chosen": -522.751708984375, "logps/rejected": -602.4344482421875, "loss": 0.2001, "rewards/accuracies": 0.4375, "rewards/chosen": -0.06098253279924393, "rewards/margins": 0.030480870977044106, "rewards/rejected": -0.09146340191364288, "step": 260 }, { "epoch": 0.29, "learning_rate": 4.481254545815943e-06, "logits/chosen": -0.15926873683929443, "logits/rejected": -0.04976898431777954, "logps/chosen": -529.4932250976562, "logps/rejected": -549.9386596679688, "loss": 0.1973, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.06077051907777786, "rewards/margins": 0.01582062616944313, "rewards/rejected": -0.0765911340713501, "step": 270 }, { "epoch": 0.3, "learning_rate": 4.42307231557875e-06, "logits/chosen": -0.07944826781749725, "logits/rejected": -0.05855567380785942, "logps/chosen": -512.50439453125, "logps/rejected": -543.458984375, "loss": 0.1986, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.06550983339548111, "rewards/margins": 0.023027174174785614, "rewards/rejected": -0.08853700011968613, "step": 280 }, { "epoch": 0.31, "learning_rate": 4.3622196015370305e-06, "logits/chosen": -0.12430046498775482, "logits/rejected": -0.06956211477518082, "logps/chosen": -550.2479248046875, "logps/rejected": -614.044189453125, "loss": 0.1944, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.056610800325870514, "rewards/margins": 0.029858995229005814, "rewards/rejected": -0.08646979182958603, "step": 290 }, { "epoch": 0.32, "learning_rate": 4.298780907110648e-06, "logits/chosen": -0.09455857425928116, "logits/rejected": -0.07383386790752411, "logps/chosen": -598.065185546875, "logps/rejected": -647.9603271484375, "loss": 0.1876, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.06337399780750275, "rewards/margins": 0.026696253567934036, "rewards/rejected": -0.09007024019956589, "step": 300 }, { "epoch": 0.33, "learning_rate": 4.23284432675381e-06, "logits/chosen": -0.19348487257957458, "logits/rejected": -0.1443384736776352, "logps/chosen": -539.6243896484375, "logps/rejected": -612.7183837890625, "loss": 0.1963, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.05517622083425522, "rewards/margins": 0.02591213583946228, "rewards/rejected": -0.0810883566737175, "step": 310 }, { "epoch": 0.34, "learning_rate": 4.164501423622277e-06, "logits/chosen": -0.19629542529582977, "logits/rejected": -0.13960464298725128, "logps/chosen": -516.0609130859375, "logps/rejected": -658.4205932617188, "loss": 0.1915, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.05958019569516182, "rewards/margins": 0.06007415056228638, "rewards/rejected": -0.1196543425321579, "step": 320 }, { "epoch": 0.35, "learning_rate": 4.0938471024237355e-06, "logits/chosen": -0.1600683629512787, "logits/rejected": -0.10378336906433105, "logps/chosen": -590.7578125, "logps/rejected": -621.64697265625, "loss": 0.2007, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.08227936178445816, "rewards/margins": 0.01520558726042509, "rewards/rejected": -0.09748493880033493, "step": 330 }, { "epoch": 0.36, "learning_rate": 4.020979477627907e-06, "logits/chosen": -0.19418606162071228, "logits/rejected": -0.1177397221326828, "logps/chosen": -586.6962890625, "logps/rejected": -654.0504150390625, "loss": 0.1894, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.07023846358060837, "rewards/margins": 0.03478557616472244, "rewards/rejected": -0.10502403974533081, "step": 340 }, { "epoch": 0.37, "learning_rate": 3.9459997372194105e-06, "logits/chosen": -0.1304813176393509, "logits/rejected": -0.04862945154309273, "logps/chosen": -594.4133911132812, "logps/rejected": -617.715087890625, "loss": 0.192, "rewards/accuracies": 0.5, "rewards/chosen": -0.08139745891094208, "rewards/margins": 0.026553615927696228, "rewards/rejected": -0.10795106738805771, "step": 350 }, { "epoch": 0.38, "learning_rate": 3.869012002182573e-06, "logits/chosen": -0.21274884045124054, "logits/rejected": -0.03855857998132706, "logps/chosen": -557.4656982421875, "logps/rejected": -637.321044921875, "loss": 0.1848, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.07546891272068024, "rewards/margins": 0.03727220743894577, "rewards/rejected": -0.1127411276102066, "step": 360 }, { "epoch": 0.39, "learning_rate": 3.7901231819133104e-06, "logits/chosen": -0.10762195289134979, "logits/rejected": -0.10060106217861176, "logps/chosen": -599.8753051757812, "logps/rejected": -646.8792724609375, "loss": 0.1955, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -0.0741112157702446, "rewards/margins": 0.03268015384674072, "rewards/rejected": -0.10679137706756592, "step": 370 }, { "epoch": 0.41, "learning_rate": 3.709442825758875e-06, "logits/chosen": -0.12406639009714127, "logits/rejected": -0.053130537271499634, "logps/chosen": -587.0034790039062, "logps/rejected": -618.0760498046875, "loss": 0.19, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.07897321879863739, "rewards/margins": 0.025586843490600586, "rewards/rejected": -0.10456006228923798, "step": 380 }, { "epoch": 0.42, "learning_rate": 3.6270829708916113e-06, "logits/chosen": -0.11101411283016205, "logits/rejected": -0.08626400679349899, "logps/chosen": -569.6163330078125, "logps/rejected": -620.4082641601562, "loss": 0.1913, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.06503543257713318, "rewards/margins": 0.037478551268577576, "rewards/rejected": -0.10251398384571075, "step": 390 }, { "epoch": 0.43, "learning_rate": 3.543157986727991e-06, "logits/chosen": -0.11596628278493881, "logits/rejected": -0.09326865524053574, "logps/chosen": -569.7626342773438, "logps/rejected": -647.47119140625, "loss": 0.1913, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0574682354927063, "rewards/margins": 0.03390919789671898, "rewards/rejected": -0.09137743711471558, "step": 400 }, { "epoch": 0.44, "learning_rate": 3.4577844161089614e-06, "logits/chosen": -0.1688176691532135, "logits/rejected": -0.1762055903673172, "logps/chosen": -548.4512939453125, "logps/rejected": -596.2463989257812, "loss": 0.1879, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.054659001529216766, "rewards/margins": 0.025764942169189453, "rewards/rejected": -0.08042393624782562, "step": 410 }, { "epoch": 0.45, "learning_rate": 3.3710808134621577e-06, "logits/chosen": -0.12280504405498505, "logits/rejected": -0.018482182174921036, "logps/chosen": -567.9172973632812, "logps/rejected": -593.0560302734375, "loss": 0.189, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.0538947694003582, "rewards/margins": 0.02232169173657894, "rewards/rejected": -0.07621645927429199, "step": 420 }, { "epoch": 0.46, "learning_rate": 3.2831675801707126e-06, "logits/chosen": -0.04735702648758888, "logits/rejected": -0.10849102586507797, "logps/chosen": -590.4489135742188, "logps/rejected": -649.82568359375, "loss": 0.1887, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.04551684111356735, "rewards/margins": 0.026576777920126915, "rewards/rejected": -0.07209362089633942, "step": 430 }, { "epoch": 0.47, "learning_rate": 3.194166797377289e-06, "logits/chosen": -0.08134131878614426, "logits/rejected": -0.1677294671535492, "logps/chosen": -574.8263549804688, "logps/rejected": -607.7601318359375, "loss": 0.1893, "rewards/accuracies": 0.46875, "rewards/chosen": -0.04221652075648308, "rewards/margins": 0.030459443107247353, "rewards/rejected": -0.07267596572637558, "step": 440 }, { "epoch": 0.48, "learning_rate": 3.104202056455501e-06, "logits/chosen": -0.0588027760386467, "logits/rejected": -0.1330319195985794, "logps/chosen": -547.6630249023438, "logps/rejected": -580.7600708007812, "loss": 0.1985, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.04689568281173706, "rewards/margins": 0.024683769792318344, "rewards/rejected": -0.07157944142818451, "step": 450 }, { "epoch": 0.49, "learning_rate": 3.013398287384144e-06, "logits/chosen": -0.0910586565732956, "logits/rejected": -0.13333860039710999, "logps/chosen": -520.99267578125, "logps/rejected": -608.8109130859375, "loss": 0.1948, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.04666762426495552, "rewards/margins": 0.04471370577812195, "rewards/rejected": -0.09138132631778717, "step": 460 }, { "epoch": 0.5, "learning_rate": 2.9218815852625717e-06, "logits/chosen": -0.09454444795846939, "logits/rejected": -0.04375922679901123, "logps/chosen": -620.7197265625, "logps/rejected": -636.3668212890625, "loss": 0.201, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.06732948869466782, "rewards/margins": 0.026028599590063095, "rewards/rejected": -0.09335808455944061, "step": 470 }, { "epoch": 0.51, "learning_rate": 2.829779035208113e-06, "logits/chosen": -0.09432949125766754, "logits/rejected": -0.08926217257976532, "logps/chosen": -597.0772705078125, "logps/rejected": -639.5493774414062, "loss": 0.1909, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.040321771055459976, "rewards/margins": 0.03370783478021622, "rewards/rejected": -0.07402960956096649, "step": 480 }, { "epoch": 0.52, "learning_rate": 2.737218535878705e-06, "logits/chosen": -0.1773318350315094, "logits/rejected": -0.07903443276882172, "logps/chosen": -552.8883666992188, "logps/rejected": -618.2833251953125, "loss": 0.2029, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.04510737583041191, "rewards/margins": 0.028245270252227783, "rewards/rejected": -0.07335264980792999, "step": 490 }, { "epoch": 0.53, "learning_rate": 2.64432862186579e-06, "logits/chosen": -0.07201124727725983, "logits/rejected": -0.04144411161541939, "logps/chosen": -526.00634765625, "logps/rejected": -577.3812255859375, "loss": 0.1891, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.03259889408946037, "rewards/margins": 0.028664156794548035, "rewards/rejected": -0.06126304715871811, "step": 500 }, { "epoch": 0.54, "learning_rate": 2.551238285204126e-06, "logits/chosen": -0.13225743174552917, "logits/rejected": -0.03518156707286835, "logps/chosen": -558.69970703125, "logps/rejected": -633.7002563476562, "loss": 0.1987, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.034947603940963745, "rewards/margins": 0.041034139692783356, "rewards/rejected": -0.0759817361831665, "step": 510 }, { "epoch": 0.55, "learning_rate": 2.4580767962463688e-06, "logits/chosen": -0.03775392845273018, "logits/rejected": -0.06259463727474213, "logps/chosen": -564.3277587890625, "logps/rejected": -616.877685546875, "loss": 0.1935, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.041550230234861374, "rewards/margins": 0.04528028517961502, "rewards/rejected": -0.0868305116891861, "step": 520 }, { "epoch": 0.57, "learning_rate": 2.3649735241511546e-06, "logits/chosen": -0.11865083128213882, "logits/rejected": -0.14535991847515106, "logps/chosen": -539.8975219726562, "logps/rejected": -628.8270263671875, "loss": 0.1988, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.06274162977933884, "rewards/margins": 0.050676118582487106, "rewards/rejected": -0.11341774463653564, "step": 530 }, { "epoch": 0.58, "learning_rate": 2.2720577572339914e-06, "logits/chosen": -0.1661374866962433, "logits/rejected": -0.10748039186000824, "logps/chosen": -546.2053833007812, "logps/rejected": -584.2305908203125, "loss": 0.1901, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.05626441910862923, "rewards/margins": 0.02776341699063778, "rewards/rejected": -0.08402784168720245, "step": 540 }, { "epoch": 0.59, "learning_rate": 2.1794585234303995e-06, "logits/chosen": -0.10749207437038422, "logits/rejected": -0.13697417080402374, "logps/chosen": -517.0869140625, "logps/rejected": -581.8153686523438, "loss": 0.1866, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.052382372319698334, "rewards/margins": 0.035972487181425095, "rewards/rejected": -0.08835486322641373, "step": 550 }, { "epoch": 0.6, "learning_rate": 2.0873044111206407e-06, "logits/chosen": -0.1282195746898651, "logits/rejected": -0.1339006870985031, "logps/chosen": -576.3350830078125, "logps/rejected": -666.8603515625, "loss": 0.1907, "rewards/accuracies": 0.5625, "rewards/chosen": -0.04062817618250847, "rewards/margins": 0.03738432377576828, "rewards/rejected": -0.07801250368356705, "step": 560 }, { "epoch": 0.61, "learning_rate": 1.9957233905648293e-06, "logits/chosen": -0.10549817234277725, "logits/rejected": -0.11278073489665985, "logps/chosen": -566.6007080078125, "logps/rejected": -636.8270263671875, "loss": 0.1877, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.048470962792634964, "rewards/margins": 0.04373977333307266, "rewards/rejected": -0.09221073240041733, "step": 570 }, { "epoch": 0.62, "learning_rate": 1.904842636196402e-06, "logits/chosen": -0.0554957278072834, "logits/rejected": -0.13037823140621185, "logps/chosen": -597.04150390625, "logps/rejected": -615.6434326171875, "loss": 0.1909, "rewards/accuracies": 0.5, "rewards/chosen": -0.0562109649181366, "rewards/margins": 0.028234709054231644, "rewards/rejected": -0.08444567024707794, "step": 580 }, { "epoch": 0.63, "learning_rate": 1.814788350020726e-06, "logits/chosen": -0.0553332157433033, "logits/rejected": -0.14984294772148132, "logps/chosen": -511.7176818847656, "logps/rejected": -577.5421752929688, "loss": 0.1891, "rewards/accuracies": 0.46875, "rewards/chosen": -0.05183824896812439, "rewards/margins": 0.0338759571313858, "rewards/rejected": -0.08571420609951019, "step": 590 }, { "epoch": 0.64, "learning_rate": 1.725685586364051e-06, "logits/chosen": -0.1068972796201706, "logits/rejected": -0.13699831068515778, "logps/chosen": -547.6019897460938, "logps/rejected": -624.2053833007812, "loss": 0.1908, "rewards/accuracies": 0.5625, "rewards/chosen": -0.04226940870285034, "rewards/margins": 0.04575734585523605, "rewards/rejected": -0.08802676200866699, "step": 600 }, { "epoch": 0.65, "learning_rate": 1.6376580782162172e-06, "logits/chosen": -0.12253417819738388, "logits/rejected": -0.09159277379512787, "logps/chosen": -534.8265380859375, "logps/rejected": -639.2476806640625, "loss": 0.1866, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.038840554654598236, "rewards/margins": 0.04929639771580696, "rewards/rejected": -0.0881369560956955, "step": 610 }, { "epoch": 0.66, "learning_rate": 1.550828065408227e-06, "logits/chosen": -0.11153294146060944, "logits/rejected": -0.0631122812628746, "logps/chosen": -581.9796142578125, "logps/rejected": -639.3689575195312, "loss": 0.1738, "rewards/accuracies": 0.53125, "rewards/chosen": -0.04295315593481064, "rewards/margins": 0.037230443209409714, "rewards/rejected": -0.08018360286951065, "step": 620 }, { "epoch": 0.67, "learning_rate": 1.4653161248633053e-06, "logits/chosen": -0.10305066406726837, "logits/rejected": -0.13783864676952362, "logps/chosen": -582.2150268554688, "logps/rejected": -607.2169799804688, "loss": 0.1865, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.05051354691386223, "rewards/margins": 0.02962956391274929, "rewards/rejected": -0.08014310896396637, "step": 630 }, { "epoch": 0.68, "learning_rate": 1.381241003157162e-06, "logits/chosen": -0.09553556144237518, "logits/rejected": -0.1049310564994812, "logps/chosen": -561.0845947265625, "logps/rejected": -615.9722900390625, "loss": 0.19, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.046824414283037186, "rewards/margins": 0.03598689287900925, "rewards/rejected": -0.08281131088733673, "step": 640 }, { "epoch": 0.69, "learning_rate": 1.298719451619979e-06, "logits/chosen": -0.1247280016541481, "logits/rejected": -0.0659816786646843, "logps/chosen": -560.4979858398438, "logps/rejected": -620.7578735351562, "loss": 0.2002, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.04652046412229538, "rewards/margins": 0.041216202080249786, "rewards/rejected": -0.08773668110370636, "step": 650 }, { "epoch": 0.7, "learning_rate": 1.2178660642091036e-06, "logits/chosen": -0.03698350116610527, "logits/rejected": -0.2196667492389679, "logps/chosen": -521.7525634765625, "logps/rejected": -626.46435546875, "loss": 0.1979, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.05702243372797966, "rewards/margins": 0.041262269020080566, "rewards/rejected": -0.09828470647335052, "step": 660 }, { "epoch": 0.71, "learning_rate": 1.1387931183775821e-06, "logits/chosen": -0.1309659779071808, "logits/rejected": -0.126008078455925, "logps/chosen": -526.6151123046875, "logps/rejected": -586.6326293945312, "loss": 0.1836, "rewards/accuracies": 0.5, "rewards/chosen": -0.0365142747759819, "rewards/margins": 0.039250634610652924, "rewards/rejected": -0.07576490938663483, "step": 670 }, { "epoch": 0.73, "learning_rate": 1.061610419159532e-06, "logits/chosen": -0.06580721586942673, "logits/rejected": -0.11697240173816681, "logps/chosen": -545.3971557617188, "logps/rejected": -590.3699340820312, "loss": 0.186, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.040514297783374786, "rewards/margins": 0.041993193328380585, "rewards/rejected": -0.08250749111175537, "step": 680 }, { "epoch": 0.74, "learning_rate": 9.864251466888364e-07, "logits/chosen": 0.015632059425115585, "logits/rejected": -0.14370284974575043, "logps/chosen": -527.1017456054688, "logps/rejected": -602.5015869140625, "loss": 0.1872, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.03584219887852669, "rewards/margins": 0.0341840498149395, "rewards/rejected": -0.07002625614404678, "step": 690 }, { "epoch": 0.75, "learning_rate": 9.133417073629288e-07, "logits/chosen": -0.1096029132604599, "logits/rejected": -0.09382790327072144, "logps/chosen": -552.9088745117188, "logps/rejected": -619.2091674804688, "loss": 0.1929, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.04123011603951454, "rewards/margins": 0.03130009397864342, "rewards/rejected": -0.07253019511699677, "step": 700 }, { "epoch": 0.76, "learning_rate": 8.424615888583332e-07, "logits/chosen": -0.1330350786447525, "logits/rejected": -0.07537052035331726, "logps/chosen": -521.3177490234375, "logps/rejected": -601.4888305664062, "loss": 0.1829, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.037754353135824203, "rewards/margins": 0.041079822927713394, "rewards/rejected": -0.0788341760635376, "step": 710 }, { "epoch": 0.77, "learning_rate": 7.738832191993092e-07, "logits/chosen": -0.13393089175224304, "logits/rejected": -0.07735292613506317, "logps/chosen": -589.1104736328125, "logps/rejected": -623.0423583984375, "loss": 0.1937, "rewards/accuracies": 0.59375, "rewards/chosen": -0.04533671587705612, "rewards/margins": 0.03662148863077164, "rewards/rejected": -0.08195820450782776, "step": 720 }, { "epoch": 0.78, "learning_rate": 7.077018300752917e-07, "logits/chosen": -0.09014391899108887, "logits/rejected": -0.02712271548807621, "logps/chosen": -550.0320434570312, "logps/rejected": -605.1174926757812, "loss": 0.1961, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.05133052542805672, "rewards/margins": 0.041539210826158524, "rewards/rejected": -0.09286972880363464, "step": 730 }, { "epoch": 0.79, "learning_rate": 6.440093245969342e-07, "logits/chosen": -0.08313737064599991, "logits/rejected": -0.1943168193101883, "logps/chosen": -516.8920288085938, "logps/rejected": -601.4186401367188, "loss": 0.1848, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.04221433773636818, "rewards/margins": 0.0475175604224205, "rewards/rejected": -0.08973188698291779, "step": 740 }, { "epoch": 0.8, "learning_rate": 5.828941496744075e-07, "logits/chosen": -0.11161942780017853, "logits/rejected": -0.0919300764799118, "logps/chosen": -563.8603515625, "logps/rejected": -619.1151733398438, "loss": 0.1903, "rewards/accuracies": 0.5, "rewards/chosen": -0.04418020322918892, "rewards/margins": 0.03953651711344719, "rewards/rejected": -0.08371671289205551, "step": 750 }, { "epoch": 0.81, "learning_rate": 5.244411731951671e-07, "logits/chosen": -0.13506890833377838, "logits/rejected": -0.033810555934906006, "logps/chosen": -605.5892944335938, "logps/rejected": -609.83544921875, "loss": 0.1878, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.03747162967920303, "rewards/margins": 0.02192925289273262, "rewards/rejected": -0.059400878846645355, "step": 760 }, { "epoch": 0.82, "learning_rate": 4.6873156617173594e-07, "logits/chosen": -0.07261113822460175, "logits/rejected": -0.16117814183235168, "logps/chosen": -553.5911254882812, "logps/rejected": -624.5232543945312, "loss": 0.1921, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.04296105355024338, "rewards/margins": 0.0388905294239521, "rewards/rejected": -0.08185158669948578, "step": 770 }, { "epoch": 0.83, "learning_rate": 4.1584269002318653e-07, "logits/chosen": -0.07403261959552765, "logits/rejected": -0.054157156497240067, "logps/chosen": -535.3461303710938, "logps/rejected": -585.4727783203125, "loss": 0.1828, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0406302735209465, "rewards/margins": 0.03608276695013046, "rewards/rejected": -0.07671303302049637, "step": 780 }, { "epoch": 0.84, "learning_rate": 3.658479891468258e-07, "logits/chosen": -0.1717700958251953, "logits/rejected": -0.08853835612535477, "logps/chosen": -527.3263549804688, "logps/rejected": -540.2444458007812, "loss": 0.1778, "rewards/accuracies": 0.4375, "rewards/chosen": -0.04036609083414078, "rewards/margins": 0.03141506761312485, "rewards/rejected": -0.07178115844726562, "step": 790 }, { "epoch": 0.85, "learning_rate": 3.18816888929272e-07, "logits/chosen": -0.09848084300756454, "logits/rejected": -0.06764743477106094, "logps/chosen": -563.3206787109375, "logps/rejected": -668.9093017578125, "loss": 0.2002, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.046812716871500015, "rewards/margins": 0.054834604263305664, "rewards/rejected": -0.10164730250835419, "step": 800 }, { "epoch": 0.86, "learning_rate": 2.748146993385484e-07, "logits/chosen": -0.09693370759487152, "logits/rejected": -0.07278673350811005, "logps/chosen": -522.9954833984375, "logps/rejected": -612.6608276367188, "loss": 0.1854, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.04407941550016403, "rewards/margins": 0.05026249960064888, "rewards/rejected": -0.09434191882610321, "step": 810 }, { "epoch": 0.87, "learning_rate": 2.3390252423108077e-07, "logits/chosen": -0.07084161043167114, "logits/rejected": -0.18225322663784027, "logps/chosen": -488.76483154296875, "logps/rejected": -558.3425903320312, "loss": 0.1939, "rewards/accuracies": 0.5, "rewards/chosen": -0.035873524844646454, "rewards/margins": 0.037640780210494995, "rewards/rejected": -0.07351429760456085, "step": 820 }, { "epoch": 0.89, "learning_rate": 1.961371764995243e-07, "logits/chosen": -0.11218070983886719, "logits/rejected": -0.143798828125, "logps/chosen": -548.5975341796875, "logps/rejected": -618.435302734375, "loss": 0.2009, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.03908687084913254, "rewards/margins": 0.042751066386699677, "rewards/rejected": -0.08183793723583221, "step": 830 }, { "epoch": 0.9, "learning_rate": 1.61571099179261e-07, "logits/chosen": -0.0712205171585083, "logits/rejected": -0.06110917776823044, "logps/chosen": -584.1240234375, "logps/rejected": -650.0173950195312, "loss": 0.1955, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.04009150713682175, "rewards/margins": 0.030330544337630272, "rewards/rejected": -0.07042204588651657, "step": 840 }, { "epoch": 0.91, "learning_rate": 1.3025229262312367e-07, "logits/chosen": -0.0935712531208992, "logits/rejected": -0.05454383045434952, "logps/chosen": -496.932861328125, "logps/rejected": -605.6661987304688, "loss": 0.1884, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.042653247714042664, "rewards/margins": 0.048957787454128265, "rewards/rejected": -0.09161103516817093, "step": 850 }, { "epoch": 0.92, "learning_rate": 1.0222424784546853e-07, "logits/chosen": -0.08921684324741364, "logits/rejected": -0.15163610875606537, "logps/chosen": -579.2117919921875, "logps/rejected": -619.4464111328125, "loss": 0.1904, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.04733709245920181, "rewards/margins": 0.03301934152841568, "rewards/rejected": -0.08035643398761749, "step": 860 }, { "epoch": 0.93, "learning_rate": 7.752588612816553e-08, "logits/chosen": -0.04686546325683594, "logits/rejected": -0.15816907584667206, "logps/chosen": -509.0023498535156, "logps/rejected": -572.1159057617188, "loss": 0.1754, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.042182981967926025, "rewards/margins": 0.04292844608426094, "rewards/rejected": -0.08511142432689667, "step": 870 }, { "epoch": 0.94, "learning_rate": 5.619150497236991e-08, "logits/chosen": -0.07643123716115952, "logits/rejected": -0.16245657205581665, "logps/chosen": -535.0369873046875, "logps/rejected": -608.0992431640625, "loss": 0.192, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.04792182892560959, "rewards/margins": 0.03496783226728439, "rewards/rejected": -0.08288966119289398, "step": 880 }, { "epoch": 0.95, "learning_rate": 3.825073047112743e-08, "logits/chosen": -0.13168227672576904, "logits/rejected": -0.046010442078113556, "logps/chosen": -579.3240356445312, "logps/rejected": -674.3414306640625, "loss": 0.1964, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.04349333792924881, "rewards/margins": 0.047455307096242905, "rewards/rejected": -0.09094865620136261, "step": 890 }, { "epoch": 0.96, "learning_rate": 2.372847616895685e-08, "logits/chosen": -0.04904794320464134, "logits/rejected": -0.019006099551916122, "logps/chosen": -542.4931640625, "logps/rejected": -638.1673583984375, "loss": 0.1889, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.04928978905081749, "rewards/margins": 0.03806794807314873, "rewards/rejected": -0.08735774457454681, "step": 900 }, { "epoch": 0.97, "learning_rate": 1.264490846553279e-08, "logits/chosen": -0.12707039713859558, "logits/rejected": -0.10833065211772919, "logps/chosen": -579.73681640625, "logps/rejected": -622.3654174804688, "loss": 0.1897, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.046609390527009964, "rewards/margins": 0.03541853651404381, "rewards/rejected": -0.08202792704105377, "step": 910 }, { "epoch": 0.98, "learning_rate": 5.015418611516165e-09, "logits/chosen": -0.0854305848479271, "logits/rejected": -0.11656080186367035, "logps/chosen": -616.4360961914062, "logps/rejected": -670.5054931640625, "loss": 0.1907, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.04680439084768295, "rewards/margins": 0.05593379586935043, "rewards/rejected": -0.10273818671703339, "step": 920 }, { "epoch": 0.99, "learning_rate": 8.506013354186993e-10, "logits/chosen": -0.11298644542694092, "logits/rejected": -0.03937912359833717, "logps/chosen": -532.8866577148438, "logps/rejected": -597.7803344726562, "loss": 0.2033, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.043054092675447464, "rewards/margins": 0.037277717143297195, "rewards/rejected": -0.08033180981874466, "step": 930 }, { "epoch": 1.0, "step": 937, "total_flos": 0.0, "train_loss": 0.19462941225971966, "train_runtime": 7972.3934, "train_samples_per_second": 3.763, "train_steps_per_second": 0.118 } ], "logging_steps": 10, "max_steps": 937, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }