{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994666666666666, "eval_steps": 500, "global_step": 937, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.319148936170213e-08, "logits/chosen": 0.06642268598079681, "logits/rejected": 0.23397813737392426, "logps/chosen": -587.28369140625, "logps/rejected": -568.082763671875, "loss": 0.279, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 5.319148936170213e-07, "logits/chosen": 0.16077889502048492, "logits/rejected": 0.28465068340301514, "logps/chosen": -462.99114990234375, "logps/rejected": -441.3485107421875, "loss": 0.303, "rewards/accuracies": 0.4236111044883728, "rewards/chosen": -0.0005749252159148455, "rewards/margins": 0.0002628265065141022, "rewards/rejected": -0.0008377517224289477, "step": 10 }, { "epoch": 0.02, "learning_rate": 1.0638297872340427e-06, "logits/chosen": 0.2244517058134079, "logits/rejected": 0.2146037071943283, "logps/chosen": -456.9951171875, "logps/rejected": -442.496826171875, "loss": 0.3007, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": -0.0011023276019841433, "rewards/margins": -6.420163117581978e-05, "rewards/rejected": -0.0010381259489804506, "step": 20 }, { "epoch": 0.03, "learning_rate": 1.595744680851064e-06, "logits/chosen": 0.34518542885780334, "logits/rejected": 0.15579931437969208, "logps/chosen": -414.3968200683594, "logps/rejected": -407.1432800292969, "loss": 0.3064, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -0.0004762631724588573, "rewards/margins": 6.935702549526468e-05, "rewards/rejected": -0.0005456201615743339, "step": 30 }, { "epoch": 0.04, "learning_rate": 2.1276595744680853e-06, "logits/chosen": 0.19979876279830933, "logits/rejected": 0.1475386917591095, "logps/chosen": -387.1222839355469, "logps/rejected": -380.3912048339844, "loss": 0.3173, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": 8.398960198974237e-05, "rewards/margins": 0.0001377248700009659, "rewards/rejected": -5.3735253459308296e-05, "step": 40 }, { "epoch": 0.05, "learning_rate": 2.6595744680851065e-06, "logits/chosen": 0.20050282776355743, "logits/rejected": 0.2853023409843445, "logps/chosen": -405.1722717285156, "logps/rejected": -433.797119140625, "loss": 0.2969, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.00031604920513927937, "rewards/margins": 9.358949318993837e-05, "rewards/rejected": -0.0004096386837773025, "step": 50 }, { "epoch": 0.06, "learning_rate": 3.191489361702128e-06, "logits/chosen": 0.22685687243938446, "logits/rejected": 0.2761882245540619, "logps/chosen": -418.8284606933594, "logps/rejected": -444.300537109375, "loss": 0.2916, "rewards/accuracies": 0.5, "rewards/chosen": -0.0013369970256462693, "rewards/margins": 0.00045625813072547317, "rewards/rejected": -0.0017932550981640816, "step": 60 }, { "epoch": 0.07, "learning_rate": 3.723404255319149e-06, "logits/chosen": 0.1869155615568161, "logits/rejected": 0.2700553834438324, "logps/chosen": -443.9104919433594, "logps/rejected": -423.21075439453125, "loss": 0.2976, "rewards/accuracies": 0.46875, "rewards/chosen": -0.003828343003988266, "rewards/margins": 0.0006897930870763958, "rewards/rejected": -0.004518135450780392, "step": 70 }, { "epoch": 0.09, "learning_rate": 4.255319148936171e-06, "logits/chosen": 0.23689034581184387, "logits/rejected": 0.21069273352622986, "logps/chosen": -386.25067138671875, "logps/rejected": -387.7801818847656, "loss": 0.3143, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.003951665014028549, "rewards/margins": 0.0011586709879338741, "rewards/rejected": -0.005110335536301136, "step": 80 }, { "epoch": 0.1, "learning_rate": 4.787234042553192e-06, "logits/chosen": 0.24118606746196747, "logits/rejected": 0.25480058789253235, "logps/chosen": -412.43499755859375, "logps/rejected": -408.15802001953125, "loss": 0.3065, "rewards/accuracies": 0.5, "rewards/chosen": -0.010140495374798775, "rewards/margins": 0.001615689368918538, "rewards/rejected": -0.01175618451088667, "step": 90 }, { "epoch": 0.11, "learning_rate": 4.999375059004058e-06, "logits/chosen": 0.1671404391527176, "logits/rejected": 0.2540619969367981, "logps/chosen": -389.2574157714844, "logps/rejected": -388.87408447265625, "loss": 0.3053, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.022764097899198532, "rewards/margins": 0.0024351924657821655, "rewards/rejected": -0.02519928850233555, "step": 100 }, { "epoch": 0.12, "learning_rate": 4.9955571065548795e-06, "logits/chosen": 0.23305337131023407, "logits/rejected": 0.22437167167663574, "logps/chosen": -441.59771728515625, "logps/rejected": -446.51971435546875, "loss": 0.3081, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.04231434687972069, "rewards/margins": 0.003846729639917612, "rewards/rejected": -0.04616107791662216, "step": 110 }, { "epoch": 0.13, "learning_rate": 4.9882736864879e-06, "logits/chosen": 0.17237094044685364, "logits/rejected": 0.20950445532798767, "logps/chosen": -497.03741455078125, "logps/rejected": -493.6482849121094, "loss": 0.3001, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.06084052473306656, "rewards/margins": 0.009210348129272461, "rewards/rejected": -0.07005088031291962, "step": 120 }, { "epoch": 0.14, "learning_rate": 4.977534912960124e-06, "logits/chosen": 0.12596510350704193, "logits/rejected": 0.18595007061958313, "logps/chosen": -534.695556640625, "logps/rejected": -506.4364318847656, "loss": 0.285, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.09366725385189056, "rewards/margins": 0.009929810650646687, "rewards/rejected": -0.10359706729650497, "step": 130 }, { "epoch": 0.15, "learning_rate": 4.963355698422092e-06, "logits/chosen": 0.12804082036018372, "logits/rejected": 0.10300163924694061, "logps/chosen": -515.469970703125, "logps/rejected": -508.8036193847656, "loss": 0.3043, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.11144615709781647, "rewards/margins": 0.012134796939790249, "rewards/rejected": -0.1235809326171875, "step": 140 }, { "epoch": 0.16, "learning_rate": 4.945755732909625e-06, "logits/chosen": 0.16220004856586456, "logits/rejected": 0.08028533309698105, "logps/chosen": -479.33184814453125, "logps/rejected": -524.7337646484375, "loss": 0.2802, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.1306999772787094, "rewards/margins": 0.03233319893479347, "rewards/rejected": -0.16303318738937378, "step": 150 }, { "epoch": 0.17, "learning_rate": 4.924759456701167e-06, "logits/chosen": 0.18928228318691254, "logits/rejected": 0.14869533479213715, "logps/chosen": -573.3798828125, "logps/rejected": -610.5779418945312, "loss": 0.2894, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.168126180768013, "rewards/margins": 0.04517129063606262, "rewards/rejected": -0.21329745650291443, "step": 160 }, { "epoch": 0.18, "learning_rate": 4.900396026378671e-06, "logits/chosen": 0.062172818928956985, "logits/rejected": 0.019718164578080177, "logps/chosen": -475.08551025390625, "logps/rejected": -561.6641845703125, "loss": 0.2631, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.1646568328142166, "rewards/margins": 0.06741134822368622, "rewards/rejected": -0.23206815123558044, "step": 170 }, { "epoch": 0.19, "learning_rate": 4.872699274339169e-06, "logits/chosen": -0.0011120836716145277, "logits/rejected": 0.13960300385951996, "logps/chosen": -525.2228393554688, "logps/rejected": -511.014404296875, "loss": 0.2908, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.14511564373970032, "rewards/margins": 0.03634321317076683, "rewards/rejected": -0.18145884573459625, "step": 180 }, { "epoch": 0.2, "learning_rate": 4.8417076618132434e-06, "logits/chosen": 0.11656410992145538, "logits/rejected": 0.08704119175672531, "logps/chosen": -514.813720703125, "logps/rejected": -595.0880126953125, "loss": 0.2617, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.14817455410957336, "rewards/margins": 0.04434273764491081, "rewards/rejected": -0.19251729547977448, "step": 190 }, { "epoch": 0.21, "learning_rate": 4.807464225455655e-06, "logits/chosen": 0.06601261347532272, "logits/rejected": 0.12538839876651764, "logps/chosen": -538.46142578125, "logps/rejected": -592.1277465820312, "loss": 0.2739, "rewards/accuracies": 0.5, "rewards/chosen": -0.15786947309970856, "rewards/margins": 0.04001317173242569, "rewards/rejected": -0.19788263738155365, "step": 200 }, { "epoch": 0.22, "learning_rate": 4.770016517582283e-06, "logits/chosen": 0.04595109820365906, "logits/rejected": 0.05397043749690056, "logps/chosen": -542.3662109375, "logps/rejected": -587.703125, "loss": 0.2803, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.15854400396347046, "rewards/margins": 0.037279583513736725, "rewards/rejected": -0.19582359492778778, "step": 210 }, { "epoch": 0.23, "learning_rate": 4.7294165401363616e-06, "logits/chosen": 0.06908506900072098, "logits/rejected": 0.0783570259809494, "logps/chosen": -543.8673095703125, "logps/rejected": -552.3768920898438, "loss": 0.273, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.14275754988193512, "rewards/margins": 0.04143111780285835, "rewards/rejected": -0.18418867886066437, "step": 220 }, { "epoch": 0.25, "learning_rate": 4.68572067247573e-06, "logits/chosen": 0.07219888269901276, "logits/rejected": 0.0497373566031456, "logps/chosen": -572.8856201171875, "logps/rejected": -622.0572509765625, "loss": 0.2833, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.14573441445827484, "rewards/margins": 0.06495748460292816, "rewards/rejected": -0.210691899061203, "step": 230 }, { "epoch": 0.26, "learning_rate": 4.638989593081364e-06, "logits/chosen": -0.10666439682245255, "logits/rejected": 0.05354728549718857, "logps/chosen": -541.7586059570312, "logps/rejected": -586.1435546875, "loss": 0.2883, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.14508689939975739, "rewards/margins": 0.048395391553640366, "rewards/rejected": -0.19348229467868805, "step": 240 }, { "epoch": 0.27, "learning_rate": 4.5892881952959015e-06, "logits/chosen": 0.07505561411380768, "logits/rejected": -0.051252782344818115, "logps/chosen": -555.9990844726562, "logps/rejected": -585.9085083007812, "loss": 0.2815, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.14657357335090637, "rewards/margins": 0.04433682560920715, "rewards/rejected": -0.19091038405895233, "step": 250 }, { "epoch": 0.28, "learning_rate": 4.536685497209182e-06, "logits/chosen": -0.020656492561101913, "logits/rejected": 0.007626605220139027, "logps/chosen": -612.7149658203125, "logps/rejected": -613.846435546875, "loss": 0.2945, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.1504904180765152, "rewards/margins": 0.03951232135295868, "rewards/rejected": -0.19000275433063507, "step": 260 }, { "epoch": 0.29, "learning_rate": 4.481254545815943e-06, "logits/chosen": 0.06314031779766083, "logits/rejected": 0.05012714862823486, "logps/chosen": -572.8782958984375, "logps/rejected": -599.4718627929688, "loss": 0.286, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.11262224614620209, "rewards/margins": 0.0400107316672802, "rewards/rejected": -0.15263298153877258, "step": 270 }, { "epoch": 0.3, "learning_rate": 4.42307231557875e-06, "logits/chosen": 4.419684410095215e-05, "logits/rejected": 0.1705075055360794, "logps/chosen": -478.0445251464844, "logps/rejected": -514.4085693359375, "loss": 0.2574, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.08713702112436295, "rewards/margins": 0.0530150942504406, "rewards/rejected": -0.14015211164951324, "step": 280 }, { "epoch": 0.31, "learning_rate": 4.3622196015370305e-06, "logits/chosen": -0.05049672722816467, "logits/rejected": 0.10665085166692734, "logps/chosen": -562.5294799804688, "logps/rejected": -583.0447998046875, "loss": 0.2866, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.12877288460731506, "rewards/margins": 0.0415426567196846, "rewards/rejected": -0.17031553387641907, "step": 290 }, { "epoch": 0.32, "learning_rate": 4.298780907110648e-06, "logits/chosen": -0.13402745127677917, "logits/rejected": 0.04227043688297272, "logps/chosen": -570.83935546875, "logps/rejected": -597.737060546875, "loss": 0.2749, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.14190678298473358, "rewards/margins": 0.054900676012039185, "rewards/rejected": -0.19680745899677277, "step": 300 }, { "epoch": 0.33, "learning_rate": 4.23284432675381e-06, "logits/chosen": -0.07339149713516235, "logits/rejected": -0.040264565497636795, "logps/chosen": -619.7767333984375, "logps/rejected": -682.0263061523438, "loss": 0.2589, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.16093352437019348, "rewards/margins": 0.07607638835906982, "rewards/rejected": -0.2370099127292633, "step": 310 }, { "epoch": 0.34, "learning_rate": 4.164501423622277e-06, "logits/chosen": -0.028974998742341995, "logits/rejected": -0.06319359689950943, "logps/chosen": -549.9708862304688, "logps/rejected": -637.650390625, "loss": 0.264, "rewards/accuracies": 0.5, "rewards/chosen": -0.13445612788200378, "rewards/margins": 0.07859645038843155, "rewards/rejected": -0.21305255591869354, "step": 320 }, { "epoch": 0.35, "learning_rate": 4.0938471024237355e-06, "logits/chosen": -0.11449748277664185, "logits/rejected": -0.03637564182281494, "logps/chosen": -601.0888671875, "logps/rejected": -669.8153076171875, "loss": 0.2698, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.1675841063261032, "rewards/margins": 0.0730680450797081, "rewards/rejected": -0.2406521737575531, "step": 330 }, { "epoch": 0.36, "learning_rate": 4.020979477627907e-06, "logits/chosen": -0.08815717697143555, "logits/rejected": 0.015436625108122826, "logps/chosen": -504.43658447265625, "logps/rejected": -530.493408203125, "loss": 0.2631, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -0.11692949384450912, "rewards/margins": 0.04244539141654968, "rewards/rejected": -0.1593748927116394, "step": 340 }, { "epoch": 0.37, "learning_rate": 3.9459997372194105e-06, "logits/chosen": -0.061802517622709274, "logits/rejected": 0.09361619502305984, "logps/chosen": -579.513671875, "logps/rejected": -639.0809936523438, "loss": 0.2755, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.16077642142772675, "rewards/margins": 0.05602121353149414, "rewards/rejected": -0.2167976200580597, "step": 350 }, { "epoch": 0.38, "learning_rate": 3.869012002182573e-06, "logits/chosen": -0.10564370453357697, "logits/rejected": 0.02583186700940132, "logps/chosen": -590.0396728515625, "logps/rejected": -647.909423828125, "loss": 0.2786, "rewards/accuracies": 0.53125, "rewards/chosen": -0.15724439918994904, "rewards/margins": 0.0704963356256485, "rewards/rejected": -0.22774071991443634, "step": 360 }, { "epoch": 0.39, "learning_rate": 3.7901231819133104e-06, "logits/chosen": -0.10458900034427643, "logits/rejected": -0.004534685518592596, "logps/chosen": -542.1465454101562, "logps/rejected": -573.3040771484375, "loss": 0.2668, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.1385766863822937, "rewards/margins": 0.05575231835246086, "rewards/rejected": -0.19432899355888367, "step": 370 }, { "epoch": 0.41, "learning_rate": 3.709442825758875e-06, "logits/chosen": -0.06990720331668854, "logits/rejected": 0.01404495257884264, "logps/chosen": -505.4917907714844, "logps/rejected": -572.2424926757812, "loss": 0.2636, "rewards/accuracies": 0.46875, "rewards/chosen": -0.14204895496368408, "rewards/margins": 0.050913404673337936, "rewards/rejected": -0.19296236336231232, "step": 380 }, { "epoch": 0.42, "learning_rate": 3.6270829708916113e-06, "logits/chosen": -0.040005024522542953, "logits/rejected": 0.017743710428476334, "logps/chosen": -548.9078369140625, "logps/rejected": -573.68408203125, "loss": 0.2815, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.1460207998752594, "rewards/margins": 0.041480742394924164, "rewards/rejected": -0.18750153481960297, "step": 390 }, { "epoch": 0.43, "learning_rate": 3.543157986727991e-06, "logits/chosen": -0.13051895797252655, "logits/rejected": 0.012003961019217968, "logps/chosen": -543.142578125, "logps/rejected": -594.59326171875, "loss": 0.2753, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.13953322172164917, "rewards/margins": 0.0578032024204731, "rewards/rejected": -0.19733640551567078, "step": 400 }, { "epoch": 0.44, "learning_rate": 3.4577844161089614e-06, "logits/chosen": -0.03122936561703682, "logits/rejected": 0.010242189280688763, "logps/chosen": -564.1265869140625, "logps/rejected": -600.2008666992188, "loss": 0.2742, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.14681796729564667, "rewards/margins": 0.06189022213220596, "rewards/rejected": -0.20870819687843323, "step": 410 }, { "epoch": 0.45, "learning_rate": 3.3710808134621577e-06, "logits/chosen": -0.010301386937499046, "logits/rejected": -0.039281733334064484, "logps/chosen": -603.3815307617188, "logps/rejected": -645.9063110351562, "loss": 0.2761, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.15229162573814392, "rewards/margins": 0.06186581775546074, "rewards/rejected": -0.21415743231773376, "step": 420 }, { "epoch": 0.46, "learning_rate": 3.2831675801707126e-06, "logits/chosen": -0.08892063051462173, "logits/rejected": -0.0773845762014389, "logps/chosen": -537.9869995117188, "logps/rejected": -586.5262451171875, "loss": 0.2721, "rewards/accuracies": 0.53125, "rewards/chosen": -0.13618162274360657, "rewards/margins": 0.06536873430013657, "rewards/rejected": -0.20155039429664612, "step": 430 }, { "epoch": 0.47, "learning_rate": 3.194166797377289e-06, "logits/chosen": -0.03688632696866989, "logits/rejected": -0.00927029736340046, "logps/chosen": -540.3975219726562, "logps/rejected": -622.5618896484375, "loss": 0.2622, "rewards/accuracies": 0.53125, "rewards/chosen": -0.12767064571380615, "rewards/margins": 0.07470119744539261, "rewards/rejected": -0.20237183570861816, "step": 440 }, { "epoch": 0.48, "learning_rate": 3.104202056455501e-06, "logits/chosen": -0.05976264923810959, "logits/rejected": -0.16253043711185455, "logps/chosen": -514.6741943359375, "logps/rejected": -578.7728271484375, "loss": 0.2588, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.13386496901512146, "rewards/margins": 0.05132218077778816, "rewards/rejected": -0.18518713116645813, "step": 450 }, { "epoch": 0.49, "learning_rate": 3.013398287384144e-06, "logits/chosen": -0.15890637040138245, "logits/rejected": -0.05094796419143677, "logps/chosen": -516.9619750976562, "logps/rejected": -604.88525390625, "loss": 0.2671, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.11203358322381973, "rewards/margins": 0.08909189701080322, "rewards/rejected": -0.20112547278404236, "step": 460 }, { "epoch": 0.5, "learning_rate": 2.9218815852625717e-06, "logits/chosen": -0.17138849198818207, "logits/rejected": -0.08313537389039993, "logps/chosen": -573.4813232421875, "logps/rejected": -608.16552734375, "loss": 0.2695, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.13088415563106537, "rewards/margins": 0.047856587916612625, "rewards/rejected": -0.1787407547235489, "step": 470 }, { "epoch": 0.51, "learning_rate": 2.829779035208113e-06, "logits/chosen": -0.12412846088409424, "logits/rejected": -0.11923656612634659, "logps/chosen": -472.38726806640625, "logps/rejected": -527.9046020507812, "loss": 0.2658, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.09405693411827087, "rewards/margins": 0.057601846754550934, "rewards/rejected": -0.1516587734222412, "step": 480 }, { "epoch": 0.52, "learning_rate": 2.737218535878705e-06, "logits/chosen": -0.11355652660131454, "logits/rejected": -0.06184381991624832, "logps/chosen": -522.3283081054688, "logps/rejected": -575.9847412109375, "loss": 0.2596, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.11173069477081299, "rewards/margins": 0.06776181608438492, "rewards/rejected": -0.1794925034046173, "step": 490 }, { "epoch": 0.53, "learning_rate": 2.64432862186579e-06, "logits/chosen": -0.1486915647983551, "logits/rejected": -0.1126946210861206, "logps/chosen": -472.10009765625, "logps/rejected": -565.5072021484375, "loss": 0.2578, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.10586412250995636, "rewards/margins": 0.08054044097661972, "rewards/rejected": -0.1864045411348343, "step": 500 }, { "epoch": 0.54, "learning_rate": 2.551238285204126e-06, "logits/chosen": -0.07071704417467117, "logits/rejected": -0.06895752251148224, "logps/chosen": -553.7777709960938, "logps/rejected": -641.0071411132812, "loss": 0.2755, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.12443629652261734, "rewards/margins": 0.08012167364358902, "rewards/rejected": -0.20455794036388397, "step": 510 }, { "epoch": 0.55, "learning_rate": 2.4580767962463688e-06, "logits/chosen": -0.07696928828954697, "logits/rejected": 0.0007179826498031616, "logps/chosen": -548.6107788085938, "logps/rejected": -587.250732421875, "loss": 0.2776, "rewards/accuracies": 0.46875, "rewards/chosen": -0.14552563428878784, "rewards/margins": 0.04918716475367546, "rewards/rejected": -0.1947127878665924, "step": 520 }, { "epoch": 0.57, "learning_rate": 2.3649735241511546e-06, "logits/chosen": -0.07807435840368271, "logits/rejected": 0.01831636391580105, "logps/chosen": -556.7833251953125, "logps/rejected": -576.1497192382812, "loss": 0.2592, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.13193278014659882, "rewards/margins": 0.05105576664209366, "rewards/rejected": -0.18298853933811188, "step": 530 }, { "epoch": 0.58, "learning_rate": 2.2720577572339914e-06, "logits/chosen": -0.014492440037429333, "logits/rejected": -0.10470409691333771, "logps/chosen": -520.7554931640625, "logps/rejected": -621.583251953125, "loss": 0.2589, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.1109011173248291, "rewards/margins": 0.08521705865859985, "rewards/rejected": -0.19611816108226776, "step": 540 }, { "epoch": 0.59, "learning_rate": 2.1794585234303995e-06, "logits/chosen": -0.09592770040035248, "logits/rejected": 0.03173860162496567, "logps/chosen": -527.00537109375, "logps/rejected": -587.9857177734375, "loss": 0.2698, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.13213616609573364, "rewards/margins": 0.07161318510770798, "rewards/rejected": -0.2037493735551834, "step": 550 }, { "epoch": 0.6, "learning_rate": 2.0873044111206407e-06, "logits/chosen": -0.08211179822683334, "logits/rejected": -0.07464434206485748, "logps/chosen": -526.9556884765625, "logps/rejected": -597.501708984375, "loss": 0.263, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.12764397263526917, "rewards/margins": 0.07685311138629913, "rewards/rejected": -0.2044970691204071, "step": 560 }, { "epoch": 0.61, "learning_rate": 1.9957233905648293e-06, "logits/chosen": -0.17138975858688354, "logits/rejected": 0.06973910331726074, "logps/chosen": -606.2980346679688, "logps/rejected": -656.0491943359375, "loss": 0.2673, "rewards/accuracies": 0.53125, "rewards/chosen": -0.13349631428718567, "rewards/margins": 0.08181539922952652, "rewards/rejected": -0.2153116911649704, "step": 570 }, { "epoch": 0.62, "learning_rate": 1.904842636196402e-06, "logits/chosen": -0.12237439304590225, "logits/rejected": 0.0009635284659452736, "logps/chosen": -486.32818603515625, "logps/rejected": -522.0555419921875, "loss": 0.2611, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.09265846014022827, "rewards/margins": 0.06704847514629364, "rewards/rejected": -0.15970692038536072, "step": 580 }, { "epoch": 0.63, "learning_rate": 1.814788350020726e-06, "logits/chosen": -0.0846022367477417, "logits/rejected": 0.020748872309923172, "logps/chosen": -515.6177368164062, "logps/rejected": -512.121337890625, "loss": 0.288, "rewards/accuracies": 0.4375, "rewards/chosen": -0.10487208515405655, "rewards/margins": 0.0322984978556633, "rewards/rejected": -0.13717058300971985, "step": 590 }, { "epoch": 0.64, "learning_rate": 1.725685586364051e-06, "logits/chosen": -0.1460862159729004, "logits/rejected": -0.06654468178749084, "logps/chosen": -464.4956970214844, "logps/rejected": -489.59161376953125, "loss": 0.2786, "rewards/accuracies": 0.40625, "rewards/chosen": -0.09209474176168442, "rewards/margins": 0.04379875212907791, "rewards/rejected": -0.13589349389076233, "step": 600 }, { "epoch": 0.65, "learning_rate": 1.6376580782162172e-06, "logits/chosen": -0.1410539448261261, "logits/rejected": -0.08074741810560226, "logps/chosen": -483.46893310546875, "logps/rejected": -526.5164794921875, "loss": 0.2829, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.09680439531803131, "rewards/margins": 0.06490226835012436, "rewards/rejected": -0.16170665621757507, "step": 610 }, { "epoch": 0.66, "learning_rate": 1.550828065408227e-06, "logits/chosen": -0.09655305743217468, "logits/rejected": -0.0762481540441513, "logps/chosen": -468.4337463378906, "logps/rejected": -525.4623413085938, "loss": 0.2619, "rewards/accuracies": 0.5, "rewards/chosen": -0.08364450931549072, "rewards/margins": 0.0691133439540863, "rewards/rejected": -0.15275785326957703, "step": 620 }, { "epoch": 0.67, "learning_rate": 1.4653161248633053e-06, "logits/chosen": -0.03865772485733032, "logits/rejected": -0.10341192781925201, "logps/chosen": -468.88763427734375, "logps/rejected": -524.7953491210938, "loss": 0.2589, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.08644279092550278, "rewards/margins": 0.0591856949031353, "rewards/rejected": -0.14562849700450897, "step": 630 }, { "epoch": 0.68, "learning_rate": 1.381241003157162e-06, "logits/chosen": -0.08597133308649063, "logits/rejected": 0.0006229489808902144, "logps/chosen": -556.832275390625, "logps/rejected": -577.3524780273438, "loss": 0.2942, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.1141686886548996, "rewards/margins": 0.03842983394861221, "rewards/rejected": -0.1525985300540924, "step": 640 }, { "epoch": 0.69, "learning_rate": 1.298719451619979e-06, "logits/chosen": -0.09541022032499313, "logits/rejected": -0.05003209039568901, "logps/chosen": -469.9234313964844, "logps/rejected": -529.8631591796875, "loss": 0.2683, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.09767267853021622, "rewards/margins": 0.054049454629421234, "rewards/rejected": -0.15172213315963745, "step": 650 }, { "epoch": 0.7, "learning_rate": 1.2178660642091036e-06, "logits/chosen": -0.2083953619003296, "logits/rejected": 0.05156536027789116, "logps/chosen": -587.299560546875, "logps/rejected": -581.7611694335938, "loss": 0.2608, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.10773968696594238, "rewards/margins": 0.06141304969787598, "rewards/rejected": -0.16915276646614075, "step": 660 }, { "epoch": 0.71, "learning_rate": 1.1387931183775821e-06, "logits/chosen": -0.09526301920413971, "logits/rejected": -0.06854981184005737, "logps/chosen": -553.5062255859375, "logps/rejected": -567.5850219726562, "loss": 0.2713, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.10804013162851334, "rewards/margins": 0.06674468517303467, "rewards/rejected": -0.1747848242521286, "step": 670 }, { "epoch": 0.73, "learning_rate": 1.061610419159532e-06, "logits/chosen": -0.15921640396118164, "logits/rejected": -0.22180967032909393, "logps/chosen": -464.23480224609375, "logps/rejected": -517.2071533203125, "loss": 0.2618, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.1085701733827591, "rewards/margins": 0.06596361845731735, "rewards/rejected": -0.17453376948833466, "step": 680 }, { "epoch": 0.74, "learning_rate": 9.864251466888364e-07, "logits/chosen": -0.2035103738307953, "logits/rejected": -0.13756130635738373, "logps/chosen": -533.3971557617188, "logps/rejected": -562.695068359375, "loss": 0.2758, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.12624426186084747, "rewards/margins": 0.05400489643216133, "rewards/rejected": -0.1802491694688797, "step": 690 }, { "epoch": 0.75, "learning_rate": 9.133417073629288e-07, "logits/chosen": -0.19000104069709778, "logits/rejected": -0.17901551723480225, "logps/chosen": -564.2060546875, "logps/rejected": -620.484619140625, "loss": 0.2765, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.12581291794776917, "rewards/margins": 0.06180128455162048, "rewards/rejected": -0.18761418759822845, "step": 700 }, { "epoch": 0.76, "learning_rate": 8.424615888583332e-07, "logits/chosen": -0.1307680755853653, "logits/rejected": -0.10078835487365723, "logps/chosen": -524.8638916015625, "logps/rejected": -553.104736328125, "loss": 0.2792, "rewards/accuracies": 0.4375, "rewards/chosen": -0.13305291533470154, "rewards/margins": 0.040464796125888824, "rewards/rejected": -0.17351767420768738, "step": 710 }, { "epoch": 0.77, "learning_rate": 7.738832191993092e-07, "logits/chosen": -0.11268335580825806, "logits/rejected": -0.05942006781697273, "logps/chosen": -497.02325439453125, "logps/rejected": -589.494384765625, "loss": 0.267, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.11972874402999878, "rewards/margins": 0.0718456357717514, "rewards/rejected": -0.19157439470291138, "step": 720 }, { "epoch": 0.78, "learning_rate": 7.077018300752917e-07, "logits/chosen": -0.14758452773094177, "logits/rejected": -0.0012020498979836702, "logps/chosen": -520.2757568359375, "logps/rejected": -585.8750610351562, "loss": 0.2769, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.11923079192638397, "rewards/margins": 0.06183774396777153, "rewards/rejected": -0.1810685396194458, "step": 730 }, { "epoch": 0.79, "learning_rate": 6.440093245969342e-07, "logits/chosen": -0.1631493866443634, "logits/rejected": -0.14262652397155762, "logps/chosen": -531.8765869140625, "logps/rejected": -570.99365234375, "loss": 0.2753, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.11439726501703262, "rewards/margins": 0.05860968679189682, "rewards/rejected": -0.17300695180892944, "step": 740 }, { "epoch": 0.8, "learning_rate": 5.828941496744075e-07, "logits/chosen": -0.13462567329406738, "logits/rejected": -0.10775252431631088, "logps/chosen": -528.6737060546875, "logps/rejected": -559.4716796875, "loss": 0.2773, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.11941119283437729, "rewards/margins": 0.06873499602079391, "rewards/rejected": -0.1881461888551712, "step": 750 }, { "epoch": 0.81, "learning_rate": 5.244411731951671e-07, "logits/chosen": -0.10648471117019653, "logits/rejected": -0.12774226069450378, "logps/chosen": -540.2073974609375, "logps/rejected": -609.1519165039062, "loss": 0.2717, "rewards/accuracies": 0.53125, "rewards/chosen": -0.1273835450410843, "rewards/margins": 0.07286655902862549, "rewards/rejected": -0.2002500742673874, "step": 760 }, { "epoch": 0.82, "learning_rate": 4.6873156617173594e-07, "logits/chosen": -0.2068806141614914, "logits/rejected": -0.1860518753528595, "logps/chosen": -510.51983642578125, "logps/rejected": -575.1392822265625, "loss": 0.2802, "rewards/accuracies": 0.46875, "rewards/chosen": -0.1164456382393837, "rewards/margins": 0.06441595405340195, "rewards/rejected": -0.18086162209510803, "step": 770 }, { "epoch": 0.83, "learning_rate": 4.1584269002318653e-07, "logits/chosen": -0.23118607699871063, "logits/rejected": 0.012592856772243977, "logps/chosen": -493.88201904296875, "logps/rejected": -535.3624267578125, "loss": 0.2541, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.10904928296804428, "rewards/margins": 0.06981517374515533, "rewards/rejected": -0.17886444926261902, "step": 780 }, { "epoch": 0.84, "learning_rate": 3.658479891468258e-07, "logits/chosen": -0.13056764006614685, "logits/rejected": -0.06715533137321472, "logps/chosen": -526.5809326171875, "logps/rejected": -599.15771484375, "loss": 0.2635, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.12021216005086899, "rewards/margins": 0.07990214973688126, "rewards/rejected": -0.20011429488658905, "step": 790 }, { "epoch": 0.85, "learning_rate": 3.18816888929272e-07, "logits/chosen": -0.2064342051744461, "logits/rejected": -0.1310141682624817, "logps/chosen": -503.92022705078125, "logps/rejected": -535.3073120117188, "loss": 0.2788, "rewards/accuracies": 0.4375, "rewards/chosen": -0.1168203130364418, "rewards/margins": 0.04998582974076271, "rewards/rejected": -0.1668061465024948, "step": 800 }, { "epoch": 0.86, "learning_rate": 2.748146993385484e-07, "logits/chosen": -0.15658999979496002, "logits/rejected": -0.05465535447001457, "logps/chosen": -493.89666748046875, "logps/rejected": -565.4508056640625, "loss": 0.2593, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.11054392158985138, "rewards/margins": 0.062444061040878296, "rewards/rejected": -0.17298798263072968, "step": 810 }, { "epoch": 0.87, "learning_rate": 2.3390252423108077e-07, "logits/chosen": -0.12076146900653839, "logits/rejected": -0.07717995345592499, "logps/chosen": -557.0513305664062, "logps/rejected": -630.1217041015625, "loss": 0.26, "rewards/accuracies": 0.53125, "rewards/chosen": -0.12639924883842468, "rewards/margins": 0.06895993649959564, "rewards/rejected": -0.19535920023918152, "step": 820 }, { "epoch": 0.89, "learning_rate": 1.961371764995243e-07, "logits/chosen": -0.18750372529029846, "logits/rejected": -0.20195484161376953, "logps/chosen": -431.05615234375, "logps/rejected": -488.2513732910156, "loss": 0.2672, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -0.0998988226056099, "rewards/margins": 0.054485417902469635, "rewards/rejected": -0.15438422560691833, "step": 830 }, { "epoch": 0.9, "learning_rate": 1.61571099179261e-07, "logits/chosen": -0.17934174835681915, "logits/rejected": -0.13482218980789185, "logps/chosen": -465.31268310546875, "logps/rejected": -541.5718994140625, "loss": 0.2379, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.11106850951910019, "rewards/margins": 0.07404305785894394, "rewards/rejected": -0.18511156737804413, "step": 840 }, { "epoch": 0.91, "learning_rate": 1.3025229262312367e-07, "logits/chosen": -0.16116994619369507, "logits/rejected": -0.06133908033370972, "logps/chosen": -567.8401489257812, "logps/rejected": -617.7798461914062, "loss": 0.2591, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.12380583584308624, "rewards/margins": 0.07460357248783112, "rewards/rejected": -0.19840940833091736, "step": 850 }, { "epoch": 0.92, "learning_rate": 1.0222424784546853e-07, "logits/chosen": -0.09671641886234283, "logits/rejected": -0.10632093995809555, "logps/chosen": -526.3566284179688, "logps/rejected": -554.2623291015625, "loss": 0.271, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -0.1161075010895729, "rewards/margins": 0.05733795836567879, "rewards/rejected": -0.1734454333782196, "step": 860 }, { "epoch": 0.93, "learning_rate": 7.752588612816553e-08, "logits/chosen": -0.15584774315357208, "logits/rejected": -0.18242886662483215, "logps/chosen": -535.07373046875, "logps/rejected": -580.4075927734375, "loss": 0.2676, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.12142983824014664, "rewards/margins": 0.06167648360133171, "rewards/rejected": -0.18310633301734924, "step": 870 }, { "epoch": 0.94, "learning_rate": 5.619150497236991e-08, "logits/chosen": -0.09849689900875092, "logits/rejected": -0.10616960376501083, "logps/chosen": -480.7345275878906, "logps/rejected": -568.6452026367188, "loss": 0.2573, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.12242833524942398, "rewards/margins": 0.07691850513219833, "rewards/rejected": -0.19934681057929993, "step": 880 }, { "epoch": 0.95, "learning_rate": 3.825073047112743e-08, "logits/chosen": -0.1931258887052536, "logits/rejected": -0.14527785778045654, "logps/chosen": -525.9476318359375, "logps/rejected": -556.6074829101562, "loss": 0.2718, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.1265556812286377, "rewards/margins": 0.05168802663683891, "rewards/rejected": -0.1782437115907669, "step": 890 }, { "epoch": 0.96, "learning_rate": 2.372847616895685e-08, "logits/chosen": -0.1395512819290161, "logits/rejected": -0.15351735055446625, "logps/chosen": -519.0070190429688, "logps/rejected": -630.4365234375, "loss": 0.25, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.12783722579479218, "rewards/margins": 0.08357492834329605, "rewards/rejected": -0.21141216158866882, "step": 900 }, { "epoch": 0.97, "learning_rate": 1.264490846553279e-08, "logits/chosen": -0.1655835509300232, "logits/rejected": -0.06148504465818405, "logps/chosen": -530.4078979492188, "logps/rejected": -588.1494140625, "loss": 0.2773, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.13116273283958435, "rewards/margins": 0.060104191303253174, "rewards/rejected": -0.19126692414283752, "step": 910 }, { "epoch": 0.98, "learning_rate": 5.015418611516165e-09, "logits/chosen": -0.13306137919425964, "logits/rejected": -0.1015244722366333, "logps/chosen": -540.9669189453125, "logps/rejected": -633.6178588867188, "loss": 0.2619, "rewards/accuracies": 0.53125, "rewards/chosen": -0.13009101152420044, "rewards/margins": 0.07983705401420593, "rewards/rejected": -0.20992806553840637, "step": 920 }, { "epoch": 0.99, "learning_rate": 8.506013354186993e-10, "logits/chosen": -0.16086629033088684, "logits/rejected": -0.07110301405191422, "logps/chosen": -540.5888061523438, "logps/rejected": -531.308349609375, "loss": 0.2795, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.11827180534601212, "rewards/margins": 0.032559461891651154, "rewards/rejected": -0.15083125233650208, "step": 930 }, { "epoch": 1.0, "step": 937, "total_flos": 0.0, "train_loss": 0.275421927202982, "train_runtime": 7850.8319, "train_samples_per_second": 3.821, "train_steps_per_second": 0.119 } ], "logging_steps": 10, "max_steps": 937, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }