{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 593, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 4.201680672268907e-09, "logits/chosen": -0.6788080930709839, "logits/rejected": -1.1750900745391846, "logps/chosen": -702.8984985351562, "logps/rejected": -239.67630004882812, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 8.403361344537815e-09, "logits/chosen": -1.6158480644226074, "logits/rejected": -1.2959809303283691, "logps/chosen": -112.90769958496094, "logps/rejected": -81.65785217285156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.01, "learning_rate": 1.2605042016806723e-08, "logits/chosen": -2.375753879547119, "logits/rejected": -2.5303637981414795, "logps/chosen": -105.81280517578125, "logps/rejected": -131.5235595703125, "loss": 0.6943, "rewards/accuracies": 0.5, "rewards/chosen": 0.023235511034727097, "rewards/margins": -0.002191734267398715, "rewards/rejected": 0.02542724646627903, "step": 3 }, { "epoch": 0.01, "learning_rate": 1.680672268907563e-08, "logits/chosen": -1.907819151878357, "logits/rejected": -1.9828282594680786, "logps/chosen": -243.6266326904297, "logps/rejected": -293.4872741699219, "loss": 0.6958, "rewards/accuracies": 0.5, "rewards/chosen": 0.013934326358139515, "rewards/margins": -0.03516464680433273, "rewards/rejected": 0.049098968505859375, "step": 4 }, { "epoch": 0.01, "learning_rate": 2.1008403361344538e-08, "logits/chosen": -1.5391994714736938, "logits/rejected": -1.6013704538345337, "logps/chosen": -514.83447265625, "logps/rejected": -273.52606201171875, "loss": 0.6919, "rewards/accuracies": 1.0, "rewards/chosen": 0.04896850883960724, "rewards/margins": 0.09175796806812286, "rewards/rejected": -0.042789459228515625, "step": 5 }, { "epoch": 0.01, "learning_rate": 2.5210084033613446e-08, "logits/chosen": -2.251502513885498, "logits/rejected": -1.4788130521774292, "logps/chosen": -194.65187072753906, "logps/rejected": -230.2232666015625, "loss": 0.6948, "rewards/accuracies": 0.5, "rewards/chosen": 0.06929359585046768, "rewards/margins": 0.0685802549123764, "rewards/rejected": 0.000713348388671875, "step": 6 }, { "epoch": 0.01, "learning_rate": 2.941176470588235e-08, "logits/chosen": -1.6795597076416016, "logits/rejected": -1.6621124744415283, "logps/chosen": -188.00582885742188, "logps/rejected": -178.40765380859375, "loss": 0.6955, "rewards/accuracies": 0.5, "rewards/chosen": 0.004383087158203125, "rewards/margins": -0.015031430870294571, "rewards/rejected": 0.019414519891142845, "step": 7 }, { "epoch": 0.01, "learning_rate": 3.361344537815126e-08, "logits/chosen": -1.2061922550201416, "logits/rejected": -1.4656660556793213, "logps/chosen": -493.43206787109375, "logps/rejected": -74.92171478271484, "loss": 0.6919, "rewards/accuracies": 1.0, "rewards/chosen": 0.07655182480812073, "rewards/margins": 0.10538730025291443, "rewards/rejected": -0.028835486620664597, "step": 8 }, { "epoch": 0.02, "learning_rate": 3.7815126050420164e-08, "logits/chosen": -1.5676227807998657, "logits/rejected": -1.5455267429351807, "logps/chosen": -228.5581817626953, "logps/rejected": -194.3417510986328, "loss": 0.6953, "rewards/accuracies": 0.5, "rewards/chosen": 0.03210144117474556, "rewards/margins": -0.005328751169145107, "rewards/rejected": 0.03743019327521324, "step": 9 }, { "epoch": 0.02, "learning_rate": 4.2016806722689076e-08, "logits/chosen": -1.2673882246017456, "logits/rejected": -1.175107717514038, "logps/chosen": -226.69273376464844, "logps/rejected": -170.93002319335938, "loss": 0.6932, "rewards/accuracies": 1.0, "rewards/chosen": 0.05511780083179474, "rewards/margins": 0.0763774886727333, "rewards/rejected": -0.021259689703583717, "step": 10 }, { "epoch": 0.02, "learning_rate": 4.621848739495798e-08, "logits/chosen": -1.7369565963745117, "logits/rejected": -2.0291335582733154, "logps/chosen": -134.85565185546875, "logps/rejected": -61.743980407714844, "loss": 0.6925, "rewards/accuracies": 1.0, "rewards/chosen": -0.0022448543459177017, "rewards/margins": 0.011675357818603516, "rewards/rejected": -0.013920212164521217, "step": 11 }, { "epoch": 0.02, "learning_rate": 5.042016806722689e-08, "logits/chosen": -1.3196473121643066, "logits/rejected": -1.325734257698059, "logps/chosen": -80.28683471679688, "logps/rejected": -79.56066131591797, "loss": 0.6918, "rewards/accuracies": 0.5, "rewards/chosen": 0.006443023681640625, "rewards/margins": 0.021741105243563652, "rewards/rejected": -0.015298080630600452, "step": 12 }, { "epoch": 0.02, "learning_rate": 5.46218487394958e-08, "logits/chosen": -1.5383967161178589, "logits/rejected": -1.4319273233413696, "logps/chosen": -71.45745086669922, "logps/rejected": -93.32796478271484, "loss": 0.6971, "rewards/accuracies": 1.0, "rewards/chosen": -0.006036281585693359, "rewards/margins": 0.015392017550766468, "rewards/rejected": -0.021428298205137253, "step": 13 }, { "epoch": 0.02, "learning_rate": 5.88235294117647e-08, "logits/chosen": -1.968656301498413, "logits/rejected": -1.845158338546753, "logps/chosen": -168.4257049560547, "logps/rejected": -300.03240966796875, "loss": 0.6904, "rewards/accuracies": 0.5, "rewards/chosen": -0.027264786884188652, "rewards/margins": -0.05615234375, "rewards/rejected": 0.028887558728456497, "step": 14 }, { "epoch": 0.03, "learning_rate": 6.302521008403361e-08, "logits/chosen": -1.1591368913650513, "logits/rejected": -1.4170737266540527, "logps/chosen": -538.101806640625, "logps/rejected": -236.76358032226562, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": 0.03155364841222763, "rewards/margins": 0.02361450158059597, "rewards/rejected": 0.00793914869427681, "step": 15 }, { "epoch": 0.03, "learning_rate": 6.722689075630252e-08, "logits/chosen": -1.7213101387023926, "logits/rejected": -1.8231241703033447, "logps/chosen": -196.15289306640625, "logps/rejected": -119.35342407226562, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": 0.029254913330078125, "rewards/margins": 0.023168563842773438, "rewards/rejected": 0.0060863494873046875, "step": 16 }, { "epoch": 0.03, "learning_rate": 7.142857142857142e-08, "logits/chosen": -1.7846002578735352, "logits/rejected": -2.3181114196777344, "logps/chosen": -273.34564208984375, "logps/rejected": -146.905029296875, "loss": 0.6954, "rewards/accuracies": 1.0, "rewards/chosen": 0.03240509331226349, "rewards/margins": 0.0660804733633995, "rewards/rejected": -0.033675383776426315, "step": 17 }, { "epoch": 0.03, "learning_rate": 7.563025210084033e-08, "logits/chosen": -2.3839895725250244, "logits/rejected": -1.8420289754867554, "logps/chosen": -47.616455078125, "logps/rejected": -177.2080841064453, "loss": 0.6935, "rewards/accuracies": 0.5, "rewards/chosen": 0.0052467347122728825, "rewards/margins": 0.012156296521425247, "rewards/rejected": -0.006909562274813652, "step": 18 }, { "epoch": 0.03, "learning_rate": 7.983193277310923e-08, "logits/chosen": -2.3161439895629883, "logits/rejected": -1.8462892770767212, "logps/chosen": -96.58424377441406, "logps/rejected": -209.37664794921875, "loss": 0.6883, "rewards/accuracies": 0.5, "rewards/chosen": -0.0225248821079731, "rewards/margins": 0.016490697860717773, "rewards/rejected": -0.03901557996869087, "step": 19 }, { "epoch": 0.03, "learning_rate": 8.403361344537815e-08, "logits/chosen": -2.2219033241271973, "logits/rejected": -2.0519139766693115, "logps/chosen": -346.8481750488281, "logps/rejected": -1364.489990234375, "loss": 0.6862, "rewards/accuracies": 1.0, "rewards/chosen": 0.08504104614257812, "rewards/margins": 0.16014480590820312, "rewards/rejected": -0.075103759765625, "step": 20 }, { "epoch": 0.04, "learning_rate": 8.823529411764706e-08, "logits/chosen": -1.4293802976608276, "logits/rejected": -1.661201000213623, "logps/chosen": -307.6474609375, "logps/rejected": -215.94967651367188, "loss": 0.6908, "rewards/accuracies": 0.5, "rewards/chosen": 0.004551697056740522, "rewards/margins": 0.012935257516801357, "rewards/rejected": -0.008383559994399548, "step": 21 }, { "epoch": 0.04, "learning_rate": 9.243697478991596e-08, "logits/chosen": -1.8076047897338867, "logits/rejected": -1.5782675743103027, "logps/chosen": -179.2224884033203, "logps/rejected": -232.96527099609375, "loss": 0.6916, "rewards/accuracies": 0.5, "rewards/chosen": 0.04394521564245224, "rewards/margins": 0.08951330184936523, "rewards/rejected": -0.045568086206912994, "step": 22 }, { "epoch": 0.04, "learning_rate": 9.663865546218488e-08, "logits/chosen": -1.7942882776260376, "logits/rejected": -1.0943225622177124, "logps/chosen": -55.99930191040039, "logps/rejected": -140.6543426513672, "loss": 0.6901, "rewards/accuracies": 0.5, "rewards/chosen": 0.023340702056884766, "rewards/margins": 0.0005490314215421677, "rewards/rejected": 0.022791672497987747, "step": 23 }, { "epoch": 0.04, "learning_rate": 1.0084033613445378e-07, "logits/chosen": -0.837689995765686, "logits/rejected": -1.8798249959945679, "logps/chosen": -213.58486938476562, "logps/rejected": -28.56793785095215, "loss": 0.6852, "rewards/accuracies": 0.5, "rewards/chosen": 0.026500703766942024, "rewards/margins": 0.03877449035644531, "rewards/rejected": -0.012273788452148438, "step": 24 }, { "epoch": 0.04, "learning_rate": 1.0504201680672269e-07, "logits/chosen": -2.5890495777130127, "logits/rejected": -1.7141728401184082, "logps/chosen": -12.43747329711914, "logps/rejected": -147.9033203125, "loss": 0.6846, "rewards/accuracies": 0.0, "rewards/chosen": -0.0024075033143162727, "rewards/margins": -0.05119595676660538, "rewards/rejected": 0.04878845438361168, "step": 25 }, { "epoch": 0.04, "learning_rate": 1.092436974789916e-07, "logits/chosen": -2.9379727840423584, "logits/rejected": -1.3671715259552002, "logps/chosen": -203.9562530517578, "logps/rejected": -130.590576171875, "loss": 0.6829, "rewards/accuracies": 1.0, "rewards/chosen": 0.021196747198700905, "rewards/margins": 0.04621582105755806, "rewards/rejected": -0.025019073858857155, "step": 26 }, { "epoch": 0.05, "learning_rate": 1.134453781512605e-07, "logits/chosen": -1.2850056886672974, "logits/rejected": -1.527043104171753, "logps/chosen": -293.0238037109375, "logps/rejected": -110.18681335449219, "loss": 0.6859, "rewards/accuracies": 0.0, "rewards/chosen": -0.069427490234375, "rewards/margins": -0.04693755879998207, "rewards/rejected": -0.02248992957174778, "step": 27 }, { "epoch": 0.05, "learning_rate": 1.176470588235294e-07, "logits/chosen": -2.0202457904815674, "logits/rejected": -2.382385730743408, "logps/chosen": -323.6606750488281, "logps/rejected": -179.65538024902344, "loss": 0.6833, "rewards/accuracies": 0.0, "rewards/chosen": -0.028132058680057526, "rewards/margins": -0.018391229212284088, "rewards/rejected": -0.009740829467773438, "step": 28 }, { "epoch": 0.05, "learning_rate": 1.2184873949579832e-07, "logits/chosen": -1.9959008693695068, "logits/rejected": -1.408521294593811, "logps/chosen": -221.8274383544922, "logps/rejected": -225.3356475830078, "loss": 0.6799, "rewards/accuracies": 1.0, "rewards/chosen": 0.023763515055179596, "rewards/margins": 0.14066720008850098, "rewards/rejected": -0.11690368503332138, "step": 29 }, { "epoch": 0.05, "learning_rate": 1.2605042016806723e-07, "logits/chosen": -1.8002355098724365, "logits/rejected": -1.521448016166687, "logps/chosen": -94.72344970703125, "logps/rejected": -110.32486724853516, "loss": 0.6821, "rewards/accuracies": 0.0, "rewards/chosen": -0.054589081555604935, "rewards/margins": -0.024533655494451523, "rewards/rejected": -0.03005542792379856, "step": 30 }, { "epoch": 0.05, "learning_rate": 1.3025210084033613e-07, "logits/chosen": -1.8377197980880737, "logits/rejected": -2.063385248184204, "logps/chosen": -62.164398193359375, "logps/rejected": -104.72893524169922, "loss": 0.679, "rewards/accuracies": 0.5, "rewards/chosen": 0.019611358642578125, "rewards/margins": -0.0029705059714615345, "rewards/rejected": 0.022581864148378372, "step": 31 }, { "epoch": 0.05, "learning_rate": 1.3445378151260504e-07, "logits/chosen": -2.392535924911499, "logits/rejected": -2.1506621837615967, "logps/chosen": -11.346028327941895, "logps/rejected": -71.765625, "loss": 0.6785, "rewards/accuracies": 0.5, "rewards/chosen": -0.015737399458885193, "rewards/margins": -0.004548127297312021, "rewards/rejected": -0.01118927076458931, "step": 32 }, { "epoch": 0.06, "learning_rate": 1.3865546218487394e-07, "logits/chosen": -2.0362868309020996, "logits/rejected": -2.1367034912109375, "logps/chosen": -266.005859375, "logps/rejected": -214.60191345214844, "loss": 0.6788, "rewards/accuracies": 1.0, "rewards/chosen": 0.052748873829841614, "rewards/margins": 0.09725818783044815, "rewards/rejected": -0.04450931400060654, "step": 33 }, { "epoch": 0.06, "learning_rate": 1.4285714285714285e-07, "logits/chosen": -1.3824855089187622, "logits/rejected": -1.5640826225280762, "logps/chosen": -111.83187866210938, "logps/rejected": -36.405189514160156, "loss": 0.6693, "rewards/accuracies": 0.5, "rewards/chosen": -0.008962631225585938, "rewards/margins": 0.011541889980435371, "rewards/rejected": -0.02050452120602131, "step": 34 }, { "epoch": 0.06, "learning_rate": 1.4705882352941175e-07, "logits/chosen": -2.272282361984253, "logits/rejected": -2.159532308578491, "logps/chosen": -43.7902717590332, "logps/rejected": -74.43631744384766, "loss": 0.6652, "rewards/accuracies": 1.0, "rewards/chosen": 0.01943950727581978, "rewards/margins": 0.03280620649456978, "rewards/rejected": -0.01336669921875, "step": 35 }, { "epoch": 0.06, "learning_rate": 1.5126050420168066e-07, "logits/chosen": -2.0161397457122803, "logits/rejected": -1.3697400093078613, "logps/chosen": -67.14362335205078, "logps/rejected": -123.745361328125, "loss": 0.6641, "rewards/accuracies": 1.0, "rewards/chosen": 0.0367613323032856, "rewards/margins": 0.04867387190461159, "rewards/rejected": -0.01191253773868084, "step": 36 }, { "epoch": 0.06, "learning_rate": 1.554621848739496e-07, "logits/chosen": -2.1913275718688965, "logits/rejected": -1.7024658918380737, "logps/chosen": -10.184264183044434, "logps/rejected": -107.8653793334961, "loss": 0.6593, "rewards/accuracies": 0.5, "rewards/chosen": -0.004525709431618452, "rewards/margins": -0.01936373859643936, "rewards/rejected": 0.014838028699159622, "step": 37 }, { "epoch": 0.06, "learning_rate": 1.5966386554621847e-07, "logits/chosen": -1.081247091293335, "logits/rejected": -2.124126434326172, "logps/chosen": -789.7781982421875, "logps/rejected": -147.115966796875, "loss": 0.659, "rewards/accuracies": 1.0, "rewards/chosen": 0.05167846754193306, "rewards/margins": 0.20409394800662994, "rewards/rejected": -0.152415469288826, "step": 38 }, { "epoch": 0.07, "learning_rate": 1.638655462184874e-07, "logits/chosen": -2.1789205074310303, "logits/rejected": -1.1509499549865723, "logps/chosen": -295.66265869140625, "logps/rejected": -394.5150451660156, "loss": 0.6517, "rewards/accuracies": 0.5, "rewards/chosen": -0.12732239067554474, "rewards/margins": 0.1430404633283615, "rewards/rejected": -0.27036285400390625, "step": 39 }, { "epoch": 0.07, "learning_rate": 1.680672268907563e-07, "logits/chosen": -2.1448452472686768, "logits/rejected": -2.1956920623779297, "logps/chosen": -62.93687438964844, "logps/rejected": -84.88615417480469, "loss": 0.6534, "rewards/accuracies": 0.5, "rewards/chosen": -0.008682060055434704, "rewards/margins": 0.0031255725771188736, "rewards/rejected": -0.011807632632553577, "step": 40 }, { "epoch": 0.07, "learning_rate": 1.722689075630252e-07, "logits/chosen": -1.4040545225143433, "logits/rejected": -0.7300827503204346, "logps/chosen": -326.945068359375, "logps/rejected": -324.3778076171875, "loss": 0.6435, "rewards/accuracies": 1.0, "rewards/chosen": -0.008136749267578125, "rewards/margins": 0.2589103579521179, "rewards/rejected": -0.26704710721969604, "step": 41 }, { "epoch": 0.07, "learning_rate": 1.764705882352941e-07, "logits/chosen": -1.4858357906341553, "logits/rejected": -2.0196330547332764, "logps/chosen": -459.7484130859375, "logps/rejected": -210.59243774414062, "loss": 0.6423, "rewards/accuracies": 0.5, "rewards/chosen": -0.13091735541820526, "rewards/margins": -0.0402679406106472, "rewards/rejected": -0.09064941853284836, "step": 42 }, { "epoch": 0.07, "learning_rate": 1.8067226890756302e-07, "logits/chosen": -1.4100688695907593, "logits/rejected": -2.2512903213500977, "logps/chosen": -263.8143615722656, "logps/rejected": -82.95572662353516, "loss": 0.6408, "rewards/accuracies": 0.0, "rewards/chosen": -0.052451327443122864, "rewards/margins": -0.04398571699857712, "rewards/rejected": -0.008465608581900597, "step": 43 }, { "epoch": 0.07, "learning_rate": 1.8487394957983192e-07, "logits/chosen": -1.654313325881958, "logits/rejected": -1.3700717687606812, "logps/chosen": -167.07334899902344, "logps/rejected": -133.7058868408203, "loss": 0.646, "rewards/accuracies": 0.5, "rewards/chosen": -0.036783602088689804, "rewards/margins": 0.10394057631492615, "rewards/rejected": -0.14072418212890625, "step": 44 }, { "epoch": 0.08, "learning_rate": 1.8907563025210083e-07, "logits/chosen": -2.3346948623657227, "logits/rejected": -1.4270800352096558, "logps/chosen": -363.33868408203125, "logps/rejected": -202.8612060546875, "loss": 0.6329, "rewards/accuracies": 0.5, "rewards/chosen": -0.11517754197120667, "rewards/margins": -0.01095886155962944, "rewards/rejected": -0.10421867668628693, "step": 45 }, { "epoch": 0.08, "learning_rate": 1.9327731092436976e-07, "logits/chosen": -1.5132286548614502, "logits/rejected": -0.9802812933921814, "logps/chosen": -463.6932067871094, "logps/rejected": -284.0732421875, "loss": 0.6186, "rewards/accuracies": 1.0, "rewards/chosen": -0.11024780571460724, "rewards/margins": 0.5847091674804688, "rewards/rejected": -0.6949569582939148, "step": 46 }, { "epoch": 0.08, "learning_rate": 1.9747899159663864e-07, "logits/chosen": -1.640755295753479, "logits/rejected": -2.064528465270996, "logps/chosen": -141.10398864746094, "logps/rejected": -74.3631362915039, "loss": 0.6133, "rewards/accuracies": 1.0, "rewards/chosen": 0.003640557639300823, "rewards/margins": 0.031121447682380676, "rewards/rejected": -0.02748088911175728, "step": 47 }, { "epoch": 0.08, "learning_rate": 2.0168067226890757e-07, "logits/chosen": -1.3043317794799805, "logits/rejected": -0.7944495677947998, "logps/chosen": -236.87294006347656, "logps/rejected": -140.55502319335938, "loss": 0.6065, "rewards/accuracies": 1.0, "rewards/chosen": 0.0874466672539711, "rewards/margins": 0.6055868864059448, "rewards/rejected": -0.5181402564048767, "step": 48 }, { "epoch": 0.08, "learning_rate": 2.0588235294117645e-07, "logits/chosen": -2.2910568714141846, "logits/rejected": -1.5469049215316772, "logps/chosen": -50.14934539794922, "logps/rejected": -230.92745971679688, "loss": 0.6062, "rewards/accuracies": 1.0, "rewards/chosen": -0.012336349114775658, "rewards/margins": 0.3008907437324524, "rewards/rejected": -0.3132270872592926, "step": 49 }, { "epoch": 0.08, "learning_rate": 2.1008403361344538e-07, "logits/chosen": -1.6005315780639648, "logits/rejected": -2.0069663524627686, "logps/chosen": -300.32855224609375, "logps/rejected": -70.41799926757812, "loss": 0.5983, "rewards/accuracies": 0.5, "rewards/chosen": -0.16448670625686646, "rewards/margins": -0.07991065829992294, "rewards/rejected": -0.08457604050636292, "step": 50 }, { "epoch": 0.09, "learning_rate": 2.1428571428571426e-07, "logits/chosen": -1.6963545083999634, "logits/rejected": -1.8104299306869507, "logps/chosen": -224.7620391845703, "logps/rejected": -106.64592742919922, "loss": 0.5888, "rewards/accuracies": 0.5, "rewards/chosen": -0.040007784962654114, "rewards/margins": 0.2745014429092407, "rewards/rejected": -0.31450921297073364, "step": 51 }, { "epoch": 0.09, "learning_rate": 2.184873949579832e-07, "logits/chosen": -1.519837737083435, "logits/rejected": -1.6336404085159302, "logps/chosen": -302.5843505859375, "logps/rejected": -260.89599609375, "loss": 0.5738, "rewards/accuracies": 0.0, "rewards/chosen": -0.00217361468821764, "rewards/margins": -0.08065643161535263, "rewards/rejected": 0.07848282158374786, "step": 52 }, { "epoch": 0.09, "learning_rate": 2.226890756302521e-07, "logits/chosen": -0.7233390212059021, "logits/rejected": -0.5498945116996765, "logps/chosen": -314.1108093261719, "logps/rejected": -156.276611328125, "loss": 0.5757, "rewards/accuracies": 1.0, "rewards/chosen": -0.03915100172162056, "rewards/margins": 0.5292686223983765, "rewards/rejected": -0.5684196352958679, "step": 53 }, { "epoch": 0.09, "learning_rate": 2.26890756302521e-07, "logits/chosen": -2.079944610595703, "logits/rejected": -2.1164326667785645, "logps/chosen": -363.44049072265625, "logps/rejected": -204.23228454589844, "loss": 0.5763, "rewards/accuracies": 1.0, "rewards/chosen": 0.03126373142004013, "rewards/margins": 0.17057648301124573, "rewards/rejected": -0.139312744140625, "step": 54 }, { "epoch": 0.09, "learning_rate": 2.3109243697478993e-07, "logits/chosen": -0.8852956891059875, "logits/rejected": -1.17733633518219, "logps/chosen": -229.51646423339844, "logps/rejected": -123.25852966308594, "loss": 0.5658, "rewards/accuracies": 1.0, "rewards/chosen": 0.012698173522949219, "rewards/margins": 0.3284967541694641, "rewards/rejected": -0.3157985806465149, "step": 55 }, { "epoch": 0.09, "learning_rate": 2.352941176470588e-07, "logits/chosen": -1.2436057329177856, "logits/rejected": -1.3107479810714722, "logps/chosen": -507.35784912109375, "logps/rejected": -224.55007934570312, "loss": 0.5549, "rewards/accuracies": 0.5, "rewards/chosen": -0.12623444199562073, "rewards/margins": 0.4832092523574829, "rewards/rejected": -0.6094436645507812, "step": 56 }, { "epoch": 0.1, "learning_rate": 2.394957983193277e-07, "logits/chosen": -1.765979290008545, "logits/rejected": -2.5764899253845215, "logps/chosen": -305.29486083984375, "logps/rejected": -71.30033874511719, "loss": 0.5634, "rewards/accuracies": 0.5, "rewards/chosen": 0.14705120027065277, "rewards/margins": 0.1404399424791336, "rewards/rejected": 0.00661125173792243, "step": 57 }, { "epoch": 0.1, "learning_rate": 2.4369747899159664e-07, "logits/chosen": -0.8349874019622803, "logits/rejected": -0.4467710256576538, "logps/chosen": -396.75067138671875, "logps/rejected": -238.7788848876953, "loss": 0.5588, "rewards/accuracies": 1.0, "rewards/chosen": -0.2361343502998352, "rewards/margins": 0.7512519955635071, "rewards/rejected": -0.9873863458633423, "step": 58 }, { "epoch": 0.1, "learning_rate": 2.478991596638655e-07, "logits/chosen": -1.1839135885238647, "logits/rejected": -1.342013955116272, "logps/chosen": -506.8759460449219, "logps/rejected": -188.46533203125, "loss": 0.5478, "rewards/accuracies": 1.0, "rewards/chosen": -0.19339600205421448, "rewards/margins": 0.9577789306640625, "rewards/rejected": -1.1511750221252441, "step": 59 }, { "epoch": 0.1, "learning_rate": 2.5210084033613445e-07, "logits/chosen": -1.238139271736145, "logits/rejected": -1.305624008178711, "logps/chosen": -354.024169921875, "logps/rejected": -150.709228515625, "loss": 0.5474, "rewards/accuracies": 1.0, "rewards/chosen": -0.03719634935259819, "rewards/margins": 0.4611190855503082, "rewards/rejected": -0.4983154535293579, "step": 60 }, { "epoch": 0.1, "learning_rate": 2.5630252100840333e-07, "logits/chosen": -1.0698193311691284, "logits/rejected": -1.1871693134307861, "logps/chosen": -533.863525390625, "logps/rejected": -207.14418029785156, "loss": 0.5121, "rewards/accuracies": 1.0, "rewards/chosen": -0.06107788532972336, "rewards/margins": 1.0981537103652954, "rewards/rejected": -1.1592315435409546, "step": 61 }, { "epoch": 0.1, "learning_rate": 2.6050420168067226e-07, "logits/chosen": -1.856410026550293, "logits/rejected": -1.372816801071167, "logps/chosen": -205.7808074951172, "logps/rejected": -189.2294464111328, "loss": 0.5077, "rewards/accuracies": 1.0, "rewards/chosen": -0.07348620891571045, "rewards/margins": 0.7908002138137817, "rewards/rejected": -0.8642864227294922, "step": 62 }, { "epoch": 0.11, "learning_rate": 2.6470588235294114e-07, "logits/chosen": -1.718478798866272, "logits/rejected": -1.663999080657959, "logps/chosen": -604.8311767578125, "logps/rejected": -742.4992065429688, "loss": 0.518, "rewards/accuracies": 0.5, "rewards/chosen": -0.17543946206569672, "rewards/margins": -0.012347415089607239, "rewards/rejected": -0.16309204697608948, "step": 63 }, { "epoch": 0.11, "learning_rate": 2.689075630252101e-07, "logits/chosen": -2.534972906112671, "logits/rejected": -2.4803988933563232, "logps/chosen": -27.763654708862305, "logps/rejected": -65.91168975830078, "loss": 0.4941, "rewards/accuracies": 1.0, "rewards/chosen": 0.041327860206365585, "rewards/margins": 0.3402096629142761, "rewards/rejected": -0.2988818287849426, "step": 64 }, { "epoch": 0.11, "learning_rate": 2.7310924369747895e-07, "logits/chosen": -1.866020679473877, "logits/rejected": -1.5979124307632446, "logps/chosen": -348.8237609863281, "logps/rejected": -415.2561950683594, "loss": 0.4814, "rewards/accuracies": 0.5, "rewards/chosen": -0.09458465874195099, "rewards/margins": 0.18892823159694672, "rewards/rejected": -0.2835128903388977, "step": 65 }, { "epoch": 0.11, "learning_rate": 2.773109243697479e-07, "logits/chosen": -1.7812227010726929, "logits/rejected": -1.2362346649169922, "logps/chosen": -210.58091735839844, "logps/rejected": -221.3896026611328, "loss": 0.4545, "rewards/accuracies": 1.0, "rewards/chosen": -0.13596276938915253, "rewards/margins": 1.656264305114746, "rewards/rejected": -1.7922271490097046, "step": 66 }, { "epoch": 0.11, "learning_rate": 2.815126050420168e-07, "logits/chosen": -1.5426255464553833, "logits/rejected": -1.5356191396713257, "logps/chosen": -36.993186950683594, "logps/rejected": -77.3095474243164, "loss": 0.4601, "rewards/accuracies": 1.0, "rewards/chosen": 0.021850014105439186, "rewards/margins": 0.5636359453201294, "rewards/rejected": -0.5417859554290771, "step": 67 }, { "epoch": 0.11, "learning_rate": 2.857142857142857e-07, "logits/chosen": -1.814134955406189, "logits/rejected": -1.493807077407837, "logps/chosen": -214.3492431640625, "logps/rejected": -331.2470397949219, "loss": 0.466, "rewards/accuracies": 1.0, "rewards/chosen": 0.041826628148555756, "rewards/margins": 0.37629854679107666, "rewards/rejected": -0.3344719111919403, "step": 68 }, { "epoch": 0.12, "learning_rate": 2.899159663865546e-07, "logits/chosen": -1.104387640953064, "logits/rejected": -1.4887744188308716, "logps/chosen": -790.798583984375, "logps/rejected": -575.5951538085938, "loss": 0.4556, "rewards/accuracies": 1.0, "rewards/chosen": 0.17354126274585724, "rewards/margins": 0.449990838766098, "rewards/rejected": -0.276449590921402, "step": 69 }, { "epoch": 0.12, "learning_rate": 2.941176470588235e-07, "logits/chosen": -1.1406750679016113, "logits/rejected": -1.7379848957061768, "logps/chosen": -677.4329833984375, "logps/rejected": -125.09095001220703, "loss": 0.4216, "rewards/accuracies": 1.0, "rewards/chosen": 0.29751741886138916, "rewards/margins": 1.025307536125183, "rewards/rejected": -0.727790117263794, "step": 70 }, { "epoch": 0.12, "learning_rate": 2.9831932773109244e-07, "logits/chosen": -1.548018455505371, "logits/rejected": -1.9434715509414673, "logps/chosen": -60.39828109741211, "logps/rejected": -38.36094665527344, "loss": 0.435, "rewards/accuracies": 1.0, "rewards/chosen": -0.07226741313934326, "rewards/margins": 0.6180351972579956, "rewards/rejected": -0.6903026103973389, "step": 71 }, { "epoch": 0.12, "learning_rate": 3.025210084033613e-07, "logits/chosen": -1.4888752698898315, "logits/rejected": -1.3269966840744019, "logps/chosen": -333.23468017578125, "logps/rejected": -181.92431640625, "loss": 0.4167, "rewards/accuracies": 1.0, "rewards/chosen": -0.046831514686346054, "rewards/margins": 2.105104923248291, "rewards/rejected": -2.1519362926483154, "step": 72 }, { "epoch": 0.12, "learning_rate": 3.0672268907563024e-07, "logits/chosen": -1.5784661769866943, "logits/rejected": -1.5319206714630127, "logps/chosen": -37.48617172241211, "logps/rejected": -38.057193756103516, "loss": 0.4306, "rewards/accuracies": 0.5, "rewards/chosen": -0.07858496159315109, "rewards/margins": 0.3954930901527405, "rewards/rejected": -0.47407805919647217, "step": 73 }, { "epoch": 0.12, "learning_rate": 3.109243697478992e-07, "logits/chosen": -2.385610580444336, "logits/rejected": -2.365718364715576, "logps/chosen": -31.291919708251953, "logps/rejected": -60.941036224365234, "loss": 0.3708, "rewards/accuracies": 1.0, "rewards/chosen": 0.0072297099977731705, "rewards/margins": 0.6486601829528809, "rewards/rejected": -0.6414304971694946, "step": 74 }, { "epoch": 0.13, "learning_rate": 3.1512605042016805e-07, "logits/chosen": -1.8049649000167847, "logits/rejected": -1.474593162536621, "logps/chosen": -145.8429412841797, "logps/rejected": -114.68021392822266, "loss": 0.3998, "rewards/accuracies": 1.0, "rewards/chosen": -0.15717864036560059, "rewards/margins": 0.8213388919830322, "rewards/rejected": -0.9785175919532776, "step": 75 }, { "epoch": 0.13, "learning_rate": 3.1932773109243693e-07, "logits/chosen": -1.4387171268463135, "logits/rejected": -1.3081284761428833, "logps/chosen": -72.28972625732422, "logps/rejected": -99.7930679321289, "loss": 0.4351, "rewards/accuracies": 1.0, "rewards/chosen": 0.03587455675005913, "rewards/margins": 0.7939237356185913, "rewards/rejected": -0.7580491900444031, "step": 76 }, { "epoch": 0.13, "learning_rate": 3.2352941176470586e-07, "logits/chosen": -1.4761199951171875, "logits/rejected": -2.109046459197998, "logps/chosen": -155.40235900878906, "logps/rejected": -155.92733764648438, "loss": 0.395, "rewards/accuracies": 0.5, "rewards/chosen": -0.06405086815357208, "rewards/margins": 0.329689621925354, "rewards/rejected": -0.3937404751777649, "step": 77 }, { "epoch": 0.13, "learning_rate": 3.277310924369748e-07, "logits/chosen": -2.038198232650757, "logits/rejected": -2.2060189247131348, "logps/chosen": -148.52001953125, "logps/rejected": -190.42556762695312, "loss": 0.3962, "rewards/accuracies": 0.5, "rewards/chosen": -0.5522751212120056, "rewards/margins": 0.17331847548484802, "rewards/rejected": -0.725593626499176, "step": 78 }, { "epoch": 0.13, "learning_rate": 3.319327731092437e-07, "logits/chosen": -1.1491724252700806, "logits/rejected": -0.9036651849746704, "logps/chosen": -343.11529541015625, "logps/rejected": -174.84776306152344, "loss": 0.4158, "rewards/accuracies": 1.0, "rewards/chosen": -0.14253844320774078, "rewards/margins": 2.43456768989563, "rewards/rejected": -2.577106237411499, "step": 79 }, { "epoch": 0.13, "learning_rate": 3.361344537815126e-07, "logits/chosen": -1.5111039876937866, "logits/rejected": -1.9596521854400635, "logps/chosen": -572.5203857421875, "logps/rejected": -325.9815368652344, "loss": 0.373, "rewards/accuracies": 1.0, "rewards/chosen": 0.3365722894668579, "rewards/margins": 2.104917287826538, "rewards/rejected": -1.7683449983596802, "step": 80 }, { "epoch": 0.14, "learning_rate": 3.403361344537815e-07, "logits/chosen": -1.8409173488616943, "logits/rejected": -1.9764267206192017, "logps/chosen": -352.3498840332031, "logps/rejected": -321.4953308105469, "loss": 0.3865, "rewards/accuracies": 1.0, "rewards/chosen": 0.22799532115459442, "rewards/margins": 0.7164055109024048, "rewards/rejected": -0.48841017484664917, "step": 81 }, { "epoch": 0.14, "learning_rate": 3.445378151260504e-07, "logits/chosen": -1.8073253631591797, "logits/rejected": -2.74298357963562, "logps/chosen": -222.5423583984375, "logps/rejected": -52.44717788696289, "loss": 0.3857, "rewards/accuracies": 0.5, "rewards/chosen": -0.19195251166820526, "rewards/margins": 0.3153046667575836, "rewards/rejected": -0.5072571635246277, "step": 82 }, { "epoch": 0.14, "learning_rate": 3.487394957983193e-07, "logits/chosen": -2.154622793197632, "logits/rejected": -1.6847541332244873, "logps/chosen": -208.72132873535156, "logps/rejected": -175.53054809570312, "loss": 0.3637, "rewards/accuracies": 0.5, "rewards/chosen": -0.23619385063648224, "rewards/margins": 1.846364140510559, "rewards/rejected": -2.0825579166412354, "step": 83 }, { "epoch": 0.14, "learning_rate": 3.529411764705882e-07, "logits/chosen": -1.876042127609253, "logits/rejected": -1.3837803602218628, "logps/chosen": -53.90886306762695, "logps/rejected": -144.5306396484375, "loss": 0.364, "rewards/accuracies": 1.0, "rewards/chosen": -0.01596364937722683, "rewards/margins": 0.7440950274467468, "rewards/rejected": -0.7600586414337158, "step": 84 }, { "epoch": 0.14, "learning_rate": 3.5714285714285716e-07, "logits/chosen": -1.4703798294067383, "logits/rejected": -1.7445605993270874, "logps/chosen": -269.6485900878906, "logps/rejected": -158.86940002441406, "loss": 0.3672, "rewards/accuracies": 1.0, "rewards/chosen": 0.10937881469726562, "rewards/margins": 2.3677597045898438, "rewards/rejected": -2.258380889892578, "step": 85 }, { "epoch": 0.15, "learning_rate": 3.6134453781512604e-07, "logits/chosen": -1.6020888090133667, "logits/rejected": -1.6218175888061523, "logps/chosen": -152.8944091796875, "logps/rejected": -168.00990295410156, "loss": 0.3729, "rewards/accuracies": 0.5, "rewards/chosen": -0.5375404357910156, "rewards/margins": 0.09828647971153259, "rewards/rejected": -0.6358269453048706, "step": 86 }, { "epoch": 0.15, "learning_rate": 3.655462184873949e-07, "logits/chosen": -1.5672448873519897, "logits/rejected": -1.6510668992996216, "logps/chosen": -447.45806884765625, "logps/rejected": -239.46371459960938, "loss": 0.3704, "rewards/accuracies": 1.0, "rewards/chosen": -0.2388494461774826, "rewards/margins": 3.1040165424346924, "rewards/rejected": -3.3428659439086914, "step": 87 }, { "epoch": 0.15, "learning_rate": 3.6974789915966385e-07, "logits/chosen": -0.6570608615875244, "logits/rejected": -0.8279274702072144, "logps/chosen": -395.8023376464844, "logps/rejected": -155.96295166015625, "loss": 0.3316, "rewards/accuracies": 1.0, "rewards/chosen": 0.0513305589556694, "rewards/margins": 3.0571579933166504, "rewards/rejected": -3.0058274269104004, "step": 88 }, { "epoch": 0.15, "learning_rate": 3.739495798319328e-07, "logits/chosen": -1.7863593101501465, "logits/rejected": -2.064410924911499, "logps/chosen": -333.698486328125, "logps/rejected": -122.79313659667969, "loss": 0.3579, "rewards/accuracies": 1.0, "rewards/chosen": -0.06812648475170135, "rewards/margins": 0.5757344365119934, "rewards/rejected": -0.643860936164856, "step": 89 }, { "epoch": 0.15, "learning_rate": 3.7815126050420166e-07, "logits/chosen": -1.7475690841674805, "logits/rejected": -2.228104591369629, "logps/chosen": -287.8011169433594, "logps/rejected": -31.779052734375, "loss": 0.3574, "rewards/accuracies": 1.0, "rewards/chosen": 0.38655588030815125, "rewards/margins": 0.9471727609634399, "rewards/rejected": -0.5606168508529663, "step": 90 }, { "epoch": 0.15, "learning_rate": 3.8235294117647053e-07, "logits/chosen": -1.7612462043762207, "logits/rejected": -1.4105805158615112, "logps/chosen": -122.49490356445312, "logps/rejected": -187.72357177734375, "loss": 0.3251, "rewards/accuracies": 1.0, "rewards/chosen": -0.22412091493606567, "rewards/margins": 2.9434642791748047, "rewards/rejected": -3.1675851345062256, "step": 91 }, { "epoch": 0.16, "learning_rate": 3.865546218487395e-07, "logits/chosen": -1.4474364519119263, "logits/rejected": -1.5902643203735352, "logps/chosen": -239.24310302734375, "logps/rejected": -135.27609252929688, "loss": 0.3662, "rewards/accuracies": 1.0, "rewards/chosen": -0.21128883957862854, "rewards/margins": 1.6379708051681519, "rewards/rejected": -1.849259614944458, "step": 92 }, { "epoch": 0.16, "learning_rate": 3.907563025210084e-07, "logits/chosen": -1.1318254470825195, "logits/rejected": -1.4229687452316284, "logps/chosen": -260.1365966796875, "logps/rejected": -74.00960540771484, "loss": 0.3516, "rewards/accuracies": 1.0, "rewards/chosen": 0.2649814486503601, "rewards/margins": 1.8917981386184692, "rewards/rejected": -1.6268166303634644, "step": 93 }, { "epoch": 0.16, "learning_rate": 3.949579831932773e-07, "logits/chosen": -1.8826991319656372, "logits/rejected": -2.357274055480957, "logps/chosen": -311.9720458984375, "logps/rejected": -158.02745056152344, "loss": 0.3471, "rewards/accuracies": 1.0, "rewards/chosen": 0.05377950519323349, "rewards/margins": 1.7474021911621094, "rewards/rejected": -1.6936227083206177, "step": 94 }, { "epoch": 0.16, "learning_rate": 3.991596638655462e-07, "logits/chosen": -1.2974653244018555, "logits/rejected": -1.5636136531829834, "logps/chosen": -478.4207763671875, "logps/rejected": -169.5061492919922, "loss": 0.2943, "rewards/accuracies": 1.0, "rewards/chosen": 0.32736513018608093, "rewards/margins": 3.4351882934570312, "rewards/rejected": -3.107823133468628, "step": 95 }, { "epoch": 0.16, "learning_rate": 4.0336134453781514e-07, "logits/chosen": -1.6629210710525513, "logits/rejected": -1.3563766479492188, "logps/chosen": -302.283447265625, "logps/rejected": -185.8943328857422, "loss": 0.3216, "rewards/accuracies": 1.0, "rewards/chosen": 0.08565587550401688, "rewards/margins": 2.939385414123535, "rewards/rejected": -2.853729486465454, "step": 96 }, { "epoch": 0.16, "learning_rate": 4.07563025210084e-07, "logits/chosen": -1.579493761062622, "logits/rejected": -1.9858088493347168, "logps/chosen": -166.11209106445312, "logps/rejected": -93.21321868896484, "loss": 0.2999, "rewards/accuracies": 0.5, "rewards/chosen": -0.39952126145362854, "rewards/margins": 0.800835371017456, "rewards/rejected": -1.2003566026687622, "step": 97 }, { "epoch": 0.17, "learning_rate": 4.117647058823529e-07, "logits/chosen": -1.9741665124893188, "logits/rejected": -1.3600385189056396, "logps/chosen": -258.8650817871094, "logps/rejected": -185.20352172851562, "loss": 0.3346, "rewards/accuracies": 1.0, "rewards/chosen": -0.09575500339269638, "rewards/margins": 2.4661808013916016, "rewards/rejected": -2.5619359016418457, "step": 98 }, { "epoch": 0.17, "learning_rate": 4.159663865546218e-07, "logits/chosen": -1.7311229705810547, "logits/rejected": -2.162808418273926, "logps/chosen": -423.9176940917969, "logps/rejected": -175.74667358398438, "loss": 0.2905, "rewards/accuracies": 0.0, "rewards/chosen": -1.4093338251113892, "rewards/margins": -0.9930892586708069, "rewards/rejected": -0.4162445068359375, "step": 99 }, { "epoch": 0.17, "learning_rate": 4.2016806722689076e-07, "logits/chosen": -1.7871568202972412, "logits/rejected": -1.6674413681030273, "logps/chosen": -133.9112548828125, "logps/rejected": -180.4711456298828, "loss": 0.2967, "rewards/accuracies": 0.5, "rewards/chosen": -0.17781352996826172, "rewards/margins": 1.3768540620803833, "rewards/rejected": -1.554667592048645, "step": 100 }, { "epoch": 0.17, "learning_rate": 4.2436974789915964e-07, "logits/chosen": -1.9930833578109741, "logits/rejected": -2.3848719596862793, "logps/chosen": -248.69406127929688, "logps/rejected": -160.950927734375, "loss": 0.2864, "rewards/accuracies": 0.5, "rewards/chosen": -0.7147164344787598, "rewards/margins": 1.7294585704803467, "rewards/rejected": -2.4441750049591064, "step": 101 }, { "epoch": 0.17, "learning_rate": 4.285714285714285e-07, "logits/chosen": -1.812693476676941, "logits/rejected": -1.6050926446914673, "logps/chosen": -277.7933654785156, "logps/rejected": -483.4625549316406, "loss": 0.3317, "rewards/accuracies": 0.5, "rewards/chosen": -0.6971309781074524, "rewards/margins": -0.4464415907859802, "rewards/rejected": -0.2506893575191498, "step": 102 }, { "epoch": 0.17, "learning_rate": 4.327731092436975e-07, "logits/chosen": -1.461750864982605, "logits/rejected": -1.7464590072631836, "logps/chosen": -173.95265197753906, "logps/rejected": -135.8446807861328, "loss": 0.2714, "rewards/accuracies": 1.0, "rewards/chosen": -0.12809354066848755, "rewards/margins": 3.298431396484375, "rewards/rejected": -3.4265246391296387, "step": 103 }, { "epoch": 0.18, "learning_rate": 4.369747899159664e-07, "logits/chosen": -1.5056589841842651, "logits/rejected": -1.9312864542007446, "logps/chosen": -201.46270751953125, "logps/rejected": -267.52276611328125, "loss": 0.3103, "rewards/accuracies": 1.0, "rewards/chosen": 0.1706157773733139, "rewards/margins": 0.6676197052001953, "rewards/rejected": -0.4970039427280426, "step": 104 }, { "epoch": 0.18, "learning_rate": 4.4117647058823526e-07, "logits/chosen": -2.0907585620880127, "logits/rejected": -1.849104642868042, "logps/chosen": -21.733829498291016, "logps/rejected": -76.71643829345703, "loss": 0.2871, "rewards/accuracies": 1.0, "rewards/chosen": -0.0892881453037262, "rewards/margins": 1.409407138824463, "rewards/rejected": -1.4986952543258667, "step": 105 }, { "epoch": 0.18, "learning_rate": 4.453781512605042e-07, "logits/chosen": -0.9986115097999573, "logits/rejected": -0.6594001054763794, "logps/chosen": -368.1170654296875, "logps/rejected": -226.82830810546875, "loss": 0.3077, "rewards/accuracies": 1.0, "rewards/chosen": -0.4677414000034332, "rewards/margins": 4.347979545593262, "rewards/rejected": -4.815721035003662, "step": 106 }, { "epoch": 0.18, "learning_rate": 4.495798319327731e-07, "logits/chosen": -1.8972766399383545, "logits/rejected": -2.1591522693634033, "logps/chosen": -341.3930358886719, "logps/rejected": -195.02679443359375, "loss": 0.28, "rewards/accuracies": 0.5, "rewards/chosen": -0.6874268054962158, "rewards/margins": 2.0455260276794434, "rewards/rejected": -2.732952833175659, "step": 107 }, { "epoch": 0.18, "learning_rate": 4.53781512605042e-07, "logits/chosen": -1.1454424858093262, "logits/rejected": -1.2674638032913208, "logps/chosen": -141.8154754638672, "logps/rejected": -19.07546043395996, "loss": 0.3023, "rewards/accuracies": 1.0, "rewards/chosen": 0.4041498303413391, "rewards/margins": 1.1306045055389404, "rewards/rejected": -0.7264547348022461, "step": 108 }, { "epoch": 0.18, "learning_rate": 4.579831932773109e-07, "logits/chosen": -2.07033109664917, "logits/rejected": -2.380950450897217, "logps/chosen": -80.41409301757812, "logps/rejected": -98.79011535644531, "loss": 0.2933, "rewards/accuracies": 0.0, "rewards/chosen": -0.43403300642967224, "rewards/margins": -0.3926330506801605, "rewards/rejected": -0.04139995574951172, "step": 109 }, { "epoch": 0.19, "learning_rate": 4.6218487394957986e-07, "logits/chosen": -2.5270440578460693, "logits/rejected": -2.435595750808716, "logps/chosen": -24.47152328491211, "logps/rejected": -148.6935577392578, "loss": 0.3021, "rewards/accuracies": 1.0, "rewards/chosen": -0.11162052303552628, "rewards/margins": 3.680636405944824, "rewards/rejected": -3.7922568321228027, "step": 110 }, { "epoch": 0.19, "learning_rate": 4.6638655462184874e-07, "logits/chosen": -1.4159996509552002, "logits/rejected": -1.1079641580581665, "logps/chosen": -213.67677307128906, "logps/rejected": -146.0835418701172, "loss": 0.2737, "rewards/accuracies": 1.0, "rewards/chosen": -0.6399146914482117, "rewards/margins": 2.671748161315918, "rewards/rejected": -3.3116626739501953, "step": 111 }, { "epoch": 0.19, "learning_rate": 4.705882352941176e-07, "logits/chosen": -1.859910488128662, "logits/rejected": -2.269141435623169, "logps/chosen": -270.6617126464844, "logps/rejected": -149.7200164794922, "loss": 0.2958, "rewards/accuracies": 1.0, "rewards/chosen": -0.3810228407382965, "rewards/margins": 2.3572652339935303, "rewards/rejected": -2.738288164138794, "step": 112 }, { "epoch": 0.19, "learning_rate": 4.747899159663865e-07, "logits/chosen": -1.615531325340271, "logits/rejected": -2.3673205375671387, "logps/chosen": -264.86578369140625, "logps/rejected": -242.48330688476562, "loss": 0.292, "rewards/accuracies": 0.5, "rewards/chosen": -0.4372093081474304, "rewards/margins": 2.0082688331604004, "rewards/rejected": -2.4454782009124756, "step": 113 }, { "epoch": 0.19, "learning_rate": 4.789915966386554e-07, "logits/chosen": -1.3283716440200806, "logits/rejected": -1.4000985622406006, "logps/chosen": -751.4788818359375, "logps/rejected": -504.08892822265625, "loss": 0.2725, "rewards/accuracies": 1.0, "rewards/chosen": 0.6655944585800171, "rewards/margins": 4.143014907836914, "rewards/rejected": -3.4774200916290283, "step": 114 }, { "epoch": 0.19, "learning_rate": 4.831932773109244e-07, "logits/chosen": -1.942575216293335, "logits/rejected": -1.5764302015304565, "logps/chosen": -49.1660270690918, "logps/rejected": -56.93123245239258, "loss": 0.2784, "rewards/accuracies": 1.0, "rewards/chosen": -0.0690576359629631, "rewards/margins": 1.2642771005630493, "rewards/rejected": -1.3333348035812378, "step": 115 }, { "epoch": 0.2, "learning_rate": 4.873949579831933e-07, "logits/chosen": -2.2477641105651855, "logits/rejected": -2.076430559158325, "logps/chosen": -29.411571502685547, "logps/rejected": -121.75186920166016, "loss": 0.3125, "rewards/accuracies": 1.0, "rewards/chosen": -0.06398458778858185, "rewards/margins": 3.124544143676758, "rewards/rejected": -3.1885287761688232, "step": 116 }, { "epoch": 0.2, "learning_rate": 4.915966386554621e-07, "logits/chosen": -2.0187594890594482, "logits/rejected": -1.1486027240753174, "logps/chosen": -329.5198974609375, "logps/rejected": -279.2951354980469, "loss": 0.2825, "rewards/accuracies": 1.0, "rewards/chosen": 0.12247312068939209, "rewards/margins": 6.082241058349609, "rewards/rejected": -5.959768295288086, "step": 117 }, { "epoch": 0.2, "learning_rate": 4.95798319327731e-07, "logits/chosen": -1.4514484405517578, "logits/rejected": -2.005096435546875, "logps/chosen": -317.6128845214844, "logps/rejected": -207.64822387695312, "loss": 0.2737, "rewards/accuracies": 1.0, "rewards/chosen": 0.06410064548254013, "rewards/margins": 2.704306125640869, "rewards/rejected": -2.6402053833007812, "step": 118 }, { "epoch": 0.2, "learning_rate": 5e-07, "logits/chosen": -0.627937376499176, "logits/rejected": -0.6839653253555298, "logps/chosen": -91.22163391113281, "logps/rejected": -55.82908630371094, "loss": 0.2844, "rewards/accuracies": 1.0, "rewards/chosen": 0.14227086305618286, "rewards/margins": 2.2958028316497803, "rewards/rejected": -2.153531789779663, "step": 119 }, { "epoch": 0.2, "learning_rate": 5.042016806722689e-07, "logits/chosen": -2.05309796333313, "logits/rejected": -1.3187203407287598, "logps/chosen": -375.8322448730469, "logps/rejected": -427.10931396484375, "loss": 0.2755, "rewards/accuracies": 1.0, "rewards/chosen": 0.019181065261363983, "rewards/margins": 2.3469948768615723, "rewards/rejected": -2.3278136253356934, "step": 120 }, { "epoch": 0.2, "learning_rate": 5.084033613445377e-07, "logits/chosen": -1.6020848751068115, "logits/rejected": -1.890777826309204, "logps/chosen": -389.67840576171875, "logps/rejected": -156.53250122070312, "loss": 0.2691, "rewards/accuracies": 1.0, "rewards/chosen": -0.7378628253936768, "rewards/margins": 2.929511308670044, "rewards/rejected": -3.6673741340637207, "step": 121 }, { "epoch": 0.21, "learning_rate": 5.126050420168067e-07, "logits/chosen": -1.5457667112350464, "logits/rejected": -0.9591537714004517, "logps/chosen": -232.89480590820312, "logps/rejected": -236.02783203125, "loss": 0.2301, "rewards/accuracies": 1.0, "rewards/chosen": -0.7060562372207642, "rewards/margins": 4.284882545471191, "rewards/rejected": -4.990938663482666, "step": 122 }, { "epoch": 0.21, "learning_rate": 5.168067226890757e-07, "logits/chosen": -1.6671092510223389, "logits/rejected": -1.6428896188735962, "logps/chosen": -63.35133361816406, "logps/rejected": -109.79218292236328, "loss": 0.2499, "rewards/accuracies": 1.0, "rewards/chosen": -0.2081298828125, "rewards/margins": 2.169139862060547, "rewards/rejected": -2.3772695064544678, "step": 123 }, { "epoch": 0.21, "learning_rate": 5.210084033613445e-07, "logits/chosen": -1.5807249546051025, "logits/rejected": -1.7926443815231323, "logps/chosen": -74.63159942626953, "logps/rejected": -134.3237762451172, "loss": 0.2469, "rewards/accuracies": 1.0, "rewards/chosen": -0.030628204345703125, "rewards/margins": 1.1621196269989014, "rewards/rejected": -1.1927478313446045, "step": 124 }, { "epoch": 0.21, "learning_rate": 5.252100840336135e-07, "logits/chosen": -1.318616509437561, "logits/rejected": -1.6264910697937012, "logps/chosen": -527.2734375, "logps/rejected": -174.54412841796875, "loss": 0.2468, "rewards/accuracies": 1.0, "rewards/chosen": -0.34924012422561646, "rewards/margins": 3.5974647998809814, "rewards/rejected": -3.946704864501953, "step": 125 }, { "epoch": 0.21, "learning_rate": 5.294117647058823e-07, "logits/chosen": -1.1573264598846436, "logits/rejected": -0.8850076198577881, "logps/chosen": -82.6595458984375, "logps/rejected": -150.1305389404297, "loss": 0.2477, "rewards/accuracies": 1.0, "rewards/chosen": -0.006247520446777344, "rewards/margins": 2.6717464923858643, "rewards/rejected": -2.6779940128326416, "step": 126 }, { "epoch": 0.21, "learning_rate": 5.336134453781512e-07, "logits/chosen": -1.5155869722366333, "logits/rejected": -1.3886268138885498, "logps/chosen": -13.588849067687988, "logps/rejected": -60.8436279296875, "loss": 0.2575, "rewards/accuracies": 1.0, "rewards/chosen": -0.14050476253032684, "rewards/margins": 1.282322883605957, "rewards/rejected": -1.4228277206420898, "step": 127 }, { "epoch": 0.22, "learning_rate": 5.378151260504201e-07, "logits/chosen": -2.142261505126953, "logits/rejected": -2.022604465484619, "logps/chosen": -36.83582305908203, "logps/rejected": -84.54644775390625, "loss": 0.2476, "rewards/accuracies": 1.0, "rewards/chosen": -0.06873762607574463, "rewards/margins": 1.3124231100082397, "rewards/rejected": -1.3811607360839844, "step": 128 }, { "epoch": 0.22, "learning_rate": 5.42016806722689e-07, "logits/chosen": -2.0003206729888916, "logits/rejected": -2.7086338996887207, "logps/chosen": -297.49810791015625, "logps/rejected": -121.93182373046875, "loss": 0.2778, "rewards/accuracies": 1.0, "rewards/chosen": 0.6414843797683716, "rewards/margins": 2.057905673980713, "rewards/rejected": -1.4164212942123413, "step": 129 }, { "epoch": 0.22, "learning_rate": 5.462184873949579e-07, "logits/chosen": -2.3803634643554688, "logits/rejected": -1.4992303848266602, "logps/chosen": -57.217220306396484, "logps/rejected": -203.0693817138672, "loss": 0.2675, "rewards/accuracies": 1.0, "rewards/chosen": -0.25870370864868164, "rewards/margins": 2.8911209106445312, "rewards/rejected": -3.149824619293213, "step": 130 }, { "epoch": 0.22, "learning_rate": 5.504201680672269e-07, "logits/chosen": -1.2806719541549683, "logits/rejected": -2.390531063079834, "logps/chosen": -454.8437194824219, "logps/rejected": -94.0916748046875, "loss": 0.2616, "rewards/accuracies": 1.0, "rewards/chosen": 0.13427734375, "rewards/margins": 3.2612316608428955, "rewards/rejected": -3.1269543170928955, "step": 131 }, { "epoch": 0.22, "learning_rate": 5.546218487394958e-07, "logits/chosen": -1.9892646074295044, "logits/rejected": -1.4233769178390503, "logps/chosen": -108.95271301269531, "logps/rejected": -160.64715576171875, "loss": 0.2424, "rewards/accuracies": 1.0, "rewards/chosen": -0.15306320786476135, "rewards/margins": 3.4583911895751953, "rewards/rejected": -3.611454486846924, "step": 132 }, { "epoch": 0.22, "learning_rate": 5.588235294117647e-07, "logits/chosen": -1.2769445180892944, "logits/rejected": -1.524395227432251, "logps/chosen": -52.44013214111328, "logps/rejected": -91.19084167480469, "loss": 0.2597, "rewards/accuracies": 1.0, "rewards/chosen": -0.10794153809547424, "rewards/margins": 0.9474404454231262, "rewards/rejected": -1.0553820133209229, "step": 133 }, { "epoch": 0.23, "learning_rate": 5.630252100840336e-07, "logits/chosen": -2.26499080657959, "logits/rejected": -1.1651140451431274, "logps/chosen": -140.05178833007812, "logps/rejected": -229.3851318359375, "loss": 0.2595, "rewards/accuracies": 1.0, "rewards/chosen": -0.13012734055519104, "rewards/margins": 3.768279790878296, "rewards/rejected": -3.898406982421875, "step": 134 }, { "epoch": 0.23, "learning_rate": 5.672268907563025e-07, "logits/chosen": -1.6903553009033203, "logits/rejected": -1.663693904876709, "logps/chosen": -70.8027572631836, "logps/rejected": -179.33489990234375, "loss": 0.2193, "rewards/accuracies": 1.0, "rewards/chosen": 0.4184946119785309, "rewards/margins": 0.8598314523696899, "rewards/rejected": -0.44133681058883667, "step": 135 }, { "epoch": 0.23, "learning_rate": 5.714285714285714e-07, "logits/chosen": -1.894440770149231, "logits/rejected": -2.020301342010498, "logps/chosen": -215.9483642578125, "logps/rejected": -198.383544921875, "loss": 0.2796, "rewards/accuracies": 0.5, "rewards/chosen": -0.6141928434371948, "rewards/margins": 0.986687183380127, "rewards/rejected": -1.6008800268173218, "step": 136 }, { "epoch": 0.23, "learning_rate": 5.756302521008402e-07, "logits/chosen": -2.016982316970825, "logits/rejected": -1.272426962852478, "logps/chosen": -51.863426208496094, "logps/rejected": -149.52340698242188, "loss": 0.252, "rewards/accuracies": 1.0, "rewards/chosen": -0.017465591430664062, "rewards/margins": 2.352057695388794, "rewards/rejected": -2.369523286819458, "step": 137 }, { "epoch": 0.23, "learning_rate": 5.798319327731093e-07, "logits/chosen": -0.8563526272773743, "logits/rejected": -0.9021680355072021, "logps/chosen": -510.8074951171875, "logps/rejected": -266.77423095703125, "loss": 0.2882, "rewards/accuracies": 1.0, "rewards/chosen": -0.9366821050643921, "rewards/margins": 5.248723030090332, "rewards/rejected": -6.1854047775268555, "step": 138 }, { "epoch": 0.23, "learning_rate": 5.840336134453782e-07, "logits/chosen": -1.8311724662780762, "logits/rejected": -1.8810319900512695, "logps/chosen": -106.62870788574219, "logps/rejected": -123.30711364746094, "loss": 0.2667, "rewards/accuracies": 1.0, "rewards/chosen": -0.28059542179107666, "rewards/margins": 3.491508960723877, "rewards/rejected": -3.772104263305664, "step": 139 }, { "epoch": 0.24, "learning_rate": 5.88235294117647e-07, "logits/chosen": -2.032766103744507, "logits/rejected": -1.9613983631134033, "logps/chosen": -239.09063720703125, "logps/rejected": -421.301513671875, "loss": 0.267, "rewards/accuracies": 0.5, "rewards/chosen": -0.27995407581329346, "rewards/margins": 5.965301990509033, "rewards/rejected": -6.245256423950195, "step": 140 }, { "epoch": 0.24, "learning_rate": 5.924369747899159e-07, "logits/chosen": -0.8632844090461731, "logits/rejected": -1.520354151725769, "logps/chosen": -345.11065673828125, "logps/rejected": -139.1497039794922, "loss": 0.2711, "rewards/accuracies": 1.0, "rewards/chosen": 0.5095192193984985, "rewards/margins": 2.5539865493774414, "rewards/rejected": -2.0444672107696533, "step": 141 }, { "epoch": 0.24, "learning_rate": 5.966386554621849e-07, "logits/chosen": -1.3275768756866455, "logits/rejected": -2.037048816680908, "logps/chosen": -461.4246826171875, "logps/rejected": -711.2941284179688, "loss": 0.2599, "rewards/accuracies": 0.5, "rewards/chosen": -1.077467441558838, "rewards/margins": 0.06723290681838989, "rewards/rejected": -1.144700288772583, "step": 142 }, { "epoch": 0.24, "learning_rate": 6.008403361344537e-07, "logits/chosen": -2.1924614906311035, "logits/rejected": -2.12215256690979, "logps/chosen": -35.62915802001953, "logps/rejected": -112.1571273803711, "loss": 0.2408, "rewards/accuracies": 1.0, "rewards/chosen": -0.3445836305618286, "rewards/margins": 3.033930540084839, "rewards/rejected": -3.378514289855957, "step": 143 }, { "epoch": 0.24, "learning_rate": 6.050420168067226e-07, "logits/chosen": -1.556571125984192, "logits/rejected": -1.026197910308838, "logps/chosen": -352.4817810058594, "logps/rejected": -479.5821838378906, "loss": 0.2467, "rewards/accuracies": 0.5, "rewards/chosen": -0.581763505935669, "rewards/margins": 3.661806583404541, "rewards/rejected": -4.243570327758789, "step": 144 }, { "epoch": 0.24, "learning_rate": 6.092436974789916e-07, "logits/chosen": -1.3160314559936523, "logits/rejected": -1.45841646194458, "logps/chosen": -36.064300537109375, "logps/rejected": -15.627889633178711, "loss": 0.2421, "rewards/accuracies": 0.5, "rewards/chosen": -0.07876625657081604, "rewards/margins": 0.7412266731262207, "rewards/rejected": -0.8199928998947144, "step": 145 }, { "epoch": 0.25, "learning_rate": 6.134453781512605e-07, "logits/chosen": -1.2557013034820557, "logits/rejected": -1.1368392705917358, "logps/chosen": -427.95294189453125, "logps/rejected": -396.66290283203125, "loss": 0.2784, "rewards/accuracies": 0.5, "rewards/chosen": -1.8088486194610596, "rewards/margins": 1.1350128650665283, "rewards/rejected": -2.943861484527588, "step": 146 }, { "epoch": 0.25, "learning_rate": 6.176470588235294e-07, "logits/chosen": -2.3703060150146484, "logits/rejected": -1.596415638923645, "logps/chosen": -37.992149353027344, "logps/rejected": -99.8422622680664, "loss": 0.2201, "rewards/accuracies": 1.0, "rewards/chosen": -0.14648981392383575, "rewards/margins": 0.39361295104026794, "rewards/rejected": -0.5401027798652649, "step": 147 }, { "epoch": 0.25, "learning_rate": 6.218487394957984e-07, "logits/chosen": -2.017554759979248, "logits/rejected": -1.885864019393921, "logps/chosen": -31.028438568115234, "logps/rejected": -165.63868713378906, "loss": 0.2205, "rewards/accuracies": 1.0, "rewards/chosen": -0.28271618485450745, "rewards/margins": 5.170090675354004, "rewards/rejected": -5.4528069496154785, "step": 148 }, { "epoch": 0.25, "learning_rate": 6.260504201680672e-07, "logits/chosen": -1.8670669794082642, "logits/rejected": -1.4380172491073608, "logps/chosen": -174.096435546875, "logps/rejected": -223.5670166015625, "loss": 0.2268, "rewards/accuracies": 1.0, "rewards/chosen": -1.1435142755508423, "rewards/margins": 5.261943817138672, "rewards/rejected": -6.405458450317383, "step": 149 }, { "epoch": 0.25, "learning_rate": 6.302521008403361e-07, "logits/chosen": -2.312112808227539, "logits/rejected": -1.3749383687973022, "logps/chosen": -48.91535949707031, "logps/rejected": -204.19570922851562, "loss": 0.205, "rewards/accuracies": 0.5, "rewards/chosen": -0.1308881789445877, "rewards/margins": 1.4819360971450806, "rewards/rejected": -1.6128243207931519, "step": 150 }, { "epoch": 0.25, "learning_rate": 6.344537815126049e-07, "logits/chosen": -2.0252039432525635, "logits/rejected": -2.22589111328125, "logps/chosen": -223.1410369873047, "logps/rejected": -64.30872344970703, "loss": 0.2547, "rewards/accuracies": 1.0, "rewards/chosen": 0.0106048583984375, "rewards/margins": 0.5256061553955078, "rewards/rejected": -0.5150012969970703, "step": 151 }, { "epoch": 0.26, "learning_rate": 6.386554621848739e-07, "logits/chosen": -1.5374224185943604, "logits/rejected": -1.8345128297805786, "logps/chosen": -300.93792724609375, "logps/rejected": -145.31222534179688, "loss": 0.2391, "rewards/accuracies": 1.0, "rewards/chosen": -0.8454994559288025, "rewards/margins": 2.8806023597717285, "rewards/rejected": -3.726101875305176, "step": 152 }, { "epoch": 0.26, "learning_rate": 6.428571428571429e-07, "logits/chosen": -1.6726059913635254, "logits/rejected": -1.3613877296447754, "logps/chosen": -693.0863037109375, "logps/rejected": -1098.9571533203125, "loss": 0.2422, "rewards/accuracies": 0.0, "rewards/chosen": -0.5227920413017273, "rewards/margins": -0.413046270608902, "rewards/rejected": -0.10974578559398651, "step": 153 }, { "epoch": 0.26, "learning_rate": 6.470588235294117e-07, "logits/chosen": -2.3774003982543945, "logits/rejected": -1.5442438125610352, "logps/chosen": -41.62322998046875, "logps/rejected": -237.86154174804688, "loss": 0.237, "rewards/accuracies": 1.0, "rewards/chosen": -0.06168098375201225, "rewards/margins": 5.083809852600098, "rewards/rejected": -5.145491123199463, "step": 154 }, { "epoch": 0.26, "learning_rate": 6.512605042016807e-07, "logits/chosen": -2.0312511920928955, "logits/rejected": -1.431885004043579, "logps/chosen": -149.83917236328125, "logps/rejected": -129.43667602539062, "loss": 0.2291, "rewards/accuracies": 1.0, "rewards/chosen": -0.30399322509765625, "rewards/margins": 2.70412540435791, "rewards/rejected": -3.0081186294555664, "step": 155 }, { "epoch": 0.26, "learning_rate": 6.554621848739496e-07, "logits/chosen": -0.6298438310623169, "logits/rejected": -0.9561706781387329, "logps/chosen": -303.7615051269531, "logps/rejected": -206.84622192382812, "loss": 0.215, "rewards/accuracies": 1.0, "rewards/chosen": -1.0302143096923828, "rewards/margins": 2.9986958503723145, "rewards/rejected": -4.028910160064697, "step": 156 }, { "epoch": 0.26, "learning_rate": 6.596638655462184e-07, "logits/chosen": -1.0504183769226074, "logits/rejected": -2.596862316131592, "logps/chosen": -604.466552734375, "logps/rejected": -95.1014633178711, "loss": 0.2044, "rewards/accuracies": 1.0, "rewards/chosen": 0.6258544921875, "rewards/margins": 3.6576082706451416, "rewards/rejected": -3.0317537784576416, "step": 157 }, { "epoch": 0.27, "learning_rate": 6.638655462184873e-07, "logits/chosen": -1.4746941328048706, "logits/rejected": -2.3198976516723633, "logps/chosen": -510.818115234375, "logps/rejected": -121.24671936035156, "loss": 0.2315, "rewards/accuracies": 1.0, "rewards/chosen": 0.42745667695999146, "rewards/margins": 2.9992709159851074, "rewards/rejected": -2.5718140602111816, "step": 158 }, { "epoch": 0.27, "learning_rate": 6.680672268907563e-07, "logits/chosen": -1.1884299516677856, "logits/rejected": -0.7777690291404724, "logps/chosen": -96.32203674316406, "logps/rejected": -142.47738647460938, "loss": 0.2078, "rewards/accuracies": 1.0, "rewards/chosen": -0.33403682708740234, "rewards/margins": 1.1259114742279053, "rewards/rejected": -1.4599483013153076, "step": 159 }, { "epoch": 0.27, "learning_rate": 6.722689075630252e-07, "logits/chosen": -1.337453007698059, "logits/rejected": -1.9868876934051514, "logps/chosen": -307.2751159667969, "logps/rejected": -128.35006713867188, "loss": 0.2637, "rewards/accuracies": 1.0, "rewards/chosen": 0.154842808842659, "rewards/margins": 0.5152485370635986, "rewards/rejected": -0.36040574312210083, "step": 160 }, { "epoch": 0.27, "learning_rate": 6.764705882352941e-07, "logits/chosen": -1.0279209613800049, "logits/rejected": -2.354536533355713, "logps/chosen": -310.1678466796875, "logps/rejected": -82.68232727050781, "loss": 0.2187, "rewards/accuracies": 1.0, "rewards/chosen": 0.6246879696846008, "rewards/margins": 4.095552921295166, "rewards/rejected": -3.470865249633789, "step": 161 }, { "epoch": 0.27, "learning_rate": 6.80672268907563e-07, "logits/chosen": -2.222411870956421, "logits/rejected": -2.0266168117523193, "logps/chosen": -31.795406341552734, "logps/rejected": -155.34320068359375, "loss": 0.1976, "rewards/accuracies": 1.0, "rewards/chosen": -0.33064308762550354, "rewards/margins": 5.187854766845703, "rewards/rejected": -5.518497943878174, "step": 162 }, { "epoch": 0.27, "learning_rate": 6.848739495798319e-07, "logits/chosen": -1.2088196277618408, "logits/rejected": -2.6081087589263916, "logps/chosen": -340.25689697265625, "logps/rejected": -129.95571899414062, "loss": 0.2151, "rewards/accuracies": 1.0, "rewards/chosen": 0.44170382618904114, "rewards/margins": 1.4245266914367676, "rewards/rejected": -0.982822835445404, "step": 163 }, { "epoch": 0.28, "learning_rate": 6.890756302521008e-07, "logits/chosen": -1.2510286569595337, "logits/rejected": -1.489122748374939, "logps/chosen": -358.14013671875, "logps/rejected": -482.1717529296875, "loss": 0.2434, "rewards/accuracies": 0.5, "rewards/chosen": -0.6463592648506165, "rewards/margins": -0.21972429752349854, "rewards/rejected": -0.4266350269317627, "step": 164 }, { "epoch": 0.28, "learning_rate": 6.932773109243697e-07, "logits/chosen": -1.6255576610565186, "logits/rejected": -2.694222927093506, "logps/chosen": -328.84320068359375, "logps/rejected": -102.6138687133789, "loss": 0.2171, "rewards/accuracies": 1.0, "rewards/chosen": 0.14793242514133453, "rewards/margins": 2.839181661605835, "rewards/rejected": -2.691249132156372, "step": 165 }, { "epoch": 0.28, "learning_rate": 6.974789915966386e-07, "logits/chosen": -1.9384331703186035, "logits/rejected": -1.580994725227356, "logps/chosen": -276.12921142578125, "logps/rejected": -316.6767578125, "loss": 0.2836, "rewards/accuracies": 1.0, "rewards/chosen": -0.9446266889572144, "rewards/margins": 9.762857437133789, "rewards/rejected": -10.707484245300293, "step": 166 }, { "epoch": 0.28, "learning_rate": 7.016806722689075e-07, "logits/chosen": -1.6407663822174072, "logits/rejected": -1.5907646417617798, "logps/chosen": -514.5374145507812, "logps/rejected": -433.6203308105469, "loss": 0.2494, "rewards/accuracies": 1.0, "rewards/chosen": 0.24427911639213562, "rewards/margins": 2.523643970489502, "rewards/rejected": -2.279364824295044, "step": 167 }, { "epoch": 0.28, "learning_rate": 7.058823529411765e-07, "logits/chosen": -1.674712061882019, "logits/rejected": -1.6826207637786865, "logps/chosen": -47.03229522705078, "logps/rejected": -77.42449951171875, "loss": 0.2308, "rewards/accuracies": 1.0, "rewards/chosen": 0.0016428008675575256, "rewards/margins": 1.911266565322876, "rewards/rejected": -1.9096237421035767, "step": 168 }, { "epoch": 0.28, "learning_rate": 7.100840336134454e-07, "logits/chosen": -1.5853824615478516, "logits/rejected": -1.8151202201843262, "logps/chosen": -343.2923889160156, "logps/rejected": -209.4898681640625, "loss": 0.2194, "rewards/accuracies": 1.0, "rewards/chosen": -0.006474226713180542, "rewards/margins": 7.342705726623535, "rewards/rejected": -7.349180221557617, "step": 169 }, { "epoch": 0.29, "learning_rate": 7.142857142857143e-07, "logits/chosen": -1.7506383657455444, "logits/rejected": -1.3358906507492065, "logps/chosen": -263.41241455078125, "logps/rejected": -275.55706787109375, "loss": 0.2147, "rewards/accuracies": 1.0, "rewards/chosen": -0.8642104864120483, "rewards/margins": 5.5388641357421875, "rewards/rejected": -6.403074741363525, "step": 170 }, { "epoch": 0.29, "learning_rate": 7.184873949579831e-07, "logits/chosen": -1.4059754610061646, "logits/rejected": -1.2317900657653809, "logps/chosen": -191.6216278076172, "logps/rejected": -363.2991027832031, "loss": 0.2269, "rewards/accuracies": 0.5, "rewards/chosen": -0.5392516851425171, "rewards/margins": 1.2023521661758423, "rewards/rejected": -1.7416038513183594, "step": 171 }, { "epoch": 0.29, "learning_rate": 7.226890756302521e-07, "logits/chosen": -1.632811188697815, "logits/rejected": -1.3805952072143555, "logps/chosen": -133.94142150878906, "logps/rejected": -269.5859680175781, "loss": 0.2391, "rewards/accuracies": 0.5, "rewards/chosen": -0.4022440016269684, "rewards/margins": 4.070891857147217, "rewards/rejected": -4.473135948181152, "step": 172 }, { "epoch": 0.29, "learning_rate": 7.268907563025209e-07, "logits/chosen": -1.6576902866363525, "logits/rejected": -1.4377576112747192, "logps/chosen": -43.066829681396484, "logps/rejected": -67.72735595703125, "loss": 0.2507, "rewards/accuracies": 0.5, "rewards/chosen": -0.6717521548271179, "rewards/margins": 0.4190084934234619, "rewards/rejected": -1.0907607078552246, "step": 173 }, { "epoch": 0.29, "learning_rate": 7.310924369747898e-07, "logits/chosen": -1.5274075269699097, "logits/rejected": -2.076913833618164, "logps/chosen": -260.2308349609375, "logps/rejected": -114.42774963378906, "loss": 0.1975, "rewards/accuracies": 1.0, "rewards/chosen": 1.2202541828155518, "rewards/margins": 3.1372241973876953, "rewards/rejected": -1.916969895362854, "step": 174 }, { "epoch": 0.3, "learning_rate": 7.352941176470589e-07, "logits/chosen": -1.3671880960464478, "logits/rejected": -1.7833813428878784, "logps/chosen": -409.7170715332031, "logps/rejected": -275.7892150878906, "loss": 0.2064, "rewards/accuracies": 1.0, "rewards/chosen": -0.522778332233429, "rewards/margins": 5.268821716308594, "rewards/rejected": -5.791600227355957, "step": 175 }, { "epoch": 0.3, "learning_rate": 7.394957983193277e-07, "logits/chosen": -2.0044264793395996, "logits/rejected": -2.814589023590088, "logps/chosen": -286.0587158203125, "logps/rejected": -112.9316177368164, "loss": 0.2077, "rewards/accuracies": 1.0, "rewards/chosen": 0.33617284893989563, "rewards/margins": 1.9382390975952148, "rewards/rejected": -1.6020662784576416, "step": 176 }, { "epoch": 0.3, "learning_rate": 7.436974789915966e-07, "logits/chosen": -2.030181646347046, "logits/rejected": -1.3561756610870361, "logps/chosen": -192.53549194335938, "logps/rejected": -259.68597412109375, "loss": 0.2328, "rewards/accuracies": 1.0, "rewards/chosen": -0.3000236749649048, "rewards/margins": 1.729806900024414, "rewards/rejected": -2.0298304557800293, "step": 177 }, { "epoch": 0.3, "learning_rate": 7.478991596638656e-07, "logits/chosen": -1.314343810081482, "logits/rejected": -0.948784351348877, "logps/chosen": -419.88946533203125, "logps/rejected": -249.33352661132812, "loss": 0.2275, "rewards/accuracies": 1.0, "rewards/chosen": -0.001434326171875, "rewards/margins": 8.745747566223145, "rewards/rejected": -8.74718189239502, "step": 178 }, { "epoch": 0.3, "learning_rate": 7.521008403361344e-07, "logits/chosen": -1.3746141195297241, "logits/rejected": -1.2344566583633423, "logps/chosen": -33.07182693481445, "logps/rejected": -67.85247802734375, "loss": 0.2025, "rewards/accuracies": 1.0, "rewards/chosen": -0.7276212573051453, "rewards/margins": 2.376377820968628, "rewards/rejected": -3.103999137878418, "step": 179 }, { "epoch": 0.3, "learning_rate": 7.563025210084033e-07, "logits/chosen": -1.0103384256362915, "logits/rejected": -2.038599967956543, "logps/chosen": -425.00360107421875, "logps/rejected": -136.74205017089844, "loss": 0.2085, "rewards/accuracies": 1.0, "rewards/chosen": 0.21104279160499573, "rewards/margins": 5.775498390197754, "rewards/rejected": -5.564455986022949, "step": 180 }, { "epoch": 0.31, "learning_rate": 7.605042016806722e-07, "logits/chosen": -1.7550321817398071, "logits/rejected": -2.278670310974121, "logps/chosen": -90.49088287353516, "logps/rejected": -64.57603454589844, "loss": 0.2221, "rewards/accuracies": 1.0, "rewards/chosen": -0.6065499782562256, "rewards/margins": 2.4836509227752686, "rewards/rejected": -3.090200901031494, "step": 181 }, { "epoch": 0.31, "learning_rate": 7.647058823529411e-07, "logits/chosen": -1.554534912109375, "logits/rejected": -2.033510684967041, "logps/chosen": -275.0732116699219, "logps/rejected": -270.5262145996094, "loss": 0.2019, "rewards/accuracies": 1.0, "rewards/chosen": 0.631757378578186, "rewards/margins": 5.426800727844238, "rewards/rejected": -4.795043468475342, "step": 182 }, { "epoch": 0.31, "learning_rate": 7.689075630252101e-07, "logits/chosen": -1.0876420736312866, "logits/rejected": -1.9512230157852173, "logps/chosen": -462.33001708984375, "logps/rejected": -397.541259765625, "loss": 0.2168, "rewards/accuracies": 1.0, "rewards/chosen": 0.3109893798828125, "rewards/margins": 2.645329475402832, "rewards/rejected": -2.3343400955200195, "step": 183 }, { "epoch": 0.31, "learning_rate": 7.73109243697479e-07, "logits/chosen": -2.234156847000122, "logits/rejected": -2.153066873550415, "logps/chosen": -59.26795196533203, "logps/rejected": -123.72718811035156, "loss": 0.2057, "rewards/accuracies": 0.5, "rewards/chosen": -0.2919696867465973, "rewards/margins": 0.5463926196098328, "rewards/rejected": -0.8383622765541077, "step": 184 }, { "epoch": 0.31, "learning_rate": 7.773109243697479e-07, "logits/chosen": -1.892674446105957, "logits/rejected": -0.7566059827804565, "logps/chosen": -128.3382110595703, "logps/rejected": -352.1267395019531, "loss": 0.2409, "rewards/accuracies": 1.0, "rewards/chosen": -1.1412032842636108, "rewards/margins": 3.8913497924804688, "rewards/rejected": -5.032553195953369, "step": 185 }, { "epoch": 0.31, "learning_rate": 7.815126050420168e-07, "logits/chosen": -1.3167263269424438, "logits/rejected": -1.3082215785980225, "logps/chosen": -188.31219482421875, "logps/rejected": -194.53448486328125, "loss": 0.2037, "rewards/accuracies": 1.0, "rewards/chosen": -0.14720916748046875, "rewards/margins": 4.053531169891357, "rewards/rejected": -4.200739860534668, "step": 186 }, { "epoch": 0.32, "learning_rate": 7.857142857142856e-07, "logits/chosen": -1.758721113204956, "logits/rejected": -1.5897212028503418, "logps/chosen": -490.67822265625, "logps/rejected": -506.51971435546875, "loss": 0.1959, "rewards/accuracies": 1.0, "rewards/chosen": -0.8040527701377869, "rewards/margins": 1.7363312244415283, "rewards/rejected": -2.540383815765381, "step": 187 }, { "epoch": 0.32, "learning_rate": 7.899159663865545e-07, "logits/chosen": -1.2402632236480713, "logits/rejected": -2.1518731117248535, "logps/chosen": -440.912353515625, "logps/rejected": -190.54396057128906, "loss": 0.201, "rewards/accuracies": 1.0, "rewards/chosen": 0.4817703366279602, "rewards/margins": 2.1288094520568848, "rewards/rejected": -1.6470390558242798, "step": 188 }, { "epoch": 0.32, "learning_rate": 7.941176470588235e-07, "logits/chosen": -1.8251270055770874, "logits/rejected": -1.9880740642547607, "logps/chosen": -525.2092895507812, "logps/rejected": -373.4764404296875, "loss": 0.2224, "rewards/accuracies": 1.0, "rewards/chosen": 0.25654488801956177, "rewards/margins": 3.9607901573181152, "rewards/rejected": -3.704245090484619, "step": 189 }, { "epoch": 0.32, "learning_rate": 7.983193277310924e-07, "logits/chosen": -0.7393491864204407, "logits/rejected": -1.706693172454834, "logps/chosen": -540.9542236328125, "logps/rejected": -206.2239990234375, "loss": 0.1797, "rewards/accuracies": 1.0, "rewards/chosen": 0.6728149652481079, "rewards/margins": 7.774383544921875, "rewards/rejected": -7.101568222045898, "step": 190 }, { "epoch": 0.32, "learning_rate": 8.025210084033613e-07, "logits/chosen": -1.3957598209381104, "logits/rejected": -1.6105530261993408, "logps/chosen": -451.7230224609375, "logps/rejected": -323.118408203125, "loss": 0.1768, "rewards/accuracies": 1.0, "rewards/chosen": -0.5888229608535767, "rewards/margins": 6.918619155883789, "rewards/rejected": -7.507441997528076, "step": 191 }, { "epoch": 0.32, "learning_rate": 8.067226890756303e-07, "logits/chosen": -2.012829065322876, "logits/rejected": -1.7003165483474731, "logps/chosen": -231.9615936279297, "logps/rejected": -416.9687805175781, "loss": 0.2369, "rewards/accuracies": 1.0, "rewards/chosen": -0.45652008056640625, "rewards/margins": 4.130596160888672, "rewards/rejected": -4.587116241455078, "step": 192 }, { "epoch": 0.33, "learning_rate": 8.109243697478991e-07, "logits/chosen": -1.829034447669983, "logits/rejected": -1.623055338859558, "logps/chosen": -405.52508544921875, "logps/rejected": -334.392578125, "loss": 0.2274, "rewards/accuracies": 1.0, "rewards/chosen": 0.20874272286891937, "rewards/margins": 1.027557611465454, "rewards/rejected": -0.8188148736953735, "step": 193 }, { "epoch": 0.33, "learning_rate": 8.15126050420168e-07, "logits/chosen": -0.5686680674552917, "logits/rejected": -0.6403495669364929, "logps/chosen": -654.5108642578125, "logps/rejected": -387.836669921875, "loss": 0.2104, "rewards/accuracies": 1.0, "rewards/chosen": -0.738049328327179, "rewards/margins": 5.746264457702637, "rewards/rejected": -6.48431396484375, "step": 194 }, { "epoch": 0.33, "learning_rate": 8.19327731092437e-07, "logits/chosen": -1.5858798027038574, "logits/rejected": -1.6420996189117432, "logps/chosen": -116.9679946899414, "logps/rejected": -202.39669799804688, "loss": 0.1847, "rewards/accuracies": 1.0, "rewards/chosen": -0.6328601837158203, "rewards/margins": 3.9840362071990967, "rewards/rejected": -4.616896629333496, "step": 195 }, { "epoch": 0.33, "learning_rate": 8.235294117647058e-07, "logits/chosen": -1.6367160081863403, "logits/rejected": -1.6892379522323608, "logps/chosen": -175.56951904296875, "logps/rejected": -154.72865295410156, "loss": 0.1826, "rewards/accuracies": 1.0, "rewards/chosen": -0.052147675305604935, "rewards/margins": 6.1080498695373535, "rewards/rejected": -6.1601972579956055, "step": 196 }, { "epoch": 0.33, "learning_rate": 8.277310924369747e-07, "logits/chosen": -1.7774677276611328, "logits/rejected": -2.791229248046875, "logps/chosen": -339.4785461425781, "logps/rejected": -127.16014099121094, "loss": 0.2728, "rewards/accuracies": 1.0, "rewards/chosen": -0.5950691103935242, "rewards/margins": 5.04873514175415, "rewards/rejected": -5.64380407333374, "step": 197 }, { "epoch": 0.33, "learning_rate": 8.319327731092437e-07, "logits/chosen": -1.5082173347473145, "logits/rejected": -1.6745432615280151, "logps/chosen": -78.62190246582031, "logps/rejected": -35.42887496948242, "loss": 0.2192, "rewards/accuracies": 0.5, "rewards/chosen": -0.4798523187637329, "rewards/margins": 1.3703386783599854, "rewards/rejected": -1.8501909971237183, "step": 198 }, { "epoch": 0.34, "learning_rate": 8.361344537815126e-07, "logits/chosen": -1.651850938796997, "logits/rejected": -2.2229156494140625, "logps/chosen": -417.2205810546875, "logps/rejected": -179.9000244140625, "loss": 0.1873, "rewards/accuracies": 1.0, "rewards/chosen": -0.32609206438064575, "rewards/margins": 2.4329936504364014, "rewards/rejected": -2.7590856552124023, "step": 199 }, { "epoch": 0.34, "learning_rate": 8.403361344537815e-07, "logits/chosen": -1.3835643529891968, "logits/rejected": -1.4247071743011475, "logps/chosen": -68.0342025756836, "logps/rejected": -78.13129425048828, "loss": 0.2554, "rewards/accuracies": 0.5, "rewards/chosen": -0.7295857667922974, "rewards/margins": 2.134474277496338, "rewards/rejected": -2.8640599250793457, "step": 200 }, { "epoch": 0.34, "learning_rate": 8.445378151260503e-07, "logits/chosen": -1.367997169494629, "logits/rejected": -1.0557571649551392, "logps/chosen": -353.5372314453125, "logps/rejected": -270.9771728515625, "loss": 0.1961, "rewards/accuracies": 1.0, "rewards/chosen": -0.3901122808456421, "rewards/margins": 8.15034294128418, "rewards/rejected": -8.540454864501953, "step": 201 }, { "epoch": 0.34, "learning_rate": 8.487394957983193e-07, "logits/chosen": -1.7191728353500366, "logits/rejected": -1.4392447471618652, "logps/chosen": -22.586790084838867, "logps/rejected": -131.69461059570312, "loss": 0.1911, "rewards/accuracies": 1.0, "rewards/chosen": -0.18322506546974182, "rewards/margins": 4.684540748596191, "rewards/rejected": -4.867766380310059, "step": 202 }, { "epoch": 0.34, "learning_rate": 8.529411764705882e-07, "logits/chosen": -1.5832895040512085, "logits/rejected": -1.645804524421692, "logps/chosen": -67.92455291748047, "logps/rejected": -73.83755493164062, "loss": 0.2242, "rewards/accuracies": 1.0, "rewards/chosen": -0.22629156708717346, "rewards/margins": 1.8930432796478271, "rewards/rejected": -2.1193346977233887, "step": 203 }, { "epoch": 0.34, "learning_rate": 8.57142857142857e-07, "logits/chosen": -1.3173104524612427, "logits/rejected": -1.7601145505905151, "logps/chosen": -541.1436767578125, "logps/rejected": -157.87823486328125, "loss": 0.2101, "rewards/accuracies": 0.5, "rewards/chosen": -1.8486511707305908, "rewards/margins": 1.4881985187530518, "rewards/rejected": -3.3368496894836426, "step": 204 }, { "epoch": 0.35, "learning_rate": 8.613445378151261e-07, "logits/chosen": -0.26975634694099426, "logits/rejected": -0.21657763421535492, "logps/chosen": -448.43878173828125, "logps/rejected": -328.8587646484375, "loss": 0.1599, "rewards/accuracies": 1.0, "rewards/chosen": -2.2071120738983154, "rewards/margins": 9.672096252441406, "rewards/rejected": -11.879209518432617, "step": 205 }, { "epoch": 0.35, "learning_rate": 8.65546218487395e-07, "logits/chosen": -1.3258733749389648, "logits/rejected": -1.7010501623153687, "logps/chosen": -280.8769836425781, "logps/rejected": -82.33454132080078, "loss": 0.2117, "rewards/accuracies": 0.5, "rewards/chosen": -0.4332069754600525, "rewards/margins": 1.0438206195831299, "rewards/rejected": -1.4770275354385376, "step": 206 }, { "epoch": 0.35, "learning_rate": 8.697478991596638e-07, "logits/chosen": -1.833431601524353, "logits/rejected": -1.4919263124465942, "logps/chosen": -550.7933959960938, "logps/rejected": -366.83819580078125, "loss": 0.2718, "rewards/accuracies": 1.0, "rewards/chosen": -0.9451126456260681, "rewards/margins": 9.382735252380371, "rewards/rejected": -10.327848434448242, "step": 207 }, { "epoch": 0.35, "learning_rate": 8.739495798319328e-07, "logits/chosen": -1.6695424318313599, "logits/rejected": -2.533496618270874, "logps/chosen": -193.40089416503906, "logps/rejected": -144.17640686035156, "loss": 0.1894, "rewards/accuracies": 1.0, "rewards/chosen": -0.4194018244743347, "rewards/margins": 3.9117088317871094, "rewards/rejected": -4.33111047744751, "step": 208 }, { "epoch": 0.35, "learning_rate": 8.781512605042016e-07, "logits/chosen": -1.0884276628494263, "logits/rejected": -1.7498663663864136, "logps/chosen": -236.72494506835938, "logps/rejected": -124.72262573242188, "loss": 0.1856, "rewards/accuracies": 1.0, "rewards/chosen": -0.06562767922878265, "rewards/margins": 4.05698823928833, "rewards/rejected": -4.122615814208984, "step": 209 }, { "epoch": 0.35, "learning_rate": 8.823529411764705e-07, "logits/chosen": -1.2553131580352783, "logits/rejected": -1.654006838798523, "logps/chosen": -549.10400390625, "logps/rejected": -261.51226806640625, "loss": 0.1827, "rewards/accuracies": 1.0, "rewards/chosen": -0.3053421378135681, "rewards/margins": 7.806386947631836, "rewards/rejected": -8.11172866821289, "step": 210 }, { "epoch": 0.36, "learning_rate": 8.865546218487394e-07, "logits/chosen": -1.1624391078948975, "logits/rejected": -1.2052842378616333, "logps/chosen": -47.10140609741211, "logps/rejected": -77.38719177246094, "loss": 0.1972, "rewards/accuracies": 1.0, "rewards/chosen": -0.18456250429153442, "rewards/margins": 4.566642761230469, "rewards/rejected": -4.7512054443359375, "step": 211 }, { "epoch": 0.36, "learning_rate": 8.907563025210084e-07, "logits/chosen": -2.755701780319214, "logits/rejected": -1.7558363676071167, "logps/chosen": -249.60147094726562, "logps/rejected": -118.43754577636719, "loss": 0.1763, "rewards/accuracies": 1.0, "rewards/chosen": -0.39346620440483093, "rewards/margins": 4.152596473693848, "rewards/rejected": -4.546062469482422, "step": 212 }, { "epoch": 0.36, "learning_rate": 8.949579831932773e-07, "logits/chosen": -1.4844564199447632, "logits/rejected": -2.2958929538726807, "logps/chosen": -372.4450988769531, "logps/rejected": -301.98016357421875, "loss": 0.215, "rewards/accuracies": 1.0, "rewards/chosen": -0.4042709469795227, "rewards/margins": 3.7409682273864746, "rewards/rejected": -4.145238876342773, "step": 213 }, { "epoch": 0.36, "learning_rate": 8.991596638655462e-07, "logits/chosen": -1.6278800964355469, "logits/rejected": -2.122774600982666, "logps/chosen": -185.77044677734375, "logps/rejected": -173.4629364013672, "loss": 0.2481, "rewards/accuracies": 1.0, "rewards/chosen": 0.15187864005565643, "rewards/margins": 5.104752540588379, "rewards/rejected": -4.952874183654785, "step": 214 }, { "epoch": 0.36, "learning_rate": 9.033613445378151e-07, "logits/chosen": -1.2420963048934937, "logits/rejected": -1.2279390096664429, "logps/chosen": -56.64626693725586, "logps/rejected": -95.80750274658203, "loss": 0.2378, "rewards/accuracies": 1.0, "rewards/chosen": 0.10336628556251526, "rewards/margins": 4.724967956542969, "rewards/rejected": -4.6216020584106445, "step": 215 }, { "epoch": 0.36, "learning_rate": 9.07563025210084e-07, "logits/chosen": -0.6830325126647949, "logits/rejected": -0.7307128310203552, "logps/chosen": -509.64990234375, "logps/rejected": -261.376220703125, "loss": 0.2494, "rewards/accuracies": 1.0, "rewards/chosen": -1.0829544067382812, "rewards/margins": 7.50957727432251, "rewards/rejected": -8.592531204223633, "step": 216 }, { "epoch": 0.37, "learning_rate": 9.117647058823529e-07, "logits/chosen": -1.6173702478408813, "logits/rejected": -1.7928811311721802, "logps/chosen": -288.9893798828125, "logps/rejected": -148.32977294921875, "loss": 0.2141, "rewards/accuracies": 1.0, "rewards/chosen": -0.27213987708091736, "rewards/margins": 6.079885482788086, "rewards/rejected": -6.352025032043457, "step": 217 }, { "epoch": 0.37, "learning_rate": 9.159663865546218e-07, "logits/chosen": -1.3861663341522217, "logits/rejected": -1.4028596878051758, "logps/chosen": -72.13826751708984, "logps/rejected": -156.20257568359375, "loss": 0.2067, "rewards/accuracies": 1.0, "rewards/chosen": -0.7177965044975281, "rewards/margins": 5.9151411056518555, "rewards/rejected": -6.632937908172607, "step": 218 }, { "epoch": 0.37, "learning_rate": 9.201680672268907e-07, "logits/chosen": -1.943166971206665, "logits/rejected": -1.703479290008545, "logps/chosen": -254.5187530517578, "logps/rejected": -391.06610107421875, "loss": 0.198, "rewards/accuracies": 1.0, "rewards/chosen": -0.8562338948249817, "rewards/margins": 6.539711952209473, "rewards/rejected": -7.395946025848389, "step": 219 }, { "epoch": 0.37, "learning_rate": 9.243697478991597e-07, "logits/chosen": -1.8535621166229248, "logits/rejected": -2.3986668586730957, "logps/chosen": -221.98687744140625, "logps/rejected": -164.90716552734375, "loss": 0.2149, "rewards/accuracies": 1.0, "rewards/chosen": -0.19046229124069214, "rewards/margins": 3.5672414302825928, "rewards/rejected": -3.7577037811279297, "step": 220 }, { "epoch": 0.37, "learning_rate": 9.285714285714285e-07, "logits/chosen": -1.5400798320770264, "logits/rejected": -0.977436900138855, "logps/chosen": -38.834285736083984, "logps/rejected": -214.7268524169922, "loss": 0.2612, "rewards/accuracies": 1.0, "rewards/chosen": -0.6614927649497986, "rewards/margins": 6.089022636413574, "rewards/rejected": -6.750515460968018, "step": 221 }, { "epoch": 0.37, "learning_rate": 9.327731092436975e-07, "logits/chosen": -1.2816520929336548, "logits/rejected": -1.7735843658447266, "logps/chosen": -653.9671630859375, "logps/rejected": -604.3455810546875, "loss": 0.2203, "rewards/accuracies": 0.5, "rewards/chosen": -0.04913024976849556, "rewards/margins": 0.8042449355125427, "rewards/rejected": -0.8533751964569092, "step": 222 }, { "epoch": 0.38, "learning_rate": 9.369747899159663e-07, "logits/chosen": -0.8977038264274597, "logits/rejected": -1.0373615026474, "logps/chosen": -559.4537963867188, "logps/rejected": -400.3348388671875, "loss": 0.2109, "rewards/accuracies": 1.0, "rewards/chosen": -0.4869690239429474, "rewards/margins": 4.854780673980713, "rewards/rejected": -5.341749668121338, "step": 223 }, { "epoch": 0.38, "learning_rate": 9.411764705882352e-07, "logits/chosen": -1.5731453895568848, "logits/rejected": -1.3540092706680298, "logps/chosen": -362.89971923828125, "logps/rejected": -213.01864624023438, "loss": 0.2251, "rewards/accuracies": 1.0, "rewards/chosen": -0.5867255330085754, "rewards/margins": 7.263546466827393, "rewards/rejected": -7.850271701812744, "step": 224 }, { "epoch": 0.38, "learning_rate": 9.453781512605042e-07, "logits/chosen": -0.8027037978172302, "logits/rejected": -1.3039056062698364, "logps/chosen": -191.79257202148438, "logps/rejected": -130.3656768798828, "loss": 0.2174, "rewards/accuracies": 1.0, "rewards/chosen": 0.5622925162315369, "rewards/margins": 2.7290549278259277, "rewards/rejected": -2.166762590408325, "step": 225 }, { "epoch": 0.38, "learning_rate": 9.49579831932773e-07, "logits/chosen": -1.7100781202316284, "logits/rejected": -1.5751913785934448, "logps/chosen": -51.673133850097656, "logps/rejected": -87.66356658935547, "loss": 0.1912, "rewards/accuracies": 1.0, "rewards/chosen": -0.03124428540468216, "rewards/margins": 3.016190528869629, "rewards/rejected": -3.0474348068237305, "step": 226 }, { "epoch": 0.38, "learning_rate": 9.53781512605042e-07, "logits/chosen": -1.7360178232192993, "logits/rejected": -1.5387942790985107, "logps/chosen": -623.2257690429688, "logps/rejected": -446.02789306640625, "loss": 0.2341, "rewards/accuracies": 0.5, "rewards/chosen": -0.8850250244140625, "rewards/margins": 0.6927383542060852, "rewards/rejected": -1.577763319015503, "step": 227 }, { "epoch": 0.38, "learning_rate": 9.579831932773109e-07, "logits/chosen": -1.3128571510314941, "logits/rejected": -1.2455374002456665, "logps/chosen": -491.56341552734375, "logps/rejected": -306.54974365234375, "loss": 0.2017, "rewards/accuracies": 1.0, "rewards/chosen": -0.8349395990371704, "rewards/margins": 7.316784381866455, "rewards/rejected": -8.151723861694336, "step": 228 }, { "epoch": 0.39, "learning_rate": 9.621848739495798e-07, "logits/chosen": -0.9198746681213379, "logits/rejected": -1.0064659118652344, "logps/chosen": -170.07423400878906, "logps/rejected": -151.16380310058594, "loss": 0.2215, "rewards/accuracies": 1.0, "rewards/chosen": -0.42428553104400635, "rewards/margins": 4.414795398712158, "rewards/rejected": -4.839080810546875, "step": 229 }, { "epoch": 0.39, "learning_rate": 9.663865546218487e-07, "logits/chosen": -0.19439470767974854, "logits/rejected": -0.1546761840581894, "logps/chosen": -443.8742980957031, "logps/rejected": -319.8472595214844, "loss": 0.2317, "rewards/accuracies": 1.0, "rewards/chosen": -1.7987457513809204, "rewards/margins": 9.688015937805176, "rewards/rejected": -11.486761093139648, "step": 230 }, { "epoch": 0.39, "learning_rate": 9.705882352941176e-07, "logits/chosen": -0.9490076303482056, "logits/rejected": -0.48910650610923767, "logps/chosen": -513.2789306640625, "logps/rejected": -340.21929931640625, "loss": 0.2053, "rewards/accuracies": 1.0, "rewards/chosen": -1.3714752197265625, "rewards/margins": 9.171676635742188, "rewards/rejected": -10.54315185546875, "step": 231 }, { "epoch": 0.39, "learning_rate": 9.747899159663866e-07, "logits/chosen": -1.1433497667312622, "logits/rejected": -1.3219791650772095, "logps/chosen": -505.0430908203125, "logps/rejected": -255.84214782714844, "loss": 0.2021, "rewards/accuracies": 1.0, "rewards/chosen": -0.42404481768608093, "rewards/margins": 8.78639030456543, "rewards/rejected": -9.21043586730957, "step": 232 }, { "epoch": 0.39, "learning_rate": 9.789915966386553e-07, "logits/chosen": -1.0191929340362549, "logits/rejected": -1.4857556819915771, "logps/chosen": -178.77206420898438, "logps/rejected": -129.4377899169922, "loss": 0.2355, "rewards/accuracies": 1.0, "rewards/chosen": -0.5842880606651306, "rewards/margins": 3.4605538845062256, "rewards/rejected": -4.044841766357422, "step": 233 }, { "epoch": 0.39, "learning_rate": 9.831932773109242e-07, "logits/chosen": -1.518091082572937, "logits/rejected": -1.1380579471588135, "logps/chosen": -144.8524627685547, "logps/rejected": -124.58405303955078, "loss": 0.1858, "rewards/accuracies": 1.0, "rewards/chosen": -0.12543268501758575, "rewards/margins": 5.242918014526367, "rewards/rejected": -5.368350982666016, "step": 234 }, { "epoch": 0.4, "learning_rate": 9.873949579831934e-07, "logits/chosen": -1.8797619342803955, "logits/rejected": -1.4644118547439575, "logps/chosen": -70.80252838134766, "logps/rejected": -90.79109191894531, "loss": 0.2319, "rewards/accuracies": 0.5, "rewards/chosen": -1.3334128856658936, "rewards/margins": 1.8264957666397095, "rewards/rejected": -3.1599087715148926, "step": 235 }, { "epoch": 0.4, "learning_rate": 9.91596638655462e-07, "logits/chosen": -0.9166591763496399, "logits/rejected": -0.93157559633255, "logps/chosen": -12.770793914794922, "logps/rejected": -55.15324783325195, "loss": 0.1993, "rewards/accuracies": 1.0, "rewards/chosen": -0.37832537293434143, "rewards/margins": 3.1305532455444336, "rewards/rejected": -3.508878707885742, "step": 236 }, { "epoch": 0.4, "learning_rate": 9.95798319327731e-07, "logits/chosen": -1.6806684732437134, "logits/rejected": -2.278653144836426, "logps/chosen": -293.66644287109375, "logps/rejected": -160.80221557617188, "loss": 0.1972, "rewards/accuracies": 1.0, "rewards/chosen": 0.09649372100830078, "rewards/margins": 8.160371780395508, "rewards/rejected": -8.063878059387207, "step": 237 }, { "epoch": 0.4, "learning_rate": 1e-06, "logits/chosen": -0.28713756799697876, "logits/rejected": -0.2611769437789917, "logps/chosen": -21.57666778564453, "logps/rejected": -82.406982421875, "loss": 0.2085, "rewards/accuracies": 1.0, "rewards/chosen": -0.08337266743183136, "rewards/margins": 4.99355936050415, "rewards/rejected": -5.076931953430176, "step": 238 }, { "epoch": 0.4, "learning_rate": 9.99999458185223e-07, "logits/chosen": -2.185724973678589, "logits/rejected": -2.134838104248047, "logps/chosen": -37.73735427856445, "logps/rejected": -141.9013671875, "loss": 0.1863, "rewards/accuracies": 0.5, "rewards/chosen": -1.4596128463745117, "rewards/margins": 5.266613960266113, "rewards/rejected": -6.726226806640625, "step": 239 }, { "epoch": 0.4, "learning_rate": 9.999978327420662e-07, "logits/chosen": -1.203190565109253, "logits/rejected": -2.1939120292663574, "logps/chosen": -514.1233520507812, "logps/rejected": -75.5627670288086, "loss": 0.2014, "rewards/accuracies": 0.5, "rewards/chosen": 2.2211365699768066, "rewards/margins": 3.9060745239257812, "rewards/rejected": -1.684937834739685, "step": 240 }, { "epoch": 0.41, "learning_rate": 9.999951236740525e-07, "logits/chosen": -1.3303395509719849, "logits/rejected": -1.934448480606079, "logps/chosen": -135.79515075683594, "logps/rejected": -107.17416381835938, "loss": 0.1887, "rewards/accuracies": 1.0, "rewards/chosen": 0.06375274807214737, "rewards/margins": 2.7002902030944824, "rewards/rejected": -2.636537551879883, "step": 241 }, { "epoch": 0.41, "learning_rate": 9.999913309870528e-07, "logits/chosen": -1.44986891746521, "logits/rejected": -1.9651787281036377, "logps/chosen": -200.12448120117188, "logps/rejected": -163.63800048828125, "loss": 0.1916, "rewards/accuracies": 1.0, "rewards/chosen": 0.016937255859375, "rewards/margins": 2.1956772804260254, "rewards/rejected": -2.1787400245666504, "step": 242 }, { "epoch": 0.41, "learning_rate": 9.999864546892874e-07, "logits/chosen": -1.2548986673355103, "logits/rejected": -1.1517094373703003, "logps/chosen": -21.825084686279297, "logps/rejected": -100.38935852050781, "loss": 0.2183, "rewards/accuracies": 0.5, "rewards/chosen": -1.235842227935791, "rewards/margins": 4.932180404663086, "rewards/rejected": -6.168022155761719, "step": 243 }, { "epoch": 0.41, "learning_rate": 9.99980494791324e-07, "logits/chosen": -1.3461451530456543, "logits/rejected": -2.2621426582336426, "logps/chosen": -529.697998046875, "logps/rejected": -53.855751037597656, "loss": 0.2415, "rewards/accuracies": 1.0, "rewards/chosen": -0.11737975478172302, "rewards/margins": 2.4394328594207764, "rewards/rejected": -2.5568125247955322, "step": 244 }, { "epoch": 0.41, "learning_rate": 9.999734513060793e-07, "logits/chosen": -1.6569421291351318, "logits/rejected": -1.4833012819290161, "logps/chosen": -21.329483032226562, "logps/rejected": -213.5611572265625, "loss": 0.1858, "rewards/accuracies": 1.0, "rewards/chosen": -0.2380591332912445, "rewards/margins": 10.061112403869629, "rewards/rejected": -10.299171447753906, "step": 245 }, { "epoch": 0.41, "learning_rate": 9.999653242488186e-07, "logits/chosen": -1.3467878103256226, "logits/rejected": -1.1173731088638306, "logps/chosen": -388.6378173828125, "logps/rejected": -294.2225341796875, "loss": 0.1943, "rewards/accuracies": 1.0, "rewards/chosen": -0.8519317507743835, "rewards/margins": 9.176179885864258, "rewards/rejected": -10.028111457824707, "step": 246 }, { "epoch": 0.42, "learning_rate": 9.999561136371554e-07, "logits/chosen": -1.8872058391571045, "logits/rejected": -1.6369519233703613, "logps/chosen": -214.09727478027344, "logps/rejected": -376.61334228515625, "loss": 0.1797, "rewards/accuracies": 0.0, "rewards/chosen": -0.4823310971260071, "rewards/margins": -0.6333345770835876, "rewards/rejected": 0.15100345015525818, "step": 247 }, { "epoch": 0.42, "learning_rate": 9.99945819491051e-07, "logits/chosen": -1.6549506187438965, "logits/rejected": -1.558489441871643, "logps/chosen": -263.10980224609375, "logps/rejected": -266.67388916015625, "loss": 0.1845, "rewards/accuracies": 1.0, "rewards/chosen": -1.2903716564178467, "rewards/margins": 5.641432285308838, "rewards/rejected": -6.931804180145264, "step": 248 }, { "epoch": 0.42, "learning_rate": 9.99934441832816e-07, "logits/chosen": -1.7296139001846313, "logits/rejected": -1.958460807800293, "logps/chosen": -219.89266967773438, "logps/rejected": -296.84906005859375, "loss": 0.1885, "rewards/accuracies": 1.0, "rewards/chosen": -0.706303060054779, "rewards/margins": 5.47901725769043, "rewards/rejected": -6.1853203773498535, "step": 249 }, { "epoch": 0.42, "learning_rate": 9.999219806871085e-07, "logits/chosen": -1.1240818500518799, "logits/rejected": -1.4093561172485352, "logps/chosen": -481.0801086425781, "logps/rejected": -516.7035522460938, "loss": 0.2172, "rewards/accuracies": 1.0, "rewards/chosen": 0.33460843563079834, "rewards/margins": 6.402099609375, "rewards/rejected": -6.067491054534912, "step": 250 }, { "epoch": 0.42, "learning_rate": 9.99908436080935e-07, "logits/chosen": -0.8841766119003296, "logits/rejected": -1.7438700199127197, "logps/chosen": -250.23077392578125, "logps/rejected": -161.5799560546875, "loss": 0.2042, "rewards/accuracies": 1.0, "rewards/chosen": 0.4004615843296051, "rewards/margins": 6.907259941101074, "rewards/rejected": -6.50679874420166, "step": 251 }, { "epoch": 0.42, "learning_rate": 9.998938080436503e-07, "logits/chosen": -1.61567223072052, "logits/rejected": -2.236949920654297, "logps/chosen": -63.57852554321289, "logps/rejected": -129.1857452392578, "loss": 0.2339, "rewards/accuracies": 1.0, "rewards/chosen": -1.2066521644592285, "rewards/margins": 4.170039176940918, "rewards/rejected": -5.3766913414001465, "step": 252 }, { "epoch": 0.43, "learning_rate": 9.998780966069568e-07, "logits/chosen": -1.5731325149536133, "logits/rejected": -1.5780061483383179, "logps/chosen": -24.21834373474121, "logps/rejected": -83.41648864746094, "loss": 0.1516, "rewards/accuracies": 1.0, "rewards/chosen": -0.8808462023735046, "rewards/margins": 3.9870688915252686, "rewards/rejected": -4.867915153503418, "step": 253 }, { "epoch": 0.43, "learning_rate": 9.998613018049058e-07, "logits/chosen": -1.2458865642547607, "logits/rejected": -1.5161921977996826, "logps/chosen": -137.61996459960938, "logps/rejected": -198.40377807617188, "loss": 0.1805, "rewards/accuracies": 0.5, "rewards/chosen": -1.1367048025131226, "rewards/margins": 5.306751251220703, "rewards/rejected": -6.443456172943115, "step": 254 }, { "epoch": 0.43, "learning_rate": 9.998434236738956e-07, "logits/chosen": -1.3364038467407227, "logits/rejected": -1.633857011795044, "logps/chosen": -427.1192626953125, "logps/rejected": -423.90826416015625, "loss": 0.1867, "rewards/accuracies": 1.0, "rewards/chosen": -0.897076427936554, "rewards/margins": 5.636436462402344, "rewards/rejected": -6.533513069152832, "step": 255 }, { "epoch": 0.43, "learning_rate": 9.998244622526728e-07, "logits/chosen": -1.3373851776123047, "logits/rejected": -1.0651671886444092, "logps/chosen": -123.55149841308594, "logps/rejected": -188.85494995117188, "loss": 0.1508, "rewards/accuracies": 1.0, "rewards/chosen": -0.2550186216831207, "rewards/margins": 4.371315002441406, "rewards/rejected": -4.626333713531494, "step": 256 }, { "epoch": 0.43, "learning_rate": 9.99804417582332e-07, "logits/chosen": -1.8455286026000977, "logits/rejected": -1.6597044467926025, "logps/chosen": -37.02387619018555, "logps/rejected": -157.0921630859375, "loss": 0.1598, "rewards/accuracies": 1.0, "rewards/chosen": -0.6415446400642395, "rewards/margins": 7.187655448913574, "rewards/rejected": -7.829200267791748, "step": 257 }, { "epoch": 0.44, "learning_rate": 9.997832897063147e-07, "logits/chosen": -1.3792277574539185, "logits/rejected": -1.3170826435089111, "logps/chosen": -268.0335388183594, "logps/rejected": -193.27032470703125, "loss": 0.2239, "rewards/accuracies": 1.0, "rewards/chosen": -0.12644805014133453, "rewards/margins": 7.42259407043457, "rewards/rejected": -7.549041748046875, "step": 258 }, { "epoch": 0.44, "learning_rate": 9.99761078670411e-07, "logits/chosen": -2.141418218612671, "logits/rejected": -2.0980770587921143, "logps/chosen": -36.05842590332031, "logps/rejected": -130.27944946289062, "loss": 0.1816, "rewards/accuracies": 1.0, "rewards/chosen": -0.2582319378852844, "rewards/margins": 4.223054885864258, "rewards/rejected": -4.481287002563477, "step": 259 }, { "epoch": 0.44, "learning_rate": 9.997377845227574e-07, "logits/chosen": -1.0465216636657715, "logits/rejected": -0.9531089663505554, "logps/chosen": -123.76052856445312, "logps/rejected": -127.10284423828125, "loss": 0.1821, "rewards/accuracies": 1.0, "rewards/chosen": -0.5870941281318665, "rewards/margins": 1.3501648902893066, "rewards/rejected": -1.9372591972351074, "step": 260 }, { "epoch": 0.44, "learning_rate": 9.997134073138388e-07, "logits/chosen": -2.0905027389526367, "logits/rejected": -2.080312490463257, "logps/chosen": -161.46063232421875, "logps/rejected": -82.06962585449219, "loss": 0.1953, "rewards/accuracies": 0.0, "rewards/chosen": -2.4451663494110107, "rewards/margins": -1.771939754486084, "rewards/rejected": -0.6732265949249268, "step": 261 }, { "epoch": 0.44, "learning_rate": 9.996879470964867e-07, "logits/chosen": -1.8112472295761108, "logits/rejected": -2.4045462608337402, "logps/chosen": -167.5275421142578, "logps/rejected": -174.94483947753906, "loss": 0.2188, "rewards/accuracies": 1.0, "rewards/chosen": -0.22896921634674072, "rewards/margins": 5.413414478302002, "rewards/rejected": -5.642383575439453, "step": 262 }, { "epoch": 0.44, "learning_rate": 9.996614039258803e-07, "logits/chosen": -0.665377676486969, "logits/rejected": -0.4620656967163086, "logps/chosen": -148.4291229248047, "logps/rejected": -186.41522216796875, "loss": 0.2179, "rewards/accuracies": 1.0, "rewards/chosen": -0.5123512744903564, "rewards/margins": 7.561125755310059, "rewards/rejected": -8.073476791381836, "step": 263 }, { "epoch": 0.45, "learning_rate": 9.996337778595453e-07, "logits/chosen": -1.246445894241333, "logits/rejected": -1.9943310022354126, "logps/chosen": -311.7137451171875, "logps/rejected": -163.29208374023438, "loss": 0.1784, "rewards/accuracies": 1.0, "rewards/chosen": 0.3149963617324829, "rewards/margins": 7.973663330078125, "rewards/rejected": -7.658667087554932, "step": 264 }, { "epoch": 0.45, "learning_rate": 9.996050689573542e-07, "logits/chosen": -0.8231534361839294, "logits/rejected": -1.4796454906463623, "logps/chosen": -392.57366943359375, "logps/rejected": -215.98666381835938, "loss": 0.1828, "rewards/accuracies": 1.0, "rewards/chosen": -0.50887531042099, "rewards/margins": 4.215329647064209, "rewards/rejected": -4.724205017089844, "step": 265 }, { "epoch": 0.45, "learning_rate": 9.995752772815274e-07, "logits/chosen": -1.0359983444213867, "logits/rejected": -1.4785257577896118, "logps/chosen": -98.88191223144531, "logps/rejected": -250.07452392578125, "loss": 0.1721, "rewards/accuracies": 1.0, "rewards/chosen": -0.7754969000816345, "rewards/margins": 5.748414516448975, "rewards/rejected": -6.523911476135254, "step": 266 }, { "epoch": 0.45, "learning_rate": 9.995444028966306e-07, "logits/chosen": -1.2062805891036987, "logits/rejected": -0.6490817666053772, "logps/chosen": -66.56135559082031, "logps/rejected": -173.89833068847656, "loss": 0.2002, "rewards/accuracies": 0.5, "rewards/chosen": -0.7681030631065369, "rewards/margins": 3.30619478225708, "rewards/rejected": -4.074297904968262, "step": 267 }, { "epoch": 0.45, "learning_rate": 9.995124458695768e-07, "logits/chosen": -1.8022315502166748, "logits/rejected": -1.7330073118209839, "logps/chosen": -119.78292846679688, "logps/rejected": -185.3957061767578, "loss": 0.1853, "rewards/accuracies": 1.0, "rewards/chosen": -0.318962961435318, "rewards/margins": 3.6459078788757324, "rewards/rejected": -3.9648709297180176, "step": 268 }, { "epoch": 0.45, "learning_rate": 9.99479406269625e-07, "logits/chosen": -0.64534592628479, "logits/rejected": -1.2948215007781982, "logps/chosen": -386.3092956542969, "logps/rejected": -148.838134765625, "loss": 0.2189, "rewards/accuracies": 1.0, "rewards/chosen": -0.32144472002983093, "rewards/margins": 4.571913242340088, "rewards/rejected": -4.89335823059082, "step": 269 }, { "epoch": 0.46, "learning_rate": 9.994452841683807e-07, "logits/chosen": -1.8895020484924316, "logits/rejected": -1.8781219720840454, "logps/chosen": -109.0746841430664, "logps/rejected": -128.44271850585938, "loss": 0.1739, "rewards/accuracies": 0.5, "rewards/chosen": -0.8171417713165283, "rewards/margins": -0.010423451662063599, "rewards/rejected": -0.8067182898521423, "step": 270 }, { "epoch": 0.46, "learning_rate": 9.994100796397953e-07, "logits/chosen": -2.2890195846557617, "logits/rejected": -2.143787145614624, "logps/chosen": -67.44159698486328, "logps/rejected": -293.24310302734375, "loss": 0.2057, "rewards/accuracies": 1.0, "rewards/chosen": -0.6767939329147339, "rewards/margins": 12.486234664916992, "rewards/rejected": -13.163028717041016, "step": 271 }, { "epoch": 0.46, "learning_rate": 9.993737927601663e-07, "logits/chosen": -1.2766467332839966, "logits/rejected": -0.8370835781097412, "logps/chosen": -126.51022338867188, "logps/rejected": -241.4452362060547, "loss": 0.1861, "rewards/accuracies": 1.0, "rewards/chosen": -0.14905127882957458, "rewards/margins": 8.128352165222168, "rewards/rejected": -8.277403831481934, "step": 272 }, { "epoch": 0.46, "learning_rate": 9.993364236081366e-07, "logits/chosen": -2.052964925765991, "logits/rejected": -1.607141375541687, "logps/chosen": -103.60296630859375, "logps/rejected": -158.26458740234375, "loss": 0.154, "rewards/accuracies": 0.5, "rewards/chosen": -1.312964916229248, "rewards/margins": 1.991333246231079, "rewards/rejected": -3.304298162460327, "step": 273 }, { "epoch": 0.46, "learning_rate": 9.992979722646948e-07, "logits/chosen": -2.079749584197998, "logits/rejected": -2.0677154064178467, "logps/chosen": -41.83177947998047, "logps/rejected": -55.48749542236328, "loss": 0.1965, "rewards/accuracies": 0.5, "rewards/chosen": -0.5781872272491455, "rewards/margins": 1.497841477394104, "rewards/rejected": -2.076028823852539, "step": 274 }, { "epoch": 0.46, "learning_rate": 9.992584388131748e-07, "logits/chosen": -2.3774337768554688, "logits/rejected": -1.4596751928329468, "logps/chosen": -47.4880256652832, "logps/rejected": -253.26637268066406, "loss": 0.1938, "rewards/accuracies": 1.0, "rewards/chosen": -0.3620060980319977, "rewards/margins": 8.875134468078613, "rewards/rejected": -9.237140655517578, "step": 275 }, { "epoch": 0.47, "learning_rate": 9.992178233392562e-07, "logits/chosen": -1.6642234325408936, "logits/rejected": -1.6687958240509033, "logps/chosen": -307.078369140625, "logps/rejected": -324.7652587890625, "loss": 0.2655, "rewards/accuracies": 1.0, "rewards/chosen": 0.5620359182357788, "rewards/margins": 5.345144748687744, "rewards/rejected": -4.783108711242676, "step": 276 }, { "epoch": 0.47, "learning_rate": 9.991761259309633e-07, "logits/chosen": -1.3904876708984375, "logits/rejected": -1.6669962406158447, "logps/chosen": -400.28887939453125, "logps/rejected": -245.69497680664062, "loss": 0.181, "rewards/accuracies": 1.0, "rewards/chosen": -0.673797070980072, "rewards/margins": 7.228074550628662, "rewards/rejected": -7.901871681213379, "step": 277 }, { "epoch": 0.47, "learning_rate": 9.991333466786648e-07, "logits/chosen": -1.216492772102356, "logits/rejected": -1.1512091159820557, "logps/chosen": -16.977386474609375, "logps/rejected": -128.8122100830078, "loss": 0.1701, "rewards/accuracies": 1.0, "rewards/chosen": -0.0717000961303711, "rewards/margins": 7.391216278076172, "rewards/rejected": -7.462916374206543, "step": 278 }, { "epoch": 0.47, "learning_rate": 9.990894856750744e-07, "logits/chosen": -0.9838640689849854, "logits/rejected": -2.3961727619171143, "logps/chosen": -650.3472900390625, "logps/rejected": -48.605430603027344, "loss": 0.1742, "rewards/accuracies": 1.0, "rewards/chosen": 0.338510125875473, "rewards/margins": 1.6768007278442383, "rewards/rejected": -1.3382906913757324, "step": 279 }, { "epoch": 0.47, "learning_rate": 9.990445430152506e-07, "logits/chosen": -0.4278978407382965, "logits/rejected": -0.6988143920898438, "logps/chosen": -533.7452392578125, "logps/rejected": -292.7690734863281, "loss": 0.2038, "rewards/accuracies": 1.0, "rewards/chosen": -0.9284515380859375, "rewards/margins": 10.271871566772461, "rewards/rejected": -11.200323104858398, "step": 280 }, { "epoch": 0.47, "learning_rate": 9.989985187965955e-07, "logits/chosen": -1.6768834590911865, "logits/rejected": -1.5711687803268433, "logps/chosen": -439.8897705078125, "logps/rejected": -265.1025695800781, "loss": 0.2195, "rewards/accuracies": 1.0, "rewards/chosen": -1.3489418029785156, "rewards/margins": 7.014950752258301, "rewards/rejected": -8.363892555236816, "step": 281 }, { "epoch": 0.48, "learning_rate": 9.989514131188558e-07, "logits/chosen": -1.4727314710617065, "logits/rejected": -1.679413914680481, "logps/chosen": -47.99615478515625, "logps/rejected": -121.50358581542969, "loss": 0.1682, "rewards/accuracies": 0.5, "rewards/chosen": -0.8484709858894348, "rewards/margins": 4.306775093078613, "rewards/rejected": -5.155246257781982, "step": 282 }, { "epoch": 0.48, "learning_rate": 9.989032260841215e-07, "logits/chosen": -1.3292348384857178, "logits/rejected": -1.4934347867965698, "logps/chosen": -443.295654296875, "logps/rejected": -279.7898254394531, "loss": 0.176, "rewards/accuracies": 1.0, "rewards/chosen": -0.13922274112701416, "rewards/margins": 7.711603164672852, "rewards/rejected": -7.850825786590576, "step": 283 }, { "epoch": 0.48, "learning_rate": 9.988539577968264e-07, "logits/chosen": -1.694291591644287, "logits/rejected": -1.0821882486343384, "logps/chosen": -46.26044845581055, "logps/rejected": -100.78984069824219, "loss": 0.199, "rewards/accuracies": 1.0, "rewards/chosen": -0.683262825012207, "rewards/margins": 1.5582889318466187, "rewards/rejected": -2.2415518760681152, "step": 284 }, { "epoch": 0.48, "learning_rate": 9.988036083637477e-07, "logits/chosen": -1.4071030616760254, "logits/rejected": -1.7171040773391724, "logps/chosen": -215.1892852783203, "logps/rejected": -162.13401794433594, "loss": 0.2222, "rewards/accuracies": 1.0, "rewards/chosen": -0.22356048226356506, "rewards/margins": 8.27348518371582, "rewards/rejected": -8.497045516967773, "step": 285 }, { "epoch": 0.48, "learning_rate": 9.987521778940057e-07, "logits/chosen": -1.219684362411499, "logits/rejected": -1.615993618965149, "logps/chosen": -529.4940185546875, "logps/rejected": -260.98486328125, "loss": 0.187, "rewards/accuracies": 1.0, "rewards/chosen": -0.5555427670478821, "rewards/margins": 7.262081623077393, "rewards/rejected": -7.817624092102051, "step": 286 }, { "epoch": 0.48, "learning_rate": 9.986996664990635e-07, "logits/chosen": -1.1856704950332642, "logits/rejected": -1.439422607421875, "logps/chosen": -128.1553192138672, "logps/rejected": -44.11671447753906, "loss": 0.1936, "rewards/accuracies": 0.5, "rewards/chosen": -1.0148143768310547, "rewards/margins": -0.19398105144500732, "rewards/rejected": -0.8208333253860474, "step": 287 }, { "epoch": 0.49, "learning_rate": 9.986460742927269e-07, "logits/chosen": -0.8404701948165894, "logits/rejected": -0.4110315442085266, "logps/chosen": -421.8138122558594, "logps/rejected": -330.5446472167969, "loss": 0.225, "rewards/accuracies": 1.0, "rewards/chosen": -1.2713227272033691, "rewards/margins": 11.016674041748047, "rewards/rejected": -12.287996292114258, "step": 288 }, { "epoch": 0.49, "learning_rate": 9.985914013911442e-07, "logits/chosen": -1.3006629943847656, "logits/rejected": -1.1097196340560913, "logps/chosen": -624.8807983398438, "logps/rejected": -328.3757019042969, "loss": 0.1815, "rewards/accuracies": 0.5, "rewards/chosen": -1.4635086059570312, "rewards/margins": 3.9749832153320312, "rewards/rejected": -5.4384918212890625, "step": 289 }, { "epoch": 0.49, "learning_rate": 9.985356479128056e-07, "logits/chosen": -0.30210334062576294, "logits/rejected": -0.34102773666381836, "logps/chosen": -203.22467041015625, "logps/rejected": -202.3996124267578, "loss": 0.2037, "rewards/accuracies": 1.0, "rewards/chosen": -0.25408071279525757, "rewards/margins": 9.476703643798828, "rewards/rejected": -9.73078441619873, "step": 290 }, { "epoch": 0.49, "learning_rate": 9.984788139785432e-07, "logits/chosen": -1.5046318769454956, "logits/rejected": -1.4954370260238647, "logps/chosen": -128.044189453125, "logps/rejected": -212.79881286621094, "loss": 0.1563, "rewards/accuracies": 1.0, "rewards/chosen": 0.5831112265586853, "rewards/margins": 4.677426338195801, "rewards/rejected": -4.094315052032471, "step": 291 }, { "epoch": 0.49, "learning_rate": 9.984208997115311e-07, "logits/chosen": -1.3097823858261108, "logits/rejected": -0.7972382307052612, "logps/chosen": -61.33431625366211, "logps/rejected": -224.305908203125, "loss": 0.1748, "rewards/accuracies": 1.0, "rewards/chosen": -0.20676542818546295, "rewards/margins": 7.611514091491699, "rewards/rejected": -7.81827974319458, "step": 292 }, { "epoch": 0.49, "learning_rate": 9.983619052372847e-07, "logits/chosen": -1.4994029998779297, "logits/rejected": -0.9353764653205872, "logps/chosen": -454.6905517578125, "logps/rejected": -354.83721923828125, "loss": 0.1724, "rewards/accuracies": 1.0, "rewards/chosen": -0.6514847278594971, "rewards/margins": 12.14840030670166, "rewards/rejected": -12.799884796142578, "step": 293 }, { "epoch": 0.5, "learning_rate": 9.983018306836599e-07, "logits/chosen": -1.3215970993041992, "logits/rejected": -1.484683632850647, "logps/chosen": -519.1898803710938, "logps/rejected": -226.5242919921875, "loss": 0.1852, "rewards/accuracies": 1.0, "rewards/chosen": 0.4192039668560028, "rewards/margins": 8.631196975708008, "rewards/rejected": -8.211993217468262, "step": 294 }, { "epoch": 0.5, "learning_rate": 9.98240676180854e-07, "logits/chosen": -2.4572155475616455, "logits/rejected": -2.3966965675354004, "logps/chosen": -40.4495849609375, "logps/rejected": -193.80703735351562, "loss": 0.1768, "rewards/accuracies": 1.0, "rewards/chosen": -0.9751855134963989, "rewards/margins": 9.661815643310547, "rewards/rejected": -10.637001037597656, "step": 295 }, { "epoch": 0.5, "learning_rate": 9.981784418614046e-07, "logits/chosen": -1.1194053888320923, "logits/rejected": -1.3947285413742065, "logps/chosen": -282.65728759765625, "logps/rejected": -197.75747680664062, "loss": 0.1737, "rewards/accuracies": 1.0, "rewards/chosen": -0.5431690216064453, "rewards/margins": 4.192901134490967, "rewards/rejected": -4.736070156097412, "step": 296 }, { "epoch": 0.5, "learning_rate": 9.981151278601899e-07, "logits/chosen": -1.8385300636291504, "logits/rejected": -0.9504812359809875, "logps/chosen": -119.73028564453125, "logps/rejected": -262.2828674316406, "loss": 0.1592, "rewards/accuracies": 1.0, "rewards/chosen": -1.2197556495666504, "rewards/margins": 6.548429489135742, "rewards/rejected": -7.768185615539551, "step": 297 }, { "epoch": 0.5, "learning_rate": 9.980507343144271e-07, "logits/chosen": -0.959502637386322, "logits/rejected": -0.6492790579795837, "logps/chosen": -598.463623046875, "logps/rejected": -408.7165222167969, "loss": 0.2004, "rewards/accuracies": 1.0, "rewards/chosen": -1.5735183954238892, "rewards/margins": 13.148619651794434, "rewards/rejected": -14.722137451171875, "step": 298 }, { "epoch": 0.5, "learning_rate": 9.979852613636743e-07, "logits/chosen": -1.758123755455017, "logits/rejected": -2.5972981452941895, "logps/chosen": -210.6790771484375, "logps/rejected": -183.95455932617188, "loss": 0.2032, "rewards/accuracies": 1.0, "rewards/chosen": -0.4298916161060333, "rewards/margins": 4.640817642211914, "rewards/rejected": -5.070709228515625, "step": 299 }, { "epoch": 0.51, "learning_rate": 9.979187091498283e-07, "logits/chosen": -1.9287923574447632, "logits/rejected": -2.3071420192718506, "logps/chosen": -224.43728637695312, "logps/rejected": -252.1556396484375, "loss": 0.1592, "rewards/accuracies": 1.0, "rewards/chosen": -0.8604675531387329, "rewards/margins": 4.382769584655762, "rewards/rejected": -5.243237018585205, "step": 300 }, { "epoch": 0.51, "learning_rate": 9.978510778171245e-07, "logits/chosen": -1.4522650241851807, "logits/rejected": -0.9249738454818726, "logps/chosen": -410.66632080078125, "logps/rejected": -477.5390625, "loss": 0.1934, "rewards/accuracies": 0.5, "rewards/chosen": -0.16993102431297302, "rewards/margins": 6.357221603393555, "rewards/rejected": -6.5271525382995605, "step": 301 }, { "epoch": 0.51, "learning_rate": 9.977823675121382e-07, "logits/chosen": -1.6546478271484375, "logits/rejected": -2.3277413845062256, "logps/chosen": -207.47552490234375, "logps/rejected": -181.27479553222656, "loss": 0.1846, "rewards/accuracies": 1.0, "rewards/chosen": -0.2778858244419098, "rewards/margins": 6.3818769454956055, "rewards/rejected": -6.659762382507324, "step": 302 }, { "epoch": 0.51, "learning_rate": 9.977125783837818e-07, "logits/chosen": -1.469759464263916, "logits/rejected": -2.041001319885254, "logps/chosen": -286.0638427734375, "logps/rejected": -141.27078247070312, "loss": 0.1763, "rewards/accuracies": 1.0, "rewards/chosen": 0.4046391546726227, "rewards/margins": 7.348243713378906, "rewards/rejected": -6.943604469299316, "step": 303 }, { "epoch": 0.51, "learning_rate": 9.97641710583307e-07, "logits/chosen": -1.4234567880630493, "logits/rejected": -1.6683655977249146, "logps/chosen": -362.9468078613281, "logps/rejected": -260.8007507324219, "loss": 0.1771, "rewards/accuracies": 1.0, "rewards/chosen": -1.304555892944336, "rewards/margins": 6.251507759094238, "rewards/rejected": -7.556063652038574, "step": 304 }, { "epoch": 0.51, "learning_rate": 9.975697642643022e-07, "logits/chosen": -1.9733335971832275, "logits/rejected": -1.0282042026519775, "logps/chosen": -142.15017700195312, "logps/rejected": -305.75054931640625, "loss": 0.1982, "rewards/accuracies": 1.0, "rewards/chosen": -0.6309067010879517, "rewards/margins": 5.164991855621338, "rewards/rejected": -5.7958984375, "step": 305 }, { "epoch": 0.52, "learning_rate": 9.97496739582694e-07, "logits/chosen": -0.8187223076820374, "logits/rejected": -1.733450174331665, "logps/chosen": -683.83056640625, "logps/rejected": -257.8089599609375, "loss": 0.2229, "rewards/accuracies": 1.0, "rewards/chosen": -0.4965972900390625, "rewards/margins": 11.01584243774414, "rewards/rejected": -11.512439727783203, "step": 306 }, { "epoch": 0.52, "learning_rate": 9.974226366967457e-07, "logits/chosen": -1.2671740055084229, "logits/rejected": -1.1369811296463013, "logps/chosen": -495.198974609375, "logps/rejected": -387.591064453125, "loss": 0.2071, "rewards/accuracies": 1.0, "rewards/chosen": -0.4078201353549957, "rewards/margins": 7.06597900390625, "rewards/rejected": -7.473799228668213, "step": 307 }, { "epoch": 0.52, "learning_rate": 9.973474557670574e-07, "logits/chosen": -1.4290441274642944, "logits/rejected": -2.188762903213501, "logps/chosen": -75.417724609375, "logps/rejected": -70.07405853271484, "loss": 0.176, "rewards/accuracies": 1.0, "rewards/chosen": -0.36977919936180115, "rewards/margins": 4.8193769454956055, "rewards/rejected": -5.1891560554504395, "step": 308 }, { "epoch": 0.52, "learning_rate": 9.972711969565658e-07, "logits/chosen": -0.476540207862854, "logits/rejected": -1.4027069807052612, "logps/chosen": -1183.3697509765625, "logps/rejected": -414.2918701171875, "loss": 0.1684, "rewards/accuracies": 1.0, "rewards/chosen": 0.6394989490509033, "rewards/margins": 6.497945308685303, "rewards/rejected": -5.8584465980529785, "step": 309 }, { "epoch": 0.52, "learning_rate": 9.971938604305434e-07, "logits/chosen": -1.002270221710205, "logits/rejected": -0.6929762363433838, "logps/chosen": -175.58682250976562, "logps/rejected": -347.9329833984375, "loss": 0.1936, "rewards/accuracies": 0.5, "rewards/chosen": 0.08384094387292862, "rewards/margins": 1.912644863128662, "rewards/rejected": -1.8288038969039917, "step": 310 }, { "epoch": 0.52, "learning_rate": 9.971154463565984e-07, "logits/chosen": -0.7919758558273315, "logits/rejected": -1.6887415647506714, "logps/chosen": -293.739990234375, "logps/rejected": -134.64825439453125, "loss": 0.163, "rewards/accuracies": 1.0, "rewards/chosen": -0.24404069781303406, "rewards/margins": 7.300686836242676, "rewards/rejected": -7.544727325439453, "step": 311 }, { "epoch": 0.53, "learning_rate": 9.97035954904675e-07, "logits/chosen": -1.3530901670455933, "logits/rejected": -1.727453589439392, "logps/chosen": -433.71282958984375, "logps/rejected": -299.36175537109375, "loss": 0.1831, "rewards/accuracies": 1.0, "rewards/chosen": -1.0365535020828247, "rewards/margins": 6.750691890716553, "rewards/rejected": -7.787245273590088, "step": 312 }, { "epoch": 0.53, "learning_rate": 9.969553862470508e-07, "logits/chosen": -0.9584515690803528, "logits/rejected": -0.9624962210655212, "logps/chosen": -32.61267852783203, "logps/rejected": -126.69062042236328, "loss": 0.1925, "rewards/accuracies": 1.0, "rewards/chosen": -0.5699091553688049, "rewards/margins": 6.840811729431152, "rewards/rejected": -7.4107208251953125, "step": 313 }, { "epoch": 0.53, "learning_rate": 9.968737405583395e-07, "logits/chosen": -2.1836318969726562, "logits/rejected": -1.6158103942871094, "logps/chosen": -131.1636962890625, "logps/rejected": -316.4326171875, "loss": 0.1549, "rewards/accuracies": 1.0, "rewards/chosen": -0.3803081512451172, "rewards/margins": 7.161952972412109, "rewards/rejected": -7.542261600494385, "step": 314 }, { "epoch": 0.53, "learning_rate": 9.967910180154888e-07, "logits/chosen": -0.774591326713562, "logits/rejected": -1.062556505203247, "logps/chosen": -555.9920043945312, "logps/rejected": -338.88055419921875, "loss": 0.2006, "rewards/accuracies": 1.0, "rewards/chosen": -0.0430755615234375, "rewards/margins": 7.494574069976807, "rewards/rejected": -7.537649631500244, "step": 315 }, { "epoch": 0.53, "learning_rate": 9.967072187977793e-07, "logits/chosen": -1.3581414222717285, "logits/rejected": -1.6791399717330933, "logps/chosen": -354.8208312988281, "logps/rejected": -287.4419860839844, "loss": 0.1415, "rewards/accuracies": 1.0, "rewards/chosen": 0.2476089596748352, "rewards/margins": 4.651924133300781, "rewards/rejected": -4.404314994812012, "step": 316 }, { "epoch": 0.53, "learning_rate": 9.96622343086826e-07, "logits/chosen": -1.4797134399414062, "logits/rejected": -1.6615569591522217, "logps/chosen": -264.7172546386719, "logps/rejected": -219.28221130371094, "loss": 0.1911, "rewards/accuracies": 1.0, "rewards/chosen": 0.6093296408653259, "rewards/margins": 10.906174659729004, "rewards/rejected": -10.296845436096191, "step": 317 }, { "epoch": 0.54, "learning_rate": 9.96536391066576e-07, "logits/chosen": -1.7635356187820435, "logits/rejected": -1.4284019470214844, "logps/chosen": -262.9644775390625, "logps/rejected": -328.27642822265625, "loss": 0.1773, "rewards/accuracies": 1.0, "rewards/chosen": 0.7439472675323486, "rewards/margins": 5.8976640701293945, "rewards/rejected": -5.153717041015625, "step": 318 }, { "epoch": 0.54, "learning_rate": 9.964493629233104e-07, "logits/chosen": -1.0009286403656006, "logits/rejected": -0.9932736158370972, "logps/chosen": -289.8218994140625, "logps/rejected": -227.1639404296875, "loss": 0.144, "rewards/accuracies": 1.0, "rewards/chosen": -1.4354541301727295, "rewards/margins": 8.911469459533691, "rewards/rejected": -10.346923828125, "step": 319 }, { "epoch": 0.54, "learning_rate": 9.963612588456412e-07, "logits/chosen": -2.250784397125244, "logits/rejected": -1.9884320497512817, "logps/chosen": -46.93716049194336, "logps/rejected": -284.6944274902344, "loss": 0.1901, "rewards/accuracies": 1.0, "rewards/chosen": -0.18621844053268433, "rewards/margins": 10.04560661315918, "rewards/rejected": -10.231825828552246, "step": 320 }, { "epoch": 0.54, "learning_rate": 9.962720790245126e-07, "logits/chosen": -1.6217896938323975, "logits/rejected": -1.0232822895050049, "logps/chosen": -193.26486206054688, "logps/rejected": -346.9532470703125, "loss": 0.1927, "rewards/accuracies": 1.0, "rewards/chosen": -0.14425279200077057, "rewards/margins": 2.2471230030059814, "rewards/rejected": -2.391375780105591, "step": 321 }, { "epoch": 0.54, "learning_rate": 9.96181823653201e-07, "logits/chosen": -1.9973036050796509, "logits/rejected": -2.215181827545166, "logps/chosen": -267.81097412109375, "logps/rejected": -295.62298583984375, "loss": 0.1495, "rewards/accuracies": 1.0, "rewards/chosen": -0.1725495457649231, "rewards/margins": 7.604098320007324, "rewards/rejected": -7.776648044586182, "step": 322 }, { "epoch": 0.54, "learning_rate": 9.96090492927313e-07, "logits/chosen": -1.6204432249069214, "logits/rejected": -1.2365188598632812, "logps/chosen": -46.48271942138672, "logps/rejected": -140.6979217529297, "loss": 0.1883, "rewards/accuracies": 1.0, "rewards/chosen": -0.3501051366329193, "rewards/margins": 7.698967456817627, "rewards/rejected": -8.049072265625, "step": 323 }, { "epoch": 0.55, "learning_rate": 9.959980870447852e-07, "logits/chosen": -0.840675950050354, "logits/rejected": -0.4675593972206116, "logps/chosen": -480.7996826171875, "logps/rejected": -337.56854248046875, "loss": 0.1888, "rewards/accuracies": 1.0, "rewards/chosen": -1.6151520013809204, "rewards/margins": 12.177379608154297, "rewards/rejected": -13.792531967163086, "step": 324 }, { "epoch": 0.55, "learning_rate": 9.959046062058862e-07, "logits/chosen": -1.4950480461120605, "logits/rejected": -1.5916494131088257, "logps/chosen": -64.40080261230469, "logps/rejected": -71.2299575805664, "loss": 0.2506, "rewards/accuracies": 1.0, "rewards/chosen": -0.16438627243041992, "rewards/margins": 4.567226886749268, "rewards/rejected": -4.7316131591796875, "step": 325 }, { "epoch": 0.55, "learning_rate": 9.958100506132126e-07, "logits/chosen": -0.7526825666427612, "logits/rejected": -1.9301047325134277, "logps/chosen": -560.9881591796875, "logps/rejected": -72.82947540283203, "loss": 0.1853, "rewards/accuracies": 1.0, "rewards/chosen": 0.4727003276348114, "rewards/margins": 5.001248359680176, "rewards/rejected": -4.528548240661621, "step": 326 }, { "epoch": 0.55, "learning_rate": 9.957144204716907e-07, "logits/chosen": -0.9634856581687927, "logits/rejected": -1.3579816818237305, "logps/chosen": -301.60650634765625, "logps/rejected": -186.40699768066406, "loss": 0.1644, "rewards/accuracies": 1.0, "rewards/chosen": -0.7037521600723267, "rewards/margins": 7.330875873565674, "rewards/rejected": -8.034627914428711, "step": 327 }, { "epoch": 0.55, "learning_rate": 9.956177159885764e-07, "logits/chosen": -1.8041788339614868, "logits/rejected": -2.096233367919922, "logps/chosen": -149.9138946533203, "logps/rejected": -199.65846252441406, "loss": 0.1742, "rewards/accuracies": 1.0, "rewards/chosen": 0.1912795603275299, "rewards/margins": 6.335195541381836, "rewards/rejected": -6.143916130065918, "step": 328 }, { "epoch": 0.55, "learning_rate": 9.955199373734528e-07, "logits/chosen": -2.210163116455078, "logits/rejected": -1.5051549673080444, "logps/chosen": -43.63890838623047, "logps/rejected": -231.96681213378906, "loss": 0.1834, "rewards/accuracies": 1.0, "rewards/chosen": -0.6588637828826904, "rewards/margins": 4.09450101852417, "rewards/rejected": -4.753364562988281, "step": 329 }, { "epoch": 0.56, "learning_rate": 9.954210848382317e-07, "logits/chosen": -1.917798399925232, "logits/rejected": -1.8297691345214844, "logps/chosen": -28.441646575927734, "logps/rejected": -159.19583129882812, "loss": 0.1769, "rewards/accuracies": 0.5, "rewards/chosen": -0.4995257258415222, "rewards/margins": 7.739633560180664, "rewards/rejected": -8.23915958404541, "step": 330 }, { "epoch": 0.56, "learning_rate": 9.953211585971522e-07, "logits/chosen": -1.6009342670440674, "logits/rejected": -1.1614232063293457, "logps/chosen": -143.366455078125, "logps/rejected": -187.1925048828125, "loss": 0.1742, "rewards/accuracies": 0.5, "rewards/chosen": -0.8375290036201477, "rewards/margins": 0.36481326818466187, "rewards/rejected": -1.2023422718048096, "step": 331 }, { "epoch": 0.56, "learning_rate": 9.952201588667803e-07, "logits/chosen": -1.2768915891647339, "logits/rejected": -2.192704916000366, "logps/chosen": -347.0742492675781, "logps/rejected": -107.85566711425781, "loss": 0.2237, "rewards/accuracies": 1.0, "rewards/chosen": -0.9857234954833984, "rewards/margins": 4.846242904663086, "rewards/rejected": -5.831965923309326, "step": 332 }, { "epoch": 0.56, "learning_rate": 9.951180858660089e-07, "logits/chosen": -1.8204140663146973, "logits/rejected": -1.5513927936553955, "logps/chosen": -409.3429260253906, "logps/rejected": -370.9957275390625, "loss": 0.2061, "rewards/accuracies": 0.5, "rewards/chosen": -0.23614689707756042, "rewards/margins": 0.9891689419746399, "rewards/rejected": -1.225315809249878, "step": 333 }, { "epoch": 0.56, "learning_rate": 9.95014939816056e-07, "logits/chosen": -0.7975092530250549, "logits/rejected": -1.1003104448318481, "logps/chosen": -433.71661376953125, "logps/rejected": -278.63995361328125, "loss": 0.1544, "rewards/accuracies": 1.0, "rewards/chosen": -1.7607848644256592, "rewards/margins": 9.997198104858398, "rewards/rejected": -11.75798225402832, "step": 334 }, { "epoch": 0.56, "learning_rate": 9.949107209404663e-07, "logits/chosen": -2.2883524894714355, "logits/rejected": -2.550262212753296, "logps/chosen": -122.60182189941406, "logps/rejected": -206.03915405273438, "loss": 0.2065, "rewards/accuracies": 0.5, "rewards/chosen": -1.2694628238677979, "rewards/margins": 6.3892903327941895, "rewards/rejected": -7.658753395080566, "step": 335 }, { "epoch": 0.57, "learning_rate": 9.948054294651088e-07, "logits/chosen": -1.41136634349823, "logits/rejected": -2.039381265640259, "logps/chosen": -242.87060546875, "logps/rejected": -118.76246643066406, "loss": 0.1629, "rewards/accuracies": 1.0, "rewards/chosen": -0.8472713232040405, "rewards/margins": 0.7751063704490662, "rewards/rejected": -1.6223777532577515, "step": 336 }, { "epoch": 0.57, "learning_rate": 9.946990656181779e-07, "logits/chosen": -1.0350347757339478, "logits/rejected": -0.8846843838691711, "logps/chosen": -52.651214599609375, "logps/rejected": -150.87210083007812, "loss": 0.2258, "rewards/accuracies": 0.5, "rewards/chosen": -1.9379146099090576, "rewards/margins": 7.0258870124816895, "rewards/rejected": -8.963801383972168, "step": 337 }, { "epoch": 0.57, "learning_rate": 9.945916296301912e-07, "logits/chosen": -2.120410442352295, "logits/rejected": -1.4654765129089355, "logps/chosen": -81.31796264648438, "logps/rejected": -198.11753845214844, "loss": 0.1973, "rewards/accuracies": 1.0, "rewards/chosen": -0.5237701535224915, "rewards/margins": 3.80403470993042, "rewards/rejected": -4.3278045654296875, "step": 338 }, { "epoch": 0.57, "learning_rate": 9.944831217339903e-07, "logits/chosen": -1.9489809274673462, "logits/rejected": -1.5405217409133911, "logps/chosen": -335.79010009765625, "logps/rejected": -635.9193115234375, "loss": 0.1915, "rewards/accuracies": 0.5, "rewards/chosen": -1.2987374067306519, "rewards/margins": 1.2891318798065186, "rewards/rejected": -2.587869167327881, "step": 339 }, { "epoch": 0.57, "learning_rate": 9.943735421647404e-07, "logits/chosen": -1.1065800189971924, "logits/rejected": -1.074052095413208, "logps/chosen": -33.51839828491211, "logps/rejected": -160.16390991210938, "loss": 0.2031, "rewards/accuracies": 1.0, "rewards/chosen": -1.1291143894195557, "rewards/margins": 8.39620304107666, "rewards/rejected": -9.525317192077637, "step": 340 }, { "epoch": 0.58, "learning_rate": 9.94262891159928e-07, "logits/chosen": -1.4195408821105957, "logits/rejected": -1.2427754402160645, "logps/chosen": -230.10208129882812, "logps/rejected": -262.679931640625, "loss": 0.2108, "rewards/accuracies": 1.0, "rewards/chosen": -1.9072086811065674, "rewards/margins": 10.54747200012207, "rewards/rejected": -12.454681396484375, "step": 341 }, { "epoch": 0.58, "learning_rate": 9.941511689593633e-07, "logits/chosen": -1.1351226568222046, "logits/rejected": -1.9858900308609009, "logps/chosen": -298.932861328125, "logps/rejected": -142.6973876953125, "loss": 0.1527, "rewards/accuracies": 1.0, "rewards/chosen": 1.044623613357544, "rewards/margins": 9.147809982299805, "rewards/rejected": -8.10318660736084, "step": 342 }, { "epoch": 0.58, "learning_rate": 9.940383758051767e-07, "logits/chosen": -1.8453896045684814, "logits/rejected": -1.5388239622116089, "logps/chosen": -197.114990234375, "logps/rejected": -312.2078552246094, "loss": 0.2152, "rewards/accuracies": 1.0, "rewards/chosen": -0.8707555532455444, "rewards/margins": 12.464103698730469, "rewards/rejected": -13.334858894348145, "step": 343 }, { "epoch": 0.58, "learning_rate": 9.939245119418206e-07, "logits/chosen": -1.4721529483795166, "logits/rejected": -1.445346713066101, "logps/chosen": -178.68853759765625, "logps/rejected": -172.24676513671875, "loss": 0.1945, "rewards/accuracies": 0.5, "rewards/chosen": -1.9977798461914062, "rewards/margins": 2.5249760150909424, "rewards/rejected": -4.522756099700928, "step": 344 }, { "epoch": 0.58, "learning_rate": 9.938095776160674e-07, "logits/chosen": -0.6039644479751587, "logits/rejected": -0.8809584975242615, "logps/chosen": -337.0422058105469, "logps/rejected": -204.4918212890625, "loss": 0.1941, "rewards/accuracies": 1.0, "rewards/chosen": -0.2001136839389801, "rewards/margins": 9.865687370300293, "rewards/rejected": -10.065800666809082, "step": 345 }, { "epoch": 0.58, "learning_rate": 9.936935730770093e-07, "logits/chosen": -1.719544768333435, "logits/rejected": -1.678938865661621, "logps/chosen": -548.88916015625, "logps/rejected": -351.3948669433594, "loss": 0.166, "rewards/accuracies": 0.5, "rewards/chosen": -0.9289765357971191, "rewards/margins": 1.596639633178711, "rewards/rejected": -2.52561616897583, "step": 346 }, { "epoch": 0.59, "learning_rate": 9.935764985760582e-07, "logits/chosen": -1.567973256111145, "logits/rejected": -1.7810003757476807, "logps/chosen": -64.80793762207031, "logps/rejected": -145.93380737304688, "loss": 0.1616, "rewards/accuracies": 1.0, "rewards/chosen": -0.15498466789722443, "rewards/margins": 4.86065149307251, "rewards/rejected": -5.015635967254639, "step": 347 }, { "epoch": 0.59, "learning_rate": 9.934583543669453e-07, "logits/chosen": -2.2390925884246826, "logits/rejected": -1.3963329792022705, "logps/chosen": -86.66363525390625, "logps/rejected": -205.68148803710938, "loss": 0.1828, "rewards/accuracies": 0.5, "rewards/chosen": -0.016691789031028748, "rewards/margins": 5.741617202758789, "rewards/rejected": -5.7583088874816895, "step": 348 }, { "epoch": 0.59, "learning_rate": 9.933391407057195e-07, "logits/chosen": -1.3134796619415283, "logits/rejected": -1.861419439315796, "logps/chosen": -224.83663940429688, "logps/rejected": -265.0384521484375, "loss": 0.1408, "rewards/accuracies": 1.0, "rewards/chosen": 0.34677210450172424, "rewards/margins": 13.738977432250977, "rewards/rejected": -13.392204284667969, "step": 349 }, { "epoch": 0.59, "learning_rate": 9.932188578507474e-07, "logits/chosen": -1.9301025867462158, "logits/rejected": -1.5889010429382324, "logps/chosen": -50.467098236083984, "logps/rejected": -331.0249938964844, "loss": 0.1679, "rewards/accuracies": 1.0, "rewards/chosen": -0.6283357739448547, "rewards/margins": 14.762405395507812, "rewards/rejected": -15.390741348266602, "step": 350 }, { "epoch": 0.59, "learning_rate": 9.930975060627136e-07, "logits/chosen": -2.602487325668335, "logits/rejected": -1.6359155178070068, "logps/chosen": -654.8057861328125, "logps/rejected": -385.56781005859375, "loss": 0.2003, "rewards/accuracies": 1.0, "rewards/chosen": -0.9707611203193665, "rewards/margins": 4.78594970703125, "rewards/rejected": -5.756711006164551, "step": 351 }, { "epoch": 0.59, "learning_rate": 9.929750856046187e-07, "logits/chosen": -0.7864140868186951, "logits/rejected": -1.5045098066329956, "logps/chosen": -400.8957214355469, "logps/rejected": -302.7469787597656, "loss": 0.1987, "rewards/accuracies": 0.5, "rewards/chosen": 0.44732972979545593, "rewards/margins": 1.3191795349121094, "rewards/rejected": -0.8718498945236206, "step": 352 }, { "epoch": 0.6, "learning_rate": 9.928515967417792e-07, "logits/chosen": -0.948026716709137, "logits/rejected": -1.3471962213516235, "logps/chosen": -238.0382843017578, "logps/rejected": -111.223876953125, "loss": 0.1799, "rewards/accuracies": 1.0, "rewards/chosen": 0.48494261503219604, "rewards/margins": 7.656557083129883, "rewards/rejected": -7.171614170074463, "step": 353 }, { "epoch": 0.6, "learning_rate": 9.927270397418278e-07, "logits/chosen": -1.5953532457351685, "logits/rejected": -1.2294915914535522, "logps/chosen": -99.09436798095703, "logps/rejected": -213.32522583007812, "loss": 0.1767, "rewards/accuracies": 0.5, "rewards/chosen": -0.6239818930625916, "rewards/margins": 4.957241058349609, "rewards/rejected": -5.581223011016846, "step": 354 }, { "epoch": 0.6, "learning_rate": 9.92601414874712e-07, "logits/chosen": -0.9315654635429382, "logits/rejected": -1.4781970977783203, "logps/chosen": -360.9663391113281, "logps/rejected": -193.1742401123047, "loss": 0.1714, "rewards/accuracies": 0.5, "rewards/chosen": -0.39375075697898865, "rewards/margins": 6.650259971618652, "rewards/rejected": -7.044010639190674, "step": 355 }, { "epoch": 0.6, "learning_rate": 9.924747224126931e-07, "logits/chosen": -1.1415091753005981, "logits/rejected": -1.5295865535736084, "logps/chosen": -136.76544189453125, "logps/rejected": -136.62179565429688, "loss": 0.1877, "rewards/accuracies": 1.0, "rewards/chosen": -0.1295880377292633, "rewards/margins": 4.632694244384766, "rewards/rejected": -4.762282371520996, "step": 356 }, { "epoch": 0.6, "learning_rate": 9.923469626303464e-07, "logits/chosen": -2.146008014678955, "logits/rejected": -1.7241287231445312, "logps/chosen": -200.16453552246094, "logps/rejected": -284.6964416503906, "loss": 0.1901, "rewards/accuracies": 1.0, "rewards/chosen": -0.12157487869262695, "rewards/margins": 12.299015045166016, "rewards/rejected": -12.420589447021484, "step": 357 }, { "epoch": 0.6, "learning_rate": 9.922181358045606e-07, "logits/chosen": -2.4690566062927246, "logits/rejected": -2.171621084213257, "logps/chosen": -12.845178604125977, "logps/rejected": -186.6353302001953, "loss": 0.1612, "rewards/accuracies": 1.0, "rewards/chosen": -0.3692449927330017, "rewards/margins": 6.611391544342041, "rewards/rejected": -6.980636119842529, "step": 358 }, { "epoch": 0.61, "learning_rate": 9.92088242214537e-07, "logits/chosen": -1.0186376571655273, "logits/rejected": -1.3315547704696655, "logps/chosen": -394.67236328125, "logps/rejected": -266.37738037109375, "loss": 0.1621, "rewards/accuracies": 1.0, "rewards/chosen": 0.5413925647735596, "rewards/margins": 7.737855434417725, "rewards/rejected": -7.196463108062744, "step": 359 }, { "epoch": 0.61, "learning_rate": 9.919572821417885e-07, "logits/chosen": -0.9612762928009033, "logits/rejected": -1.0460320711135864, "logps/chosen": -69.64960479736328, "logps/rejected": -178.71676635742188, "loss": 0.1798, "rewards/accuracies": 1.0, "rewards/chosen": -0.11741314828395844, "rewards/margins": 11.80695629119873, "rewards/rejected": -11.924369812011719, "step": 360 }, { "epoch": 0.61, "learning_rate": 9.918252558701396e-07, "logits/chosen": -1.4163517951965332, "logits/rejected": -1.2571159601211548, "logps/chosen": -48.93925857543945, "logps/rejected": -165.11422729492188, "loss": 0.1627, "rewards/accuracies": 1.0, "rewards/chosen": -0.40845078229904175, "rewards/margins": 8.386886596679688, "rewards/rejected": -8.795337677001953, "step": 361 }, { "epoch": 0.61, "learning_rate": 9.91692163685725e-07, "logits/chosen": -1.6064996719360352, "logits/rejected": -1.1707127094268799, "logps/chosen": -13.273357391357422, "logps/rejected": -70.61612701416016, "loss": 0.1802, "rewards/accuracies": 1.0, "rewards/chosen": 0.01451602578163147, "rewards/margins": 3.4827561378479004, "rewards/rejected": -3.468240261077881, "step": 362 }, { "epoch": 0.61, "learning_rate": 9.915580058769908e-07, "logits/chosen": -1.8302160501480103, "logits/rejected": -1.5408596992492676, "logps/chosen": -49.54063034057617, "logps/rejected": -165.20664978027344, "loss": 0.1484, "rewards/accuracies": 1.0, "rewards/chosen": -0.45899465680122375, "rewards/margins": 0.5188831686973572, "rewards/rejected": -0.9778778553009033, "step": 363 }, { "epoch": 0.61, "learning_rate": 9.914227827346908e-07, "logits/chosen": -1.4065834283828735, "logits/rejected": -1.661447525024414, "logps/chosen": -73.02536010742188, "logps/rejected": -106.48794555664062, "loss": 0.1977, "rewards/accuracies": 1.0, "rewards/chosen": -0.30404195189476013, "rewards/margins": 2.008908987045288, "rewards/rejected": -2.31295108795166, "step": 364 }, { "epoch": 0.62, "learning_rate": 9.912864945518893e-07, "logits/chosen": -1.741304636001587, "logits/rejected": -2.134251356124878, "logps/chosen": -274.7901916503906, "logps/rejected": -197.7181396484375, "loss": 0.2159, "rewards/accuracies": 1.0, "rewards/chosen": -0.4863889813423157, "rewards/margins": 1.1557424068450928, "rewards/rejected": -1.6421314477920532, "step": 365 }, { "epoch": 0.62, "learning_rate": 9.911491416239577e-07, "logits/chosen": -0.297260582447052, "logits/rejected": -0.43519172072410583, "logps/chosen": -465.78167724609375, "logps/rejected": -288.87237548828125, "loss": 0.1746, "rewards/accuracies": 1.0, "rewards/chosen": -1.9665650129318237, "rewards/margins": 12.498331069946289, "rewards/rejected": -14.464896202087402, "step": 366 }, { "epoch": 0.62, "learning_rate": 9.910107242485756e-07, "logits/chosen": -2.1462013721466064, "logits/rejected": -1.7071665525436401, "logps/chosen": -158.352783203125, "logps/rejected": -259.8519287109375, "loss": 0.1644, "rewards/accuracies": 1.0, "rewards/chosen": -0.8491896390914917, "rewards/margins": 12.058333396911621, "rewards/rejected": -12.907523155212402, "step": 367 }, { "epoch": 0.62, "learning_rate": 9.908712427257291e-07, "logits/chosen": -1.6921484470367432, "logits/rejected": -1.673211932182312, "logps/chosen": -38.84759521484375, "logps/rejected": -308.70318603515625, "loss": 0.1658, "rewards/accuracies": 1.0, "rewards/chosen": -0.669171929359436, "rewards/margins": 10.507699966430664, "rewards/rejected": -11.176872253417969, "step": 368 }, { "epoch": 0.62, "learning_rate": 9.907306973577109e-07, "logits/chosen": -1.1072365045547485, "logits/rejected": -1.6484124660491943, "logps/chosen": -140.98245239257812, "logps/rejected": -191.68093872070312, "loss": 0.1682, "rewards/accuracies": 1.0, "rewards/chosen": -0.5839079022407532, "rewards/margins": 9.572250366210938, "rewards/rejected": -10.156158447265625, "step": 369 }, { "epoch": 0.62, "learning_rate": 9.905890884491194e-07, "logits/chosen": -2.0668201446533203, "logits/rejected": -1.8850473165512085, "logps/chosen": -19.376293182373047, "logps/rejected": -101.66358184814453, "loss": 0.1931, "rewards/accuracies": 1.0, "rewards/chosen": -0.12364569306373596, "rewards/margins": 4.169041633605957, "rewards/rejected": -4.29268741607666, "step": 370 }, { "epoch": 0.63, "learning_rate": 9.904464163068577e-07, "logits/chosen": -1.7173949480056763, "logits/rejected": -2.278022289276123, "logps/chosen": -288.80584716796875, "logps/rejected": -257.0757141113281, "loss": 0.2352, "rewards/accuracies": 1.0, "rewards/chosen": -0.6069202423095703, "rewards/margins": 8.09630298614502, "rewards/rejected": -8.70322322845459, "step": 371 }, { "epoch": 0.63, "learning_rate": 9.903026812401332e-07, "logits/chosen": -1.8909093141555786, "logits/rejected": -1.652140498161316, "logps/chosen": -170.0011444091797, "logps/rejected": -244.4969482421875, "loss": 0.172, "rewards/accuracies": 1.0, "rewards/chosen": -0.49996358156204224, "rewards/margins": 10.814790725708008, "rewards/rejected": -11.314754486083984, "step": 372 }, { "epoch": 0.63, "learning_rate": 9.90157883560457e-07, "logits/chosen": -0.7032025456428528, "logits/rejected": -0.7244059443473816, "logps/chosen": -17.019268035888672, "logps/rejected": -98.73320770263672, "loss": 0.2018, "rewards/accuracies": 1.0, "rewards/chosen": 0.16112473607063293, "rewards/margins": 7.4662017822265625, "rewards/rejected": -7.305077075958252, "step": 373 }, { "epoch": 0.63, "learning_rate": 9.900120235816433e-07, "logits/chosen": -1.4089502096176147, "logits/rejected": -1.385013461112976, "logps/chosen": -298.27459716796875, "logps/rejected": -263.1510314941406, "loss": 0.1913, "rewards/accuracies": 1.0, "rewards/chosen": -1.2739990949630737, "rewards/margins": 6.35389518737793, "rewards/rejected": -7.627894401550293, "step": 374 }, { "epoch": 0.63, "learning_rate": 9.898651016198085e-07, "logits/chosen": -1.6036759614944458, "logits/rejected": -2.3823535442352295, "logps/chosen": -85.63179016113281, "logps/rejected": -93.74755859375, "loss": 0.1946, "rewards/accuracies": 1.0, "rewards/chosen": 0.04974517598748207, "rewards/margins": 4.797137260437012, "rewards/rejected": -4.747392177581787, "step": 375 }, { "epoch": 0.63, "learning_rate": 9.897171179933706e-07, "logits/chosen": -1.0535942316055298, "logits/rejected": -2.1571121215820312, "logps/chosen": -717.8565673828125, "logps/rejected": -162.77194213867188, "loss": 0.2117, "rewards/accuracies": 0.5, "rewards/chosen": -0.7600006461143494, "rewards/margins": 0.8016586899757385, "rewards/rejected": -1.561659336090088, "step": 376 }, { "epoch": 0.64, "learning_rate": 9.895680730230483e-07, "logits/chosen": -1.6556205749511719, "logits/rejected": -1.703450083732605, "logps/chosen": -159.5177764892578, "logps/rejected": -124.55549621582031, "loss": 0.2291, "rewards/accuracies": 1.0, "rewards/chosen": -0.6513794660568237, "rewards/margins": 5.128384113311768, "rewards/rejected": -5.779763221740723, "step": 377 }, { "epoch": 0.64, "learning_rate": 9.894179670318606e-07, "logits/chosen": -1.3000423908233643, "logits/rejected": -2.175767183303833, "logps/chosen": -206.19947814941406, "logps/rejected": -79.72200012207031, "loss": 0.1892, "rewards/accuracies": 1.0, "rewards/chosen": -0.14822006225585938, "rewards/margins": 1.46049165725708, "rewards/rejected": -1.608711838722229, "step": 378 }, { "epoch": 0.64, "learning_rate": 9.892668003451264e-07, "logits/chosen": -1.3806241750717163, "logits/rejected": -2.067390203475952, "logps/chosen": -281.26068115234375, "logps/rejected": -130.72952270507812, "loss": 0.1605, "rewards/accuracies": 1.0, "rewards/chosen": 0.3341960906982422, "rewards/margins": 7.966523170471191, "rewards/rejected": -7.632327079772949, "step": 379 }, { "epoch": 0.64, "learning_rate": 9.891145732904626e-07, "logits/chosen": -2.4080684185028076, "logits/rejected": -2.4052534103393555, "logps/chosen": -23.60405731201172, "logps/rejected": -156.3341522216797, "loss": 0.1842, "rewards/accuracies": 1.0, "rewards/chosen": -0.3208608627319336, "rewards/margins": 9.061665534973145, "rewards/rejected": -9.382526397705078, "step": 380 }, { "epoch": 0.64, "learning_rate": 9.889612861977853e-07, "logits/chosen": -1.899614691734314, "logits/rejected": -1.3512424230575562, "logps/chosen": -52.9254035949707, "logps/rejected": -142.5703125, "loss": 0.195, "rewards/accuracies": 1.0, "rewards/chosen": 0.25636160373687744, "rewards/margins": 4.048177242279053, "rewards/rejected": -3.791815757751465, "step": 381 }, { "epoch": 0.64, "learning_rate": 9.888069393993068e-07, "logits/chosen": -1.6391324996948242, "logits/rejected": -2.6511809825897217, "logps/chosen": -357.7758483886719, "logps/rejected": -136.27767944335938, "loss": 0.1648, "rewards/accuracies": 1.0, "rewards/chosen": 0.7596569061279297, "rewards/margins": 3.358733892440796, "rewards/rejected": -2.599076986312866, "step": 382 }, { "epoch": 0.65, "learning_rate": 9.886515332295368e-07, "logits/chosen": -1.600348711013794, "logits/rejected": -2.2982192039489746, "logps/chosen": -228.30441284179688, "logps/rejected": -234.28158569335938, "loss": 0.1545, "rewards/accuracies": 1.0, "rewards/chosen": 0.35866472125053406, "rewards/margins": 5.956897735595703, "rewards/rejected": -5.598233222961426, "step": 383 }, { "epoch": 0.65, "learning_rate": 9.88495068025281e-07, "logits/chosen": -1.3065029382705688, "logits/rejected": -1.0549356937408447, "logps/chosen": -226.62106323242188, "logps/rejected": -404.517822265625, "loss": 0.2052, "rewards/accuracies": 1.0, "rewards/chosen": -1.2328369617462158, "rewards/margins": 17.27766227722168, "rewards/rejected": -18.510498046875, "step": 384 }, { "epoch": 0.65, "learning_rate": 9.883375441256397e-07, "logits/chosen": -2.497364044189453, "logits/rejected": -2.2807140350341797, "logps/chosen": -79.43330383300781, "logps/rejected": -1763.1298828125, "loss": 0.2064, "rewards/accuracies": 1.0, "rewards/chosen": -0.6368292570114136, "rewards/margins": 4.40083646774292, "rewards/rejected": -5.037665843963623, "step": 385 }, { "epoch": 0.65, "learning_rate": 9.88178961872008e-07, "logits/chosen": -2.6438419818878174, "logits/rejected": -1.569690227508545, "logps/chosen": -492.58660888671875, "logps/rejected": -267.66607666015625, "loss": 0.1912, "rewards/accuracies": 1.0, "rewards/chosen": -1.091853380203247, "rewards/margins": 10.429098129272461, "rewards/rejected": -11.520952224731445, "step": 386 }, { "epoch": 0.65, "learning_rate": 9.880193216080748e-07, "logits/chosen": -1.5667064189910889, "logits/rejected": -0.8056033253669739, "logps/chosen": -272.8934020996094, "logps/rejected": -404.6851806640625, "loss": 0.1471, "rewards/accuracies": 1.0, "rewards/chosen": -0.8850148916244507, "rewards/margins": 8.984339714050293, "rewards/rejected": -9.869355201721191, "step": 387 }, { "epoch": 0.65, "learning_rate": 9.878586236798221e-07, "logits/chosen": -1.8617156744003296, "logits/rejected": -1.3088810443878174, "logps/chosen": -240.18605041503906, "logps/rejected": -271.2889099121094, "loss": 0.1661, "rewards/accuracies": 1.0, "rewards/chosen": -0.44331783056259155, "rewards/margins": 11.63255500793457, "rewards/rejected": -12.075872421264648, "step": 388 }, { "epoch": 0.66, "learning_rate": 9.876968684355238e-07, "logits/chosen": -0.60749751329422, "logits/rejected": -0.9327036738395691, "logps/chosen": -248.83444213867188, "logps/rejected": -199.20504760742188, "loss": 0.179, "rewards/accuracies": 1.0, "rewards/chosen": -0.9339897632598877, "rewards/margins": 9.368557929992676, "rewards/rejected": -10.302547454833984, "step": 389 }, { "epoch": 0.66, "learning_rate": 9.875340562257452e-07, "logits/chosen": -1.6634929180145264, "logits/rejected": -1.8427734375, "logps/chosen": -225.20516967773438, "logps/rejected": -224.0525665283203, "loss": 0.159, "rewards/accuracies": 1.0, "rewards/chosen": 0.1540244221687317, "rewards/margins": 4.2535014152526855, "rewards/rejected": -4.099477291107178, "step": 390 }, { "epoch": 0.66, "learning_rate": 9.87370187403343e-07, "logits/chosen": -1.5186893939971924, "logits/rejected": -1.5003349781036377, "logps/chosen": -355.7366638183594, "logps/rejected": -268.9139404296875, "loss": 0.1805, "rewards/accuracies": 1.0, "rewards/chosen": -1.4280953407287598, "rewards/margins": 10.353462219238281, "rewards/rejected": -11.781557083129883, "step": 391 }, { "epoch": 0.66, "learning_rate": 9.872052623234631e-07, "logits/chosen": -1.2197469472885132, "logits/rejected": -2.3034727573394775, "logps/chosen": -305.0716552734375, "logps/rejected": -119.30422973632812, "loss": 0.1617, "rewards/accuracies": 1.0, "rewards/chosen": -0.9030243158340454, "rewards/margins": 6.021801471710205, "rewards/rejected": -6.924825668334961, "step": 392 }, { "epoch": 0.66, "learning_rate": 9.870392813435408e-07, "logits/chosen": -1.6853370666503906, "logits/rejected": -1.8906601667404175, "logps/chosen": -54.065086364746094, "logps/rejected": -109.49345397949219, "loss": 0.183, "rewards/accuracies": 1.0, "rewards/chosen": -1.0523476600646973, "rewards/margins": 4.095767021179199, "rewards/rejected": -5.1481146812438965, "step": 393 }, { "epoch": 0.66, "learning_rate": 9.868722448233003e-07, "logits/chosen": -1.3388574123382568, "logits/rejected": -1.6647964715957642, "logps/chosen": -239.39515686035156, "logps/rejected": -119.59913635253906, "loss": 0.1716, "rewards/accuracies": 1.0, "rewards/chosen": -1.1848201751708984, "rewards/margins": 4.421016693115234, "rewards/rejected": -5.605837345123291, "step": 394 }, { "epoch": 0.67, "learning_rate": 9.867041531247524e-07, "logits/chosen": -1.7558951377868652, "logits/rejected": -1.7084178924560547, "logps/chosen": -49.69265365600586, "logps/rejected": -83.90230560302734, "loss": 0.2227, "rewards/accuracies": 0.5, "rewards/chosen": -0.1897386610507965, "rewards/margins": 3.7224905490875244, "rewards/rejected": -3.912229061126709, "step": 395 }, { "epoch": 0.67, "learning_rate": 9.86535006612196e-07, "logits/chosen": -1.0474004745483398, "logits/rejected": -0.7698359489440918, "logps/chosen": -255.73220825195312, "logps/rejected": -224.53810119628906, "loss": 0.129, "rewards/accuracies": 0.5, "rewards/chosen": -3.990817070007324, "rewards/margins": 3.028542995452881, "rewards/rejected": -7.019360065460205, "step": 396 }, { "epoch": 0.67, "learning_rate": 9.86364805652215e-07, "logits/chosen": -1.4001511335372925, "logits/rejected": -0.2056565284729004, "logps/chosen": -338.36224365234375, "logps/rejected": -943.743408203125, "loss": 0.1818, "rewards/accuracies": 1.0, "rewards/chosen": -0.8232328295707703, "rewards/margins": 8.953399658203125, "rewards/rejected": -9.776632308959961, "step": 397 }, { "epoch": 0.67, "learning_rate": 9.861935506136793e-07, "logits/chosen": -2.1997809410095215, "logits/rejected": -2.1212539672851562, "logps/chosen": -115.73619079589844, "logps/rejected": -155.65628051757812, "loss": 0.1635, "rewards/accuracies": 1.0, "rewards/chosen": -1.3197250366210938, "rewards/margins": 5.0713791847229, "rewards/rejected": -6.391103744506836, "step": 398 }, { "epoch": 0.67, "learning_rate": 9.860212418677425e-07, "logits/chosen": -1.3245618343353271, "logits/rejected": -1.5838465690612793, "logps/chosen": -192.42564392089844, "logps/rejected": -121.29484558105469, "loss": 0.1951, "rewards/accuracies": 1.0, "rewards/chosen": 0.7826123237609863, "rewards/margins": 6.154439926147461, "rewards/rejected": -5.371828079223633, "step": 399 }, { "epoch": 0.67, "learning_rate": 9.858478797878428e-07, "logits/chosen": -1.1259046792984009, "logits/rejected": -1.549862265586853, "logps/chosen": -599.236328125, "logps/rejected": -300.1288757324219, "loss": 0.1937, "rewards/accuracies": 1.0, "rewards/chosen": 0.30740663409233093, "rewards/margins": 7.251522064208984, "rewards/rejected": -6.94411563873291, "step": 400 }, { "epoch": 0.68, "learning_rate": 9.856734647497004e-07, "logits/chosen": -1.8423173427581787, "logits/rejected": -1.7820255756378174, "logps/chosen": -338.8928527832031, "logps/rejected": -427.98590087890625, "loss": 0.1951, "rewards/accuracies": 0.5, "rewards/chosen": -0.7935646176338196, "rewards/margins": 1.3270835876464844, "rewards/rejected": -2.120648145675659, "step": 401 }, { "epoch": 0.68, "learning_rate": 9.854979971313182e-07, "logits/chosen": -1.5276292562484741, "logits/rejected": -1.362746000289917, "logps/chosen": -153.15682983398438, "logps/rejected": -178.7244873046875, "loss": 0.1921, "rewards/accuracies": 1.0, "rewards/chosen": -1.658955693244934, "rewards/margins": 6.521320343017578, "rewards/rejected": -8.180275917053223, "step": 402 }, { "epoch": 0.68, "learning_rate": 9.853214773129795e-07, "logits/chosen": -1.574330449104309, "logits/rejected": -1.6447815895080566, "logps/chosen": -203.70518493652344, "logps/rejected": -186.30174255371094, "loss": 0.1799, "rewards/accuracies": 1.0, "rewards/chosen": -0.5951843857765198, "rewards/margins": 10.695902824401855, "rewards/rejected": -11.29108715057373, "step": 403 }, { "epoch": 0.68, "learning_rate": 9.851439056772488e-07, "logits/chosen": -2.2148728370666504, "logits/rejected": -0.5473410487174988, "logps/chosen": -114.89361572265625, "logps/rejected": -270.749755859375, "loss": 0.1814, "rewards/accuracies": 1.0, "rewards/chosen": -0.34365734457969666, "rewards/margins": 10.512922286987305, "rewards/rejected": -10.856579780578613, "step": 404 }, { "epoch": 0.68, "learning_rate": 9.8496528260897e-07, "logits/chosen": -1.523105502128601, "logits/rejected": -0.9805685877799988, "logps/chosen": -263.91705322265625, "logps/rejected": -362.019287109375, "loss": 0.1232, "rewards/accuracies": 1.0, "rewards/chosen": -1.9856665134429932, "rewards/margins": 13.921882629394531, "rewards/rejected": -15.907548904418945, "step": 405 }, { "epoch": 0.68, "learning_rate": 9.847856084952652e-07, "logits/chosen": -1.4170777797698975, "logits/rejected": -1.4437755346298218, "logps/chosen": -71.64923858642578, "logps/rejected": -111.52011108398438, "loss": 0.1337, "rewards/accuracies": 1.0, "rewards/chosen": -0.24141913652420044, "rewards/margins": 1.8372013568878174, "rewards/rejected": -2.078620433807373, "step": 406 }, { "epoch": 0.69, "learning_rate": 9.846048837255353e-07, "logits/chosen": -1.8761231899261475, "logits/rejected": -1.4860190153121948, "logps/chosen": -321.606689453125, "logps/rejected": -248.3543701171875, "loss": 0.1918, "rewards/accuracies": 1.0, "rewards/chosen": -1.1618309020996094, "rewards/margins": 8.921380996704102, "rewards/rejected": -10.083211898803711, "step": 407 }, { "epoch": 0.69, "learning_rate": 9.84423108691457e-07, "logits/chosen": -1.91605544090271, "logits/rejected": -1.6970082521438599, "logps/chosen": -59.132293701171875, "logps/rejected": -167.10081481933594, "loss": 0.2158, "rewards/accuracies": 1.0, "rewards/chosen": -1.300628423690796, "rewards/margins": 9.486973762512207, "rewards/rejected": -10.787602424621582, "step": 408 }, { "epoch": 0.69, "learning_rate": 9.842402837869842e-07, "logits/chosen": -0.6720188856124878, "logits/rejected": -1.0757673978805542, "logps/chosen": -532.5537109375, "logps/rejected": -276.9455261230469, "loss": 0.1982, "rewards/accuracies": 1.0, "rewards/chosen": -0.42199867963790894, "rewards/margins": 12.428057670593262, "rewards/rejected": -12.850056648254395, "step": 409 }, { "epoch": 0.69, "learning_rate": 9.84056409408346e-07, "logits/chosen": -1.7832905054092407, "logits/rejected": -2.1222946643829346, "logps/chosen": -311.7086181640625, "logps/rejected": -320.260009765625, "loss": 0.2174, "rewards/accuracies": 1.0, "rewards/chosen": -1.5798829793930054, "rewards/margins": 5.759591102600098, "rewards/rejected": -7.339474201202393, "step": 410 }, { "epoch": 0.69, "learning_rate": 9.838714859540458e-07, "logits/chosen": -1.6252554655075073, "logits/rejected": -2.4704833030700684, "logps/chosen": -273.0162048339844, "logps/rejected": -206.291015625, "loss": 0.1782, "rewards/accuracies": 1.0, "rewards/chosen": -0.6414878964424133, "rewards/margins": 7.542596817016602, "rewards/rejected": -8.18408489227295, "step": 411 }, { "epoch": 0.69, "learning_rate": 9.836855138248602e-07, "logits/chosen": -1.5945271253585815, "logits/rejected": -1.039263129234314, "logps/chosen": -213.08566284179688, "logps/rejected": -276.2701416015625, "loss": 0.2015, "rewards/accuracies": 0.5, "rewards/chosen": -1.5801552534103394, "rewards/margins": 4.361068248748779, "rewards/rejected": -5.941223621368408, "step": 412 }, { "epoch": 0.7, "learning_rate": 9.834984934238397e-07, "logits/chosen": -2.0274336338043213, "logits/rejected": -1.1639341115951538, "logps/chosen": -224.14974975585938, "logps/rejected": -492.0933532714844, "loss": 0.1541, "rewards/accuracies": 0.0, "rewards/chosen": -1.5109009742736816, "rewards/margins": -0.4450409412384033, "rewards/rejected": -1.0658600330352783, "step": 413 }, { "epoch": 0.7, "learning_rate": 9.833104251563055e-07, "logits/chosen": -1.1577857732772827, "logits/rejected": -1.7103009223937988, "logps/chosen": -294.07470703125, "logps/rejected": -189.33216857910156, "loss": 0.1796, "rewards/accuracies": 1.0, "rewards/chosen": -1.6231157779693604, "rewards/margins": 8.794317245483398, "rewards/rejected": -10.41743278503418, "step": 414 }, { "epoch": 0.7, "learning_rate": 9.831213094298504e-07, "logits/chosen": -2.4059667587280273, "logits/rejected": -2.134012222290039, "logps/chosen": -89.21797180175781, "logps/rejected": -205.1034393310547, "loss": 0.2001, "rewards/accuracies": 1.0, "rewards/chosen": -0.15212784707546234, "rewards/margins": 1.837868571281433, "rewards/rejected": -1.9899964332580566, "step": 415 }, { "epoch": 0.7, "learning_rate": 9.829311466543372e-07, "logits/chosen": -0.7545611262321472, "logits/rejected": -2.312044382095337, "logps/chosen": -298.3918151855469, "logps/rejected": -100.84245300292969, "loss": 0.1934, "rewards/accuracies": 1.0, "rewards/chosen": -1.2649837732315063, "rewards/margins": 4.35391902923584, "rewards/rejected": -5.618902683258057, "step": 416 }, { "epoch": 0.7, "learning_rate": 9.827399372418978e-07, "logits/chosen": -1.689001202583313, "logits/rejected": -1.7924022674560547, "logps/chosen": -490.21630859375, "logps/rejected": -572.5187377929688, "loss": 0.1589, "rewards/accuracies": 0.5, "rewards/chosen": -0.4802818298339844, "rewards/margins": 1.4142119884490967, "rewards/rejected": -1.894493818283081, "step": 417 }, { "epoch": 0.7, "learning_rate": 9.825476816069325e-07, "logits/chosen": -1.687720537185669, "logits/rejected": -2.245692014694214, "logps/chosen": -252.95947265625, "logps/rejected": -272.66729736328125, "loss": 0.1749, "rewards/accuracies": 1.0, "rewards/chosen": -1.0029993057250977, "rewards/margins": 6.877737522125244, "rewards/rejected": -7.880736827850342, "step": 418 }, { "epoch": 0.71, "learning_rate": 9.823543801661093e-07, "logits/chosen": -1.327947735786438, "logits/rejected": -2.4989848136901855, "logps/chosen": -171.38914489746094, "logps/rejected": -114.95437622070312, "loss": 0.1847, "rewards/accuracies": 1.0, "rewards/chosen": -1.1186927556991577, "rewards/margins": 3.9151296615600586, "rewards/rejected": -5.033822536468506, "step": 419 }, { "epoch": 0.71, "learning_rate": 9.821600333383624e-07, "logits/chosen": -1.2999866008758545, "logits/rejected": -1.940646767616272, "logps/chosen": -404.37384033203125, "logps/rejected": -225.85601806640625, "loss": 0.18, "rewards/accuracies": 1.0, "rewards/chosen": -1.6930325031280518, "rewards/margins": 9.95132827758789, "rewards/rejected": -11.64436149597168, "step": 420 }, { "epoch": 0.71, "learning_rate": 9.819646415448917e-07, "logits/chosen": -0.8139424324035645, "logits/rejected": -1.0725926160812378, "logps/chosen": -449.7269287109375, "logps/rejected": -254.24598693847656, "loss": 0.1662, "rewards/accuracies": 1.0, "rewards/chosen": -1.6415162086486816, "rewards/margins": 9.845947265625, "rewards/rejected": -11.487462997436523, "step": 421 }, { "epoch": 0.71, "learning_rate": 9.817682052091617e-07, "logits/chosen": -1.257497787475586, "logits/rejected": -1.384131908416748, "logps/chosen": -306.87060546875, "logps/rejected": -237.9322509765625, "loss": 0.165, "rewards/accuracies": 1.0, "rewards/chosen": 0.43299102783203125, "rewards/margins": 6.2627716064453125, "rewards/rejected": -5.829780578613281, "step": 422 }, { "epoch": 0.71, "learning_rate": 9.815707247569012e-07, "logits/chosen": -1.3054028749465942, "logits/rejected": -1.9904706478118896, "logps/chosen": -199.9625244140625, "logps/rejected": -174.94766235351562, "loss": 0.2305, "rewards/accuracies": 1.0, "rewards/chosen": -1.0188324451446533, "rewards/margins": 9.714458465576172, "rewards/rejected": -10.733290672302246, "step": 423 }, { "epoch": 0.72, "learning_rate": 9.81372200616101e-07, "logits/chosen": -1.711554765701294, "logits/rejected": -1.9116215705871582, "logps/chosen": -244.9205780029297, "logps/rejected": -235.5703125, "loss": 0.1702, "rewards/accuracies": 1.0, "rewards/chosen": -0.49955570697784424, "rewards/margins": 6.994556427001953, "rewards/rejected": -7.494112014770508, "step": 424 }, { "epoch": 0.72, "learning_rate": 9.81172633217015e-07, "logits/chosen": -1.4319877624511719, "logits/rejected": -1.4931282997131348, "logps/chosen": -227.3060760498047, "logps/rejected": -265.25616455078125, "loss": 0.2099, "rewards/accuracies": 1.0, "rewards/chosen": -1.4414575099945068, "rewards/margins": 11.510025024414062, "rewards/rejected": -12.951482772827148, "step": 425 }, { "epoch": 0.72, "learning_rate": 9.809720229921572e-07, "logits/chosen": -1.3125286102294922, "logits/rejected": -1.061862587928772, "logps/chosen": -175.32794189453125, "logps/rejected": -224.84396362304688, "loss": 0.1759, "rewards/accuracies": 1.0, "rewards/chosen": -0.008734322153031826, "rewards/margins": 4.605374336242676, "rewards/rejected": -4.614108562469482, "step": 426 }, { "epoch": 0.72, "learning_rate": 9.807703703763015e-07, "logits/chosen": -1.8269639015197754, "logits/rejected": -1.7482253313064575, "logps/chosen": -33.005889892578125, "logps/rejected": -88.10383605957031, "loss": 0.1665, "rewards/accuracies": 1.0, "rewards/chosen": -0.19627046585083008, "rewards/margins": 3.915902614593506, "rewards/rejected": -4.112173080444336, "step": 427 }, { "epoch": 0.72, "learning_rate": 9.80567675806482e-07, "logits/chosen": -1.4262571334838867, "logits/rejected": -1.5540329217910767, "logps/chosen": -73.27861785888672, "logps/rejected": -102.44043731689453, "loss": 0.165, "rewards/accuracies": 0.5, "rewards/chosen": -1.6107984781265259, "rewards/margins": 0.42136502265930176, "rewards/rejected": -2.032163619995117, "step": 428 }, { "epoch": 0.72, "learning_rate": 9.8036393972199e-07, "logits/chosen": -2.002448320388794, "logits/rejected": -2.299851655960083, "logps/chosen": -87.68385314941406, "logps/rejected": -85.51641845703125, "loss": 0.1533, "rewards/accuracies": 1.0, "rewards/chosen": -1.3282948732376099, "rewards/margins": 2.7130818367004395, "rewards/rejected": -4.04137659072876, "step": 429 }, { "epoch": 0.73, "learning_rate": 9.801591625643743e-07, "logits/chosen": -0.7014894485473633, "logits/rejected": -1.4072990417480469, "logps/chosen": -695.2222900390625, "logps/rejected": -328.1853942871094, "loss": 0.1957, "rewards/accuracies": 1.0, "rewards/chosen": -1.7655518054962158, "rewards/margins": 12.59501838684082, "rewards/rejected": -14.360569953918457, "step": 430 }, { "epoch": 0.73, "learning_rate": 9.799533447774404e-07, "logits/chosen": -1.8441977500915527, "logits/rejected": -2.370758533477783, "logps/chosen": -488.61370849609375, "logps/rejected": -225.34573364257812, "loss": 0.1664, "rewards/accuracies": 1.0, "rewards/chosen": -0.5413681268692017, "rewards/margins": 6.072797775268555, "rewards/rejected": -6.614165782928467, "step": 431 }, { "epoch": 0.73, "learning_rate": 9.797464868072486e-07, "logits/chosen": -1.492543339729309, "logits/rejected": -1.383135437965393, "logps/chosen": -128.7510223388672, "logps/rejected": -252.252685546875, "loss": 0.1783, "rewards/accuracies": 1.0, "rewards/chosen": -0.5648261904716492, "rewards/margins": 2.220522880554199, "rewards/rejected": -2.7853493690490723, "step": 432 }, { "epoch": 0.73, "learning_rate": 9.795385891021136e-07, "logits/chosen": -1.3558893203735352, "logits/rejected": -2.4246795177459717, "logps/chosen": -88.99250793457031, "logps/rejected": -69.09440612792969, "loss": 0.137, "rewards/accuracies": 1.0, "rewards/chosen": -0.3191506564617157, "rewards/margins": 4.714672565460205, "rewards/rejected": -5.033823013305664, "step": 433 }, { "epoch": 0.73, "learning_rate": 9.79329652112604e-07, "logits/chosen": -2.187133312225342, "logits/rejected": -2.0523133277893066, "logps/chosen": -237.15127563476562, "logps/rejected": -257.36614990234375, "loss": 0.2559, "rewards/accuracies": 1.0, "rewards/chosen": -0.08025474846363068, "rewards/margins": 6.414705276489258, "rewards/rejected": -6.494959831237793, "step": 434 }, { "epoch": 0.73, "learning_rate": 9.7911967629154e-07, "logits/chosen": -0.5673585534095764, "logits/rejected": -0.7198299169540405, "logps/chosen": -272.7919921875, "logps/rejected": -180.43826293945312, "loss": 0.1932, "rewards/accuracies": 1.0, "rewards/chosen": 0.3921341001987457, "rewards/margins": 10.290606498718262, "rewards/rejected": -9.89847183227539, "step": 435 }, { "epoch": 0.74, "learning_rate": 9.789086620939935e-07, "logits/chosen": -1.5646958351135254, "logits/rejected": -1.3017683029174805, "logps/chosen": -246.85928344726562, "logps/rejected": -270.7761535644531, "loss": 0.2036, "rewards/accuracies": 1.0, "rewards/chosen": -0.7477285265922546, "rewards/margins": 11.448813438415527, "rewards/rejected": -12.196542739868164, "step": 436 }, { "epoch": 0.74, "learning_rate": 9.786966099772873e-07, "logits/chosen": -1.1668096780776978, "logits/rejected": -1.150396466255188, "logps/chosen": -214.652099609375, "logps/rejected": -179.91836547851562, "loss": 0.1923, "rewards/accuracies": 1.0, "rewards/chosen": -1.1188222169876099, "rewards/margins": 8.251716613769531, "rewards/rejected": -9.370538711547852, "step": 437 }, { "epoch": 0.74, "learning_rate": 9.784835204009932e-07, "logits/chosen": -0.9191622734069824, "logits/rejected": -0.8720898032188416, "logps/chosen": -184.92555236816406, "logps/rejected": -170.7421112060547, "loss": 0.1958, "rewards/accuracies": 1.0, "rewards/chosen": -1.486758828163147, "rewards/margins": 5.754380702972412, "rewards/rejected": -7.2411394119262695, "step": 438 }, { "epoch": 0.74, "learning_rate": 9.782693938269312e-07, "logits/chosen": -1.8319122791290283, "logits/rejected": -1.2807798385620117, "logps/chosen": -209.416259765625, "logps/rejected": -269.36041259765625, "loss": 0.1844, "rewards/accuracies": 1.0, "rewards/chosen": -1.37221360206604, "rewards/margins": 13.217639923095703, "rewards/rejected": -14.589853286743164, "step": 439 }, { "epoch": 0.74, "learning_rate": 9.780542307191697e-07, "logits/chosen": -1.5643036365509033, "logits/rejected": -1.2528153657913208, "logps/chosen": -608.1365356445312, "logps/rejected": -491.44134521484375, "loss": 0.1582, "rewards/accuracies": 1.0, "rewards/chosen": -0.6126511096954346, "rewards/margins": 10.805754661560059, "rewards/rejected": -11.418405532836914, "step": 440 }, { "epoch": 0.74, "learning_rate": 9.778380315440223e-07, "logits/chosen": -1.5918906927108765, "logits/rejected": -1.0028258562088013, "logps/chosen": -284.98736572265625, "logps/rejected": -302.61846923828125, "loss": 0.187, "rewards/accuracies": 1.0, "rewards/chosen": -0.8316802978515625, "rewards/margins": 8.248760223388672, "rewards/rejected": -9.080440521240234, "step": 441 }, { "epoch": 0.75, "learning_rate": 9.776207967700489e-07, "logits/chosen": -2.523466110229492, "logits/rejected": -1.5323551893234253, "logps/chosen": -88.10340881347656, "logps/rejected": -281.467041015625, "loss": 0.1714, "rewards/accuracies": 1.0, "rewards/chosen": -0.7970980405807495, "rewards/margins": 10.560657501220703, "rewards/rejected": -11.357755661010742, "step": 442 }, { "epoch": 0.75, "learning_rate": 9.774025268680538e-07, "logits/chosen": -1.2854011058807373, "logits/rejected": -1.61070716381073, "logps/chosen": -567.8692016601562, "logps/rejected": -314.9006042480469, "loss": 0.1934, "rewards/accuracies": 1.0, "rewards/chosen": -1.4001374244689941, "rewards/margins": 9.382172584533691, "rewards/rejected": -10.782309532165527, "step": 443 }, { "epoch": 0.75, "learning_rate": 9.77183222311084e-07, "logits/chosen": -1.8872003555297852, "logits/rejected": -1.3685364723205566, "logps/chosen": -155.8271942138672, "logps/rejected": -398.8939208984375, "loss": 0.1818, "rewards/accuracies": 0.5, "rewards/chosen": -0.7372512817382812, "rewards/margins": 0.28054046630859375, "rewards/rejected": -1.017791748046875, "step": 444 }, { "epoch": 0.75, "learning_rate": 9.769628835744292e-07, "logits/chosen": -2.153257131576538, "logits/rejected": -2.538587808609009, "logps/chosen": -219.20547485351562, "logps/rejected": -285.2276611328125, "loss": 0.1725, "rewards/accuracies": 1.0, "rewards/chosen": -1.1647541522979736, "rewards/margins": 6.780641555786133, "rewards/rejected": -7.945395469665527, "step": 445 }, { "epoch": 0.75, "learning_rate": 9.767415111356208e-07, "logits/chosen": -1.6822693347930908, "logits/rejected": -1.125065565109253, "logps/chosen": -213.1407928466797, "logps/rejected": -248.02450561523438, "loss": 0.1815, "rewards/accuracies": 1.0, "rewards/chosen": -1.2097599506378174, "rewards/margins": 7.717904090881348, "rewards/rejected": -8.927663803100586, "step": 446 }, { "epoch": 0.75, "learning_rate": 9.765191054744304e-07, "logits/chosen": -0.9882490038871765, "logits/rejected": -1.6791247129440308, "logps/chosen": -272.322998046875, "logps/rejected": -146.1826171875, "loss": 0.1619, "rewards/accuracies": 1.0, "rewards/chosen": -0.7433132529258728, "rewards/margins": 5.633927345275879, "rewards/rejected": -6.377241134643555, "step": 447 }, { "epoch": 0.76, "learning_rate": 9.762956670728683e-07, "logits/chosen": -1.6359366178512573, "logits/rejected": -1.28961181640625, "logps/chosen": -62.76860046386719, "logps/rejected": -229.94815063476562, "loss": 0.1326, "rewards/accuracies": 1.0, "rewards/chosen": -0.22195053100585938, "rewards/margins": 12.753116607666016, "rewards/rejected": -12.975067138671875, "step": 448 }, { "epoch": 0.76, "learning_rate": 9.76071196415184e-07, "logits/chosen": -1.9846500158309937, "logits/rejected": -1.8391332626342773, "logps/chosen": -223.3437042236328, "logps/rejected": -206.13235473632812, "loss": 0.1572, "rewards/accuracies": 1.0, "rewards/chosen": -0.34529823064804077, "rewards/margins": 8.844955444335938, "rewards/rejected": -9.190253257751465, "step": 449 }, { "epoch": 0.76, "learning_rate": 9.758456939878629e-07, "logits/chosen": -0.6676144003868103, "logits/rejected": -0.5327748656272888, "logps/chosen": -334.7253723144531, "logps/rejected": -288.2399597167969, "loss": 0.1739, "rewards/accuracies": 1.0, "rewards/chosen": -0.5323821902275085, "rewards/margins": 7.189955234527588, "rewards/rejected": -7.72233772277832, "step": 450 }, { "epoch": 0.76, "learning_rate": 9.756191602796275e-07, "logits/chosen": -0.7316077947616577, "logits/rejected": -1.6688975095748901, "logps/chosen": -526.944580078125, "logps/rejected": -288.75018310546875, "loss": 0.1576, "rewards/accuracies": 1.0, "rewards/chosen": -0.8874114751815796, "rewards/margins": 8.355914115905762, "rewards/rejected": -9.243326187133789, "step": 451 }, { "epoch": 0.76, "learning_rate": 9.753915957814352e-07, "logits/chosen": -1.136466145515442, "logits/rejected": -1.8164993524551392, "logps/chosen": -503.8302307128906, "logps/rejected": -425.62176513671875, "loss": 0.1423, "rewards/accuracies": 1.0, "rewards/chosen": 0.10046082735061646, "rewards/margins": 1.3869491815567017, "rewards/rejected": -1.2864882946014404, "step": 452 }, { "epoch": 0.76, "learning_rate": 9.751630009864768e-07, "logits/chosen": -0.9611995816230774, "logits/rejected": -1.4471514225006104, "logps/chosen": -530.7092895507812, "logps/rejected": -166.03952026367188, "loss": 0.2416, "rewards/accuracies": 1.0, "rewards/chosen": -0.7246857285499573, "rewards/margins": 6.8713698387146, "rewards/rejected": -7.596055507659912, "step": 453 }, { "epoch": 0.77, "learning_rate": 9.74933376390177e-07, "logits/chosen": -1.8776335716247559, "logits/rejected": -2.2231943607330322, "logps/chosen": -299.9205627441406, "logps/rejected": -328.975341796875, "loss": 0.1723, "rewards/accuracies": 1.0, "rewards/chosen": -0.1313549131155014, "rewards/margins": 5.5653581619262695, "rewards/rejected": -5.696713447570801, "step": 454 }, { "epoch": 0.77, "learning_rate": 9.747027224901912e-07, "logits/chosen": -1.4517310857772827, "logits/rejected": -1.5876500606536865, "logps/chosen": -20.67554473876953, "logps/rejected": -94.16972351074219, "loss": 0.2211, "rewards/accuracies": 1.0, "rewards/chosen": -0.09321515262126923, "rewards/margins": 4.643195629119873, "rewards/rejected": -4.736410617828369, "step": 455 }, { "epoch": 0.77, "learning_rate": 9.744710397864066e-07, "logits/chosen": -2.7332839965820312, "logits/rejected": -1.3261853456497192, "logps/chosen": -23.798072814941406, "logps/rejected": -210.81732177734375, "loss": 0.1521, "rewards/accuracies": 1.0, "rewards/chosen": -0.3627491295337677, "rewards/margins": 9.340432167053223, "rewards/rejected": -9.703181266784668, "step": 456 }, { "epoch": 0.77, "learning_rate": 9.742383287809396e-07, "logits/chosen": -1.964377999305725, "logits/rejected": -1.4948861598968506, "logps/chosen": -281.6620788574219, "logps/rejected": -280.5074157714844, "loss": 0.2045, "rewards/accuracies": 1.0, "rewards/chosen": -0.4957534670829773, "rewards/margins": 11.540742874145508, "rewards/rejected": -12.036495208740234, "step": 457 }, { "epoch": 0.77, "learning_rate": 9.740045899781352e-07, "logits/chosen": -0.5110257863998413, "logits/rejected": -0.6007272601127625, "logps/chosen": -226.26109313964844, "logps/rejected": -171.11524963378906, "loss": 0.168, "rewards/accuracies": 0.0, "rewards/chosen": -2.9013428688049316, "rewards/margins": -1.4838972091674805, "rewards/rejected": -1.4174456596374512, "step": 458 }, { "epoch": 0.77, "learning_rate": 9.737698238845658e-07, "logits/chosen": -1.7447842359542847, "logits/rejected": -2.5760953426361084, "logps/chosen": -257.5279846191406, "logps/rejected": -195.7313995361328, "loss": 0.1909, "rewards/accuracies": 1.0, "rewards/chosen": -0.032274626195430756, "rewards/margins": 7.554751396179199, "rewards/rejected": -7.5870256423950195, "step": 459 }, { "epoch": 0.78, "learning_rate": 9.735340310090306e-07, "logits/chosen": -1.7694307565689087, "logits/rejected": -2.6311557292938232, "logps/chosen": -307.8857116699219, "logps/rejected": -171.5406494140625, "loss": 0.1577, "rewards/accuracies": 1.0, "rewards/chosen": -0.29933395981788635, "rewards/margins": 8.812219619750977, "rewards/rejected": -9.111554145812988, "step": 460 }, { "epoch": 0.78, "learning_rate": 9.732972118625536e-07, "logits/chosen": -1.408591866493225, "logits/rejected": -1.5779300928115845, "logps/chosen": -169.06919860839844, "logps/rejected": -177.49819946289062, "loss": 0.1833, "rewards/accuracies": 1.0, "rewards/chosen": -0.2286117523908615, "rewards/margins": 0.6607635617256165, "rewards/rejected": -0.8893753290176392, "step": 461 }, { "epoch": 0.78, "learning_rate": 9.730593669583835e-07, "logits/chosen": -1.7068212032318115, "logits/rejected": -1.9184765815734863, "logps/chosen": -70.21788024902344, "logps/rejected": -134.25051879882812, "loss": 0.1484, "rewards/accuracies": 1.0, "rewards/chosen": 0.1888481080532074, "rewards/margins": 5.6896843910217285, "rewards/rejected": -5.500836372375488, "step": 462 }, { "epoch": 0.78, "learning_rate": 9.728204968119915e-07, "logits/chosen": -1.1536794900894165, "logits/rejected": -1.827487826347351, "logps/chosen": -354.2060546875, "logps/rejected": -361.4664001464844, "loss": 0.1828, "rewards/accuracies": 0.5, "rewards/chosen": -2.5781145095825195, "rewards/margins": 4.080938339233398, "rewards/rejected": -6.659052848815918, "step": 463 }, { "epoch": 0.78, "learning_rate": 9.725806019410717e-07, "logits/chosen": -0.7136868834495544, "logits/rejected": -0.650567352771759, "logps/chosen": -158.14785766601562, "logps/rejected": -164.43289184570312, "loss": 0.1672, "rewards/accuracies": 0.0, "rewards/chosen": -2.082798719406128, "rewards/margins": -0.6656165719032288, "rewards/rejected": -1.417182207107544, "step": 464 }, { "epoch": 0.78, "learning_rate": 9.723396828655376e-07, "logits/chosen": -1.1137374639511108, "logits/rejected": -0.8390330076217651, "logps/chosen": -389.09088134765625, "logps/rejected": -321.860107421875, "loss": 0.1468, "rewards/accuracies": 1.0, "rewards/chosen": -0.09591063857078552, "rewards/margins": 12.648015022277832, "rewards/rejected": -12.743925094604492, "step": 465 }, { "epoch": 0.79, "learning_rate": 9.72097740107524e-07, "logits/chosen": -1.2167657613754272, "logits/rejected": -1.5186375379562378, "logps/chosen": -69.07684326171875, "logps/rejected": -136.32656860351562, "loss": 0.1958, "rewards/accuracies": 1.0, "rewards/chosen": 0.024441910907626152, "rewards/margins": 7.298024654388428, "rewards/rejected": -7.273582458496094, "step": 466 }, { "epoch": 0.79, "learning_rate": 9.718547741913833e-07, "logits/chosen": -1.3299391269683838, "logits/rejected": -1.1457806825637817, "logps/chosen": -497.37103271484375, "logps/rejected": -386.53143310546875, "loss": 0.186, "rewards/accuracies": 1.0, "rewards/chosen": 1.3952667713165283, "rewards/margins": 8.335859298706055, "rewards/rejected": -6.940592288970947, "step": 467 }, { "epoch": 0.79, "learning_rate": 9.716107856436855e-07, "logits/chosen": -1.1360148191452026, "logits/rejected": -0.6608507037162781, "logps/chosen": -247.17117309570312, "logps/rejected": -258.4505920410156, "loss": 0.2208, "rewards/accuracies": 1.0, "rewards/chosen": -1.1492900848388672, "rewards/margins": 6.689465045928955, "rewards/rejected": -7.838755130767822, "step": 468 }, { "epoch": 0.79, "learning_rate": 9.713657749932171e-07, "logits/chosen": -0.8903838396072388, "logits/rejected": -1.1991342306137085, "logps/chosen": -475.362548828125, "logps/rejected": -285.0642395019531, "loss": 0.1696, "rewards/accuracies": 0.5, "rewards/chosen": -2.755786180496216, "rewards/margins": 7.545994281768799, "rewards/rejected": -10.301780700683594, "step": 469 }, { "epoch": 0.79, "learning_rate": 9.711197427709795e-07, "logits/chosen": -0.6181639432907104, "logits/rejected": -0.9941724538803101, "logps/chosen": -130.269775390625, "logps/rejected": -202.236083984375, "loss": 0.1921, "rewards/accuracies": 1.0, "rewards/chosen": -0.045879364013671875, "rewards/margins": 6.135862827301025, "rewards/rejected": -6.181741714477539, "step": 470 }, { "epoch": 0.79, "learning_rate": 9.708726895101885e-07, "logits/chosen": -0.6759887337684631, "logits/rejected": -1.0811114311218262, "logps/chosen": -349.7430725097656, "logps/rejected": -202.4884490966797, "loss": 0.1463, "rewards/accuracies": 1.0, "rewards/chosen": -0.6895374059677124, "rewards/margins": 10.514432907104492, "rewards/rejected": -11.203970909118652, "step": 471 }, { "epoch": 0.8, "learning_rate": 9.706246157462726e-07, "logits/chosen": -1.6232417821884155, "logits/rejected": -1.6543527841567993, "logps/chosen": -154.53765869140625, "logps/rejected": -209.14459228515625, "loss": 0.2433, "rewards/accuracies": 1.0, "rewards/chosen": -1.1862388849258423, "rewards/margins": 2.6417124271392822, "rewards/rejected": -3.827951431274414, "step": 472 }, { "epoch": 0.8, "learning_rate": 9.703755220168714e-07, "logits/chosen": -2.7903904914855957, "logits/rejected": -1.3351569175720215, "logps/chosen": -257.2842712402344, "logps/rejected": -224.76751708984375, "loss": 0.1928, "rewards/accuracies": 1.0, "rewards/chosen": 0.061550140380859375, "rewards/margins": 2.485539197921753, "rewards/rejected": -2.4239890575408936, "step": 473 }, { "epoch": 0.8, "learning_rate": 9.701254088618362e-07, "logits/chosen": -0.9130764007568359, "logits/rejected": -1.3383204936981201, "logps/chosen": -381.96124267578125, "logps/rejected": -193.50051879882812, "loss": 0.1515, "rewards/accuracies": 1.0, "rewards/chosen": -0.1674940139055252, "rewards/margins": 8.00719928741455, "rewards/rejected": -8.17469310760498, "step": 474 }, { "epoch": 0.8, "learning_rate": 9.698742768232265e-07, "logits/chosen": -0.8350385427474976, "logits/rejected": -0.8087922930717468, "logps/chosen": -260.8985900878906, "logps/rejected": -207.9868927001953, "loss": 0.1985, "rewards/accuracies": 1.0, "rewards/chosen": -2.3379669189453125, "rewards/margins": 6.639181137084961, "rewards/rejected": -8.977148056030273, "step": 475 }, { "epoch": 0.8, "learning_rate": 9.696221264453108e-07, "logits/chosen": -1.8277158737182617, "logits/rejected": -1.8809436559677124, "logps/chosen": -102.64655303955078, "logps/rejected": -81.0972671508789, "loss": 0.1846, "rewards/accuracies": 0.5, "rewards/chosen": -1.0536823272705078, "rewards/margins": -0.04416823387145996, "rewards/rejected": -1.0095140933990479, "step": 476 }, { "epoch": 0.8, "learning_rate": 9.693689582745643e-07, "logits/chosen": -2.200521230697632, "logits/rejected": -2.2031776905059814, "logps/chosen": -78.0069808959961, "logps/rejected": -148.51446533203125, "loss": 0.213, "rewards/accuracies": 1.0, "rewards/chosen": -0.6362529397010803, "rewards/margins": 3.200962781906128, "rewards/rejected": -3.8372156620025635, "step": 477 }, { "epoch": 0.81, "learning_rate": 9.691147728596681e-07, "logits/chosen": -0.9372101426124573, "logits/rejected": -0.8971385359764099, "logps/chosen": -158.72555541992188, "logps/rejected": -178.3800506591797, "loss": 0.1383, "rewards/accuracies": 1.0, "rewards/chosen": -0.3784370422363281, "rewards/margins": 5.4618988037109375, "rewards/rejected": -5.840336322784424, "step": 478 }, { "epoch": 0.81, "learning_rate": 9.688595707515076e-07, "logits/chosen": -2.139923572540283, "logits/rejected": -1.6465723514556885, "logps/chosen": -119.42794799804688, "logps/rejected": -210.23606872558594, "loss": 0.2018, "rewards/accuracies": 0.5, "rewards/chosen": -2.1089653968811035, "rewards/margins": 4.937000274658203, "rewards/rejected": -7.045965671539307, "step": 479 }, { "epoch": 0.81, "learning_rate": 9.686033525031719e-07, "logits/chosen": -1.4923748970031738, "logits/rejected": -1.2859959602355957, "logps/chosen": -226.31365966796875, "logps/rejected": -233.12567138671875, "loss": 0.1772, "rewards/accuracies": 1.0, "rewards/chosen": -1.537055492401123, "rewards/margins": 6.823518753051758, "rewards/rejected": -8.360574722290039, "step": 480 }, { "epoch": 0.81, "learning_rate": 9.683461186699524e-07, "logits/chosen": -0.9652807116508484, "logits/rejected": -1.830547571182251, "logps/chosen": -774.6458129882812, "logps/rejected": -302.51702880859375, "loss": 0.185, "rewards/accuracies": 0.5, "rewards/chosen": -2.865536689758301, "rewards/margins": 6.375579833984375, "rewards/rejected": -9.241116523742676, "step": 481 }, { "epoch": 0.81, "learning_rate": 9.680878698093415e-07, "logits/chosen": -0.6591046452522278, "logits/rejected": -1.157971739768982, "logps/chosen": -304.5384216308594, "logps/rejected": -185.4188232421875, "loss": 0.1882, "rewards/accuracies": 1.0, "rewards/chosen": -0.8371291756629944, "rewards/margins": 6.621988296508789, "rewards/rejected": -7.4591169357299805, "step": 482 }, { "epoch": 0.81, "learning_rate": 9.678286064810316e-07, "logits/chosen": -1.630854845046997, "logits/rejected": -1.1944377422332764, "logps/chosen": -63.372398376464844, "logps/rejected": -118.29740905761719, "loss": 0.2208, "rewards/accuracies": 0.5, "rewards/chosen": -0.9332462549209595, "rewards/margins": -0.14584654569625854, "rewards/rejected": -0.7873997092247009, "step": 483 }, { "epoch": 0.82, "learning_rate": 9.67568329246913e-07, "logits/chosen": -1.5651147365570068, "logits/rejected": -2.2136974334716797, "logps/chosen": -356.625244140625, "logps/rejected": -257.2489013671875, "loss": 0.1678, "rewards/accuracies": 1.0, "rewards/chosen": -2.220522880554199, "rewards/margins": 11.928407669067383, "rewards/rejected": -14.148930549621582, "step": 484 }, { "epoch": 0.82, "learning_rate": 9.673070386710745e-07, "logits/chosen": -0.5516372323036194, "logits/rejected": -0.5296367406845093, "logps/chosen": -12.047179222106934, "logps/rejected": -111.52848052978516, "loss": 0.1797, "rewards/accuracies": 1.0, "rewards/chosen": -0.2791111469268799, "rewards/margins": 8.235075950622559, "rewards/rejected": -8.51418685913086, "step": 485 }, { "epoch": 0.82, "learning_rate": 9.670447353197998e-07, "logits/chosen": -1.5218621492385864, "logits/rejected": -1.7300869226455688, "logps/chosen": -215.33021545410156, "logps/rejected": -353.5644226074219, "loss": 0.1917, "rewards/accuracies": 1.0, "rewards/chosen": -1.7259873151779175, "rewards/margins": 16.542556762695312, "rewards/rejected": -18.268545150756836, "step": 486 }, { "epoch": 0.82, "learning_rate": 9.66781419761569e-07, "logits/chosen": -1.3317276239395142, "logits/rejected": -1.3344420194625854, "logps/chosen": -116.13327026367188, "logps/rejected": -97.93529510498047, "loss": 0.1949, "rewards/accuracies": 0.5, "rewards/chosen": -3.2147207260131836, "rewards/margins": -0.9069676399230957, "rewards/rejected": -2.307753086090088, "step": 487 }, { "epoch": 0.82, "learning_rate": 9.665170925670546e-07, "logits/chosen": -1.387438416481018, "logits/rejected": -1.5614135265350342, "logps/chosen": -156.07591247558594, "logps/rejected": -155.57327270507812, "loss": 0.1721, "rewards/accuracies": 0.5, "rewards/chosen": -1.0742095708847046, "rewards/margins": 3.273390293121338, "rewards/rejected": -4.347599983215332, "step": 488 }, { "epoch": 0.82, "learning_rate": 9.662517543091224e-07, "logits/chosen": -0.29915913939476013, "logits/rejected": -1.622786521911621, "logps/chosen": -332.6976013183594, "logps/rejected": -172.04876708984375, "loss": 0.1802, "rewards/accuracies": 1.0, "rewards/chosen": 2.1141693592071533, "rewards/margins": 10.140617370605469, "rewards/rejected": -8.026447296142578, "step": 489 }, { "epoch": 0.83, "learning_rate": 9.659854055628289e-07, "logits/chosen": -2.113335371017456, "logits/rejected": -1.7780506610870361, "logps/chosen": -116.96385192871094, "logps/rejected": -280.6581115722656, "loss": 0.1313, "rewards/accuracies": 1.0, "rewards/chosen": -0.7822120785713196, "rewards/margins": 15.220739364624023, "rewards/rejected": -16.00295066833496, "step": 490 }, { "epoch": 0.83, "learning_rate": 9.657180469054212e-07, "logits/chosen": -1.7113397121429443, "logits/rejected": -1.1981744766235352, "logps/chosen": -214.93051147460938, "logps/rejected": -303.4566650390625, "loss": 0.1392, "rewards/accuracies": 1.0, "rewards/chosen": -1.7917399406433105, "rewards/margins": 11.92519760131836, "rewards/rejected": -13.716938018798828, "step": 491 }, { "epoch": 0.83, "learning_rate": 9.654496789163343e-07, "logits/chosen": -1.5774062871932983, "logits/rejected": -1.3951590061187744, "logps/chosen": -136.52076721191406, "logps/rejected": -191.61000061035156, "loss": 0.1411, "rewards/accuracies": 1.0, "rewards/chosen": -0.8900161981582642, "rewards/margins": 10.306234359741211, "rewards/rejected": -11.196249961853027, "step": 492 }, { "epoch": 0.83, "learning_rate": 9.651803021771917e-07, "logits/chosen": -2.1784653663635254, "logits/rejected": -1.2229230403900146, "logps/chosen": -72.89522552490234, "logps/rejected": -309.3623046875, "loss": 0.1811, "rewards/accuracies": 1.0, "rewards/chosen": -0.6526200771331787, "rewards/margins": 7.078956604003906, "rewards/rejected": -7.731576919555664, "step": 493 }, { "epoch": 0.83, "learning_rate": 9.64909917271802e-07, "logits/chosen": -0.9013615250587463, "logits/rejected": -1.0959047079086304, "logps/chosen": -426.6611022949219, "logps/rejected": -216.3023681640625, "loss": 0.1746, "rewards/accuracies": 1.0, "rewards/chosen": -0.8864956498146057, "rewards/margins": 9.015869140625, "rewards/rejected": -9.902364730834961, "step": 494 }, { "epoch": 0.83, "learning_rate": 9.6463852478616e-07, "logits/chosen": -1.6150933504104614, "logits/rejected": -2.258601427078247, "logps/chosen": -96.07403564453125, "logps/rejected": -62.08420181274414, "loss": 0.1906, "rewards/accuracies": 0.5, "rewards/chosen": -1.1480413675308228, "rewards/margins": 1.0792427062988281, "rewards/rejected": -2.2272841930389404, "step": 495 }, { "epoch": 0.84, "learning_rate": 9.643661253084429e-07, "logits/chosen": -0.35942748188972473, "logits/rejected": -1.3206900358200073, "logps/chosen": -472.96929931640625, "logps/rejected": -234.10690307617188, "loss": 0.1597, "rewards/accuracies": 1.0, "rewards/chosen": 0.2673812806606293, "rewards/margins": 13.789386749267578, "rewards/rejected": -13.522006034851074, "step": 496 }, { "epoch": 0.84, "learning_rate": 9.640927194290116e-07, "logits/chosen": -1.594843864440918, "logits/rejected": -1.8921819925308228, "logps/chosen": -260.1965026855469, "logps/rejected": -257.98199462890625, "loss": 0.1985, "rewards/accuracies": 1.0, "rewards/chosen": -0.7323309183120728, "rewards/margins": 6.5982561111450195, "rewards/rejected": -7.330586910247803, "step": 497 }, { "epoch": 0.84, "learning_rate": 9.638183077404068e-07, "logits/chosen": -2.052978754043579, "logits/rejected": -1.9758281707763672, "logps/chosen": -19.00727081298828, "logps/rejected": -73.09623718261719, "loss": 0.1491, "rewards/accuracies": 1.0, "rewards/chosen": -0.739374041557312, "rewards/margins": 2.3671977519989014, "rewards/rejected": -3.106571674346924, "step": 498 }, { "epoch": 0.84, "learning_rate": 9.635428908373502e-07, "logits/chosen": -1.1580097675323486, "logits/rejected": -1.896628975868225, "logps/chosen": -570.9942626953125, "logps/rejected": -265.4372863769531, "loss": 0.1812, "rewards/accuracies": 1.0, "rewards/chosen": -1.3030930757522583, "rewards/margins": 11.787357330322266, "rewards/rejected": -13.090450286865234, "step": 499 }, { "epoch": 0.84, "learning_rate": 9.632664693167416e-07, "logits/chosen": -1.720935583114624, "logits/rejected": -1.7201050519943237, "logps/chosen": -355.47833251953125, "logps/rejected": -418.269775390625, "loss": 0.1801, "rewards/accuracies": 1.0, "rewards/chosen": -1.297267198562622, "rewards/margins": 9.594414710998535, "rewards/rejected": -10.891682624816895, "step": 500 }, { "epoch": 0.84, "learning_rate": 9.629890437776579e-07, "logits/chosen": -1.9035245180130005, "logits/rejected": -1.9480910301208496, "logps/chosen": -131.1505889892578, "logps/rejected": -121.52857208251953, "loss": 0.1662, "rewards/accuracies": 1.0, "rewards/chosen": 0.5765511989593506, "rewards/margins": 5.626952171325684, "rewards/rejected": -5.050400733947754, "step": 501 }, { "epoch": 0.85, "learning_rate": 9.62710614821352e-07, "logits/chosen": -0.8639505505561829, "logits/rejected": -0.9645960927009583, "logps/chosen": -76.16290283203125, "logps/rejected": -115.89288330078125, "loss": 0.2082, "rewards/accuracies": 0.5, "rewards/chosen": -2.1743850708007812, "rewards/margins": 3.6736721992492676, "rewards/rejected": -5.848057270050049, "step": 502 }, { "epoch": 0.85, "learning_rate": 9.624311830512519e-07, "logits/chosen": -1.9524139165878296, "logits/rejected": -2.4563539028167725, "logps/chosen": -120.10502624511719, "logps/rejected": -73.08750915527344, "loss": 0.194, "rewards/accuracies": 0.5, "rewards/chosen": -0.409701943397522, "rewards/margins": 1.4130560159683228, "rewards/rejected": -1.8227579593658447, "step": 503 }, { "epoch": 0.85, "learning_rate": 9.621507490729584e-07, "logits/chosen": -1.4694656133651733, "logits/rejected": -1.635840892791748, "logps/chosen": -73.20475769042969, "logps/rejected": -231.72525024414062, "loss": 0.1675, "rewards/accuracies": 1.0, "rewards/chosen": -0.7220495343208313, "rewards/margins": 7.296611309051514, "rewards/rejected": -8.018661499023438, "step": 504 }, { "epoch": 0.85, "learning_rate": 9.618693134942448e-07, "logits/chosen": -0.5826085805892944, "logits/rejected": -1.8275094032287598, "logps/chosen": -112.27894592285156, "logps/rejected": -76.41281127929688, "loss": 0.197, "rewards/accuracies": 1.0, "rewards/chosen": -0.1780475676059723, "rewards/margins": 6.233481407165527, "rewards/rejected": -6.411529064178467, "step": 505 }, { "epoch": 0.85, "learning_rate": 9.615868769250545e-07, "logits/chosen": -1.1828463077545166, "logits/rejected": -1.4431803226470947, "logps/chosen": -299.8600158691406, "logps/rejected": -209.68878173828125, "loss": 0.1821, "rewards/accuracies": 1.0, "rewards/chosen": -1.3483529090881348, "rewards/margins": 10.797613143920898, "rewards/rejected": -12.145965576171875, "step": 506 }, { "epoch": 0.85, "learning_rate": 9.613034399775013e-07, "logits/chosen": -1.7455992698669434, "logits/rejected": -1.877217173576355, "logps/chosen": -80.41563415527344, "logps/rejected": -180.38148498535156, "loss": 0.1738, "rewards/accuracies": 1.0, "rewards/chosen": -1.0337793827056885, "rewards/margins": 5.948246955871582, "rewards/rejected": -6.982026100158691, "step": 507 }, { "epoch": 0.86, "learning_rate": 9.610190032658663e-07, "logits/chosen": -1.9526875019073486, "logits/rejected": -1.7264868021011353, "logps/chosen": -117.76008605957031, "logps/rejected": -175.2601318359375, "loss": 0.1698, "rewards/accuracies": 0.5, "rewards/chosen": -1.3269020318984985, "rewards/margins": 6.980993747711182, "rewards/rejected": -8.30789566040039, "step": 508 }, { "epoch": 0.86, "learning_rate": 9.607335674065975e-07, "logits/chosen": -0.6679888963699341, "logits/rejected": -0.14870330691337585, "logps/chosen": -339.99151611328125, "logps/rejected": -369.12860107421875, "loss": 0.1847, "rewards/accuracies": 1.0, "rewards/chosen": -1.8299164772033691, "rewards/margins": 15.75944709777832, "rewards/rejected": -17.58936309814453, "step": 509 }, { "epoch": 0.86, "learning_rate": 9.604471330183081e-07, "logits/chosen": -1.387742042541504, "logits/rejected": -1.4050238132476807, "logps/chosen": -387.7100524902344, "logps/rejected": -354.6739807128906, "loss": 0.1736, "rewards/accuracies": 1.0, "rewards/chosen": -1.4351615905761719, "rewards/margins": 15.635486602783203, "rewards/rejected": -17.070648193359375, "step": 510 }, { "epoch": 0.86, "learning_rate": 9.601597007217761e-07, "logits/chosen": -1.01943039894104, "logits/rejected": -1.701319932937622, "logps/chosen": -526.007568359375, "logps/rejected": -369.4110107421875, "loss": 0.2055, "rewards/accuracies": 1.0, "rewards/chosen": -0.7693207263946533, "rewards/margins": 2.1190032958984375, "rewards/rejected": -2.888324022293091, "step": 511 }, { "epoch": 0.86, "learning_rate": 9.598712711399415e-07, "logits/chosen": -1.298659086227417, "logits/rejected": -1.1038153171539307, "logps/chosen": -530.2131958007812, "logps/rejected": -275.9097595214844, "loss": 0.2014, "rewards/accuracies": 1.0, "rewards/chosen": 0.26557284593582153, "rewards/margins": 12.243782997131348, "rewards/rejected": -11.97821044921875, "step": 512 }, { "epoch": 0.87, "learning_rate": 9.59581844897906e-07, "logits/chosen": -1.3148828744888306, "logits/rejected": -1.9683585166931152, "logps/chosen": -369.1336364746094, "logps/rejected": -188.69871520996094, "loss": 0.1784, "rewards/accuracies": 0.5, "rewards/chosen": -1.694780945777893, "rewards/margins": 6.680124759674072, "rewards/rejected": -8.374905586242676, "step": 513 }, { "epoch": 0.87, "learning_rate": 9.592914226229314e-07, "logits/chosen": -1.4996843338012695, "logits/rejected": -1.4723166227340698, "logps/chosen": -149.41529846191406, "logps/rejected": -236.1160888671875, "loss": 0.1857, "rewards/accuracies": 1.0, "rewards/chosen": -0.2980178892612457, "rewards/margins": 12.551896095275879, "rewards/rejected": -12.849913597106934, "step": 514 }, { "epoch": 0.87, "learning_rate": 9.590000049444376e-07, "logits/chosen": -1.6375008821487427, "logits/rejected": -2.431309461593628, "logps/chosen": -514.4171752929688, "logps/rejected": -307.718017578125, "loss": 0.1961, "rewards/accuracies": 1.0, "rewards/chosen": -1.7196776866912842, "rewards/margins": 6.7644124031066895, "rewards/rejected": -8.484090805053711, "step": 515 }, { "epoch": 0.87, "learning_rate": 9.587075924940028e-07, "logits/chosen": -1.0333702564239502, "logits/rejected": -1.2659507989883423, "logps/chosen": -454.99566650390625, "logps/rejected": -293.19573974609375, "loss": 0.1689, "rewards/accuracies": 1.0, "rewards/chosen": -0.113037109375, "rewards/margins": 9.598712921142578, "rewards/rejected": -9.711750030517578, "step": 516 }, { "epoch": 0.87, "learning_rate": 9.5841418590536e-07, "logits/chosen": -1.6277176141738892, "logits/rejected": -2.497152090072632, "logps/chosen": -325.90850830078125, "logps/rejected": -145.48583984375, "loss": 0.2187, "rewards/accuracies": 0.5, "rewards/chosen": -1.105952501296997, "rewards/margins": 0.280051052570343, "rewards/rejected": -1.3860034942626953, "step": 517 }, { "epoch": 0.87, "learning_rate": 9.581197858143977e-07, "logits/chosen": -2.2153377532958984, "logits/rejected": -1.4255385398864746, "logps/chosen": -80.20429229736328, "logps/rejected": -332.54937744140625, "loss": 0.1985, "rewards/accuracies": 1.0, "rewards/chosen": -1.3072952032089233, "rewards/margins": 13.6825532913208, "rewards/rejected": -14.989848136901855, "step": 518 }, { "epoch": 0.88, "learning_rate": 9.578243928591569e-07, "logits/chosen": -1.0782465934753418, "logits/rejected": -1.0369610786437988, "logps/chosen": -295.9336853027344, "logps/rejected": -218.00802612304688, "loss": 0.1549, "rewards/accuracies": 1.0, "rewards/chosen": 0.03710126876831055, "rewards/margins": 10.719480514526367, "rewards/rejected": -10.682379722595215, "step": 519 }, { "epoch": 0.88, "learning_rate": 9.57528007679831e-07, "logits/chosen": -1.1398615837097168, "logits/rejected": -1.8708375692367554, "logps/chosen": -540.9119873046875, "logps/rejected": -327.2794189453125, "loss": 0.2091, "rewards/accuracies": 1.0, "rewards/chosen": 0.22789306938648224, "rewards/margins": 5.93986701965332, "rewards/rejected": -5.711973667144775, "step": 520 }, { "epoch": 0.88, "learning_rate": 9.57230630918763e-07, "logits/chosen": -2.1814510822296143, "logits/rejected": -2.2879159450531006, "logps/chosen": -105.28890991210938, "logps/rejected": -185.7814178466797, "loss": 0.1897, "rewards/accuracies": 0.5, "rewards/chosen": -1.7397704124450684, "rewards/margins": 6.833057403564453, "rewards/rejected": -8.572827339172363, "step": 521 }, { "epoch": 0.88, "learning_rate": 9.569322632204458e-07, "logits/chosen": -2.073073148727417, "logits/rejected": -1.4473570585250854, "logps/chosen": -314.480712890625, "logps/rejected": -325.2982482910156, "loss": 0.21, "rewards/accuracies": 1.0, "rewards/chosen": -0.3166259825229645, "rewards/margins": 14.919228553771973, "rewards/rejected": -15.235854148864746, "step": 522 }, { "epoch": 0.88, "learning_rate": 9.566329052315194e-07, "logits/chosen": -1.8998284339904785, "logits/rejected": -1.8589746952056885, "logps/chosen": -63.7332763671875, "logps/rejected": -274.74609375, "loss": 0.1681, "rewards/accuracies": 1.0, "rewards/chosen": -1.378976821899414, "rewards/margins": 12.973987579345703, "rewards/rejected": -14.352964401245117, "step": 523 }, { "epoch": 0.88, "learning_rate": 9.5633255760077e-07, "logits/chosen": -2.28558611869812, "logits/rejected": -2.275768280029297, "logps/chosen": -40.30331039428711, "logps/rejected": -180.61302185058594, "loss": 0.1904, "rewards/accuracies": 1.0, "rewards/chosen": -0.8475544452667236, "rewards/margins": 9.063840866088867, "rewards/rejected": -9.911395072937012, "step": 524 }, { "epoch": 0.89, "learning_rate": 9.56031220979129e-07, "logits/chosen": -0.7669763565063477, "logits/rejected": -1.784839391708374, "logps/chosen": -198.1832275390625, "logps/rejected": -123.54266357421875, "loss": 0.1687, "rewards/accuracies": 1.0, "rewards/chosen": -0.18603363633155823, "rewards/margins": 5.508733749389648, "rewards/rejected": -5.694766998291016, "step": 525 }, { "epoch": 0.89, "learning_rate": 9.557288960196707e-07, "logits/chosen": -1.1866172552108765, "logits/rejected": -1.2076904773712158, "logps/chosen": -36.310211181640625, "logps/rejected": -181.77130126953125, "loss": 0.1558, "rewards/accuracies": 1.0, "rewards/chosen": -0.9400992393493652, "rewards/margins": 10.733392715454102, "rewards/rejected": -11.673492431640625, "step": 526 }, { "epoch": 0.89, "learning_rate": 9.554255833776117e-07, "logits/chosen": -2.55730938911438, "logits/rejected": -1.6672946214675903, "logps/chosen": -80.0410385131836, "logps/rejected": -193.53387451171875, "loss": 0.2318, "rewards/accuracies": 0.5, "rewards/chosen": -1.1338022947311401, "rewards/margins": 2.3030526638031006, "rewards/rejected": -3.436854839324951, "step": 527 }, { "epoch": 0.89, "learning_rate": 9.551212837103091e-07, "logits/chosen": -2.760472059249878, "logits/rejected": -2.06662917137146, "logps/chosen": -160.0579071044922, "logps/rejected": -215.00347900390625, "loss": 0.1786, "rewards/accuracies": 1.0, "rewards/chosen": -0.42213478684425354, "rewards/margins": 5.234758377075195, "rewards/rejected": -5.656893253326416, "step": 528 }, { "epoch": 0.89, "learning_rate": 9.548159976772592e-07, "logits/chosen": -2.156421661376953, "logits/rejected": -1.3467743396759033, "logps/chosen": -69.24724578857422, "logps/rejected": -327.8988037109375, "loss": 0.192, "rewards/accuracies": 0.5, "rewards/chosen": -1.53495192527771, "rewards/margins": 7.171606540679932, "rewards/rejected": -8.706559181213379, "step": 529 }, { "epoch": 0.89, "learning_rate": 9.545097259400958e-07, "logits/chosen": -1.4128961563110352, "logits/rejected": -0.9324668645858765, "logps/chosen": -280.76806640625, "logps/rejected": -290.4639892578125, "loss": 0.1677, "rewards/accuracies": 0.5, "rewards/chosen": -1.6329319477081299, "rewards/margins": 7.9638519287109375, "rewards/rejected": -9.596783638000488, "step": 530 }, { "epoch": 0.9, "learning_rate": 9.54202469162589e-07, "logits/chosen": -2.057682991027832, "logits/rejected": -1.9787952899932861, "logps/chosen": -197.595703125, "logps/rejected": -351.2636413574219, "loss": 0.2056, "rewards/accuracies": 1.0, "rewards/chosen": -2.0103564262390137, "rewards/margins": 4.732174396514893, "rewards/rejected": -6.742530822753906, "step": 531 }, { "epoch": 0.9, "learning_rate": 9.538942280106441e-07, "logits/chosen": -0.3838121294975281, "logits/rejected": -0.7845942378044128, "logps/chosen": -481.9730224609375, "logps/rejected": -301.693115234375, "loss": 0.1884, "rewards/accuracies": 1.0, "rewards/chosen": -1.7501007318496704, "rewards/margins": 12.653074264526367, "rewards/rejected": -14.403175354003906, "step": 532 }, { "epoch": 0.9, "learning_rate": 9.535850031522996e-07, "logits/chosen": -2.2089288234710693, "logits/rejected": -2.154285430908203, "logps/chosen": -34.082950592041016, "logps/rejected": -219.37469482421875, "loss": 0.1833, "rewards/accuracies": 1.0, "rewards/chosen": -0.9477675557136536, "rewards/margins": 11.680444717407227, "rewards/rejected": -12.628212928771973, "step": 533 }, { "epoch": 0.9, "learning_rate": 9.532747952577259e-07, "logits/chosen": -1.166057825088501, "logits/rejected": -1.6293365955352783, "logps/chosen": -674.6851806640625, "logps/rejected": -405.37255859375, "loss": 0.1572, "rewards/accuracies": 1.0, "rewards/chosen": -0.8522399663925171, "rewards/margins": 10.60324478149414, "rewards/rejected": -11.455485343933105, "step": 534 }, { "epoch": 0.9, "learning_rate": 9.529636049992233e-07, "logits/chosen": -2.216061592102051, "logits/rejected": -1.3257899284362793, "logps/chosen": -267.9684143066406, "logps/rejected": -269.8423156738281, "loss": 0.1965, "rewards/accuracies": 1.0, "rewards/chosen": -0.39237213134765625, "rewards/margins": 8.582952499389648, "rewards/rejected": -8.975324630737305, "step": 535 }, { "epoch": 0.9, "learning_rate": 9.526514330512224e-07, "logits/chosen": -1.805991291999817, "logits/rejected": -2.062978744506836, "logps/chosen": -129.35073852539062, "logps/rejected": -154.8126220703125, "loss": 0.2185, "rewards/accuracies": 0.5, "rewards/chosen": -1.0079419612884521, "rewards/margins": 2.9576416015625, "rewards/rejected": -3.965583562850952, "step": 536 }, { "epoch": 0.91, "learning_rate": 9.523382800902804e-07, "logits/chosen": -0.9184517860412598, "logits/rejected": -1.1293007135391235, "logps/chosen": -497.22930908203125, "logps/rejected": -318.4179382324219, "loss": 0.1635, "rewards/accuracies": 1.0, "rewards/chosen": -2.249415636062622, "rewards/margins": 13.987029075622559, "rewards/rejected": -16.2364444732666, "step": 537 }, { "epoch": 0.91, "learning_rate": 9.52024146795081e-07, "logits/chosen": -1.5992757081985474, "logits/rejected": -1.759086012840271, "logps/chosen": -143.24261474609375, "logps/rejected": -166.47059631347656, "loss": 0.1576, "rewards/accuracies": 1.0, "rewards/chosen": -0.6290268898010254, "rewards/margins": 6.777127742767334, "rewards/rejected": -7.406154632568359, "step": 538 }, { "epoch": 0.91, "learning_rate": 9.517090338464324e-07, "logits/chosen": -1.8976322412490845, "logits/rejected": -1.2034521102905273, "logps/chosen": -247.3936004638672, "logps/rejected": -549.493408203125, "loss": 0.1994, "rewards/accuracies": 0.0, "rewards/chosen": -0.5727836489677429, "rewards/margins": -0.548413872718811, "rewards/rejected": -0.024369820952415466, "step": 539 }, { "epoch": 0.91, "learning_rate": 9.51392941927266e-07, "logits/chosen": -1.9126521348953247, "logits/rejected": -1.6490702629089355, "logps/chosen": -350.37139892578125, "logps/rejected": -361.5531005859375, "loss": 0.2141, "rewards/accuracies": 1.0, "rewards/chosen": -0.25605812668800354, "rewards/margins": 1.3470127582550049, "rewards/rejected": -1.603070855140686, "step": 540 }, { "epoch": 0.91, "learning_rate": 9.510758717226351e-07, "logits/chosen": -1.297040581703186, "logits/rejected": -2.394585132598877, "logps/chosen": -318.63446044921875, "logps/rejected": -72.8540267944336, "loss": 0.1844, "rewards/accuracies": 0.5, "rewards/chosen": -0.1780809909105301, "rewards/margins": 1.2357361316680908, "rewards/rejected": -1.413817048072815, "step": 541 }, { "epoch": 0.91, "learning_rate": 9.507578239197125e-07, "logits/chosen": -1.683724284172058, "logits/rejected": -2.4566798210144043, "logps/chosen": -198.73486328125, "logps/rejected": -102.812255859375, "loss": 0.1964, "rewards/accuracies": 0.5, "rewards/chosen": -1.182813048362732, "rewards/margins": 3.7211146354675293, "rewards/rejected": -4.903927803039551, "step": 542 }, { "epoch": 0.92, "learning_rate": 9.504387992077906e-07, "logits/chosen": -1.7871900796890259, "logits/rejected": -2.684126377105713, "logps/chosen": -830.5599365234375, "logps/rejected": -110.56852722167969, "loss": 0.1876, "rewards/accuracies": 0.5, "rewards/chosen": -0.6030117273330688, "rewards/margins": 1.5069835186004639, "rewards/rejected": -2.1099953651428223, "step": 543 }, { "epoch": 0.92, "learning_rate": 9.501187982782784e-07, "logits/chosen": -1.393845558166504, "logits/rejected": -2.000667095184326, "logps/chosen": -319.4396667480469, "logps/rejected": -176.52003479003906, "loss": 0.1713, "rewards/accuracies": 1.0, "rewards/chosen": -0.090196892619133, "rewards/margins": 11.154555320739746, "rewards/rejected": -11.244751930236816, "step": 544 }, { "epoch": 0.92, "learning_rate": 9.497978218247012e-07, "logits/chosen": -1.0957725048065186, "logits/rejected": -1.6756318807601929, "logps/chosen": -336.61602783203125, "logps/rejected": -199.86419677734375, "loss": 0.1575, "rewards/accuracies": 1.0, "rewards/chosen": -0.11384735256433487, "rewards/margins": 7.248149871826172, "rewards/rejected": -7.361997127532959, "step": 545 }, { "epoch": 0.92, "learning_rate": 9.494758705426976e-07, "logits/chosen": -1.4453366994857788, "logits/rejected": -2.0633018016815186, "logps/chosen": -223.99911499023438, "logps/rejected": -254.41709899902344, "loss": 0.2214, "rewards/accuracies": 1.0, "rewards/chosen": -0.6794521808624268, "rewards/margins": 8.287046432495117, "rewards/rejected": -8.966497421264648, "step": 546 }, { "epoch": 0.92, "learning_rate": 9.491529451300199e-07, "logits/chosen": -0.6172839999198914, "logits/rejected": -0.6064785718917847, "logps/chosen": -255.4965362548828, "logps/rejected": -213.30917358398438, "loss": 0.2023, "rewards/accuracies": 1.0, "rewards/chosen": -1.845526099205017, "rewards/margins": 9.538248062133789, "rewards/rejected": -11.383773803710938, "step": 547 }, { "epoch": 0.92, "learning_rate": 9.48829046286531e-07, "logits/chosen": -1.9111276865005493, "logits/rejected": -2.0904080867767334, "logps/chosen": -132.1181640625, "logps/rejected": -127.7751235961914, "loss": 0.148, "rewards/accuracies": 1.0, "rewards/chosen": 0.23484620451927185, "rewards/margins": 7.176847457885742, "rewards/rejected": -6.9420013427734375, "step": 548 }, { "epoch": 0.93, "learning_rate": 9.485041747142033e-07, "logits/chosen": -1.5261189937591553, "logits/rejected": -1.6879130601882935, "logps/chosen": -272.7811279296875, "logps/rejected": -296.8751525878906, "loss": 0.1803, "rewards/accuracies": 1.0, "rewards/chosen": 0.28870850801467896, "rewards/margins": 5.03480339050293, "rewards/rejected": -4.746094703674316, "step": 549 }, { "epoch": 0.93, "learning_rate": 9.481783311171182e-07, "logits/chosen": -0.9267941117286682, "logits/rejected": -0.5881129503250122, "logps/chosen": -127.84021759033203, "logps/rejected": -182.4854736328125, "loss": 0.2082, "rewards/accuracies": 1.0, "rewards/chosen": 0.14871902763843536, "rewards/margins": 10.788543701171875, "rewards/rejected": -10.639824867248535, "step": 550 }, { "epoch": 0.93, "learning_rate": 9.478515162014628e-07, "logits/chosen": -0.44782042503356934, "logits/rejected": -0.7973653078079224, "logps/chosen": -449.32598876953125, "logps/rejected": -296.8812255859375, "loss": 0.2314, "rewards/accuracies": 1.0, "rewards/chosen": -0.5391403436660767, "rewards/margins": 8.781317710876465, "rewards/rejected": -9.32045841217041, "step": 551 }, { "epoch": 0.93, "learning_rate": 9.475237306755302e-07, "logits/chosen": -2.074190855026245, "logits/rejected": -1.8858578205108643, "logps/chosen": -194.01026916503906, "logps/rejected": -407.46392822265625, "loss": 0.1754, "rewards/accuracies": 1.0, "rewards/chosen": -0.1868438720703125, "rewards/margins": 1.9931914806365967, "rewards/rejected": -2.180035352706909, "step": 552 }, { "epoch": 0.93, "learning_rate": 9.471949752497159e-07, "logits/chosen": -1.0973830223083496, "logits/rejected": -1.0407088994979858, "logps/chosen": -387.7185363769531, "logps/rejected": -300.52764892578125, "loss": 0.2544, "rewards/accuracies": 1.0, "rewards/chosen": -1.558343529701233, "rewards/margins": 11.005720138549805, "rewards/rejected": -12.564064025878906, "step": 553 }, { "epoch": 0.93, "learning_rate": 9.468652506365186e-07, "logits/chosen": -0.8757210373878479, "logits/rejected": -0.9368937611579895, "logps/chosen": -13.649900436401367, "logps/rejected": -88.50205993652344, "loss": 0.1295, "rewards/accuracies": 1.0, "rewards/chosen": -0.2053590714931488, "rewards/margins": 5.989328384399414, "rewards/rejected": -6.1946868896484375, "step": 554 }, { "epoch": 0.94, "learning_rate": 9.465345575505365e-07, "logits/chosen": -1.1155973672866821, "logits/rejected": -1.2391748428344727, "logps/chosen": -156.2330322265625, "logps/rejected": -168.20909118652344, "loss": 0.1828, "rewards/accuracies": 1.0, "rewards/chosen": -0.7171291708946228, "rewards/margins": 8.67363452911377, "rewards/rejected": -9.390764236450195, "step": 555 }, { "epoch": 0.94, "learning_rate": 9.462028967084678e-07, "logits/chosen": -2.5009000301361084, "logits/rejected": -1.359316110610962, "logps/chosen": -88.84703063964844, "logps/rejected": -606.462158203125, "loss": 0.178, "rewards/accuracies": 1.0, "rewards/chosen": -1.0266014337539673, "rewards/margins": 5.369706153869629, "rewards/rejected": -6.396307468414307, "step": 556 }, { "epoch": 0.94, "learning_rate": 9.458702688291071e-07, "logits/chosen": -1.570953369140625, "logits/rejected": -1.3593248128890991, "logps/chosen": -759.3133544921875, "logps/rejected": -677.512939453125, "loss": 0.217, "rewards/accuracies": 1.0, "rewards/chosen": -0.607739269733429, "rewards/margins": 1.023413062095642, "rewards/rejected": -1.6311523914337158, "step": 557 }, { "epoch": 0.94, "learning_rate": 9.455366746333453e-07, "logits/chosen": -1.4975221157073975, "logits/rejected": -1.9799575805664062, "logps/chosen": -238.45115661621094, "logps/rejected": -126.26002502441406, "loss": 0.1842, "rewards/accuracies": 1.0, "rewards/chosen": -0.838726818561554, "rewards/margins": 0.9599689841270447, "rewards/rejected": -1.7986958026885986, "step": 558 }, { "epoch": 0.94, "learning_rate": 9.452021148441674e-07, "logits/chosen": -2.3967480659484863, "logits/rejected": -1.919357180595398, "logps/chosen": -183.41903686523438, "logps/rejected": -302.5267028808594, "loss": 0.1945, "rewards/accuracies": 0.5, "rewards/chosen": -0.8161871433258057, "rewards/margins": 1.833760142326355, "rewards/rejected": -2.649947166442871, "step": 559 }, { "epoch": 0.94, "learning_rate": 9.448665901866513e-07, "logits/chosen": -1.308738112449646, "logits/rejected": -1.4025413990020752, "logps/chosen": -420.19940185546875, "logps/rejected": -352.2750244140625, "loss": 0.2113, "rewards/accuracies": 1.0, "rewards/chosen": -1.1311897039413452, "rewards/margins": 6.021875858306885, "rewards/rejected": -7.1530656814575195, "step": 560 }, { "epoch": 0.95, "learning_rate": 9.445301013879656e-07, "logits/chosen": -1.9166899919509888, "logits/rejected": -1.5457202196121216, "logps/chosen": -236.6947479248047, "logps/rejected": -277.1458435058594, "loss": 0.1837, "rewards/accuracies": 1.0, "rewards/chosen": -1.5052580833435059, "rewards/margins": 12.255561828613281, "rewards/rejected": -13.760820388793945, "step": 561 }, { "epoch": 0.95, "learning_rate": 9.441926491773689e-07, "logits/chosen": -1.066957712173462, "logits/rejected": -1.658841848373413, "logps/chosen": -255.17095947265625, "logps/rejected": -266.31304931640625, "loss": 0.188, "rewards/accuracies": 0.5, "rewards/chosen": -0.5748581290245056, "rewards/margins": 0.31342390179634094, "rewards/rejected": -0.888282060623169, "step": 562 }, { "epoch": 0.95, "learning_rate": 9.438542342862075e-07, "logits/chosen": -1.350300908088684, "logits/rejected": -2.001142740249634, "logps/chosen": -323.86114501953125, "logps/rejected": -88.59988403320312, "loss": 0.1661, "rewards/accuracies": 1.0, "rewards/chosen": -0.39666903018951416, "rewards/margins": 5.692585468292236, "rewards/rejected": -6.089254856109619, "step": 563 }, { "epoch": 0.95, "learning_rate": 9.435148574479144e-07, "logits/chosen": -1.2990124225616455, "logits/rejected": -1.5610456466674805, "logps/chosen": -318.2091979980469, "logps/rejected": -176.14430236816406, "loss": 0.1185, "rewards/accuracies": 1.0, "rewards/chosen": -0.6469208002090454, "rewards/margins": 9.246925354003906, "rewards/rejected": -9.893845558166504, "step": 564 }, { "epoch": 0.95, "learning_rate": 9.431745193980068e-07, "logits/chosen": -1.8961067199707031, "logits/rejected": -1.6770416498184204, "logps/chosen": -308.45989990234375, "logps/rejected": -212.64981079101562, "loss": 0.1747, "rewards/accuracies": 0.5, "rewards/chosen": -2.435619831085205, "rewards/margins": 1.2442288398742676, "rewards/rejected": -3.6798486709594727, "step": 565 }, { "epoch": 0.95, "learning_rate": 9.428332208740857e-07, "logits/chosen": -0.562650740146637, "logits/rejected": -1.6526992321014404, "logps/chosen": -225.20419311523438, "logps/rejected": -123.10130310058594, "loss": 0.1821, "rewards/accuracies": 1.0, "rewards/chosen": -0.8313095569610596, "rewards/margins": 5.300149917602539, "rewards/rejected": -6.1314592361450195, "step": 566 }, { "epoch": 0.96, "learning_rate": 9.424909626158332e-07, "logits/chosen": -1.3460386991500854, "logits/rejected": -1.8868428468704224, "logps/chosen": -541.9500122070312, "logps/rejected": -282.341552734375, "loss": 0.1658, "rewards/accuracies": 1.0, "rewards/chosen": -2.592463970184326, "rewards/margins": 13.076055526733398, "rewards/rejected": -15.668519973754883, "step": 567 }, { "epoch": 0.96, "learning_rate": 9.421477453650117e-07, "logits/chosen": -1.3395308256149292, "logits/rejected": -2.100599527359009, "logps/chosen": -186.7269287109375, "logps/rejected": -117.01820373535156, "loss": 0.1475, "rewards/accuracies": 1.0, "rewards/chosen": -1.069182276725769, "rewards/margins": 6.430233955383301, "rewards/rejected": -7.499416351318359, "step": 568 }, { "epoch": 0.96, "learning_rate": 9.41803569865462e-07, "logits/chosen": -0.560535192489624, "logits/rejected": -0.5541735291481018, "logps/chosen": -407.7998962402344, "logps/rejected": -330.052001953125, "loss": 0.1814, "rewards/accuracies": 1.0, "rewards/chosen": -1.46812903881073, "rewards/margins": 9.190093994140625, "rewards/rejected": -10.658222198486328, "step": 569 }, { "epoch": 0.96, "learning_rate": 9.414584368631018e-07, "logits/chosen": -0.2689858376979828, "logits/rejected": -0.6253905892372131, "logps/chosen": -302.06964111328125, "logps/rejected": -183.74813842773438, "loss": 0.1558, "rewards/accuracies": 1.0, "rewards/chosen": -0.3590884506702423, "rewards/margins": 10.315533638000488, "rewards/rejected": -10.67462158203125, "step": 570 }, { "epoch": 0.96, "learning_rate": 9.411123471059232e-07, "logits/chosen": -1.1581571102142334, "logits/rejected": -1.2208583354949951, "logps/chosen": -386.98779296875, "logps/rejected": -328.5216064453125, "loss": 0.1739, "rewards/accuracies": 0.5, "rewards/chosen": -0.2733596861362457, "rewards/margins": 0.4076034724712372, "rewards/rejected": -0.6809631586074829, "step": 571 }, { "epoch": 0.96, "learning_rate": 9.407653013439927e-07, "logits/chosen": -1.914874792098999, "logits/rejected": -1.633007526397705, "logps/chosen": -196.557373046875, "logps/rejected": -189.84039306640625, "loss": 0.1601, "rewards/accuracies": 0.5, "rewards/chosen": -1.1366727352142334, "rewards/margins": 0.7186151742935181, "rewards/rejected": -1.8552879095077515, "step": 572 }, { "epoch": 0.97, "learning_rate": 9.404173003294485e-07, "logits/chosen": -1.0885298252105713, "logits/rejected": -1.620314359664917, "logps/chosen": -390.4488525390625, "logps/rejected": -219.35308837890625, "loss": 0.1959, "rewards/accuracies": 1.0, "rewards/chosen": -0.2608696222305298, "rewards/margins": 8.13377857208252, "rewards/rejected": -8.394648551940918, "step": 573 }, { "epoch": 0.97, "learning_rate": 9.400683448164986e-07, "logits/chosen": -1.303099274635315, "logits/rejected": -1.4152745008468628, "logps/chosen": -225.566650390625, "logps/rejected": -119.57087707519531, "loss": 0.1869, "rewards/accuracies": 1.0, "rewards/chosen": -1.179052710533142, "rewards/margins": 2.7054755687713623, "rewards/rejected": -3.884528160095215, "step": 574 }, { "epoch": 0.97, "learning_rate": 9.397184355614205e-07, "logits/chosen": -1.2585117816925049, "logits/rejected": -1.849805474281311, "logps/chosen": -247.82049560546875, "logps/rejected": -223.6149139404297, "loss": 0.1884, "rewards/accuracies": 0.5, "rewards/chosen": -0.9630714654922485, "rewards/margins": -0.37028050422668457, "rewards/rejected": -0.592790961265564, "step": 575 }, { "epoch": 0.97, "learning_rate": 9.393675733225576e-07, "logits/chosen": -2.14628005027771, "logits/rejected": -1.4466516971588135, "logps/chosen": -36.28599548339844, "logps/rejected": -156.9398651123047, "loss": 0.2136, "rewards/accuracies": 1.0, "rewards/chosen": -1.0615413188934326, "rewards/margins": 4.366576194763184, "rewards/rejected": -5.428117752075195, "step": 576 }, { "epoch": 0.97, "learning_rate": 9.390157588603201e-07, "logits/chosen": -1.2252075672149658, "logits/rejected": -1.802139401435852, "logps/chosen": -250.9557647705078, "logps/rejected": -223.22103881835938, "loss": 0.1422, "rewards/accuracies": 1.0, "rewards/chosen": -0.9214485287666321, "rewards/margins": 1.491241455078125, "rewards/rejected": -2.4126901626586914, "step": 577 }, { "epoch": 0.97, "learning_rate": 9.386629929371804e-07, "logits/chosen": -1.6203854084014893, "logits/rejected": -1.964041829109192, "logps/chosen": -233.42408752441406, "logps/rejected": -223.70074462890625, "loss": 0.1747, "rewards/accuracies": 1.0, "rewards/chosen": -0.3521421551704407, "rewards/margins": 12.760381698608398, "rewards/rejected": -13.112524032592773, "step": 578 }, { "epoch": 0.98, "learning_rate": 9.383092763176738e-07, "logits/chosen": -1.3921282291412354, "logits/rejected": -1.1160563230514526, "logps/chosen": -242.66835021972656, "logps/rejected": -384.66790771484375, "loss": 0.196, "rewards/accuracies": 1.0, "rewards/chosen": -1.593625783920288, "rewards/margins": 17.484588623046875, "rewards/rejected": -19.078216552734375, "step": 579 }, { "epoch": 0.98, "learning_rate": 9.379546097683962e-07, "logits/chosen": -1.1282007694244385, "logits/rejected": -1.4131447076797485, "logps/chosen": -137.69473266601562, "logps/rejected": -102.01011657714844, "loss": 0.1704, "rewards/accuracies": 1.0, "rewards/chosen": -0.27938154339790344, "rewards/margins": 0.8095016479492188, "rewards/rejected": -1.0888831615447998, "step": 580 }, { "epoch": 0.98, "learning_rate": 9.375989940580014e-07, "logits/chosen": -1.014125943183899, "logits/rejected": -0.8712400197982788, "logps/chosen": -333.4993896484375, "logps/rejected": -282.827880859375, "loss": 0.1759, "rewards/accuracies": 1.0, "rewards/chosen": -0.6404533386230469, "rewards/margins": 12.329509735107422, "rewards/rejected": -12.969963073730469, "step": 581 }, { "epoch": 0.98, "learning_rate": 9.372424299572013e-07, "logits/chosen": -1.137288212776184, "logits/rejected": -1.7328457832336426, "logps/chosen": -630.9047241210938, "logps/rejected": -265.96551513671875, "loss": 0.181, "rewards/accuracies": 1.0, "rewards/chosen": -0.6024795770645142, "rewards/margins": 13.53583812713623, "rewards/rejected": -14.138317108154297, "step": 582 }, { "epoch": 0.98, "learning_rate": 9.368849182387624e-07, "logits/chosen": -1.1567295789718628, "logits/rejected": -1.5108009576797485, "logps/chosen": -423.7227478027344, "logps/rejected": -177.82542419433594, "loss": 0.1898, "rewards/accuracies": 1.0, "rewards/chosen": -1.013415813446045, "rewards/margins": 5.7657599449157715, "rewards/rejected": -6.779175758361816, "step": 583 }, { "epoch": 0.98, "learning_rate": 9.365264596775051e-07, "logits/chosen": -1.427332878112793, "logits/rejected": -1.6145710945129395, "logps/chosen": -719.35400390625, "logps/rejected": -739.6787109375, "loss": 0.1735, "rewards/accuracies": 1.0, "rewards/chosen": -0.9353516101837158, "rewards/margins": 8.291926383972168, "rewards/rejected": -9.227277755737305, "step": 584 }, { "epoch": 0.99, "learning_rate": 9.361670550503024e-07, "logits/chosen": -0.7215293049812317, "logits/rejected": -1.509445071220398, "logps/chosen": -111.42332458496094, "logps/rejected": -59.27985763549805, "loss": 0.1395, "rewards/accuracies": 0.5, "rewards/chosen": -0.639510452747345, "rewards/margins": 3.702336549758911, "rewards/rejected": -4.341846942901611, "step": 585 }, { "epoch": 0.99, "learning_rate": 9.35806705136077e-07, "logits/chosen": -1.5403887033462524, "logits/rejected": -1.7206377983093262, "logps/chosen": -47.813472747802734, "logps/rejected": -116.00591278076172, "loss": 0.1867, "rewards/accuracies": 1.0, "rewards/chosen": -1.1005914211273193, "rewards/margins": 2.5221846103668213, "rewards/rejected": -3.6227760314941406, "step": 586 }, { "epoch": 0.99, "learning_rate": 9.354454107158003e-07, "logits/chosen": -1.4039901494979858, "logits/rejected": -1.2692376375198364, "logps/chosen": -35.25469207763672, "logps/rejected": -236.85321044921875, "loss": 0.1741, "rewards/accuracies": 1.0, "rewards/chosen": -0.9473680853843689, "rewards/margins": 11.905098915100098, "rewards/rejected": -12.852466583251953, "step": 587 }, { "epoch": 0.99, "learning_rate": 9.350831725724915e-07, "logits/chosen": -1.3343538045883179, "logits/rejected": -1.4938243627548218, "logps/chosen": -328.1713562011719, "logps/rejected": -222.6204376220703, "loss": 0.2049, "rewards/accuracies": 1.0, "rewards/chosen": -1.934698462486267, "rewards/margins": 7.677361011505127, "rewards/rejected": -9.612059593200684, "step": 588 }, { "epoch": 0.99, "learning_rate": 9.347199914912139e-07, "logits/chosen": -1.4835619926452637, "logits/rejected": -1.2078652381896973, "logps/chosen": -425.0323791503906, "logps/rejected": -336.0191955566406, "loss": 0.1452, "rewards/accuracies": 1.0, "rewards/chosen": -1.6457017660140991, "rewards/margins": 12.339007377624512, "rewards/rejected": -13.984708786010742, "step": 589 }, { "epoch": 0.99, "learning_rate": 9.343558682590755e-07, "logits/chosen": -1.440277338027954, "logits/rejected": -1.2249367237091064, "logps/chosen": -213.87991333007812, "logps/rejected": -248.79725646972656, "loss": 0.1673, "rewards/accuracies": 1.0, "rewards/chosen": -1.332812786102295, "rewards/margins": 9.270461082458496, "rewards/rejected": -10.60327434539795, "step": 590 }, { "epoch": 1.0, "learning_rate": 9.339908036652254e-07, "logits/chosen": -1.4295828342437744, "logits/rejected": -0.9476127624511719, "logps/chosen": -167.48570251464844, "logps/rejected": -194.9608154296875, "loss": 0.1506, "rewards/accuracies": 1.0, "rewards/chosen": 0.5273482799530029, "rewards/margins": 11.93600845336914, "rewards/rejected": -11.408660888671875, "step": 591 }, { "epoch": 1.0, "learning_rate": 9.336247985008533e-07, "logits/chosen": -1.3463596105575562, "logits/rejected": -1.6185994148254395, "logps/chosen": -462.23626708984375, "logps/rejected": -235.42453002929688, "loss": 0.1343, "rewards/accuracies": 1.0, "rewards/chosen": -0.8855430483818054, "rewards/margins": 5.946411609649658, "rewards/rejected": -6.831954479217529, "step": 592 }, { "epoch": 1.0, "learning_rate": 9.33257853559187e-07, "logits/chosen": -1.8037328720092773, "logits/rejected": -1.4643278121948242, "logps/chosen": -556.474609375, "logps/rejected": -393.30126953125, "loss": 0.2315, "rewards/accuracies": 1.0, "rewards/chosen": 2.759026527404785, "rewards/margins": 9.786253929138184, "rewards/rejected": -7.02722692489624, "step": 593 } ], "logging_steps": 1.0, "max_steps": 2372, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }