{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 2000, "global_step": 5733, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005232862375719519, "grad_norm": 94.09188061885796, "learning_rate": 8.710801393728223e-10, "logits/chosen": -3.5152194499969482, "logits/rejected": -3.4632656574249268, "logps/chosen": -481.34503173828125, "logps/rejected": -587.6341552734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0052328623757195184, "grad_norm": 91.74388222886799, "learning_rate": 8.710801393728223e-09, "logits/chosen": -3.083644390106201, "logits/rejected": -3.0897269248962402, "logps/chosen": -324.0055847167969, "logps/rejected": -271.4593505859375, "loss": 0.6907, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.008788698352873325, "rewards/margins": -0.0033310672733932734, "rewards/rejected": 0.012119765393435955, "step": 10 }, { "epoch": 0.010465724751439037, "grad_norm": 83.43221759552014, "learning_rate": 1.7421602787456446e-08, "logits/chosen": -3.082770347595215, "logits/rejected": -3.082531690597534, "logps/chosen": -218.58328247070312, "logps/rejected": -207.92660522460938, "loss": 0.6918, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0014115041121840477, "rewards/margins": -0.0020067021250724792, "rewards/rejected": 0.0005951978964731097, "step": 20 }, { "epoch": 0.015698587127158554, "grad_norm": 84.2644965690558, "learning_rate": 2.6132404181184667e-08, "logits/chosen": -2.9581072330474854, "logits/rejected": -2.9381983280181885, "logps/chosen": -292.78509521484375, "logps/rejected": -271.8923034667969, "loss": 0.6928, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0032533351331949234, "rewards/margins": 0.008476363494992256, "rewards/rejected": -0.01172969862818718, "step": 30 }, { "epoch": 0.020931449502878074, "grad_norm": 90.5584684043173, "learning_rate": 3.484320557491289e-08, "logits/chosen": -3.170558214187622, "logits/rejected": -3.0824408531188965, "logps/chosen": -311.3786315917969, "logps/rejected": -300.3707580566406, "loss": 0.6949, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0032117608934640884, "rewards/margins": -0.00805128924548626, "rewards/rejected": 0.004839526955038309, "step": 40 }, { "epoch": 0.026164311878597593, "grad_norm": 94.41404839002357, "learning_rate": 4.355400696864111e-08, "logits/chosen": -3.1887145042419434, "logits/rejected": -3.0733883380889893, "logps/chosen": -290.10211181640625, "logps/rejected": -265.37530517578125, "loss": 0.6921, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.008720803074538708, "rewards/margins": 0.014362648129463196, "rewards/rejected": -0.005641847383230925, "step": 50 }, { "epoch": 0.03139717425431711, "grad_norm": 90.28684772947435, "learning_rate": 5.2264808362369334e-08, "logits/chosen": -3.0607616901397705, "logits/rejected": -3.0361111164093018, "logps/chosen": -264.2647705078125, "logps/rejected": -259.57171630859375, "loss": 0.6904, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0014693100238218904, "rewards/margins": 0.004146987106651068, "rewards/rejected": -0.00267767789773643, "step": 60 }, { "epoch": 0.03663003663003663, "grad_norm": 99.26055552405298, "learning_rate": 6.097560975609756e-08, "logits/chosen": -3.1867499351501465, "logits/rejected": -3.1289267539978027, "logps/chosen": -321.8197021484375, "logps/rejected": -259.8748474121094, "loss": 0.6953, "rewards/accuracies": 0.5, "rewards/chosen": 0.004617846105247736, "rewards/margins": -0.003396450076252222, "rewards/rejected": 0.008014296181499958, "step": 70 }, { "epoch": 0.04186289900575615, "grad_norm": 90.14672153405446, "learning_rate": 6.968641114982578e-08, "logits/chosen": -3.1180217266082764, "logits/rejected": -3.09037446975708, "logps/chosen": -297.14007568359375, "logps/rejected": -296.79644775390625, "loss": 0.6945, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.010294931009411812, "rewards/margins": -0.023982787504792213, "rewards/rejected": 0.013687856495380402, "step": 80 }, { "epoch": 0.04709576138147567, "grad_norm": 88.35453849925601, "learning_rate": 7.8397212543554e-08, "logits/chosen": -3.1341259479522705, "logits/rejected": -3.09688663482666, "logps/chosen": -238.9769287109375, "logps/rejected": -209.0316162109375, "loss": 0.6918, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.005319604650139809, "rewards/margins": 0.021374234929680824, "rewards/rejected": -0.016054632142186165, "step": 90 }, { "epoch": 0.052328623757195186, "grad_norm": 84.63637366938829, "learning_rate": 8.710801393728223e-08, "logits/chosen": -3.140672206878662, "logits/rejected": -3.0360054969787598, "logps/chosen": -216.46337890625, "logps/rejected": -193.71432495117188, "loss": 0.691, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.009584425017237663, "rewards/margins": 0.005080898758023977, "rewards/rejected": 0.004503525793552399, "step": 100 }, { "epoch": 0.0575614861329147, "grad_norm": 84.72034956399011, "learning_rate": 9.581881533101045e-08, "logits/chosen": -3.0480360984802246, "logits/rejected": -2.956514358520508, "logps/chosen": -251.59280395507812, "logps/rejected": -191.42855834960938, "loss": 0.6898, "rewards/accuracies": 0.5, "rewards/chosen": -0.003254679497331381, "rewards/margins": -0.0065119327045977116, "rewards/rejected": 0.0032572527416050434, "step": 110 }, { "epoch": 0.06279434850863422, "grad_norm": 84.55670670425404, "learning_rate": 1.0452961672473867e-07, "logits/chosen": -3.188284158706665, "logits/rejected": -3.1968116760253906, "logps/chosen": -360.14764404296875, "logps/rejected": -331.3255920410156, "loss": 0.6902, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.014872437343001366, "rewards/margins": 0.010815374553203583, "rewards/rejected": 0.00405706325545907, "step": 120 }, { "epoch": 0.06802721088435375, "grad_norm": 82.78170005116014, "learning_rate": 1.132404181184669e-07, "logits/chosen": -3.0354068279266357, "logits/rejected": -3.0372838973999023, "logps/chosen": -216.93643188476562, "logps/rejected": -230.98828125, "loss": 0.6851, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.011127235367894173, "rewards/margins": 0.01821243017911911, "rewards/rejected": -0.007085192948579788, "step": 130 }, { "epoch": 0.07326007326007326, "grad_norm": 84.12201046654916, "learning_rate": 1.219512195121951e-07, "logits/chosen": -2.9695236682891846, "logits/rejected": -3.0021088123321533, "logps/chosen": -290.0289306640625, "logps/rejected": -298.0795593261719, "loss": 0.6796, "rewards/accuracies": 0.75, "rewards/chosen": 0.02694712020456791, "rewards/margins": 0.05211643502116203, "rewards/rejected": -0.025169318541884422, "step": 140 }, { "epoch": 0.07849293563579278, "grad_norm": 94.75694685913368, "learning_rate": 1.3066202090592334e-07, "logits/chosen": -3.1449203491210938, "logits/rejected": -3.051241397857666, "logps/chosen": -253.33447265625, "logps/rejected": -242.75143432617188, "loss": 0.6781, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.015554627403616905, "rewards/margins": 0.026046495884656906, "rewards/rejected": -0.0104918722063303, "step": 150 }, { "epoch": 0.0837257980115123, "grad_norm": 88.35087856966929, "learning_rate": 1.3937282229965157e-07, "logits/chosen": -3.1425533294677734, "logits/rejected": -3.0324244499206543, "logps/chosen": -253.63150024414062, "logps/rejected": -229.47561645507812, "loss": 0.6782, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.021895864978432655, "rewards/margins": 0.05007987096905708, "rewards/rejected": -0.028184005990624428, "step": 160 }, { "epoch": 0.08895866038723181, "grad_norm": 95.40930863178984, "learning_rate": 1.480836236933798e-07, "logits/chosen": -3.1074182987213135, "logits/rejected": -3.1117656230926514, "logps/chosen": -227.1891326904297, "logps/rejected": -255.71444702148438, "loss": 0.6685, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.020874744281172752, "rewards/margins": 0.02632497251033783, "rewards/rejected": -0.005450229160487652, "step": 170 }, { "epoch": 0.09419152276295134, "grad_norm": 73.02435421234925, "learning_rate": 1.56794425087108e-07, "logits/chosen": -3.1123299598693848, "logits/rejected": -3.011928081512451, "logps/chosen": -285.3841552734375, "logps/rejected": -259.44586181640625, "loss": 0.6745, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.005156415048986673, "rewards/margins": 0.047526873648166656, "rewards/rejected": -0.05268328636884689, "step": 180 }, { "epoch": 0.09942438513867086, "grad_norm": 93.21320242113306, "learning_rate": 1.6550522648083622e-07, "logits/chosen": -3.1230103969573975, "logits/rejected": -3.0207977294921875, "logps/chosen": -296.14117431640625, "logps/rejected": -251.43862915039062, "loss": 0.6568, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.03524628281593323, "rewards/margins": 0.0699087381362915, "rewards/rejected": -0.034662459045648575, "step": 190 }, { "epoch": 0.10465724751439037, "grad_norm": 80.61959405986865, "learning_rate": 1.7421602787456445e-07, "logits/chosen": -3.0215725898742676, "logits/rejected": -2.9358713626861572, "logps/chosen": -265.2518005371094, "logps/rejected": -288.84405517578125, "loss": 0.65, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.03169848769903183, "rewards/margins": 0.08158738166093826, "rewards/rejected": -0.04988889396190643, "step": 200 }, { "epoch": 0.10989010989010989, "grad_norm": 84.16399451644304, "learning_rate": 1.8292682926829268e-07, "logits/chosen": -3.0894827842712402, "logits/rejected": -3.0165505409240723, "logps/chosen": -206.22940063476562, "logps/rejected": -192.75497436523438, "loss": 0.6688, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.038443055003881454, "rewards/margins": 0.039960093796253204, "rewards/rejected": -0.0015170406550168991, "step": 210 }, { "epoch": 0.1151229722658294, "grad_norm": 76.55057412551906, "learning_rate": 1.916376306620209e-07, "logits/chosen": -3.1259095668792725, "logits/rejected": -3.079519271850586, "logps/chosen": -271.9474182128906, "logps/rejected": -240.37857055664062, "loss": 0.6542, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.18100936710834503, "rewards/margins": 0.21680958569049835, "rewards/rejected": -0.03580021113157272, "step": 220 }, { "epoch": 0.12035583464154893, "grad_norm": 75.06928660370438, "learning_rate": 2.003484320557491e-07, "logits/chosen": -3.12839937210083, "logits/rejected": -3.11124324798584, "logps/chosen": -239.37850952148438, "logps/rejected": -314.4399108886719, "loss": 0.6274, "rewards/accuracies": 0.75, "rewards/chosen": 0.06692560017108917, "rewards/margins": 0.1379128396511078, "rewards/rejected": -0.07098724693059921, "step": 230 }, { "epoch": 0.12558869701726844, "grad_norm": 83.96424767502477, "learning_rate": 2.0905923344947734e-07, "logits/chosen": -3.0928332805633545, "logits/rejected": -3.010830879211426, "logps/chosen": -222.9319305419922, "logps/rejected": -210.9583282470703, "loss": 0.6317, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.011095757596194744, "rewards/margins": 0.11776226758956909, "rewards/rejected": -0.10666650533676147, "step": 240 }, { "epoch": 0.13082155939298795, "grad_norm": 88.45904287007801, "learning_rate": 2.1777003484320556e-07, "logits/chosen": -3.022921085357666, "logits/rejected": -3.0192711353302, "logps/chosen": -283.39776611328125, "logps/rejected": -244.5492401123047, "loss": 0.6222, "rewards/accuracies": 0.625, "rewards/chosen": 0.0603029727935791, "rewards/margins": 0.20555946230888367, "rewards/rejected": -0.14525647461414337, "step": 250 }, { "epoch": 0.1360544217687075, "grad_norm": 89.31449552707612, "learning_rate": 2.264808362369338e-07, "logits/chosen": -3.120241165161133, "logits/rejected": -3.0511136054992676, "logps/chosen": -283.9732360839844, "logps/rejected": -263.3446350097656, "loss": 0.6379, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.16783495247364044, "rewards/margins": 0.35026875138282776, "rewards/rejected": -0.1824338138103485, "step": 260 }, { "epoch": 0.141287284144427, "grad_norm": 68.26954277799912, "learning_rate": 2.3519163763066202e-07, "logits/chosen": -3.0991485118865967, "logits/rejected": -3.0397980213165283, "logps/chosen": -261.33587646484375, "logps/rejected": -265.8334655761719, "loss": 0.5996, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04029225930571556, "rewards/margins": 0.1272924840450287, "rewards/rejected": -0.16758474707603455, "step": 270 }, { "epoch": 0.14652014652014653, "grad_norm": 90.51202320254713, "learning_rate": 2.439024390243902e-07, "logits/chosen": -3.043231248855591, "logits/rejected": -3.0575814247131348, "logps/chosen": -298.69793701171875, "logps/rejected": -281.7175598144531, "loss": 0.6436, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0799073800444603, "rewards/margins": 0.23833930492401123, "rewards/rejected": -0.15843191742897034, "step": 280 }, { "epoch": 0.15175300889586604, "grad_norm": 80.33390044279902, "learning_rate": 2.526132404181184e-07, "logits/chosen": -3.153942346572876, "logits/rejected": -3.0358548164367676, "logps/chosen": -312.6462707519531, "logps/rejected": -233.54074096679688, "loss": 0.6468, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.003987653646618128, "rewards/margins": 0.18192686140537262, "rewards/rejected": -0.1859145164489746, "step": 290 }, { "epoch": 0.15698587127158556, "grad_norm": 78.31582974506166, "learning_rate": 2.613240418118467e-07, "logits/chosen": -3.138707160949707, "logits/rejected": -3.0932140350341797, "logps/chosen": -303.4789123535156, "logps/rejected": -249.81118774414062, "loss": 0.6224, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.019873926416039467, "rewards/margins": 0.21286074817180634, "rewards/rejected": -0.23273468017578125, "step": 300 }, { "epoch": 0.16221873364730507, "grad_norm": 73.73680319477415, "learning_rate": 2.700348432055749e-07, "logits/chosen": -3.1789348125457764, "logits/rejected": -3.066558361053467, "logps/chosen": -310.24810791015625, "logps/rejected": -288.8026123046875, "loss": 0.6273, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.20877079665660858, "rewards/margins": 0.382536381483078, "rewards/rejected": -0.17376558482646942, "step": 310 }, { "epoch": 0.1674515960230246, "grad_norm": 85.31591109907856, "learning_rate": 2.7874564459930313e-07, "logits/chosen": -3.182602882385254, "logits/rejected": -3.025256633758545, "logps/chosen": -321.3443298339844, "logps/rejected": -219.53018188476562, "loss": 0.6212, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.06027396395802498, "rewards/margins": 0.22985586524009705, "rewards/rejected": -0.2901298403739929, "step": 320 }, { "epoch": 0.1726844583987441, "grad_norm": 95.60825580213312, "learning_rate": 2.874564459930314e-07, "logits/chosen": -3.0407824516296387, "logits/rejected": -2.8936054706573486, "logps/chosen": -246.9007110595703, "logps/rejected": -193.61361694335938, "loss": 0.632, "rewards/accuracies": 0.625, "rewards/chosen": -0.05899695307016373, "rewards/margins": 0.19634512066841125, "rewards/rejected": -0.2553420662879944, "step": 330 }, { "epoch": 0.17791732077446362, "grad_norm": 93.2439319036819, "learning_rate": 2.961672473867596e-07, "logits/chosen": -3.131028890609741, "logits/rejected": -3.1440048217773438, "logps/chosen": -269.9167785644531, "logps/rejected": -297.08892822265625, "loss": 0.5946, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.027722906321287155, "rewards/margins": 0.2504483759403229, "rewards/rejected": -0.22272543609142303, "step": 340 }, { "epoch": 0.18315018315018314, "grad_norm": 83.11644785454808, "learning_rate": 3.048780487804878e-07, "logits/chosen": -3.0741794109344482, "logits/rejected": -3.142446756362915, "logps/chosen": -177.91314697265625, "logps/rejected": -214.43783569335938, "loss": 0.598, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.014001446776092052, "rewards/margins": 0.4443834722042084, "rewards/rejected": -0.43038201332092285, "step": 350 }, { "epoch": 0.18838304552590268, "grad_norm": 75.07825178789516, "learning_rate": 3.13588850174216e-07, "logits/chosen": -3.145319938659668, "logits/rejected": -3.0705819129943848, "logps/chosen": -254.92349243164062, "logps/rejected": -267.6507568359375, "loss": 0.5908, "rewards/accuracies": 0.625, "rewards/chosen": 0.01960715278983116, "rewards/margins": 0.33955109119415283, "rewards/rejected": -0.31994396448135376, "step": 360 }, { "epoch": 0.1936159079016222, "grad_norm": 84.54758581111935, "learning_rate": 3.2229965156794425e-07, "logits/chosen": -3.0986087322235107, "logits/rejected": -3.04323673248291, "logps/chosen": -286.7508850097656, "logps/rejected": -261.7186584472656, "loss": 0.5915, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.16775548458099365, "rewards/margins": 0.3590541183948517, "rewards/rejected": -0.526809573173523, "step": 370 }, { "epoch": 0.1988487702773417, "grad_norm": 75.63626487688099, "learning_rate": 3.3101045296167245e-07, "logits/chosen": -2.993973731994629, "logits/rejected": -3.0626816749572754, "logps/chosen": -195.5089874267578, "logps/rejected": -320.49505615234375, "loss": 0.5938, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.04936353117227554, "rewards/margins": 0.42661842703819275, "rewards/rejected": -0.4759819507598877, "step": 380 }, { "epoch": 0.20408163265306123, "grad_norm": 74.150465826655, "learning_rate": 3.3972125435540065e-07, "logits/chosen": -3.115739345550537, "logits/rejected": -3.011378049850464, "logps/chosen": -377.07635498046875, "logps/rejected": -261.1158447265625, "loss": 0.5984, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.11010949313640594, "rewards/margins": 0.2554206848144531, "rewards/rejected": -0.14531120657920837, "step": 390 }, { "epoch": 0.20931449502878074, "grad_norm": 73.77557301101339, "learning_rate": 3.484320557491289e-07, "logits/chosen": -3.190718412399292, "logits/rejected": -3.083627700805664, "logps/chosen": -282.1058044433594, "logps/rejected": -242.8349609375, "loss": 0.5839, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.003936466760933399, "rewards/margins": 0.46391019225120544, "rewards/rejected": -0.46784669160842896, "step": 400 }, { "epoch": 0.21454735740450026, "grad_norm": 73.8062445245953, "learning_rate": 3.5714285714285716e-07, "logits/chosen": -3.089789628982544, "logits/rejected": -3.0682833194732666, "logps/chosen": -272.4922790527344, "logps/rejected": -304.0185852050781, "loss": 0.6172, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0009228348499163985, "rewards/margins": 0.4525611400604248, "rewards/rejected": -0.45163828134536743, "step": 410 }, { "epoch": 0.21978021978021978, "grad_norm": 74.83558616876446, "learning_rate": 3.6585365853658536e-07, "logits/chosen": -3.1500418186187744, "logits/rejected": -3.1297078132629395, "logps/chosen": -270.31439208984375, "logps/rejected": -285.99566650390625, "loss": 0.5816, "rewards/accuracies": 0.625, "rewards/chosen": -0.17573900520801544, "rewards/margins": 0.23008713126182556, "rewards/rejected": -0.4058261513710022, "step": 420 }, { "epoch": 0.2250130821559393, "grad_norm": 84.39383647334913, "learning_rate": 3.7456445993031356e-07, "logits/chosen": -3.238851547241211, "logits/rejected": -3.1210849285125732, "logps/chosen": -299.7376403808594, "logps/rejected": -243.89425659179688, "loss": 0.586, "rewards/accuracies": 0.75, "rewards/chosen": 0.025929700583219528, "rewards/margins": 0.6629034876823425, "rewards/rejected": -0.6369737386703491, "step": 430 }, { "epoch": 0.2302459445316588, "grad_norm": 95.01347401286293, "learning_rate": 3.832752613240418e-07, "logits/chosen": -3.1209588050842285, "logits/rejected": -3.0972118377685547, "logps/chosen": -265.3486022949219, "logps/rejected": -244.28085327148438, "loss": 0.5989, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.165512353181839, "rewards/margins": 0.2939915657043457, "rewards/rejected": -0.4595039486885071, "step": 440 }, { "epoch": 0.23547880690737832, "grad_norm": 77.18736854674945, "learning_rate": 3.9198606271777e-07, "logits/chosen": -3.0002026557922363, "logits/rejected": -3.0352981090545654, "logps/chosen": -226.3055419921875, "logps/rejected": -235.90682983398438, "loss": 0.5476, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2797011733055115, "rewards/margins": 0.28589215874671936, "rewards/rejected": -0.5655933022499084, "step": 450 }, { "epoch": 0.24071166928309787, "grad_norm": 79.25310513642812, "learning_rate": 4.006968641114982e-07, "logits/chosen": -3.1194281578063965, "logits/rejected": -3.0288190841674805, "logps/chosen": -277.7894592285156, "logps/rejected": -257.56829833984375, "loss": 0.5857, "rewards/accuracies": 0.75, "rewards/chosen": -0.038360826671123505, "rewards/margins": 0.44273337721824646, "rewards/rejected": -0.4810941815376282, "step": 460 }, { "epoch": 0.24594453165881738, "grad_norm": 102.7869776024924, "learning_rate": 4.0940766550522647e-07, "logits/chosen": -3.1143012046813965, "logits/rejected": -3.095999240875244, "logps/chosen": -278.0996398925781, "logps/rejected": -281.44744873046875, "loss": 0.5557, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.20771941542625427, "rewards/margins": 0.40010419487953186, "rewards/rejected": -0.6078236699104309, "step": 470 }, { "epoch": 0.25117739403453687, "grad_norm": 91.2064519756057, "learning_rate": 4.1811846689895467e-07, "logits/chosen": -3.12644624710083, "logits/rejected": -3.1345064640045166, "logps/chosen": -234.6929168701172, "logps/rejected": -231.0890350341797, "loss": 0.5798, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.0074354312382638454, "rewards/margins": 0.6263734698295593, "rewards/rejected": -0.6189380288124084, "step": 480 }, { "epoch": 0.2564102564102564, "grad_norm": 74.90399686002044, "learning_rate": 4.268292682926829e-07, "logits/chosen": -3.1197152137756348, "logits/rejected": -3.1123099327087402, "logps/chosen": -282.6683654785156, "logps/rejected": -250.9263153076172, "loss": 0.5737, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.027071375399827957, "rewards/margins": 0.4689862132072449, "rewards/rejected": -0.49605756998062134, "step": 490 }, { "epoch": 0.2616431187859759, "grad_norm": 83.83371174908929, "learning_rate": 4.3554006968641113e-07, "logits/chosen": -3.095078229904175, "logits/rejected": -3.1061666011810303, "logps/chosen": -249.3391571044922, "logps/rejected": -266.2730712890625, "loss": 0.5424, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.06339094787836075, "rewards/margins": 0.7088472247123718, "rewards/rejected": -0.7722381353378296, "step": 500 }, { "epoch": 0.2668759811616955, "grad_norm": 100.36296016013735, "learning_rate": 4.442508710801394e-07, "logits/chosen": -3.1972177028656006, "logits/rejected": -3.0943140983581543, "logps/chosen": -276.8199462890625, "logps/rejected": -247.34414672851562, "loss": 0.5761, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.12653465569019318, "rewards/margins": 0.39958837628364563, "rewards/rejected": -0.5261229872703552, "step": 510 }, { "epoch": 0.272108843537415, "grad_norm": 81.94285577912619, "learning_rate": 4.529616724738676e-07, "logits/chosen": -3.141080617904663, "logits/rejected": -3.128713369369507, "logps/chosen": -259.6761474609375, "logps/rejected": -280.2798767089844, "loss": 0.5693, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.2839200496673584, "rewards/margins": 0.2823529541492462, "rewards/rejected": -0.5662729740142822, "step": 520 }, { "epoch": 0.2773417059131345, "grad_norm": 72.20052209070532, "learning_rate": 4.616724738675958e-07, "logits/chosen": -3.0837650299072266, "logits/rejected": -3.0768468379974365, "logps/chosen": -323.7898864746094, "logps/rejected": -244.2240753173828, "loss": 0.56, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.007369181606918573, "rewards/margins": 0.6912291646003723, "rewards/rejected": -0.6838599443435669, "step": 530 }, { "epoch": 0.282574568288854, "grad_norm": 96.12148617089743, "learning_rate": 4.7038327526132404e-07, "logits/chosen": -3.0696797370910645, "logits/rejected": -3.0386407375335693, "logps/chosen": -291.853759765625, "logps/rejected": -273.1394348144531, "loss": 0.5381, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3618549704551697, "rewards/margins": 0.5237946510314941, "rewards/rejected": -0.8856496810913086, "step": 540 }, { "epoch": 0.28780743066457354, "grad_norm": 86.38759011309443, "learning_rate": 4.790940766550523e-07, "logits/chosen": -3.186180830001831, "logits/rejected": -3.0945451259613037, "logps/chosen": -324.6902770996094, "logps/rejected": -275.7088928222656, "loss": 0.6049, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.20764949917793274, "rewards/margins": 0.571395218372345, "rewards/rejected": -0.7790447473526001, "step": 550 }, { "epoch": 0.29304029304029305, "grad_norm": 64.90968452127568, "learning_rate": 4.878048780487804e-07, "logits/chosen": -3.187990427017212, "logits/rejected": -3.145301580429077, "logps/chosen": -280.2078552246094, "logps/rejected": -266.88690185546875, "loss": 0.5398, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04117468744516373, "rewards/margins": 0.7495439648628235, "rewards/rejected": -0.7907186150550842, "step": 560 }, { "epoch": 0.29827315541601257, "grad_norm": 77.69080183524072, "learning_rate": 4.965156794425087e-07, "logits/chosen": -3.243943452835083, "logits/rejected": -3.134255886077881, "logps/chosen": -273.4155578613281, "logps/rejected": -252.31005859375, "loss": 0.5557, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13792729377746582, "rewards/margins": 0.6078609228134155, "rewards/rejected": -0.7457882165908813, "step": 570 }, { "epoch": 0.3035060177917321, "grad_norm": 76.4997152946134, "learning_rate": 4.999983312905696e-07, "logits/chosen": -3.1896326541900635, "logits/rejected": -3.114757537841797, "logps/chosen": -308.5732421875, "logps/rejected": -217.47811889648438, "loss": 0.5587, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.29867416620254517, "rewards/margins": 0.4343906342983246, "rewards/rejected": -0.7330647706985474, "step": 580 }, { "epoch": 0.3087388801674516, "grad_norm": 72.32572882520442, "learning_rate": 4.999881337025014e-07, "logits/chosen": -3.0399203300476074, "logits/rejected": -3.050323963165283, "logps/chosen": -214.6151580810547, "logps/rejected": -218.504638671875, "loss": 0.5723, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3614969849586487, "rewards/margins": 0.31036117672920227, "rewards/rejected": -0.6718581914901733, "step": 590 }, { "epoch": 0.3139717425431711, "grad_norm": 74.26464040105907, "learning_rate": 4.999686659648518e-07, "logits/chosen": -3.0948026180267334, "logits/rejected": -3.0892751216888428, "logps/chosen": -274.90972900390625, "logps/rejected": -267.99945068359375, "loss": 0.5872, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1589413434267044, "rewards/margins": 0.40015944838523865, "rewards/rejected": -0.5591008067131042, "step": 600 }, { "epoch": 0.31920460491889063, "grad_norm": 78.19427329028585, "learning_rate": 4.999399287995302e-07, "logits/chosen": -3.148561716079712, "logits/rejected": -3.0622525215148926, "logps/chosen": -197.9312286376953, "logps/rejected": -218.32327270507812, "loss": 0.5267, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.17100146412849426, "rewards/margins": 0.5416156053543091, "rewards/rejected": -0.712617039680481, "step": 610 }, { "epoch": 0.32443746729461015, "grad_norm": 76.66402123868217, "learning_rate": 4.999019232721791e-07, "logits/chosen": -3.204472780227661, "logits/rejected": -3.0419440269470215, "logps/chosen": -347.6180419921875, "logps/rejected": -214.8529815673828, "loss": 0.5698, "rewards/accuracies": 0.75, "rewards/chosen": 0.022726038470864296, "rewards/margins": 0.819332480430603, "rewards/rejected": -0.7966063618659973, "step": 620 }, { "epoch": 0.32967032967032966, "grad_norm": 68.7033322801817, "learning_rate": 4.998546507921325e-07, "logits/chosen": -3.0578935146331787, "logits/rejected": -3.0714313983917236, "logps/chosen": -220.4261932373047, "logps/rejected": -270.59967041015625, "loss": 0.5939, "rewards/accuracies": 0.75, "rewards/chosen": -0.3333097994327545, "rewards/margins": 0.6263474822044373, "rewards/rejected": -0.9596571922302246, "step": 630 }, { "epoch": 0.3349031920460492, "grad_norm": 81.37928076282607, "learning_rate": 4.997981131123656e-07, "logits/chosen": -3.1487958431243896, "logits/rejected": -3.0738918781280518, "logps/chosen": -278.1891174316406, "logps/rejected": -295.7309265136719, "loss": 0.5479, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1152644008398056, "rewards/margins": 0.8578460812568665, "rewards/rejected": -0.973110556602478, "step": 640 }, { "epoch": 0.3401360544217687, "grad_norm": 70.73272063848962, "learning_rate": 4.997323123294291e-07, "logits/chosen": -3.1260294914245605, "logits/rejected": -3.0860161781311035, "logps/chosen": -259.814208984375, "logps/rejected": -247.1504669189453, "loss": 0.5491, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1367429792881012, "rewards/margins": 0.7705110907554626, "rewards/rejected": -0.9072540402412415, "step": 650 }, { "epoch": 0.3453689167974882, "grad_norm": 77.96162350079605, "learning_rate": 4.99657250883371e-07, "logits/chosen": -3.1087183952331543, "logits/rejected": -3.0831079483032227, "logps/chosen": -232.91452026367188, "logps/rejected": -226.20156860351562, "loss": 0.5577, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.31011492013931274, "rewards/margins": 0.43065014481544495, "rewards/rejected": -0.7407650351524353, "step": 660 }, { "epoch": 0.35060177917320773, "grad_norm": 90.00421072830814, "learning_rate": 4.995729315576468e-07, "logits/chosen": -3.025679111480713, "logits/rejected": -2.999988079071045, "logps/chosen": -260.24688720703125, "logps/rejected": -235.12960815429688, "loss": 0.5598, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.29311051964759827, "rewards/margins": 0.5130165219306946, "rewards/rejected": -0.8061269521713257, "step": 670 }, { "epoch": 0.35583464154892724, "grad_norm": 68.9312023558885, "learning_rate": 4.99479357479016e-07, "logits/chosen": -2.976226329803467, "logits/rejected": -2.946054220199585, "logps/chosen": -240.30581665039062, "logps/rejected": -217.4703369140625, "loss": 0.551, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4448089003562927, "rewards/margins": 0.5245271921157837, "rewards/rejected": -0.9693361520767212, "step": 680 }, { "epoch": 0.36106750392464676, "grad_norm": 67.7443101141294, "learning_rate": 4.993765321174261e-07, "logits/chosen": -3.121796131134033, "logits/rejected": -3.045621633529663, "logps/chosen": -237.3647918701172, "logps/rejected": -228.0232391357422, "loss": 0.5157, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.09849713742733002, "rewards/margins": 0.6997938752174377, "rewards/rejected": -0.7982910871505737, "step": 690 }, { "epoch": 0.3663003663003663, "grad_norm": 75.25657103526551, "learning_rate": 4.992644592858842e-07, "logits/chosen": -3.0317349433898926, "logits/rejected": -3.018582344055176, "logps/chosen": -250.8370361328125, "logps/rejected": -234.2800750732422, "loss": 0.5621, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.37291091680526733, "rewards/margins": 0.4760715961456299, "rewards/rejected": -0.8489826321601868, "step": 700 }, { "epoch": 0.3715332286760858, "grad_norm": 81.21522266445194, "learning_rate": 4.991431431403148e-07, "logits/chosen": -3.0781257152557373, "logits/rejected": -3.0017409324645996, "logps/chosen": -324.48162841796875, "logps/rejected": -275.28948974609375, "loss": 0.5044, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2106948345899582, "rewards/margins": 0.7955836057662964, "rewards/rejected": -1.006278395652771, "step": 710 }, { "epoch": 0.37676609105180536, "grad_norm": 80.06562430408891, "learning_rate": 4.99012588179407e-07, "logits/chosen": -2.9837193489074707, "logits/rejected": -3.0294525623321533, "logps/chosen": -212.5444793701172, "logps/rejected": -228.88558959960938, "loss": 0.5278, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.20186667144298553, "rewards/margins": 0.7094992399215698, "rewards/rejected": -0.9113659858703613, "step": 720 }, { "epoch": 0.3819989534275249, "grad_norm": 85.78497349511593, "learning_rate": 4.988727992444467e-07, "logits/chosen": -3.043053388595581, "logits/rejected": -3.067479372024536, "logps/chosen": -260.4826965332031, "logps/rejected": -297.13818359375, "loss": 0.5579, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4206086993217468, "rewards/margins": 0.8961846232414246, "rewards/rejected": -1.316793441772461, "step": 730 }, { "epoch": 0.3872318158032444, "grad_norm": 84.87229477123647, "learning_rate": 4.98723781519137e-07, "logits/chosen": -3.0174295902252197, "logits/rejected": -3.047327995300293, "logps/chosen": -235.64041137695312, "logps/rejected": -218.7727813720703, "loss": 0.5214, "rewards/accuracies": 0.625, "rewards/chosen": -0.36899125576019287, "rewards/margins": 0.4752073884010315, "rewards/rejected": -0.8441985249519348, "step": 740 }, { "epoch": 0.3924646781789639, "grad_norm": 89.93943077074505, "learning_rate": 4.98565540529407e-07, "logits/chosen": -2.968883752822876, "logits/rejected": -2.917454957962036, "logps/chosen": -273.8594055175781, "logps/rejected": -295.1705017089844, "loss": 0.5121, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.20952562987804413, "rewards/margins": 0.5124503374099731, "rewards/rejected": -0.7219759225845337, "step": 750 }, { "epoch": 0.3976975405546834, "grad_norm": 79.23809197029273, "learning_rate": 4.983980821432054e-07, "logits/chosen": -2.989825963973999, "logits/rejected": -2.9519991874694824, "logps/chosen": -221.87802124023438, "logps/rejected": -195.59841918945312, "loss": 0.5761, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.29332882165908813, "rewards/margins": 0.5967585444450378, "rewards/rejected": -0.8900874853134155, "step": 760 }, { "epoch": 0.40293040293040294, "grad_norm": 67.58874201985374, "learning_rate": 4.982214125702845e-07, "logits/chosen": -2.992891788482666, "logits/rejected": -2.971160411834717, "logps/chosen": -237.1896209716797, "logps/rejected": -279.85540771484375, "loss": 0.5946, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.507228434085846, "rewards/margins": 0.7127504944801331, "rewards/rejected": -1.2199790477752686, "step": 770 }, { "epoch": 0.40816326530612246, "grad_norm": 97.12007893070077, "learning_rate": 4.980355383619684e-07, "logits/chosen": -2.982445001602173, "logits/rejected": -2.9643616676330566, "logps/chosen": -235.02371215820312, "logps/rejected": -204.9355926513672, "loss": 0.5384, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4040898382663727, "rewards/margins": 0.788711428642273, "rewards/rejected": -1.1928012371063232, "step": 780 }, { "epoch": 0.413396127681842, "grad_norm": 88.27200271596408, "learning_rate": 4.978404664109113e-07, "logits/chosen": -2.9764885902404785, "logits/rejected": -2.9915101528167725, "logps/chosen": -219.7651824951172, "logps/rejected": -287.2875061035156, "loss": 0.525, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5371644496917725, "rewards/margins": 0.45865750312805176, "rewards/rejected": -0.9958218336105347, "step": 790 }, { "epoch": 0.4186289900575615, "grad_norm": 96.3472634457525, "learning_rate": 4.97636203950841e-07, "logits/chosen": -2.9858834743499756, "logits/rejected": -2.9834558963775635, "logps/chosen": -291.88592529296875, "logps/rejected": -300.13739013671875, "loss": 0.6064, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6125101447105408, "rewards/margins": 0.6153772473335266, "rewards/rejected": -1.2278873920440674, "step": 800 }, { "epoch": 0.423861852433281, "grad_norm": 83.05896827107863, "learning_rate": 4.974227585562916e-07, "logits/chosen": -2.978752613067627, "logits/rejected": -2.9068539142608643, "logps/chosen": -298.07830810546875, "logps/rejected": -265.3194580078125, "loss": 0.5551, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6034780740737915, "rewards/margins": 0.6695935130119324, "rewards/rejected": -1.2730716466903687, "step": 810 }, { "epoch": 0.4290947148090005, "grad_norm": 77.69593080451958, "learning_rate": 4.972001381423214e-07, "logits/chosen": -3.0350236892700195, "logits/rejected": -2.9831128120422363, "logps/chosen": -280.2365417480469, "logps/rejected": -236.3062286376953, "loss": 0.5573, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6114312410354614, "rewards/margins": 0.8157623410224915, "rewards/rejected": -1.4271936416625977, "step": 820 }, { "epoch": 0.43432757718472004, "grad_norm": 85.74996087035125, "learning_rate": 4.969683509642206e-07, "logits/chosen": -3.1068029403686523, "logits/rejected": -3.043793201446533, "logps/chosen": -230.5146026611328, "logps/rejected": -232.333984375, "loss": 0.6085, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4987451434135437, "rewards/margins": 0.6120332479476929, "rewards/rejected": -1.1107784509658813, "step": 830 }, { "epoch": 0.43956043956043955, "grad_norm": 79.95354979497696, "learning_rate": 4.967274056172044e-07, "logits/chosen": -3.126960277557373, "logits/rejected": -2.9165122509002686, "logps/chosen": -390.7674560546875, "logps/rejected": -286.17449951171875, "loss": 0.5517, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.43036970496177673, "rewards/margins": 0.874958872795105, "rewards/rejected": -1.305328607559204, "step": 840 }, { "epoch": 0.44479330193615907, "grad_norm": 90.80812680642141, "learning_rate": 4.964773110360944e-07, "logits/chosen": -3.0085933208465576, "logits/rejected": -2.8877360820770264, "logps/chosen": -246.94723510742188, "logps/rejected": -219.91226196289062, "loss": 0.583, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5112579464912415, "rewards/margins": 0.5499164462089539, "rewards/rejected": -1.0611745119094849, "step": 850 }, { "epoch": 0.4500261643118786, "grad_norm": 84.075944520367, "learning_rate": 4.962180764949876e-07, "logits/chosen": -3.0289928913116455, "logits/rejected": -3.0150904655456543, "logps/chosen": -180.94595336914062, "logps/rejected": -255.3822784423828, "loss": 0.5456, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.12146220356225967, "rewards/margins": 1.0188446044921875, "rewards/rejected": -1.1403067111968994, "step": 860 }, { "epoch": 0.4552590266875981, "grad_norm": 64.54535660894318, "learning_rate": 4.959497116069122e-07, "logits/chosen": -2.7720274925231934, "logits/rejected": -2.821364164352417, "logps/chosen": -217.51809692382812, "logps/rejected": -224.5338897705078, "loss": 0.5592, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.4087623655796051, "rewards/margins": 0.6672945022583008, "rewards/rejected": -1.0760568380355835, "step": 870 }, { "epoch": 0.4604918890633176, "grad_norm": 79.02185341275873, "learning_rate": 4.956722263234711e-07, "logits/chosen": -3.0200018882751465, "logits/rejected": -2.999293804168701, "logps/chosen": -263.44854736328125, "logps/rejected": -247.01296997070312, "loss": 0.5074, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5074370503425598, "rewards/margins": 0.5145549774169922, "rewards/rejected": -1.0219919681549072, "step": 880 }, { "epoch": 0.46572475143903713, "grad_norm": 83.58258828958799, "learning_rate": 4.95385630934473e-07, "logits/chosen": -3.0649561882019043, "logits/rejected": -3.024348020553589, "logps/chosen": -300.5052795410156, "logps/rejected": -234.9149932861328, "loss": 0.5411, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2754116654396057, "rewards/margins": 0.6721813678741455, "rewards/rejected": -0.9475929141044617, "step": 890 }, { "epoch": 0.47095761381475665, "grad_norm": 100.19185940775421, "learning_rate": 4.950899360675511e-07, "logits/chosen": -2.9367594718933105, "logits/rejected": -2.907555103302002, "logps/chosen": -236.75027465820312, "logps/rejected": -292.83905029296875, "loss": 0.5266, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.4334315359592438, "rewards/margins": 1.182762861251831, "rewards/rejected": -1.6161943674087524, "step": 900 }, { "epoch": 0.47619047619047616, "grad_norm": 102.33562117504174, "learning_rate": 4.947851526877681e-07, "logits/chosen": -2.9709973335266113, "logits/rejected": -2.9239044189453125, "logps/chosen": -171.56973266601562, "logps/rejected": -190.5438995361328, "loss": 0.5559, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.47417712211608887, "rewards/margins": 1.0355875492095947, "rewards/rejected": -1.509764552116394, "step": 910 }, { "epoch": 0.48142333856619574, "grad_norm": 81.34927981598014, "learning_rate": 4.944712920972108e-07, "logits/chosen": -3.076129198074341, "logits/rejected": -2.9686129093170166, "logps/chosen": -299.8473815917969, "logps/rejected": -247.28634643554688, "loss": 0.5559, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5494756102561951, "rewards/margins": 0.6717077493667603, "rewards/rejected": -1.2211834192276, "step": 920 }, { "epoch": 0.48665620094191525, "grad_norm": 87.61105183585724, "learning_rate": 4.9414836593457e-07, "logits/chosen": -2.972496509552002, "logits/rejected": -2.967925548553467, "logps/chosen": -262.71697998046875, "logps/rejected": -261.80926513671875, "loss": 0.534, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7075499296188354, "rewards/margins": 0.6287915110588074, "rewards/rejected": -1.3363415002822876, "step": 930 }, { "epoch": 0.49188906331763477, "grad_norm": 70.63692038737939, "learning_rate": 4.938163861747094e-07, "logits/chosen": -3.0296452045440674, "logits/rejected": -2.960433006286621, "logps/chosen": -290.5197448730469, "logps/rejected": -246.6784210205078, "loss": 0.4768, "rewards/accuracies": 0.75, "rewards/chosen": -0.5259759426116943, "rewards/margins": 0.9665981531143188, "rewards/rejected": -1.4925740957260132, "step": 940 }, { "epoch": 0.4971219256933543, "grad_norm": 76.86103804599567, "learning_rate": 4.934753651282215e-07, "logits/chosen": -2.9564368724823, "logits/rejected": -2.855729579925537, "logps/chosen": -280.39862060546875, "logps/rejected": -273.531494140625, "loss": 0.5178, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.39186516404151917, "rewards/margins": 1.0682334899902344, "rewards/rejected": -1.4600986242294312, "step": 950 }, { "epoch": 0.5023547880690737, "grad_norm": 78.27457899984758, "learning_rate": 4.93125315440971e-07, "logits/chosen": -3.047217845916748, "logits/rejected": -2.9591550827026367, "logps/chosen": -275.50018310546875, "logps/rejected": -271.0816345214844, "loss": 0.539, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6008744239807129, "rewards/margins": 0.8788958787918091, "rewards/rejected": -1.4797704219818115, "step": 960 }, { "epoch": 0.5075876504447933, "grad_norm": 90.1368608733719, "learning_rate": 4.92766250093626e-07, "logits/chosen": -2.944821834564209, "logits/rejected": -2.87223482131958, "logps/chosen": -286.7122497558594, "logps/rejected": -255.8455810546875, "loss": 0.5428, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.6097656488418579, "rewards/margins": 1.3089778423309326, "rewards/rejected": -1.9187434911727905, "step": 970 }, { "epoch": 0.5128205128205128, "grad_norm": 57.42441636178167, "learning_rate": 4.92398182401176e-07, "logits/chosen": -3.04980206489563, "logits/rejected": -2.8831355571746826, "logps/chosen": -300.86956787109375, "logps/rejected": -244.74783325195312, "loss": 0.5036, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.19930225610733032, "rewards/margins": 1.2830896377563477, "rewards/rejected": -1.4823919534683228, "step": 980 }, { "epoch": 0.5180533751962323, "grad_norm": 75.79879823434918, "learning_rate": 4.920211260124395e-07, "logits/chosen": -2.957986354827881, "logits/rejected": -2.893465757369995, "logps/chosen": -242.7764892578125, "logps/rejected": -228.1251220703125, "loss": 0.5288, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5788222551345825, "rewards/margins": 0.959934413433075, "rewards/rejected": -1.5387566089630127, "step": 990 }, { "epoch": 0.5232862375719518, "grad_norm": 83.5444177319047, "learning_rate": 4.916350949095566e-07, "logits/chosen": -3.0023560523986816, "logits/rejected": -2.9412941932678223, "logps/chosen": -229.6995849609375, "logps/rejected": -229.082275390625, "loss": 0.5584, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.70775306224823, "rewards/margins": 0.9178392291069031, "rewards/rejected": -1.6255924701690674, "step": 1000 }, { "epoch": 0.5285190999476713, "grad_norm": 70.36883726966389, "learning_rate": 4.912401034074708e-07, "logits/chosen": -2.9685754776000977, "logits/rejected": -2.956085205078125, "logps/chosen": -232.73098754882812, "logps/rejected": -260.23516845703125, "loss": 0.5401, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6321336030960083, "rewards/margins": 0.8737136721611023, "rewards/rejected": -1.5058472156524658, "step": 1010 }, { "epoch": 0.533751962323391, "grad_norm": 84.61192539313546, "learning_rate": 4.908361661533989e-07, "logits/chosen": -3.0166802406311035, "logits/rejected": -2.995523452758789, "logps/chosen": -284.29241943359375, "logps/rejected": -255.83047485351562, "loss": 0.5056, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.4262539744377136, "rewards/margins": 1.263817548751831, "rewards/rejected": -1.6900714635849, "step": 1020 }, { "epoch": 0.5389848246991105, "grad_norm": 72.71679571956443, "learning_rate": 4.904232981262866e-07, "logits/chosen": -2.997488498687744, "logits/rejected": -2.9504899978637695, "logps/chosen": -254.54287719726562, "logps/rejected": -222.4877471923828, "loss": 0.5683, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.9320276975631714, "rewards/margins": 0.3925037086009979, "rewards/rejected": -1.3245315551757812, "step": 1030 }, { "epoch": 0.54421768707483, "grad_norm": 72.41150985465931, "learning_rate": 4.900015146362544e-07, "logits/chosen": -3.0341286659240723, "logits/rejected": -3.08656907081604, "logps/chosen": -249.39901733398438, "logps/rejected": -267.85601806640625, "loss": 0.5664, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8237807154655457, "rewards/margins": 0.7084304094314575, "rewards/rejected": -1.5322110652923584, "step": 1040 }, { "epoch": 0.5494505494505495, "grad_norm": 91.57337744398069, "learning_rate": 4.895708313240285e-07, "logits/chosen": -3.079549789428711, "logits/rejected": -2.9854531288146973, "logps/chosen": -319.2317810058594, "logps/rejected": -298.8382568359375, "loss": 0.5347, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.34583228826522827, "rewards/margins": 1.3173162937164307, "rewards/rejected": -1.6631486415863037, "step": 1050 }, { "epoch": 0.554683411826269, "grad_norm": 95.90576654679334, "learning_rate": 4.891312641603623e-07, "logits/chosen": -2.9752936363220215, "logits/rejected": -2.981757640838623, "logps/chosen": -256.8917541503906, "logps/rejected": -274.5128173828125, "loss": 0.544, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.25574904680252075, "rewards/margins": 1.2427527904510498, "rewards/rejected": -1.4985018968582153, "step": 1060 }, { "epoch": 0.5599162742019885, "grad_norm": 66.29249203294205, "learning_rate": 4.886828294454426e-07, "logits/chosen": -3.0031514167785645, "logits/rejected": -3.005155563354492, "logps/chosen": -319.611328125, "logps/rejected": -273.65997314453125, "loss": 0.524, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3722835183143616, "rewards/margins": 0.9366200566291809, "rewards/rejected": -1.3089035749435425, "step": 1070 }, { "epoch": 0.565149136577708, "grad_norm": 83.89236886108212, "learning_rate": 4.882255438082863e-07, "logits/chosen": -3.058509349822998, "logits/rejected": -2.992572546005249, "logps/chosen": -229.7340545654297, "logps/rejected": -230.9755859375, "loss": 0.5518, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5026660561561584, "rewards/margins": 0.8336219787597656, "rewards/rejected": -1.3362879753112793, "step": 1080 }, { "epoch": 0.5703819989534276, "grad_norm": 117.67919796778382, "learning_rate": 4.877594242061233e-07, "logits/chosen": -3.017270565032959, "logits/rejected": -2.8980987071990967, "logps/chosen": -300.8147888183594, "logps/rejected": -195.97146606445312, "loss": 0.5742, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7242558598518372, "rewards/margins": 0.48976564407348633, "rewards/rejected": -1.2140214443206787, "step": 1090 }, { "epoch": 0.5756148613291471, "grad_norm": 77.27033275419339, "learning_rate": 4.87284487923768e-07, "logits/chosen": -2.960627317428589, "logits/rejected": -2.91349458694458, "logps/chosen": -256.8774719238281, "logps/rejected": -278.6012268066406, "loss": 0.4771, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.47361594438552856, "rewards/margins": 0.9880229234695435, "rewards/rejected": -1.4616389274597168, "step": 1100 }, { "epoch": 0.5808477237048666, "grad_norm": 88.42790393719109, "learning_rate": 4.868007525729775e-07, "logits/chosen": -2.7881951332092285, "logits/rejected": -2.7863636016845703, "logps/chosen": -171.53135681152344, "logps/rejected": -204.23678588867188, "loss": 0.5735, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.3550284802913666, "rewards/margins": 0.9956822395324707, "rewards/rejected": -1.3507106304168701, "step": 1110 }, { "epoch": 0.5860805860805861, "grad_norm": 91.18513645524655, "learning_rate": 4.863082360917998e-07, "logits/chosen": -2.953455924987793, "logits/rejected": -2.907076358795166, "logps/chosen": -269.57958984375, "logps/rejected": -256.61138916015625, "loss": 0.4961, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.44610539078712463, "rewards/margins": 0.767912745475769, "rewards/rejected": -1.2140181064605713, "step": 1120 }, { "epoch": 0.5913134484563056, "grad_norm": 88.14815824423434, "learning_rate": 4.858069567439072e-07, "logits/chosen": -2.8839964866638184, "logits/rejected": -2.843053102493286, "logps/chosen": -219.7151336669922, "logps/rejected": -267.60614013671875, "loss": 0.5507, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8401961326599121, "rewards/margins": 0.5585467219352722, "rewards/rejected": -1.398742914199829, "step": 1130 }, { "epoch": 0.5965463108320251, "grad_norm": 85.19304391255665, "learning_rate": 4.852969331179206e-07, "logits/chosen": -3.124087333679199, "logits/rejected": -3.0854485034942627, "logps/chosen": -253.47720336914062, "logps/rejected": -274.27716064453125, "loss": 0.5073, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4598918855190277, "rewards/margins": 0.9162165522575378, "rewards/rejected": -1.3761085271835327, "step": 1140 }, { "epoch": 0.6017791732077447, "grad_norm": 100.16684770922006, "learning_rate": 4.847781841267185e-07, "logits/chosen": -3.0712294578552246, "logits/rejected": -2.9157419204711914, "logps/chosen": -264.02935791015625, "logps/rejected": -238.4131622314453, "loss": 0.5347, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4971913695335388, "rewards/margins": 0.9195780754089355, "rewards/rejected": -1.4167696237564087, "step": 1150 }, { "epoch": 0.6070120355834642, "grad_norm": 88.54740950290132, "learning_rate": 4.842507290067374e-07, "logits/chosen": -2.8672521114349365, "logits/rejected": -2.877729892730713, "logps/chosen": -213.7853546142578, "logps/rejected": -187.8013153076172, "loss": 0.5511, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6637629866600037, "rewards/margins": 0.4064168334007263, "rewards/rejected": -1.07017982006073, "step": 1160 }, { "epoch": 0.6122448979591837, "grad_norm": 83.20806710425786, "learning_rate": 4.837145873172567e-07, "logits/chosen": -2.975207567214966, "logits/rejected": -2.9137356281280518, "logps/chosen": -265.36907958984375, "logps/rejected": -283.68890380859375, "loss": 0.5539, "rewards/accuracies": 0.875, "rewards/chosen": -0.2634466886520386, "rewards/margins": 1.391940712928772, "rewards/rejected": -1.655387282371521, "step": 1170 }, { "epoch": 0.6174777603349032, "grad_norm": 65.06976433254769, "learning_rate": 4.83169778939675e-07, "logits/chosen": -3.0333054065704346, "logits/rejected": -2.977327585220337, "logps/chosen": -306.73089599609375, "logps/rejected": -270.4570007324219, "loss": 0.4779, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.39636844396591187, "rewards/margins": 0.710414469242096, "rewards/rejected": -1.1067829132080078, "step": 1180 }, { "epoch": 0.6227106227106227, "grad_norm": 78.50773728209101, "learning_rate": 4.826163240767716e-07, "logits/chosen": -3.0391085147857666, "logits/rejected": -2.9755520820617676, "logps/chosen": -355.873779296875, "logps/rejected": -274.9627685546875, "loss": 0.4816, "rewards/accuracies": 0.75, "rewards/chosen": -0.3288365602493286, "rewards/margins": 0.7189548015594482, "rewards/rejected": -1.0477913618087769, "step": 1190 }, { "epoch": 0.6279434850863422, "grad_norm": 69.05070378714194, "learning_rate": 4.820542432519584e-07, "logits/chosen": -2.8001251220703125, "logits/rejected": -2.7054355144500732, "logps/chosen": -296.8133544921875, "logps/rejected": -270.8907165527344, "loss": 0.5062, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.42598795890808105, "rewards/margins": 0.8950722813606262, "rewards/rejected": -1.321060299873352, "step": 1200 }, { "epoch": 0.6331763474620618, "grad_norm": 63.11370489954011, "learning_rate": 4.814835573085176e-07, "logits/chosen": -3.0644478797912598, "logits/rejected": -3.015381097793579, "logps/chosen": -287.351806640625, "logps/rejected": -261.4433898925781, "loss": 0.545, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.4076271653175354, "rewards/margins": 1.0334349870681763, "rewards/rejected": -1.4410618543624878, "step": 1210 }, { "epoch": 0.6384092098377813, "grad_norm": 85.32332324564254, "learning_rate": 4.809042874088304e-07, "logits/chosen": -3.03351092338562, "logits/rejected": -3.0054540634155273, "logps/chosen": -313.1636657714844, "logps/rejected": -287.75836181640625, "loss": 0.5252, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6502276659011841, "rewards/margins": 1.1218125820159912, "rewards/rejected": -1.7720403671264648, "step": 1220 }, { "epoch": 0.6436420722135008, "grad_norm": 77.04848894355693, "learning_rate": 4.803164550335905e-07, "logits/chosen": -2.952907085418701, "logits/rejected": -2.8508353233337402, "logps/chosen": -338.08428955078125, "logps/rejected": -243.0955047607422, "loss": 0.5123, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6789973974227905, "rewards/margins": 1.2384763956069946, "rewards/rejected": -1.9174737930297852, "step": 1230 }, { "epoch": 0.6488749345892203, "grad_norm": 59.27275266351012, "learning_rate": 4.797200819810089e-07, "logits/chosen": -2.9722557067871094, "logits/rejected": -2.958375930786133, "logps/chosen": -237.11099243164062, "logps/rejected": -215.58950805664062, "loss": 0.5185, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7369828820228577, "rewards/margins": 0.5474749207496643, "rewards/rejected": -1.284457802772522, "step": 1240 }, { "epoch": 0.6541077969649398, "grad_norm": 75.12543235283981, "learning_rate": 4.79115190366005e-07, "logits/chosen": -3.0331928730010986, "logits/rejected": -2.943148136138916, "logps/chosen": -265.9112243652344, "logps/rejected": -288.0257263183594, "loss": 0.5354, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5052046179771423, "rewards/margins": 0.9338423609733582, "rewards/rejected": -1.439046859741211, "step": 1250 }, { "epoch": 0.6593406593406593, "grad_norm": 62.691477391969826, "learning_rate": 4.785018026193862e-07, "logits/chosen": -2.9795613288879395, "logits/rejected": -2.942622184753418, "logps/chosen": -258.8407287597656, "logps/rejected": -184.7825164794922, "loss": 0.4982, "rewards/accuracies": 0.75, "rewards/chosen": -0.5558956265449524, "rewards/margins": 1.0887887477874756, "rewards/rejected": -1.6446841955184937, "step": 1260 }, { "epoch": 0.6645735217163788, "grad_norm": 66.514213131995, "learning_rate": 4.77879941487017e-07, "logits/chosen": -2.9221765995025635, "logits/rejected": -2.868154764175415, "logps/chosen": -228.4252471923828, "logps/rejected": -218.2352752685547, "loss": 0.4786, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6031314134597778, "rewards/margins": 1.1121221780776978, "rewards/rejected": -1.7152538299560547, "step": 1270 }, { "epoch": 0.6698063840920984, "grad_norm": 81.62218584098261, "learning_rate": 4.772496300289748e-07, "logits/chosen": -2.9566588401794434, "logits/rejected": -2.854403257369995, "logps/chosen": -243.740234375, "logps/rejected": -224.7729949951172, "loss": 0.5158, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.957431435585022, "rewards/margins": 1.070603609085083, "rewards/rejected": -2.0280354022979736, "step": 1280 }, { "epoch": 0.6750392464678179, "grad_norm": 88.54496255837837, "learning_rate": 4.766108916186949e-07, "logits/chosen": -2.9128806591033936, "logits/rejected": -2.8795971870422363, "logps/chosen": -238.8448486328125, "logps/rejected": -282.5215759277344, "loss": 0.5554, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.41275936365127563, "rewards/margins": 1.2810719013214111, "rewards/rejected": -1.693831205368042, "step": 1290 }, { "epoch": 0.6802721088435374, "grad_norm": 89.68756008661323, "learning_rate": 4.759637499421042e-07, "logits/chosen": -2.9003686904907227, "logits/rejected": -2.9282007217407227, "logps/chosen": -267.2535095214844, "logps/rejected": -288.4033508300781, "loss": 0.4905, "rewards/accuracies": 0.75, "rewards/chosen": -0.7487326860427856, "rewards/margins": 0.9262698292732239, "rewards/rejected": -1.6750024557113647, "step": 1300 }, { "epoch": 0.6855049712192569, "grad_norm": 75.93623969578695, "learning_rate": 4.7530822899674207e-07, "logits/chosen": -3.082273006439209, "logits/rejected": -3.0540592670440674, "logps/chosen": -242.1746368408203, "logps/rejected": -216.32907104492188, "loss": 0.4728, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.5144501328468323, "rewards/margins": 1.2238795757293701, "rewards/rejected": -1.7383296489715576, "step": 1310 }, { "epoch": 0.6907378335949764, "grad_norm": 96.97675116106844, "learning_rate": 4.7464435309087137e-07, "logits/chosen": -2.9758706092834473, "logits/rejected": -2.9694249629974365, "logps/chosen": -285.498046875, "logps/rejected": -297.6026306152344, "loss": 0.4841, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9340837597846985, "rewards/margins": 0.8559338450431824, "rewards/rejected": -1.7900174856185913, "step": 1320 }, { "epoch": 0.6959706959706959, "grad_norm": 78.5778301557016, "learning_rate": 4.739721468425763e-07, "logits/chosen": -2.9774537086486816, "logits/rejected": -2.9779419898986816, "logps/chosen": -263.31573486328125, "logps/rejected": -304.69683837890625, "loss": 0.4682, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.5467711091041565, "rewards/margins": 1.3913160562515259, "rewards/rejected": -1.9380871057510376, "step": 1330 }, { "epoch": 0.7012035583464155, "grad_norm": 65.59399311000074, "learning_rate": 4.7329163517885e-07, "logits/chosen": -2.946503162384033, "logits/rejected": -2.8569929599761963, "logps/chosen": -260.58367919921875, "logps/rejected": -218.1254425048828, "loss": 0.4978, "rewards/accuracies": 0.75, "rewards/chosen": -0.766433596611023, "rewards/margins": 1.07541024684906, "rewards/rejected": -1.8418439626693726, "step": 1340 }, { "epoch": 0.706436420722135, "grad_norm": 85.46475968682417, "learning_rate": 4.7260284333466973e-07, "logits/chosen": -3.0457088947296143, "logits/rejected": -3.0220398902893066, "logps/chosen": -282.926513671875, "logps/rejected": -258.7223815917969, "loss": 0.5378, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.817114531993866, "rewards/margins": 1.0490025281906128, "rewards/rejected": -1.8661168813705444, "step": 1350 }, { "epoch": 0.7116692830978545, "grad_norm": 92.82738010386494, "learning_rate": 4.719057968520617e-07, "logits/chosen": -2.852339267730713, "logits/rejected": -2.8235440254211426, "logps/chosen": -330.9522705078125, "logps/rejected": -305.92132568359375, "loss": 0.5575, "rewards/accuracies": 0.75, "rewards/chosen": -0.9185736775398254, "rewards/margins": 0.9067566990852356, "rewards/rejected": -1.8253304958343506, "step": 1360 }, { "epoch": 0.716902145473574, "grad_norm": 85.9752498987174, "learning_rate": 4.7120052157915345e-07, "logits/chosen": -3.061681032180786, "logits/rejected": -2.887890338897705, "logps/chosen": -306.37750244140625, "logps/rejected": -219.4311065673828, "loss": 0.4408, "rewards/accuracies": 0.75, "rewards/chosen": -0.7019962072372437, "rewards/margins": 1.1164424419403076, "rewards/rejected": -1.8184385299682617, "step": 1370 }, { "epoch": 0.7221350078492935, "grad_norm": 79.70629727830975, "learning_rate": 4.7048704366921537e-07, "logits/chosen": -3.0268707275390625, "logits/rejected": -2.9550089836120605, "logps/chosen": -211.5172119140625, "logps/rejected": -250.2776641845703, "loss": 0.4883, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5869075655937195, "rewards/margins": 1.3238399028778076, "rewards/rejected": -1.9107472896575928, "step": 1380 }, { "epoch": 0.727367870225013, "grad_norm": 76.13173431946342, "learning_rate": 4.6976538957969114e-07, "logits/chosen": -2.9245352745056152, "logits/rejected": -2.833132743835449, "logps/chosen": -255.0247039794922, "logps/rejected": -227.4293212890625, "loss": 0.5025, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8037885427474976, "rewards/margins": 1.1720962524414062, "rewards/rejected": -1.975885033607483, "step": 1390 }, { "epoch": 0.7326007326007326, "grad_norm": 80.47476470129122, "learning_rate": 4.690355860712163e-07, "logits/chosen": -2.9190263748168945, "logits/rejected": -2.9157705307006836, "logps/chosen": -235.0019989013672, "logps/rejected": -256.9010314941406, "loss": 0.5379, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9700290560722351, "rewards/margins": 0.7937289476394653, "rewards/rejected": -1.7637580633163452, "step": 1400 }, { "epoch": 0.7378335949764521, "grad_norm": 92.43909818164913, "learning_rate": 4.682976602066262e-07, "logits/chosen": -2.832491159439087, "logits/rejected": -2.7899534702301025, "logps/chosen": -250.2271728515625, "logps/rejected": -251.36978149414062, "loss": 0.5435, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9459089040756226, "rewards/margins": 1.147392988204956, "rewards/rejected": -2.093302011489868, "step": 1410 }, { "epoch": 0.7430664573521716, "grad_norm": 91.1009024528207, "learning_rate": 4.6755163934995224e-07, "logits/chosen": -2.9592061042785645, "logits/rejected": -2.8992438316345215, "logps/chosen": -303.87213134765625, "logps/rejected": -255.3300018310547, "loss": 0.5445, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6414457559585571, "rewards/margins": 1.0195133686065674, "rewards/rejected": -1.660959005355835, "step": 1420 }, { "epoch": 0.7482993197278912, "grad_norm": 63.31857198860101, "learning_rate": 4.667975511654072e-07, "logits/chosen": -2.9833927154541016, "logits/rejected": -2.8848397731781006, "logps/chosen": -286.5281677246094, "logps/rejected": -258.4193115234375, "loss": 0.4821, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4777020812034607, "rewards/margins": 1.1378450393676758, "rewards/rejected": -1.6155469417572021, "step": 1430 }, { "epoch": 0.7535321821036107, "grad_norm": 95.23291217079216, "learning_rate": 4.660354236163595e-07, "logits/chosen": -3.0186591148376465, "logits/rejected": -2.924861431121826, "logps/chosen": -346.0341796875, "logps/rejected": -309.82562255859375, "loss": 0.5381, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.39197543263435364, "rewards/margins": 1.1676783561706543, "rewards/rejected": -1.5596539974212646, "step": 1440 }, { "epoch": 0.7587650444793302, "grad_norm": 84.05578541528062, "learning_rate": 4.6526528496429606e-07, "logits/chosen": -2.998558282852173, "logits/rejected": -2.9074602127075195, "logps/chosen": -290.22528076171875, "logps/rejected": -268.1587829589844, "loss": 0.5191, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9176603555679321, "rewards/margins": 1.0642603635787964, "rewards/rejected": -1.981920838356018, "step": 1450 }, { "epoch": 0.7639979068550498, "grad_norm": 64.96139822658712, "learning_rate": 4.644871637677745e-07, "logits/chosen": -2.905656337738037, "logits/rejected": -2.906144618988037, "logps/chosen": -206.65487670898438, "logps/rejected": -225.69546508789062, "loss": 0.5738, "rewards/accuracies": 0.625, "rewards/chosen": -0.6699077486991882, "rewards/margins": 0.7092582583427429, "rewards/rejected": -1.3791661262512207, "step": 1460 }, { "epoch": 0.7692307692307693, "grad_norm": 67.0033380779087, "learning_rate": 4.637010888813638e-07, "logits/chosen": -2.9822003841400146, "logits/rejected": -2.839690685272217, "logps/chosen": -323.13128662109375, "logps/rejected": -243.99472045898438, "loss": 0.4904, "rewards/accuracies": 0.875, "rewards/chosen": -0.6122270226478577, "rewards/margins": 1.028287649154663, "rewards/rejected": -1.6405147314071655, "step": 1470 }, { "epoch": 0.7744636316064888, "grad_norm": 61.00968857693698, "learning_rate": 4.6290708945457493e-07, "logits/chosen": -2.8971664905548096, "logits/rejected": -2.8889663219451904, "logps/chosen": -244.0663299560547, "logps/rejected": -231.54977416992188, "loss": 0.5389, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8593710064888, "rewards/margins": 0.80824214220047, "rewards/rejected": -1.6676132678985596, "step": 1480 }, { "epoch": 0.7796964939822083, "grad_norm": 88.66550618750362, "learning_rate": 4.6210519493077887e-07, "logits/chosen": -2.703312397003174, "logits/rejected": -2.716028928756714, "logps/chosen": -280.28106689453125, "logps/rejected": -277.0183410644531, "loss": 0.5111, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1523878574371338, "rewards/margins": 0.8042893409729004, "rewards/rejected": -1.9566770792007446, "step": 1490 }, { "epoch": 0.7849293563579278, "grad_norm": 75.36404263170701, "learning_rate": 4.6129543504611607e-07, "logits/chosen": -2.9114177227020264, "logits/rejected": -2.875338077545166, "logps/chosen": -212.26803588867188, "logps/rejected": -267.29302978515625, "loss": 0.4809, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0753793716430664, "rewards/margins": 1.2071021795272827, "rewards/rejected": -2.2824816703796387, "step": 1500 }, { "epoch": 0.7901622187336473, "grad_norm": 80.31311220888489, "learning_rate": 4.604778398283927e-07, "logits/chosen": -2.8635993003845215, "logits/rejected": -2.869093418121338, "logps/chosen": -261.09283447265625, "logps/rejected": -303.63677978515625, "loss": 0.5719, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4059689044952393, "rewards/margins": 0.9533067941665649, "rewards/rejected": -2.3592755794525146, "step": 1510 }, { "epoch": 0.7953950811093669, "grad_norm": 62.11055323834296, "learning_rate": 4.596524395959678e-07, "logits/chosen": -2.925779342651367, "logits/rejected": -2.8773224353790283, "logps/chosen": -217.9270782470703, "logps/rejected": -260.5126647949219, "loss": 0.5028, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.6801798343658447, "rewards/margins": 1.533402919769287, "rewards/rejected": -2.2135825157165527, "step": 1520 }, { "epoch": 0.8006279434850864, "grad_norm": 73.0898016548836, "learning_rate": 4.588192649566285e-07, "logits/chosen": -3.048966407775879, "logits/rejected": -3.0150530338287354, "logps/chosen": -313.2316589355469, "logps/rejected": -376.1241760253906, "loss": 0.4744, "rewards/accuracies": 0.75, "rewards/chosen": -0.9245558977127075, "rewards/margins": 1.0850825309753418, "rewards/rejected": -2.0096383094787598, "step": 1530 }, { "epoch": 0.8058608058608059, "grad_norm": 59.17561841744251, "learning_rate": 4.5797834680645553e-07, "logits/chosen": -2.961118459701538, "logits/rejected": -2.968853235244751, "logps/chosen": -355.70977783203125, "logps/rejected": -316.1515808105469, "loss": 0.5477, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8812974095344543, "rewards/margins": 0.7961042523384094, "rewards/rejected": -1.6774017810821533, "step": 1540 }, { "epoch": 0.8110936682365254, "grad_norm": 69.83865548386235, "learning_rate": 4.5712971632867715e-07, "logits/chosen": -2.9274020195007324, "logits/rejected": -2.82372784614563, "logps/chosen": -311.6867370605469, "logps/rejected": -227.6506805419922, "loss": 0.515, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.39746323227882385, "rewards/margins": 1.1062740087509155, "rewards/rejected": -1.503737211227417, "step": 1550 }, { "epoch": 0.8163265306122449, "grad_norm": 73.00608900493768, "learning_rate": 4.562734049925129e-07, "logits/chosen": -2.9387753009796143, "logits/rejected": -2.8545188903808594, "logps/chosen": -342.23388671875, "logps/rejected": -291.25732421875, "loss": 0.5009, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7539585828781128, "rewards/margins": 0.7867677211761475, "rewards/rejected": -1.5407264232635498, "step": 1560 }, { "epoch": 0.8215593929879644, "grad_norm": 69.7676228542488, "learning_rate": 4.5540944455200663e-07, "logits/chosen": -2.93424654006958, "logits/rejected": -2.8800809383392334, "logps/chosen": -227.81954956054688, "logps/rejected": -251.3435821533203, "loss": 0.4882, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.4879691004753113, "rewards/margins": 1.1892462968826294, "rewards/rejected": -1.6772152185440063, "step": 1570 }, { "epoch": 0.826792255363684, "grad_norm": 88.89077432831509, "learning_rate": 4.545378670448492e-07, "logits/chosen": -2.965099334716797, "logits/rejected": -2.8346457481384277, "logps/chosen": -281.7225036621094, "logps/rejected": -244.849609375, "loss": 0.5708, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.020928144454956, "rewards/margins": 0.9104280471801758, "rewards/rejected": -1.9313561916351318, "step": 1580 }, { "epoch": 0.8320251177394035, "grad_norm": 95.80208248980419, "learning_rate": 4.5365870479119014e-07, "logits/chosen": -2.844191312789917, "logits/rejected": -2.7497730255126953, "logps/chosen": -231.2622528076172, "logps/rejected": -218.54238891601562, "loss": 0.4924, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5696216821670532, "rewards/margins": 1.335473895072937, "rewards/rejected": -1.9050956964492798, "step": 1590 }, { "epoch": 0.837257980115123, "grad_norm": 79.89926739534515, "learning_rate": 4.5277199039243917e-07, "logits/chosen": -2.8526086807250977, "logits/rejected": -2.869624614715576, "logps/chosen": -249.06509399414062, "logps/rejected": -277.98016357421875, "loss": 0.5098, "rewards/accuracies": 0.75, "rewards/chosen": -0.907018780708313, "rewards/margins": 0.9708824157714844, "rewards/rejected": -1.8779014348983765, "step": 1600 }, { "epoch": 0.8424908424908425, "grad_norm": 86.25630920176074, "learning_rate": 4.5187775673005744e-07, "logits/chosen": -3.0155694484710693, "logits/rejected": -2.8981802463531494, "logps/chosen": -357.2152099609375, "logps/rejected": -325.990478515625, "loss": 0.5085, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6420009136199951, "rewards/margins": 1.1055619716644287, "rewards/rejected": -1.7475630044937134, "step": 1610 }, { "epoch": 0.847723704866562, "grad_norm": 55.46847910841205, "learning_rate": 4.509760369643384e-07, "logits/chosen": -2.9256107807159424, "logits/rejected": -2.8107352256774902, "logps/chosen": -277.77362060546875, "logps/rejected": -242.86172485351562, "loss": 0.5243, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8871413469314575, "rewards/margins": 1.0385167598724365, "rewards/rejected": -1.925657868385315, "step": 1620 }, { "epoch": 0.8529565672422815, "grad_norm": 80.79889712040811, "learning_rate": 4.5006686453317734e-07, "logits/chosen": -3.077967882156372, "logits/rejected": -3.082167148590088, "logps/chosen": -233.31558227539062, "logps/rejected": -251.12789916992188, "loss": 0.5198, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6661070585250854, "rewards/margins": 1.10641610622406, "rewards/rejected": -1.7725231647491455, "step": 1630 }, { "epoch": 0.858189429618001, "grad_norm": 71.15822312007893, "learning_rate": 4.4915027315083243e-07, "logits/chosen": -2.9664804935455322, "logits/rejected": -2.949410915374756, "logps/chosen": -299.4471435546875, "logps/rejected": -284.2243347167969, "loss": 0.5177, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6192963719367981, "rewards/margins": 0.9329144358634949, "rewards/rejected": -1.5522109270095825, "step": 1640 }, { "epoch": 0.8634222919937206, "grad_norm": 71.71977372253805, "learning_rate": 4.482262968066737e-07, "logits/chosen": -2.9455220699310303, "logits/rejected": -2.8994319438934326, "logps/chosen": -274.3739318847656, "logps/rejected": -278.1145935058594, "loss": 0.4743, "rewards/accuracies": 0.625, "rewards/chosen": -0.8218958973884583, "rewards/margins": 0.5913797616958618, "rewards/rejected": -1.4132755994796753, "step": 1650 }, { "epoch": 0.8686551543694401, "grad_norm": 75.56645135433682, "learning_rate": 4.4729496976392324e-07, "logits/chosen": -2.9459645748138428, "logits/rejected": -2.9064693450927734, "logps/chosen": -207.9593505859375, "logps/rejected": -248.1178436279297, "loss": 0.536, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7137929201126099, "rewards/margins": 1.04018235206604, "rewards/rejected": -1.75397527217865, "step": 1660 }, { "epoch": 0.8738880167451596, "grad_norm": 97.4139445430043, "learning_rate": 4.463563265583843e-07, "logits/chosen": -3.0816235542297363, "logits/rejected": -2.985708236694336, "logps/chosen": -260.05712890625, "logps/rejected": -264.0434265136719, "loss": 0.5033, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8484287261962891, "rewards/margins": 1.169203519821167, "rewards/rejected": -2.017632007598877, "step": 1670 }, { "epoch": 0.8791208791208791, "grad_norm": 71.19259118760309, "learning_rate": 4.4541040199716063e-07, "logits/chosen": -2.9066882133483887, "logits/rejected": -2.8966193199157715, "logps/chosen": -248.3551483154297, "logps/rejected": -276.22113037109375, "loss": 0.4657, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7082773447036743, "rewards/margins": 1.1715316772460938, "rewards/rejected": -1.8798091411590576, "step": 1680 }, { "epoch": 0.8843537414965986, "grad_norm": 101.28406237828162, "learning_rate": 4.4445723115736587e-07, "logits/chosen": -2.895580768585205, "logits/rejected": -2.860962390899658, "logps/chosen": -250.845947265625, "logps/rejected": -239.2021484375, "loss": 0.468, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7915253639221191, "rewards/margins": 1.3335636854171753, "rewards/rejected": -2.125089168548584, "step": 1690 }, { "epoch": 0.8895866038723181, "grad_norm": 75.46747322745003, "learning_rate": 4.434968493848228e-07, "logits/chosen": -2.9386839866638184, "logits/rejected": -2.881162405014038, "logps/chosen": -268.7374572753906, "logps/rejected": -264.47918701171875, "loss": 0.4922, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8749815225601196, "rewards/margins": 1.1021599769592285, "rewards/rejected": -1.9771416187286377, "step": 1700 }, { "epoch": 0.8948194662480377, "grad_norm": 85.32772096585312, "learning_rate": 4.425292922927525e-07, "logits/chosen": -2.956533908843994, "logits/rejected": -2.884119749069214, "logps/chosen": -323.5614318847656, "logps/rejected": -319.70489501953125, "loss": 0.4883, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7921448945999146, "rewards/margins": 0.8883172869682312, "rewards/rejected": -1.6804622411727905, "step": 1710 }, { "epoch": 0.9000523286237572, "grad_norm": 68.3409079162927, "learning_rate": 4.41554595760454e-07, "logits/chosen": -3.018251657485962, "logits/rejected": -2.8875749111175537, "logps/chosen": -285.4612121582031, "logps/rejected": -249.6368865966797, "loss": 0.5353, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0603779554367065, "rewards/margins": 0.5571672320365906, "rewards/rejected": -1.6175453662872314, "step": 1720 }, { "epoch": 0.9052851909994767, "grad_norm": 83.4405589341867, "learning_rate": 4.4057279593197326e-07, "logits/chosen": -2.981658935546875, "logits/rejected": -2.9193899631500244, "logps/chosen": -222.2957305908203, "logps/rejected": -191.22984313964844, "loss": 0.5012, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5767639875411987, "rewards/margins": 1.0086519718170166, "rewards/rejected": -1.5854159593582153, "step": 1730 }, { "epoch": 0.9105180533751962, "grad_norm": 94.52843782642458, "learning_rate": 4.395839292147637e-07, "logits/chosen": -3.0103816986083984, "logits/rejected": -2.8786423206329346, "logps/chosen": -245.9064483642578, "logps/rejected": -217.5230255126953, "loss": 0.5518, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8081547021865845, "rewards/margins": 0.8482156991958618, "rewards/rejected": -1.6563704013824463, "step": 1740 }, { "epoch": 0.9157509157509157, "grad_norm": 88.34936638026802, "learning_rate": 4.3858803227833526e-07, "logits/chosen": -2.9911084175109863, "logits/rejected": -2.9616482257843018, "logps/chosen": -319.87847900390625, "logps/rejected": -290.2704772949219, "loss": 0.5563, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6435294151306152, "rewards/margins": 1.1032793521881104, "rewards/rejected": -1.746808648109436, "step": 1750 }, { "epoch": 0.9209837781266352, "grad_norm": 85.6891791776135, "learning_rate": 4.375851420528951e-07, "logits/chosen": -3.0104589462280273, "logits/rejected": -2.9877097606658936, "logps/chosen": -221.95361328125, "logps/rejected": -213.62319946289062, "loss": 0.4778, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.614212155342102, "rewards/margins": 0.7944315075874329, "rewards/rejected": -1.4086437225341797, "step": 1760 }, { "epoch": 0.9262166405023547, "grad_norm": 78.74476615176712, "learning_rate": 4.36575295727978e-07, "logits/chosen": -2.884420871734619, "logits/rejected": -2.8180770874023438, "logps/chosen": -285.18170166015625, "logps/rejected": -258.6389465332031, "loss": 0.4811, "rewards/accuracies": 0.75, "rewards/chosen": -0.8808829188346863, "rewards/margins": 1.1605877876281738, "rewards/rejected": -2.041470766067505, "step": 1770 }, { "epoch": 0.9314495028780743, "grad_norm": 103.56433150946246, "learning_rate": 4.355585307510675e-07, "logits/chosen": -2.8778350353240967, "logits/rejected": -2.860365390777588, "logps/chosen": -240.9153289794922, "logps/rejected": -215.19528198242188, "loss": 0.5465, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9615912437438965, "rewards/margins": 0.8098009824752808, "rewards/rejected": -1.7713922262191772, "step": 1780 }, { "epoch": 0.9366823652537938, "grad_norm": 63.70930140770908, "learning_rate": 4.345348848262068e-07, "logits/chosen": -2.944972276687622, "logits/rejected": -2.978013515472412, "logps/chosen": -320.50909423828125, "logps/rejected": -327.7473449707031, "loss": 0.4985, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.46996793150901794, "rewards/margins": 1.082518219947815, "rewards/rejected": -1.5524863004684448, "step": 1790 }, { "epoch": 0.9419152276295133, "grad_norm": 93.82609743600084, "learning_rate": 4.33504395912601e-07, "logits/chosen": -2.808483600616455, "logits/rejected": -2.740899085998535, "logps/chosen": -232.63046264648438, "logps/rejected": -276.5832214355469, "loss": 0.5019, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.145923376083374, "rewards/margins": 1.5711814165115356, "rewards/rejected": -2.71710467338562, "step": 1800 }, { "epoch": 0.9471480900052328, "grad_norm": 87.46705536864567, "learning_rate": 4.324671022232095e-07, "logits/chosen": -2.977370023727417, "logits/rejected": -2.9073007106781006, "logps/chosen": -253.8762664794922, "logps/rejected": -233.0682373046875, "loss": 0.5077, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8621772527694702, "rewards/margins": 1.1445598602294922, "rewards/rejected": -2.006737232208252, "step": 1810 }, { "epoch": 0.9523809523809523, "grad_norm": 73.52097016382926, "learning_rate": 4.314230422233286e-07, "logits/chosen": -2.8957488536834717, "logits/rejected": -2.82643723487854, "logps/chosen": -214.1259307861328, "logps/rejected": -189.720703125, "loss": 0.5265, "rewards/accuracies": 0.75, "rewards/chosen": -0.7376524209976196, "rewards/margins": 0.9930845499038696, "rewards/rejected": -1.7307370901107788, "step": 1820 }, { "epoch": 0.957613814756672, "grad_norm": 91.98931930574095, "learning_rate": 4.303722546291655e-07, "logits/chosen": -3.036027669906616, "logits/rejected": -2.99212908744812, "logps/chosen": -279.8523864746094, "logps/rejected": -249.5442657470703, "loss": 0.5486, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.8860118985176086, "rewards/margins": 0.9745572209358215, "rewards/rejected": -1.8605692386627197, "step": 1830 }, { "epoch": 0.9628466771323915, "grad_norm": 99.43396723779242, "learning_rate": 4.2931477840640243e-07, "logits/chosen": -2.975102424621582, "logits/rejected": -2.8146276473999023, "logps/chosen": -325.7985534667969, "logps/rejected": -278.90582275390625, "loss": 0.5254, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.940270721912384, "rewards/margins": 1.1125400066375732, "rewards/rejected": -2.0528111457824707, "step": 1840 }, { "epoch": 0.968079539508111, "grad_norm": 80.29010358777224, "learning_rate": 4.282506527687517e-07, "logits/chosen": -2.8818559646606445, "logits/rejected": -2.8396973609924316, "logps/chosen": -362.3193664550781, "logps/rejected": -304.5287170410156, "loss": 0.4915, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3979787528514862, "rewards/margins": 1.2758557796478271, "rewards/rejected": -1.6738345623016357, "step": 1850 }, { "epoch": 0.9733124018838305, "grad_norm": 59.73665520432585, "learning_rate": 4.271799171765016e-07, "logits/chosen": -2.9014787673950195, "logits/rejected": -2.766084909439087, "logps/chosen": -319.53985595703125, "logps/rejected": -244.4556121826172, "loss": 0.4809, "rewards/accuracies": 0.75, "rewards/chosen": -0.9494949579238892, "rewards/margins": 1.0711262226104736, "rewards/rejected": -2.020620822906494, "step": 1860 }, { "epoch": 0.97854526425955, "grad_norm": 84.69208349810964, "learning_rate": 4.2610261133505323e-07, "logits/chosen": -2.947687864303589, "logits/rejected": -2.849827527999878, "logps/chosen": -240.40872192382812, "logps/rejected": -239.29385375976562, "loss": 0.5052, "rewards/accuracies": 0.625, "rewards/chosen": -0.6224541664123535, "rewards/margins": 1.0290005207061768, "rewards/rejected": -1.6514545679092407, "step": 1870 }, { "epoch": 0.9837781266352695, "grad_norm": 82.00303750133932, "learning_rate": 4.250187751934479e-07, "logits/chosen": -2.9904048442840576, "logits/rejected": -3.048518657684326, "logps/chosen": -248.32186889648438, "logps/rejected": -317.966796875, "loss": 0.4856, "rewards/accuracies": 0.75, "rewards/chosen": -0.6637740135192871, "rewards/margins": 1.0528675317764282, "rewards/rejected": -1.7166414260864258, "step": 1880 }, { "epoch": 0.989010989010989, "grad_norm": 78.89636516877677, "learning_rate": 4.2392844894288605e-07, "logits/chosen": -2.9030120372772217, "logits/rejected": -2.8590681552886963, "logps/chosen": -381.3399353027344, "logps/rejected": -334.9476013183594, "loss": 0.5012, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7761744856834412, "rewards/margins": 0.9569997787475586, "rewards/rejected": -1.7331740856170654, "step": 1890 }, { "epoch": 0.9942438513867086, "grad_norm": 73.9895058125802, "learning_rate": 4.2283167301523634e-07, "logits/chosen": -3.0315451622009277, "logits/rejected": -2.9190683364868164, "logps/chosen": -224.55868530273438, "logps/rejected": -230.12667846679688, "loss": 0.5227, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1358522176742554, "rewards/margins": 1.0464999675750732, "rewards/rejected": -2.182352066040039, "step": 1900 }, { "epoch": 0.9994767137624281, "grad_norm": 96.25577410747253, "learning_rate": 4.217284880815369e-07, "logits/chosen": -2.8946022987365723, "logits/rejected": -2.91059947013855, "logps/chosen": -327.3114929199219, "logps/rejected": -334.94891357421875, "loss": 0.5046, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.7329050302505493, "rewards/margins": 1.5198781490325928, "rewards/rejected": -2.2527832984924316, "step": 1910 }, { "epoch": 1.0047095761381475, "grad_norm": 46.022721964184576, "learning_rate": 4.2061893505048694e-07, "logits/chosen": -2.898128032684326, "logits/rejected": -2.881345272064209, "logps/chosen": -189.64544677734375, "logps/rejected": -262.56622314453125, "loss": 0.1535, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.0961851254105568, "rewards/margins": 2.768672466278076, "rewards/rejected": -2.672487735748291, "step": 1920 }, { "epoch": 1.0099424385138671, "grad_norm": 21.47967793233117, "learning_rate": 4.1950305506692967e-07, "logits/chosen": -3.0447676181793213, "logits/rejected": -2.898925304412842, "logps/chosen": -300.3017272949219, "logps/rejected": -286.5074157714844, "loss": 0.0958, "rewards/accuracies": 1.0, "rewards/chosen": 1.0494248867034912, "rewards/margins": 5.129704475402832, "rewards/rejected": -4.080279350280762, "step": 1930 }, { "epoch": 1.0151753008895865, "grad_norm": 23.900256038642652, "learning_rate": 4.1838088951032656e-07, "logits/chosen": -2.8007640838623047, "logits/rejected": -2.7669568061828613, "logps/chosen": -329.2915954589844, "logps/rejected": -312.6531066894531, "loss": 0.1089, "rewards/accuracies": 1.0, "rewards/chosen": 0.5750948786735535, "rewards/margins": 5.029973030090332, "rewards/rejected": -4.454878807067871, "step": 1940 }, { "epoch": 1.0204081632653061, "grad_norm": 15.971753588042969, "learning_rate": 4.172524799932231e-07, "logits/chosen": -2.9136364459991455, "logits/rejected": -2.871410846710205, "logps/chosen": -205.9805908203125, "logps/rejected": -274.6159973144531, "loss": 0.0876, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.26637163758277893, "rewards/margins": 3.6904170513153076, "rewards/rejected": -3.9567885398864746, "step": 1950 }, { "epoch": 1.0256410256410255, "grad_norm": 22.188622827319865, "learning_rate": 4.161178683597054e-07, "logits/chosen": -3.0572562217712402, "logits/rejected": -2.893533945083618, "logps/chosen": -247.4630889892578, "logps/rejected": -234.9210662841797, "loss": 0.0831, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.36797022819519043, "rewards/margins": 4.55133056640625, "rewards/rejected": -4.9193010330200195, "step": 1960 }, { "epoch": 1.0308738880167452, "grad_norm": 35.42823469735703, "learning_rate": 4.1497709668384885e-07, "logits/chosen": -2.9946725368499756, "logits/rejected": -2.926649332046509, "logps/chosen": -327.3368835449219, "logps/rejected": -316.7547912597656, "loss": 0.0873, "rewards/accuracies": 1.0, "rewards/chosen": 0.39913448691368103, "rewards/margins": 5.337223052978516, "rewards/rejected": -4.938088417053223, "step": 1970 }, { "epoch": 1.0361067503924646, "grad_norm": 43.28377590038816, "learning_rate": 4.1383020726815745e-07, "logits/chosen": -3.0193605422973633, "logits/rejected": -2.9150025844573975, "logps/chosen": -237.4845428466797, "logps/rejected": -274.40008544921875, "loss": 0.1073, "rewards/accuracies": 0.875, "rewards/chosen": -0.5532289147377014, "rewards/margins": 4.324273109436035, "rewards/rejected": -4.87750244140625, "step": 1980 }, { "epoch": 1.0413396127681842, "grad_norm": 28.692968077635122, "learning_rate": 4.126772426419959e-07, "logits/chosen": -2.906698703765869, "logits/rejected": -2.9146125316619873, "logps/chosen": -248.41421508789062, "logps/rejected": -292.7886657714844, "loss": 0.1418, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.4445223808288574, "rewards/margins": 3.622772693634033, "rewards/rejected": -4.067295074462891, "step": 1990 }, { "epoch": 1.0465724751439036, "grad_norm": 13.69875303026955, "learning_rate": 4.1151824556001145e-07, "logits/chosen": -2.9932796955108643, "logits/rejected": -2.936567783355713, "logps/chosen": -217.5187530517578, "logps/rejected": -281.3251953125, "loss": 0.1171, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.09808170795440674, "rewards/margins": 4.096837520599365, "rewards/rejected": -4.194918632507324, "step": 2000 }, { "epoch": 1.0465724751439036, "eval_logits/chosen": -2.93619704246521, "eval_logits/rejected": -2.8896522521972656, "eval_logps/chosen": -266.1375732421875, "eval_logps/rejected": -283.6535339355469, "eval_loss": 0.5328664779663086, "eval_rewards/accuracies": 0.7734375, "eval_rewards/chosen": -1.452146291732788, "eval_rewards/margins": 1.4750443696975708, "eval_rewards/rejected": -2.9271907806396484, "eval_runtime": 83.6424, "eval_samples_per_second": 23.911, "eval_steps_per_second": 0.383, "step": 2000 }, { "epoch": 1.0518053375196232, "grad_norm": 7.398063116741986, "learning_rate": 4.103532590005495e-07, "logits/chosen": -3.0467581748962402, "logits/rejected": -2.95546555519104, "logps/chosen": -257.2591857910156, "logps/rejected": -240.78158569335938, "loss": 0.0783, "rewards/accuracies": 1.0, "rewards/chosen": 0.3103707730770111, "rewards/margins": 4.979373931884766, "rewards/rejected": -4.669003486633301, "step": 2010 }, { "epoch": 1.0570381998953426, "grad_norm": 23.22966134683761, "learning_rate": 4.091823261640592e-07, "logits/chosen": -2.9991204738616943, "logits/rejected": -2.9148526191711426, "logps/chosen": -234.30935668945312, "logps/rejected": -246.5050811767578, "loss": 0.1048, "rewards/accuracies": 1.0, "rewards/chosen": 0.5768532156944275, "rewards/margins": 5.63602352142334, "rewards/rejected": -5.059170722961426, "step": 2020 }, { "epoch": 1.0622710622710623, "grad_norm": 23.101111936027397, "learning_rate": 4.080054904714917e-07, "logits/chosen": -2.9671566486358643, "logits/rejected": -2.911362648010254, "logps/chosen": -230.13046264648438, "logps/rejected": -271.76385498046875, "loss": 0.0864, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5850220918655396, "rewards/margins": 4.525626182556152, "rewards/rejected": -5.110648155212402, "step": 2030 }, { "epoch": 1.0675039246467817, "grad_norm": 7.307134085139132, "learning_rate": 4.0682279556268993e-07, "logits/chosen": -2.9680025577545166, "logits/rejected": -2.940239667892456, "logps/chosen": -307.3257751464844, "logps/rejected": -344.1888427734375, "loss": 0.0912, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.14602717757225037, "rewards/margins": 5.6945481300354, "rewards/rejected": -5.840575218200684, "step": 2040 }, { "epoch": 1.0727367870225013, "grad_norm": 59.84476321738945, "learning_rate": 4.056342852947706e-07, "logits/chosen": -3.1115972995758057, "logits/rejected": -2.934664249420166, "logps/chosen": -344.67535400390625, "logps/rejected": -331.46209716796875, "loss": 0.0963, "rewards/accuracies": 1.0, "rewards/chosen": -0.11524569988250732, "rewards/margins": 5.930139541625977, "rewards/rejected": -6.045385360717773, "step": 2050 }, { "epoch": 1.077969649398221, "grad_norm": 20.32729671227812, "learning_rate": 4.044400037404973e-07, "logits/chosen": -3.011934757232666, "logits/rejected": -2.9430861473083496, "logps/chosen": -201.46963500976562, "logps/rejected": -230.5752716064453, "loss": 0.0844, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.34829196333885193, "rewards/margins": 4.364696979522705, "rewards/rejected": -4.71298885345459, "step": 2060 }, { "epoch": 1.0832025117739403, "grad_norm": 23.54144587644266, "learning_rate": 4.032399951866468e-07, "logits/chosen": -2.886375904083252, "logits/rejected": -2.782416343688965, "logps/chosen": -207.6606903076172, "logps/rejected": -223.850830078125, "loss": 0.1164, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.32867947220802307, "rewards/margins": 4.25605583190918, "rewards/rejected": -4.58473539352417, "step": 2070 }, { "epoch": 1.08843537414966, "grad_norm": 9.730976366552161, "learning_rate": 4.0203430413236637e-07, "logits/chosen": -3.0306410789489746, "logits/rejected": -2.9684665203094482, "logps/chosen": -276.56341552734375, "logps/rejected": -331.5174865722656, "loss": 0.1154, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.24090953171253204, "rewards/margins": 5.034852027893066, "rewards/rejected": -5.275761604309082, "step": 2080 }, { "epoch": 1.0936682365253794, "grad_norm": 29.920040045828966, "learning_rate": 4.0082297528752407e-07, "logits/chosen": -2.9148123264312744, "logits/rejected": -2.8249359130859375, "logps/chosen": -180.6721954345703, "logps/rejected": -235.6604766845703, "loss": 0.1074, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.13160227239131927, "rewards/margins": 4.845431327819824, "rewards/rejected": -4.977033615112305, "step": 2090 }, { "epoch": 1.098901098901099, "grad_norm": 27.394773976428937, "learning_rate": 3.9960605357105e-07, "logits/chosen": -2.9857449531555176, "logits/rejected": -2.9158477783203125, "logps/chosen": -257.39434814453125, "logps/rejected": -284.5668029785156, "loss": 0.0968, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.3832382559776306, "rewards/margins": 4.940213680267334, "rewards/rejected": -5.323452472686768, "step": 2100 }, { "epoch": 1.1041339612768184, "grad_norm": 12.329216781198715, "learning_rate": 3.983835841092716e-07, "logits/chosen": -2.9735963344573975, "logits/rejected": -2.79585862159729, "logps/chosen": -287.66790771484375, "logps/rejected": -236.3841552734375, "loss": 0.1038, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03246309608221054, "rewards/margins": 4.720534324645996, "rewards/rejected": -4.688071250915527, "step": 2110 }, { "epoch": 1.109366823652538, "grad_norm": 31.86430068978165, "learning_rate": 3.971556122342398e-07, "logits/chosen": -2.981353759765625, "logits/rejected": -2.8921375274658203, "logps/chosen": -249.86093139648438, "logps/rejected": -246.5110321044922, "loss": 0.1083, "rewards/accuracies": 1.0, "rewards/chosen": -0.200478196144104, "rewards/margins": 3.9746551513671875, "rewards/rejected": -4.175133228302002, "step": 2120 }, { "epoch": 1.1145996860282574, "grad_norm": 35.01019130679176, "learning_rate": 3.9592218348204766e-07, "logits/chosen": -3.0276362895965576, "logits/rejected": -2.914118528366089, "logps/chosen": -267.53192138671875, "logps/rejected": -275.28863525390625, "loss": 0.0784, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.2447108030319214, "rewards/margins": 4.070721626281738, "rewards/rejected": -4.315432071685791, "step": 2130 }, { "epoch": 1.119832548403977, "grad_norm": 26.246202916670846, "learning_rate": 3.946833435911423e-07, "logits/chosen": -3.058420419692993, "logits/rejected": -2.9097139835357666, "logps/chosen": -226.70803833007812, "logps/rejected": -251.3625030517578, "loss": 0.1247, "rewards/accuracies": 1.0, "rewards/chosen": -0.09658551216125488, "rewards/margins": 5.5951385498046875, "rewards/rejected": -5.6917243003845215, "step": 2140 }, { "epoch": 1.1250654107796965, "grad_norm": 10.723823775472926, "learning_rate": 3.9343913850062856e-07, "logits/chosen": -2.9560294151306152, "logits/rejected": -3.0658373832702637, "logps/chosen": -209.95236206054688, "logps/rejected": -312.0199279785156, "loss": 0.0961, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.8991118669509888, "rewards/margins": 4.448459625244141, "rewards/rejected": -5.34757137298584, "step": 2150 }, { "epoch": 1.130298273155416, "grad_norm": 14.170795621597003, "learning_rate": 3.921896143485657e-07, "logits/chosen": -2.957063913345337, "logits/rejected": -2.8811850547790527, "logps/chosen": -261.2143249511719, "logps/rejected": -286.85308837890625, "loss": 0.1278, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5732889175415039, "rewards/margins": 4.4664483070373535, "rewards/rejected": -5.039736747741699, "step": 2160 }, { "epoch": 1.1355311355311355, "grad_norm": 54.21223212063504, "learning_rate": 3.9093481747025615e-07, "logits/chosen": -3.118234872817993, "logits/rejected": -3.0169854164123535, "logps/chosen": -286.1936340332031, "logps/rejected": -297.8741149902344, "loss": 0.1035, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.33529624342918396, "rewards/margins": 5.152597427368164, "rewards/rejected": -5.487893581390381, "step": 2170 }, { "epoch": 1.1407639979068551, "grad_norm": 14.160361482909245, "learning_rate": 3.896747943965275e-07, "logits/chosen": -3.088549852371216, "logits/rejected": -2.9224636554718018, "logps/chosen": -237.7678680419922, "logps/rejected": -271.75616455078125, "loss": 0.0881, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6482471227645874, "rewards/margins": 5.079795837402344, "rewards/rejected": -5.728043556213379, "step": 2180 }, { "epoch": 1.1459968602825745, "grad_norm": 7.461020244679668, "learning_rate": 3.8840959185200717e-07, "logits/chosen": -2.977137327194214, "logits/rejected": -3.014305353164673, "logps/chosen": -252.70071411132812, "logps/rejected": -280.7303771972656, "loss": 0.0723, "rewards/accuracies": 0.875, "rewards/chosen": -0.2845343351364136, "rewards/margins": 4.949723720550537, "rewards/rejected": -5.234258651733398, "step": 2190 }, { "epoch": 1.1512297226582942, "grad_norm": 38.91594311650636, "learning_rate": 3.871392567533893e-07, "logits/chosen": -3.05753755569458, "logits/rejected": -2.94245982170105, "logps/chosen": -302.5651550292969, "logps/rejected": -306.23455810546875, "loss": 0.083, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7702548503875732, "rewards/margins": 4.530703544616699, "rewards/rejected": -5.300958156585693, "step": 2200 }, { "epoch": 1.1564625850340136, "grad_norm": 23.179794299583637, "learning_rate": 3.858638362076953e-07, "logits/chosen": -2.953263282775879, "logits/rejected": -2.8362820148468018, "logps/chosen": -258.02001953125, "logps/rejected": -278.0293884277344, "loss": 0.0956, "rewards/accuracies": 1.0, "rewards/chosen": -0.07994568347930908, "rewards/margins": 4.916607856750488, "rewards/rejected": -4.996554374694824, "step": 2210 }, { "epoch": 1.1616954474097332, "grad_norm": 18.734630879266444, "learning_rate": 3.845833775105272e-07, "logits/chosen": -2.9967446327209473, "logits/rejected": -2.979264736175537, "logps/chosen": -237.64208984375, "logps/rejected": -306.78668212890625, "loss": 0.0548, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.22943219542503357, "rewards/margins": 5.639667987823486, "rewards/rejected": -5.869100093841553, "step": 2220 }, { "epoch": 1.1669283097854526, "grad_norm": 55.905583714173034, "learning_rate": 3.832979281443133e-07, "logits/chosen": -3.068455696105957, "logits/rejected": -3.0449776649475098, "logps/chosen": -248.92758178710938, "logps/rejected": -278.6637268066406, "loss": 0.1058, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3283912241458893, "rewards/margins": 4.862596035003662, "rewards/rejected": -5.190988063812256, "step": 2230 }, { "epoch": 1.1721611721611722, "grad_norm": 22.786546900366766, "learning_rate": 3.8200753577654765e-07, "logits/chosen": -3.026653528213501, "logits/rejected": -2.917473316192627, "logps/chosen": -227.84951782226562, "logps/rejected": -284.4237976074219, "loss": 0.1155, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5640532970428467, "rewards/margins": 5.25749397277832, "rewards/rejected": -5.821547031402588, "step": 2240 }, { "epoch": 1.1773940345368916, "grad_norm": 23.631022982674985, "learning_rate": 3.8071224825802273e-07, "logits/chosen": -3.0882997512817383, "logits/rejected": -3.0628597736358643, "logps/chosen": -289.17437744140625, "logps/rejected": -367.0567626953125, "loss": 0.0826, "rewards/accuracies": 1.0, "rewards/chosen": -0.4264296591281891, "rewards/margins": 5.180129051208496, "rewards/rejected": -5.606558322906494, "step": 2250 }, { "epoch": 1.1826268969126112, "grad_norm": 23.451307302160764, "learning_rate": 3.7941211362105453e-07, "logits/chosen": -3.090294122695923, "logits/rejected": -2.991959810256958, "logps/chosen": -289.88177490234375, "logps/rejected": -353.2331848144531, "loss": 0.0896, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0703296884894371, "rewards/margins": 5.362122535705566, "rewards/rejected": -5.291792869567871, "step": 2260 }, { "epoch": 1.1878597592883307, "grad_norm": 28.45922492219434, "learning_rate": 3.781071800777017e-07, "logits/chosen": -2.909813404083252, "logits/rejected": -2.8854784965515137, "logps/chosen": -283.5074462890625, "logps/rejected": -332.10955810546875, "loss": 0.0888, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.15381774306297302, "rewards/margins": 6.3244709968566895, "rewards/rejected": -6.478288173675537, "step": 2270 }, { "epoch": 1.1930926216640503, "grad_norm": 43.51762733886877, "learning_rate": 3.767974960179776e-07, "logits/chosen": -3.0325286388397217, "logits/rejected": -2.9897449016571045, "logps/chosen": -240.8324737548828, "logps/rejected": -285.1839904785156, "loss": 0.0784, "rewards/accuracies": 1.0, "rewards/chosen": -0.8381017446517944, "rewards/margins": 5.218292713165283, "rewards/rejected": -6.056394577026367, "step": 2280 }, { "epoch": 1.1983254840397697, "grad_norm": 40.21353684719865, "learning_rate": 3.7548311000805605e-07, "logits/chosen": -2.9304068088531494, "logits/rejected": -2.93696928024292, "logps/chosen": -253.8983612060547, "logps/rejected": -349.63885498046875, "loss": 0.104, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7673223614692688, "rewards/margins": 5.358363151550293, "rewards/rejected": -6.125685691833496, "step": 2290 }, { "epoch": 1.2035583464154893, "grad_norm": 23.383293979913763, "learning_rate": 3.7416407078847015e-07, "logits/chosen": -3.0815589427948, "logits/rejected": -3.046006679534912, "logps/chosen": -278.02520751953125, "logps/rejected": -335.1031188964844, "loss": 0.0745, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3998408317565918, "rewards/margins": 5.286677360534668, "rewards/rejected": -5.686518669128418, "step": 2300 }, { "epoch": 1.2087912087912087, "grad_norm": 13.387888525430816, "learning_rate": 3.7284042727230506e-07, "logits/chosen": -3.0553221702575684, "logits/rejected": -2.9283556938171387, "logps/chosen": -206.7539520263672, "logps/rejected": -265.8564453125, "loss": 0.0941, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.7557197213172913, "rewards/margins": 5.524768829345703, "rewards/rejected": -6.280488967895508, "step": 2310 }, { "epoch": 1.2140240711669283, "grad_norm": 32.51555521966961, "learning_rate": 3.7151222854338413e-07, "logits/chosen": -3.0623388290405273, "logits/rejected": -2.886702299118042, "logps/chosen": -294.43878173828125, "logps/rejected": -312.8724670410156, "loss": 0.0924, "rewards/accuracies": 1.0, "rewards/chosen": -0.02350255288183689, "rewards/margins": 6.44631814956665, "rewards/rejected": -6.469820976257324, "step": 2320 }, { "epoch": 1.2192569335426477, "grad_norm": 37.54072190706394, "learning_rate": 3.701795238544488e-07, "logits/chosen": -3.0191171169281006, "logits/rejected": -2.929847240447998, "logps/chosen": -286.76507568359375, "logps/rejected": -321.422119140625, "loss": 0.0783, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5516226291656494, "rewards/margins": 5.549538612365723, "rewards/rejected": -6.101161479949951, "step": 2330 }, { "epoch": 1.2244897959183674, "grad_norm": 10.368978896633115, "learning_rate": 3.688423626253318e-07, "logits/chosen": -2.8838584423065186, "logits/rejected": -2.929598331451416, "logps/chosen": -201.93203735351562, "logps/rejected": -264.67938232421875, "loss": 0.0997, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6430965662002563, "rewards/margins": 5.315836429595947, "rewards/rejected": -5.958932399749756, "step": 2340 }, { "epoch": 1.2297226582940868, "grad_norm": 23.956960809997067, "learning_rate": 3.675007944411253e-07, "logits/chosen": -3.050785541534424, "logits/rejected": -2.9645841121673584, "logps/chosen": -279.9877624511719, "logps/rejected": -279.80743408203125, "loss": 0.1339, "rewards/accuracies": 1.0, "rewards/chosen": -0.18310296535491943, "rewards/margins": 5.065299987792969, "rewards/rejected": -5.2484025955200195, "step": 2350 }, { "epoch": 1.2349555206698064, "grad_norm": 12.238395680477895, "learning_rate": 3.6615486905034167e-07, "logits/chosen": -3.035884380340576, "logits/rejected": -2.971494197845459, "logps/chosen": -291.66021728515625, "logps/rejected": -292.61492919921875, "loss": 0.0876, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.576339840888977, "rewards/margins": 4.373812675476074, "rewards/rejected": -4.950152397155762, "step": 2360 }, { "epoch": 1.2401883830455258, "grad_norm": 82.92894714055515, "learning_rate": 3.6480463636306846e-07, "logits/chosen": -3.0620784759521484, "logits/rejected": -2.98693585395813, "logps/chosen": -307.61309814453125, "logps/rejected": -333.37298583984375, "loss": 0.1104, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.6986103057861328, "rewards/margins": 4.59494686126709, "rewards/rejected": -5.293557167053223, "step": 2370 }, { "epoch": 1.2454212454212454, "grad_norm": 38.91522588989638, "learning_rate": 3.634501464491183e-07, "logits/chosen": -3.0400471687316895, "logits/rejected": -2.9690637588500977, "logps/chosen": -239.689208984375, "logps/rejected": -299.9053649902344, "loss": 0.0984, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.3628920018672943, "rewards/margins": 5.034619331359863, "rewards/rejected": -5.397511959075928, "step": 2380 }, { "epoch": 1.250654107796965, "grad_norm": 36.45804577184584, "learning_rate": 3.6209144953617175e-07, "logits/chosen": -2.825417995452881, "logits/rejected": -2.8415687084198, "logps/chosen": -336.24261474609375, "logps/rejected": -438.61669921875, "loss": 0.0997, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.27739372849464417, "rewards/margins": 6.871772766113281, "rewards/rejected": -7.149165153503418, "step": 2390 }, { "epoch": 1.2558869701726845, "grad_norm": 50.941925895830245, "learning_rate": 3.607285960079146e-07, "logits/chosen": -3.0473155975341797, "logits/rejected": -2.954472064971924, "logps/chosen": -312.7008972167969, "logps/rejected": -339.1510009765625, "loss": 0.1287, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.06949774920940399, "rewards/margins": 6.339685916900635, "rewards/rejected": -6.409183502197266, "step": 2400 }, { "epoch": 1.2611198325484039, "grad_norm": 27.01682326315958, "learning_rate": 3.593616364021701e-07, "logits/chosen": -3.0986335277557373, "logits/rejected": -2.977479934692383, "logps/chosen": -285.5718078613281, "logps/rejected": -333.64801025390625, "loss": 0.1011, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.11033590883016586, "rewards/margins": 6.00553035736084, "rewards/rejected": -6.115866661071777, "step": 2410 }, { "epoch": 1.2663526949241235, "grad_norm": 44.649961374990966, "learning_rate": 3.5799062140902413e-07, "logits/chosen": -2.964820146560669, "logits/rejected": -2.853663444519043, "logps/chosen": -299.32794189453125, "logps/rejected": -297.76470947265625, "loss": 0.1056, "rewards/accuracies": 1.0, "rewards/chosen": -0.19874989986419678, "rewards/margins": 5.375239372253418, "rewards/rejected": -5.573988914489746, "step": 2420 }, { "epoch": 1.2715855572998431, "grad_norm": 34.13757368275467, "learning_rate": 3.566156018689462e-07, "logits/chosen": -3.016500949859619, "logits/rejected": -2.783592462539673, "logps/chosen": -266.65576171875, "logps/rejected": -253.0983428955078, "loss": 0.1379, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.2731441259384155, "rewards/margins": 4.529138565063477, "rewards/rejected": -5.80228328704834, "step": 2430 }, { "epoch": 1.2768184196755625, "grad_norm": 32.44546226886494, "learning_rate": 3.552366287709038e-07, "logits/chosen": -2.9087882041931152, "logits/rejected": -2.986567974090576, "logps/chosen": -297.7473449707031, "logps/rejected": -337.3050537109375, "loss": 0.0935, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.19403524696826935, "rewards/margins": 6.510956764221191, "rewards/rejected": -6.704992771148682, "step": 2440 }, { "epoch": 1.282051282051282, "grad_norm": 43.273950632911784, "learning_rate": 3.5385375325047163e-07, "logits/chosen": -2.9453835487365723, "logits/rejected": -2.949300527572632, "logps/chosen": -250.9259033203125, "logps/rejected": -292.3937072753906, "loss": 0.0721, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.4169522821903229, "rewards/margins": 5.265911102294922, "rewards/rejected": -5.682863712310791, "step": 2450 }, { "epoch": 1.2872841444270016, "grad_norm": 18.630987989359536, "learning_rate": 3.524670265879353e-07, "logits/chosen": -2.9842844009399414, "logits/rejected": -2.9054853916168213, "logps/chosen": -221.57373046875, "logps/rejected": -255.801025390625, "loss": 0.0888, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.04223984479904175, "rewards/margins": 5.626054286956787, "rewards/rejected": -5.668294429779053, "step": 2460 }, { "epoch": 1.2925170068027212, "grad_norm": 11.466642886313057, "learning_rate": 3.510765002063901e-07, "logits/chosen": -2.963801145553589, "logits/rejected": -2.9503703117370605, "logps/chosen": -243.6164093017578, "logps/rejected": -322.0021667480469, "loss": 0.0784, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5315900444984436, "rewards/margins": 5.600189685821533, "rewards/rejected": -6.131779670715332, "step": 2470 }, { "epoch": 1.2977498691784406, "grad_norm": 25.139288095561046, "learning_rate": 3.4968222566983367e-07, "logits/chosen": -3.0824332237243652, "logits/rejected": -2.9414005279541016, "logps/chosen": -251.28067016601562, "logps/rejected": -252.15756225585938, "loss": 0.1047, "rewards/accuracies": 0.875, "rewards/chosen": -1.299600601196289, "rewards/margins": 4.348740100860596, "rewards/rejected": -5.648340702056885, "step": 2480 }, { "epoch": 1.30298273155416, "grad_norm": 22.06833443129562, "learning_rate": 3.482842546812543e-07, "logits/chosen": -3.055598497390747, "logits/rejected": -2.9183878898620605, "logps/chosen": -331.6462097167969, "logps/rejected": -336.825927734375, "loss": 0.0658, "rewards/accuracies": 1.0, "rewards/chosen": -0.4471103549003601, "rewards/margins": 5.988678455352783, "rewards/rejected": -6.435788631439209, "step": 2490 }, { "epoch": 1.3082155939298796, "grad_norm": 13.61957732508816, "learning_rate": 3.4688263908071307e-07, "logits/chosen": -2.949493169784546, "logits/rejected": -2.878563165664673, "logps/chosen": -230.7290496826172, "logps/rejected": -269.21527099609375, "loss": 0.1106, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.155723214149475, "rewards/margins": 4.836132526397705, "rewards/rejected": -5.991855621337891, "step": 2500 }, { "epoch": 1.3134484563055993, "grad_norm": 15.943018646289424, "learning_rate": 3.454774308434222e-07, "logits/chosen": -2.986858367919922, "logits/rejected": -2.944578170776367, "logps/chosen": -241.6566162109375, "logps/rejected": -347.42364501953125, "loss": 0.0787, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.8483079075813293, "rewards/margins": 5.642345905303955, "rewards/rejected": -6.490653991699219, "step": 2510 }, { "epoch": 1.3186813186813187, "grad_norm": 26.633630588345365, "learning_rate": 3.4406868207781725e-07, "logits/chosen": -2.9771275520324707, "logits/rejected": -2.9106130599975586, "logps/chosen": -233.73934936523438, "logps/rejected": -243.2650146484375, "loss": 0.1095, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.33707278966903687, "rewards/margins": 6.005349159240723, "rewards/rejected": -6.342421531677246, "step": 2520 }, { "epoch": 1.323914181057038, "grad_norm": 32.26255918634188, "learning_rate": 3.426564450236249e-07, "logits/chosen": -2.998955488204956, "logits/rejected": -2.8419620990753174, "logps/chosen": -254.6608123779297, "logps/rejected": -260.8964538574219, "loss": 0.1056, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.8031023740768433, "rewards/margins": 5.423132419586182, "rewards/rejected": -6.226234436035156, "step": 2530 }, { "epoch": 1.3291470434327577, "grad_norm": 48.10134557760728, "learning_rate": 3.4124077204992576e-07, "logits/chosen": -2.847639560699463, "logits/rejected": -2.82789945602417, "logps/chosen": -191.11929321289062, "logps/rejected": -272.513916015625, "loss": 0.0844, "rewards/accuracies": 1.0, "rewards/chosen": -0.180890291929245, "rewards/margins": 5.861083030700684, "rewards/rejected": -6.041973114013672, "step": 2540 }, { "epoch": 1.3343799058084773, "grad_norm": 13.737963186316012, "learning_rate": 3.398217156532125e-07, "logits/chosen": -3.062126636505127, "logits/rejected": -2.9471311569213867, "logps/chosen": -284.6077880859375, "logps/rejected": -316.8412170410156, "loss": 0.0626, "rewards/accuracies": 1.0, "rewards/chosen": -0.707090437412262, "rewards/margins": 6.346011161804199, "rewards/rejected": -7.053101539611816, "step": 2550 }, { "epoch": 1.3396127681841967, "grad_norm": 21.481670934609202, "learning_rate": 3.383993284554431e-07, "logits/chosen": -3.0201539993286133, "logits/rejected": -2.966002941131592, "logps/chosen": -252.81924438476562, "logps/rejected": -297.75836181640625, "loss": 0.0849, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.4976441264152527, "rewards/margins": 6.169718265533447, "rewards/rejected": -6.667363166809082, "step": 2560 }, { "epoch": 1.3448456305599163, "grad_norm": 50.47018378445504, "learning_rate": 3.3697366320208955e-07, "logits/chosen": -2.941006660461426, "logits/rejected": -2.877628803253174, "logps/chosen": -294.93780517578125, "logps/rejected": -320.22119140625, "loss": 0.0761, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.8542757034301758, "rewards/margins": 6.09744930267334, "rewards/rejected": -6.951725006103516, "step": 2570 }, { "epoch": 1.3500784929356358, "grad_norm": 44.961326255520234, "learning_rate": 3.355447727601816e-07, "logits/chosen": -2.937790632247925, "logits/rejected": -2.821808099746704, "logps/chosen": -258.78167724609375, "logps/rejected": -311.67608642578125, "loss": 0.1, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.2528257369995117, "rewards/margins": 5.724603652954102, "rewards/rejected": -6.977429389953613, "step": 2580 }, { "epoch": 1.3553113553113554, "grad_norm": 31.739399994558784, "learning_rate": 3.3411271011634697e-07, "logits/chosen": -2.9528229236602783, "logits/rejected": -2.9789559841156006, "logps/chosen": -302.6459655761719, "logps/rejected": -370.6207580566406, "loss": 0.1295, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.177910566329956, "rewards/margins": 5.77822732925415, "rewards/rejected": -6.956138610839844, "step": 2590 }, { "epoch": 1.3605442176870748, "grad_norm": 29.022208552252266, "learning_rate": 3.3267752837484587e-07, "logits/chosen": -2.906136989593506, "logits/rejected": -2.8690638542175293, "logps/chosen": -233.2480010986328, "logps/rejected": -274.82275390625, "loss": 0.0953, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9544634819030762, "rewards/margins": 5.1417412757873535, "rewards/rejected": -6.096203804016113, "step": 2600 }, { "epoch": 1.3657770800627944, "grad_norm": 52.234570472381684, "learning_rate": 3.31239280755602e-07, "logits/chosen": -2.9591028690338135, "logits/rejected": -2.866718053817749, "logps/chosen": -292.55218505859375, "logps/rejected": -298.78118896484375, "loss": 0.1039, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.8524774312973022, "rewards/margins": 4.748727321624756, "rewards/rejected": -5.601204872131348, "step": 2610 }, { "epoch": 1.3710099424385138, "grad_norm": 16.937275915650034, "learning_rate": 3.2979802059222936e-07, "logits/chosen": -2.977233409881592, "logits/rejected": -2.85064697265625, "logps/chosen": -280.8937072753906, "logps/rejected": -272.0334777832031, "loss": 0.0986, "rewards/accuracies": 1.0, "rewards/chosen": -0.41921567916870117, "rewards/margins": 4.747152805328369, "rewards/rejected": -5.16636848449707, "step": 2620 }, { "epoch": 1.3762428048142334, "grad_norm": 38.73502041481118, "learning_rate": 3.283538013300537e-07, "logits/chosen": -2.862461805343628, "logits/rejected": -2.858710765838623, "logps/chosen": -210.4797821044922, "logps/rejected": -312.61614990234375, "loss": 0.0854, "rewards/accuracies": 1.0, "rewards/chosen": -0.5349727272987366, "rewards/margins": 5.527820110321045, "rewards/rejected": -6.062793731689453, "step": 2630 }, { "epoch": 1.3814756671899528, "grad_norm": 20.630990200844337, "learning_rate": 3.269066765241314e-07, "logits/chosen": -3.017667531967163, "logits/rejected": -2.9742352962493896, "logps/chosen": -274.62066650390625, "logps/rejected": -286.84454345703125, "loss": 0.0801, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.2776591777801514, "rewards/margins": 4.2892351150512695, "rewards/rejected": -5.56689453125, "step": 2640 }, { "epoch": 1.3867085295656725, "grad_norm": 61.13652061596397, "learning_rate": 3.254566998372634e-07, "logits/chosen": -2.873039722442627, "logits/rejected": -2.9162306785583496, "logps/chosen": -203.89306640625, "logps/rejected": -302.4574890136719, "loss": 0.1246, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.9670795202255249, "rewards/margins": 5.9818549156188965, "rewards/rejected": -6.948934078216553, "step": 2650 }, { "epoch": 1.3919413919413919, "grad_norm": 22.416398572712577, "learning_rate": 3.2400392503800477e-07, "logits/chosen": -2.9820892810821533, "logits/rejected": -2.9712131023406982, "logps/chosen": -298.67816162109375, "logps/rejected": -400.2839660644531, "loss": 0.0802, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5202631950378418, "rewards/margins": 5.253757953643799, "rewards/rejected": -5.774021148681641, "step": 2660 }, { "epoch": 1.3971742543171115, "grad_norm": 20.007250737929727, "learning_rate": 3.225484059986715e-07, "logits/chosen": -3.0013813972473145, "logits/rejected": -2.890977144241333, "logps/chosen": -247.83615112304688, "logps/rejected": -294.61407470703125, "loss": 0.0728, "rewards/accuracies": 1.0, "rewards/chosen": -0.5211168527603149, "rewards/margins": 5.1726813316345215, "rewards/rejected": -5.693798542022705, "step": 2670 }, { "epoch": 1.402407116692831, "grad_norm": 46.34881376495717, "learning_rate": 3.2109019669334215e-07, "logits/chosen": -2.9078080654144287, "logits/rejected": -2.8357596397399902, "logps/chosen": -328.3313293457031, "logps/rejected": -352.0106506347656, "loss": 0.085, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7543078660964966, "rewards/margins": 6.036242485046387, "rewards/rejected": -6.79055118560791, "step": 2680 }, { "epoch": 1.4076399790685505, "grad_norm": 22.133821840362398, "learning_rate": 3.19629351195857e-07, "logits/chosen": -2.951897144317627, "logits/rejected": -2.861912965774536, "logps/chosen": -250.5077667236328, "logps/rejected": -320.7691345214844, "loss": 0.0929, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5991395711898804, "rewards/margins": 5.9031572341918945, "rewards/rejected": -6.502297401428223, "step": 2690 }, { "epoch": 1.41287284144427, "grad_norm": 30.601613255685876, "learning_rate": 3.1816592367781236e-07, "logits/chosen": -2.886700391769409, "logits/rejected": -2.737668752670288, "logps/chosen": -309.296630859375, "logps/rejected": -309.95526123046875, "loss": 0.0978, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.3904002904891968, "rewards/margins": 5.650885581970215, "rewards/rejected": -7.041285514831543, "step": 2700 }, { "epoch": 1.4181057038199896, "grad_norm": 34.14352313992961, "learning_rate": 3.166999684065521e-07, "logits/chosen": -2.9279563426971436, "logits/rejected": -2.8133740425109863, "logps/chosen": -256.2647399902344, "logps/rejected": -277.4833679199219, "loss": 0.1068, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.5506166219711304, "rewards/margins": 4.79842472076416, "rewards/rejected": -6.34904146194458, "step": 2710 }, { "epoch": 1.423338566195709, "grad_norm": 28.468259226171355, "learning_rate": 3.1523153974315497e-07, "logits/chosen": -2.9370522499084473, "logits/rejected": -2.887308120727539, "logps/chosen": -263.50213623046875, "logps/rejected": -305.27728271484375, "loss": 0.1023, "rewards/accuracies": 0.875, "rewards/chosen": -0.893061637878418, "rewards/margins": 5.906707286834717, "rewards/rejected": -6.799768924713135, "step": 2720 }, { "epoch": 1.4285714285714286, "grad_norm": 78.09831295891125, "learning_rate": 3.137606921404191e-07, "logits/chosen": -2.8777241706848145, "logits/rejected": -2.8062610626220703, "logps/chosen": -275.77105712890625, "logps/rejected": -275.1842346191406, "loss": 0.1611, "rewards/accuracies": 0.875, "rewards/chosen": -1.4379390478134155, "rewards/margins": 4.804083347320557, "rewards/rejected": -6.242022514343262, "step": 2730 }, { "epoch": 1.433804290947148, "grad_norm": 54.967093544123856, "learning_rate": 3.1228748014084243e-07, "logits/chosen": -2.7218832969665527, "logits/rejected": -2.688936233520508, "logps/chosen": -281.573974609375, "logps/rejected": -296.79144287109375, "loss": 0.1184, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.1594982147216797, "rewards/margins": 4.915614128112793, "rewards/rejected": -6.075112342834473, "step": 2740 }, { "epoch": 1.4390371533228676, "grad_norm": 16.239012261097116, "learning_rate": 3.108119583746005e-07, "logits/chosen": -2.83630108833313, "logits/rejected": -2.814840793609619, "logps/chosen": -222.92904663085938, "logps/rejected": -289.79473876953125, "loss": 0.116, "rewards/accuracies": 1.0, "rewards/chosen": -0.1372254192829132, "rewards/margins": 5.56964635848999, "rewards/rejected": -5.70687198638916, "step": 2750 }, { "epoch": 1.4442700156985873, "grad_norm": 44.44528433919159, "learning_rate": 3.093341815575202e-07, "logits/chosen": -2.8627822399139404, "logits/rejected": -2.7667229175567627, "logps/chosen": -264.558349609375, "logps/rejected": -240.86599731445312, "loss": 0.0875, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5655437707901001, "rewards/margins": 4.9313201904296875, "rewards/rejected": -5.49686336517334, "step": 2760 }, { "epoch": 1.4495028780743067, "grad_norm": 23.672761698311415, "learning_rate": 3.078542044890513e-07, "logits/chosen": -2.9311320781707764, "logits/rejected": -2.7795305252075195, "logps/chosen": -317.44677734375, "logps/rejected": -337.5609436035156, "loss": 0.123, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.8098549842834473, "rewards/margins": 5.57565975189209, "rewards/rejected": -6.3855156898498535, "step": 2770 }, { "epoch": 1.454735740450026, "grad_norm": 35.65805968415091, "learning_rate": 3.0637208205023386e-07, "logits/chosen": -3.028580665588379, "logits/rejected": -2.873582124710083, "logps/chosen": -297.55120849609375, "logps/rejected": -289.8152770996094, "loss": 0.1127, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6644346714019775, "rewards/margins": 5.660701274871826, "rewards/rejected": -6.325135707855225, "step": 2780 }, { "epoch": 1.4599686028257457, "grad_norm": 32.973540635925744, "learning_rate": 3.0488786920166343e-07, "logits/chosen": -2.8953874111175537, "logits/rejected": -2.9363508224487305, "logps/chosen": -287.4119567871094, "logps/rejected": -363.69775390625, "loss": 0.0881, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5275593400001526, "rewards/margins": 5.911239147186279, "rewards/rejected": -6.438798427581787, "step": 2790 }, { "epoch": 1.4652014652014653, "grad_norm": 31.34346310233405, "learning_rate": 3.034016209814529e-07, "logits/chosen": -2.9117588996887207, "logits/rejected": -2.8713459968566895, "logps/chosen": -251.2953338623047, "logps/rejected": -302.90814208984375, "loss": 0.102, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7041305899620056, "rewards/margins": 5.538697242736816, "rewards/rejected": -6.242827415466309, "step": 2800 }, { "epoch": 1.4704343275771847, "grad_norm": 22.938183256717316, "learning_rate": 3.0191339250319147e-07, "logits/chosen": -2.9245946407318115, "logits/rejected": -2.9519927501678467, "logps/chosen": -270.9739990234375, "logps/rejected": -349.18621826171875, "loss": 0.0713, "rewards/accuracies": 1.0, "rewards/chosen": -0.1178923100233078, "rewards/margins": 5.9581298828125, "rewards/rejected": -6.076023101806641, "step": 2810 }, { "epoch": 1.4756671899529041, "grad_norm": 15.50339182070402, "learning_rate": 3.004232389539011e-07, "logits/chosen": -3.0099282264709473, "logits/rejected": -2.954331874847412, "logps/chosen": -261.56475830078125, "logps/rejected": -315.17401123046875, "loss": 0.0717, "rewards/accuracies": 1.0, "rewards/chosen": -0.5278488993644714, "rewards/margins": 6.136358737945557, "rewards/rejected": -6.66420841217041, "step": 2820 }, { "epoch": 1.4809000523286238, "grad_norm": 4.730071724667568, "learning_rate": 2.989312155919898e-07, "logits/chosen": -2.929542064666748, "logits/rejected": -2.843005418777466, "logps/chosen": -248.89242553710938, "logps/rejected": -322.938232421875, "loss": 0.0741, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.4635700285434723, "rewards/margins": 5.479854583740234, "rewards/rejected": -5.943425178527832, "step": 2830 }, { "epoch": 1.4861329147043434, "grad_norm": 29.753431344256935, "learning_rate": 2.9743737774520266e-07, "logits/chosen": -2.9428200721740723, "logits/rejected": -2.9172582626342773, "logps/chosen": -259.7123718261719, "logps/rejected": -317.86212158203125, "loss": 0.1183, "rewards/accuracies": 1.0, "rewards/chosen": -0.12774649262428284, "rewards/margins": 6.480625152587891, "rewards/rejected": -6.608371734619141, "step": 2840 }, { "epoch": 1.4913657770800628, "grad_norm": 18.436496762242186, "learning_rate": 2.959417808085702e-07, "logits/chosen": -2.883880138397217, "logits/rejected": -2.9055960178375244, "logps/chosen": -219.8786163330078, "logps/rejected": -269.0650329589844, "loss": 0.0999, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.3107945919036865, "rewards/margins": 5.153830528259277, "rewards/rejected": -6.464625358581543, "step": 2850 }, { "epoch": 1.4965986394557822, "grad_norm": 59.998549224732635, "learning_rate": 2.944444802423542e-07, "logits/chosen": -3.047563076019287, "logits/rejected": -2.978602886199951, "logps/chosen": -298.4427795410156, "logps/rejected": -372.28143310546875, "loss": 0.103, "rewards/accuracies": 1.0, "rewards/chosen": -0.2781336307525635, "rewards/margins": 6.95473575592041, "rewards/rejected": -7.232868194580078, "step": 2860 }, { "epoch": 1.5018315018315018, "grad_norm": 32.397232151300464, "learning_rate": 2.929455315699908e-07, "logits/chosen": -2.911093235015869, "logits/rejected": -2.7437069416046143, "logps/chosen": -296.2510681152344, "logps/rejected": -344.44720458984375, "loss": 0.0972, "rewards/accuracies": 1.0, "rewards/chosen": -0.5999561548233032, "rewards/margins": 6.021127700805664, "rewards/rejected": -6.6210832595825195, "step": 2870 }, { "epoch": 1.5070643642072215, "grad_norm": 37.113192999674766, "learning_rate": 2.9144499037603204e-07, "logits/chosen": -2.999938726425171, "logits/rejected": -2.9040145874023438, "logps/chosen": -249.46566772460938, "logps/rejected": -286.53656005859375, "loss": 0.1067, "rewards/accuracies": 1.0, "rewards/chosen": -0.7570666670799255, "rewards/margins": 5.558635234832764, "rewards/rejected": -6.315701961517334, "step": 2880 }, { "epoch": 1.5122972265829409, "grad_norm": 40.843554535640834, "learning_rate": 2.899429123040843e-07, "logits/chosen": -3.0040383338928223, "logits/rejected": -2.9690637588500977, "logps/chosen": -248.6911163330078, "logps/rejected": -318.9759216308594, "loss": 0.0863, "rewards/accuracies": 1.0, "rewards/chosen": -1.1157524585723877, "rewards/margins": 5.613360404968262, "rewards/rejected": -6.729112148284912, "step": 2890 }, { "epoch": 1.5175300889586603, "grad_norm": 62.42500487498685, "learning_rate": 2.884393530547452e-07, "logits/chosen": -3.0819621086120605, "logits/rejected": -2.975093126296997, "logps/chosen": -287.20965576171875, "logps/rejected": -331.211181640625, "loss": 0.0944, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.39944010972976685, "rewards/margins": 5.308953285217285, "rewards/rejected": -5.708393573760986, "step": 2900 }, { "epoch": 1.5227629513343799, "grad_norm": 34.482798051149224, "learning_rate": 2.869343683835376e-07, "logits/chosen": -2.9905455112457275, "logits/rejected": -2.880870819091797, "logps/chosen": -230.02828979492188, "logps/rejected": -334.75177001953125, "loss": 0.0881, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7035460472106934, "rewards/margins": 6.056662559509277, "rewards/rejected": -6.7602081298828125, "step": 2910 }, { "epoch": 1.5279958137100995, "grad_norm": 13.029392874930261, "learning_rate": 2.8542801409884253e-07, "logits/chosen": -3.028442859649658, "logits/rejected": -2.9436566829681396, "logps/chosen": -316.7098083496094, "logps/rejected": -369.3524169921875, "loss": 0.0526, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.0176303386688232, "rewards/margins": 5.336720943450928, "rewards/rejected": -6.35435152053833, "step": 2920 }, { "epoch": 1.533228676085819, "grad_norm": 30.242776956972175, "learning_rate": 2.839203460598297e-07, "logits/chosen": -3.031217098236084, "logits/rejected": -3.0050437450408936, "logps/chosen": -332.63134765625, "logps/rejected": -378.0337219238281, "loss": 0.0908, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.6280286908149719, "rewards/margins": 6.327795028686523, "rewards/rejected": -6.955824375152588, "step": 2930 }, { "epoch": 1.5384615384615383, "grad_norm": 18.61395247355257, "learning_rate": 2.8241142017438557e-07, "logits/chosen": -3.026963710784912, "logits/rejected": -2.9979422092437744, "logps/chosen": -302.5912170410156, "logps/rejected": -325.7685241699219, "loss": 0.1015, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3258691430091858, "rewards/margins": 6.078970432281494, "rewards/rejected": -6.404839992523193, "step": 2940 }, { "epoch": 1.543694400837258, "grad_norm": 47.93525556205328, "learning_rate": 2.8090129239704083e-07, "logits/chosen": -3.00567364692688, "logits/rejected": -2.892261028289795, "logps/chosen": -291.2741394042969, "logps/rejected": -263.0060119628906, "loss": 0.1233, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.1496226787567139, "rewards/margins": 5.680020332336426, "rewards/rejected": -6.829643249511719, "step": 2950 }, { "epoch": 1.5489272632129776, "grad_norm": 27.219930395156002, "learning_rate": 2.7939001872689496e-07, "logits/chosen": -2.89626407623291, "logits/rejected": -2.846527099609375, "logps/chosen": -204.7397003173828, "logps/rejected": -239.25146484375, "loss": 0.1187, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.2600133419036865, "rewards/margins": 4.346137046813965, "rewards/rejected": -5.6061506271362305, "step": 2960 }, { "epoch": 1.554160125588697, "grad_norm": 50.435351589285844, "learning_rate": 2.778776552055398e-07, "logits/chosen": -2.9109950065612793, "logits/rejected": -2.7655975818634033, "logps/chosen": -295.6907043457031, "logps/rejected": -309.79736328125, "loss": 0.0828, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.2689138650894165, "rewards/margins": 5.466970920562744, "rewards/rejected": -6.735884666442871, "step": 2970 }, { "epoch": 1.5593929879644164, "grad_norm": 23.734830834040732, "learning_rate": 2.763642579149817e-07, "logits/chosen": -2.8620216846466064, "logits/rejected": -2.83495831489563, "logps/chosen": -243.73861694335938, "logps/rejected": -302.7878723144531, "loss": 0.095, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.0916752815246582, "rewards/margins": 4.832813739776611, "rewards/rejected": -5.9244890213012695, "step": 2980 }, { "epoch": 1.564625850340136, "grad_norm": 40.03151184538333, "learning_rate": 2.748498829755615e-07, "logits/chosen": -2.889631748199463, "logits/rejected": -2.840968608856201, "logps/chosen": -248.60791015625, "logps/rejected": -363.3118896484375, "loss": 0.068, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.20546475052833557, "rewards/margins": 6.797191619873047, "rewards/rejected": -7.00265645980835, "step": 2990 }, { "epoch": 1.5698587127158556, "grad_norm": 18.046787841515116, "learning_rate": 2.7333458654387344e-07, "logits/chosen": -2.9715352058410645, "logits/rejected": -2.944772243499756, "logps/chosen": -285.3953857421875, "logps/rejected": -317.0597229003906, "loss": 0.0727, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.23264732956886292, "rewards/margins": 6.140435695648193, "rewards/rejected": -6.373082637786865, "step": 3000 }, { "epoch": 1.575091575091575, "grad_norm": 39.201062672587085, "learning_rate": 2.718184248106828e-07, "logits/chosen": -3.0555896759033203, "logits/rejected": -2.931945323944092, "logps/chosen": -328.04486083984375, "logps/rejected": -364.1317138671875, "loss": 0.0803, "rewards/accuracies": 1.0, "rewards/chosen": -0.5059137344360352, "rewards/margins": 5.9586334228515625, "rewards/rejected": -6.464547157287598, "step": 3010 }, { "epoch": 1.5803244374672945, "grad_norm": 48.72913748805263, "learning_rate": 2.7030145399884275e-07, "logits/chosen": -2.9798293113708496, "logits/rejected": -2.8546149730682373, "logps/chosen": -352.45428466796875, "logps/rejected": -357.4016418457031, "loss": 0.0974, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9498438835144043, "rewards/margins": 5.737730979919434, "rewards/rejected": -6.687574863433838, "step": 3020 }, { "epoch": 1.585557299843014, "grad_norm": 14.485290817570968, "learning_rate": 2.687837303612085e-07, "logits/chosen": -3.0707411766052246, "logits/rejected": -2.9482741355895996, "logps/chosen": -328.9745788574219, "logps/rejected": -365.7391052246094, "loss": 0.102, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6016635894775391, "rewards/margins": 6.558728218078613, "rewards/rejected": -7.160391330718994, "step": 3030 }, { "epoch": 1.5907901622187337, "grad_norm": 25.755954095654527, "learning_rate": 2.672653101785519e-07, "logits/chosen": -2.8823635578155518, "logits/rejected": -2.8659863471984863, "logps/chosen": -288.79022216796875, "logps/rejected": -342.3415222167969, "loss": 0.0729, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.4533253312110901, "rewards/margins": 6.304105281829834, "rewards/rejected": -6.7574310302734375, "step": 3040 }, { "epoch": 1.5960230245944533, "grad_norm": 21.88421097981173, "learning_rate": 2.657462497574747e-07, "logits/chosen": -3.0075151920318604, "logits/rejected": -2.998107433319092, "logps/chosen": -227.07632446289062, "logps/rejected": -276.89129638671875, "loss": 0.071, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.725974440574646, "rewards/margins": 4.849265098571777, "rewards/rejected": -5.575240135192871, "step": 3050 }, { "epoch": 1.6012558869701727, "grad_norm": 5.099767698274497, "learning_rate": 2.642266054283198e-07, "logits/chosen": -3.0432772636413574, "logits/rejected": -2.8562986850738525, "logps/chosen": -347.7922058105469, "logps/rejected": -279.85919189453125, "loss": 0.068, "rewards/accuracies": 1.0, "rewards/chosen": -0.2676194906234741, "rewards/margins": 6.386515140533447, "rewards/rejected": -6.654134273529053, "step": 3060 }, { "epoch": 1.6064887493458921, "grad_norm": 44.87621981936915, "learning_rate": 2.627064335430829e-07, "logits/chosen": -2.979245662689209, "logits/rejected": -2.8612546920776367, "logps/chosen": -306.9947204589844, "logps/rejected": -330.82220458984375, "loss": 0.0817, "rewards/accuracies": 1.0, "rewards/chosen": -0.8024505376815796, "rewards/margins": 6.137944221496582, "rewards/rejected": -6.940393924713135, "step": 3070 }, { "epoch": 1.6117216117216118, "grad_norm": 49.912106032950746, "learning_rate": 2.611857904733227e-07, "logits/chosen": -2.9569969177246094, "logits/rejected": -2.7965855598449707, "logps/chosen": -299.54412841796875, "logps/rejected": -295.87548828125, "loss": 0.086, "rewards/accuracies": 1.0, "rewards/chosen": -1.317258596420288, "rewards/margins": 5.417449474334717, "rewards/rejected": -6.734707832336426, "step": 3080 }, { "epoch": 1.6169544740973314, "grad_norm": 23.234907216831658, "learning_rate": 2.5966473260807076e-07, "logits/chosen": -3.0244688987731934, "logits/rejected": -2.9242327213287354, "logps/chosen": -336.098388671875, "logps/rejected": -386.6656799316406, "loss": 0.0792, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.31826600432395935, "rewards/margins": 7.304460048675537, "rewards/rejected": -7.622725486755371, "step": 3090 }, { "epoch": 1.6221873364730508, "grad_norm": 37.026988654093245, "learning_rate": 2.5814331635173987e-07, "logits/chosen": -2.937436580657959, "logits/rejected": -2.8952934741973877, "logps/chosen": -298.97406005859375, "logps/rejected": -340.5735778808594, "loss": 0.1292, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.8033806085586548, "rewards/margins": 4.659232139587402, "rewards/rejected": -5.462612152099609, "step": 3100 }, { "epoch": 1.6274201988487702, "grad_norm": 60.44387379525024, "learning_rate": 2.566215981220331e-07, "logits/chosen": -2.886880397796631, "logits/rejected": -2.8125598430633545, "logps/chosen": -294.96209716796875, "logps/rejected": -353.826171875, "loss": 0.1046, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.357627272605896, "rewards/margins": 5.615719795227051, "rewards/rejected": -6.973346710205078, "step": 3110 }, { "epoch": 1.6326530612244898, "grad_norm": 40.60664813574127, "learning_rate": 2.550996343478514e-07, "logits/chosen": -2.9131360054016113, "logits/rejected": -2.8931052684783936, "logps/chosen": -291.5629577636719, "logps/rejected": -337.7886962890625, "loss": 0.1101, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.4644583761692047, "rewards/margins": 6.634761810302734, "rewards/rejected": -7.09921932220459, "step": 3120 }, { "epoch": 1.6378859236002095, "grad_norm": 6.442973089160438, "learning_rate": 2.5357748146720076e-07, "logits/chosen": -2.91871976852417, "logits/rejected": -2.790741205215454, "logps/chosen": -202.78965759277344, "logps/rejected": -254.2498321533203, "loss": 0.076, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.0862606763839722, "rewards/margins": 4.93480110168457, "rewards/rejected": -6.021062850952148, "step": 3130 }, { "epoch": 1.6431187859759289, "grad_norm": 42.913887776580026, "learning_rate": 2.5205519592509993e-07, "logits/chosen": -2.9244229793548584, "logits/rejected": -2.837059736251831, "logps/chosen": -261.33209228515625, "logps/rejected": -315.60736083984375, "loss": 0.0853, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.9527704119682312, "rewards/margins": 6.18652868270874, "rewards/rejected": -7.139298915863037, "step": 3140 }, { "epoch": 1.6483516483516483, "grad_norm": 29.024401488847513, "learning_rate": 2.505328341714873e-07, "logits/chosen": -3.0372185707092285, "logits/rejected": -2.8590736389160156, "logps/chosen": -305.61029052734375, "logps/rejected": -339.8692932128906, "loss": 0.0739, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6348286271095276, "rewards/margins": 6.644381523132324, "rewards/rejected": -7.279211521148682, "step": 3150 }, { "epoch": 1.653584510727368, "grad_norm": 27.667691920236233, "learning_rate": 2.4901045265912687e-07, "logits/chosen": -2.9930267333984375, "logits/rejected": -2.9300479888916016, "logps/chosen": -302.3780212402344, "logps/rejected": -366.72406005859375, "loss": 0.0859, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.47990530729293823, "rewards/margins": 6.747189998626709, "rewards/rejected": -7.227095603942871, "step": 3160 }, { "epoch": 1.6588173731030875, "grad_norm": 17.71095089815236, "learning_rate": 2.4748810784151555e-07, "logits/chosen": -2.9701192378997803, "logits/rejected": -2.9169044494628906, "logps/chosen": -320.234375, "logps/rejected": -315.2912292480469, "loss": 0.0733, "rewards/accuracies": 1.0, "rewards/chosen": -0.9635225534439087, "rewards/margins": 6.2165679931640625, "rewards/rejected": -7.180090427398682, "step": 3170 }, { "epoch": 1.664050235478807, "grad_norm": 66.82571827973516, "learning_rate": 2.459658561707898e-07, "logits/chosen": -2.9692416191101074, "logits/rejected": -2.8981778621673584, "logps/chosen": -307.89569091796875, "logps/rejected": -357.0768737792969, "loss": 0.103, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.1377623081207275, "rewards/margins": 5.656141757965088, "rewards/rejected": -6.7939043045043945, "step": 3180 }, { "epoch": 1.6692830978545263, "grad_norm": 19.399631348812303, "learning_rate": 2.4444375409563145e-07, "logits/chosen": -2.9965333938598633, "logits/rejected": -2.882585048675537, "logps/chosen": -305.0103454589844, "logps/rejected": -337.71282958984375, "loss": 0.0892, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.524758219718933, "rewards/margins": 5.996395111083984, "rewards/rejected": -7.521153450012207, "step": 3190 }, { "epoch": 1.674515960230246, "grad_norm": 22.422632694742795, "learning_rate": 2.429218580591753e-07, "logits/chosen": -2.8643314838409424, "logits/rejected": -2.7628695964813232, "logps/chosen": -316.1343688964844, "logps/rejected": -289.3740234375, "loss": 0.114, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.9421480894088745, "rewards/margins": 5.61722469329834, "rewards/rejected": -6.559373378753662, "step": 3200 }, { "epoch": 1.6797488226059656, "grad_norm": 62.23282651464688, "learning_rate": 2.414002244969158e-07, "logits/chosen": -2.847851037979126, "logits/rejected": -2.8082852363586426, "logps/chosen": -279.5246276855469, "logps/rejected": -325.80816650390625, "loss": 0.1038, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.7170674800872803, "rewards/margins": 5.6793904304504395, "rewards/rejected": -7.396457672119141, "step": 3210 }, { "epoch": 1.684981684981685, "grad_norm": 34.039705135346985, "learning_rate": 2.3987890983461403e-07, "logits/chosen": -3.0056445598602295, "logits/rejected": -2.910372257232666, "logps/chosen": -305.9912109375, "logps/rejected": -379.44757080078125, "loss": 0.1014, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.0099217891693115, "rewards/margins": 6.441681861877441, "rewards/rejected": -7.451603889465332, "step": 3220 }, { "epoch": 1.6902145473574044, "grad_norm": 56.210004165791595, "learning_rate": 2.3835797048620564e-07, "logits/chosen": -3.0148122310638428, "logits/rejected": -2.9477195739746094, "logps/chosen": -278.8802185058594, "logps/rejected": -306.242431640625, "loss": 0.1053, "rewards/accuracies": 1.0, "rewards/chosen": -0.8668941259384155, "rewards/margins": 5.957779884338379, "rewards/rejected": -6.824674129486084, "step": 3230 }, { "epoch": 1.695447409733124, "grad_norm": 58.027504089360605, "learning_rate": 2.368374628517088e-07, "logits/chosen": -2.8214244842529297, "logits/rejected": -2.749276638031006, "logps/chosen": -280.22607421875, "logps/rejected": -315.31988525390625, "loss": 0.1011, "rewards/accuracies": 1.0, "rewards/chosen": -0.6232548952102661, "rewards/margins": 6.575970649719238, "rewards/rejected": -7.199225425720215, "step": 3240 }, { "epoch": 1.7006802721088436, "grad_norm": 8.580505101098622, "learning_rate": 2.3531744331513247e-07, "logits/chosen": -2.905571222305298, "logits/rejected": -2.9306349754333496, "logps/chosen": -233.11837768554688, "logps/rejected": -301.6280212402344, "loss": 0.0878, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7622843384742737, "rewards/margins": 6.068845748901367, "rewards/rejected": -6.831130027770996, "step": 3250 }, { "epoch": 1.705913134484563, "grad_norm": 57.45403183546928, "learning_rate": 2.3379796824238608e-07, "logits/chosen": -2.8800442218780518, "logits/rejected": -2.8686094284057617, "logps/chosen": -222.8167266845703, "logps/rejected": -256.17926025390625, "loss": 0.1374, "rewards/accuracies": 0.875, "rewards/chosen": -1.6925709247589111, "rewards/margins": 4.765946865081787, "rewards/rejected": -6.458517551422119, "step": 3260 }, { "epoch": 1.7111459968602825, "grad_norm": 14.549207853965402, "learning_rate": 2.3227909397918894e-07, "logits/chosen": -3.1244492530822754, "logits/rejected": -3.039151430130005, "logps/chosen": -317.5973815917969, "logps/rejected": -378.6702880859375, "loss": 0.0773, "rewards/accuracies": 1.0, "rewards/chosen": -0.20847468078136444, "rewards/margins": 7.307975769042969, "rewards/rejected": -7.516449928283691, "step": 3270 }, { "epoch": 1.716378859236002, "grad_norm": 37.13991812843785, "learning_rate": 2.3076087684898076e-07, "logits/chosen": -2.980071544647217, "logits/rejected": -2.8639683723449707, "logps/chosen": -266.0498962402344, "logps/rejected": -327.7710876464844, "loss": 0.0979, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.0797480344772339, "rewards/margins": 5.549710273742676, "rewards/rejected": -6.629457950592041, "step": 3280 }, { "epoch": 1.7216117216117217, "grad_norm": 65.07829538813989, "learning_rate": 2.2924337315083353e-07, "logits/chosen": -3.0111212730407715, "logits/rejected": -2.8984410762786865, "logps/chosen": -360.9862976074219, "logps/rejected": -380.43511962890625, "loss": 0.0799, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.536628246307373, "rewards/margins": 6.152345657348633, "rewards/rejected": -6.688973903656006, "step": 3290 }, { "epoch": 1.7268445839874411, "grad_norm": 32.17586854451659, "learning_rate": 2.277266391573633e-07, "logits/chosen": -2.9658689498901367, "logits/rejected": -2.9409101009368896, "logps/chosen": -318.0333557128906, "logps/rejected": -331.3514404296875, "loss": 0.0836, "rewards/accuracies": 1.0, "rewards/chosen": 0.29693344235420227, "rewards/margins": 7.987631797790527, "rewards/rejected": -7.69069766998291, "step": 3300 }, { "epoch": 1.7320774463631605, "grad_norm": 38.47913673885275, "learning_rate": 2.2621073111264357e-07, "logits/chosen": -2.8291594982147217, "logits/rejected": -2.8325917720794678, "logps/chosen": -261.59246826171875, "logps/rejected": -277.89556884765625, "loss": 0.0685, "rewards/accuracies": 1.0, "rewards/chosen": -0.7481458783149719, "rewards/margins": 5.285660743713379, "rewards/rejected": -6.033806800842285, "step": 3310 }, { "epoch": 1.7373103087388801, "grad_norm": 33.19422160572144, "learning_rate": 2.2469570523011993e-07, "logits/chosen": -2.8709657192230225, "logits/rejected": -2.88303279876709, "logps/chosen": -258.5608825683594, "logps/rejected": -312.49761962890625, "loss": 0.1047, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.2499102354049683, "rewards/margins": 5.424338340759277, "rewards/rejected": -6.674247741699219, "step": 3320 }, { "epoch": 1.7425431711145998, "grad_norm": 2.7246770556377244, "learning_rate": 2.2318161769052525e-07, "logits/chosen": -2.976322889328003, "logits/rejected": -2.8597910404205322, "logps/chosen": -265.7853698730469, "logps/rejected": -337.48480224609375, "loss": 0.1059, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.2388050556182861, "rewards/margins": 6.378020763397217, "rewards/rejected": -7.616826057434082, "step": 3330 }, { "epoch": 1.7477760334903192, "grad_norm": 12.807451563358699, "learning_rate": 2.2166852463979624e-07, "logits/chosen": -2.8794219493865967, "logits/rejected": -2.7902653217315674, "logps/chosen": -261.71697998046875, "logps/rejected": -280.11663818359375, "loss": 0.1052, "rewards/accuracies": 1.0, "rewards/chosen": -1.3503434658050537, "rewards/margins": 5.744359016418457, "rewards/rejected": -7.094702243804932, "step": 3340 }, { "epoch": 1.7530088958660386, "grad_norm": 12.621324261795577, "learning_rate": 2.20156482186992e-07, "logits/chosen": -2.8771462440490723, "logits/rejected": -2.8502235412597656, "logps/chosen": -268.9850769042969, "logps/rejected": -330.7921447753906, "loss": 0.082, "rewards/accuracies": 1.0, "rewards/chosen": -1.1187727451324463, "rewards/margins": 6.160275459289551, "rewards/rejected": -7.279047966003418, "step": 3350 }, { "epoch": 1.7582417582417582, "grad_norm": 13.24125378186075, "learning_rate": 2.1864554640221244e-07, "logits/chosen": -2.786963939666748, "logits/rejected": -2.854315757751465, "logps/chosen": -206.97390747070312, "logps/rejected": -341.7965087890625, "loss": 0.0877, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.3356707096099854, "rewards/margins": 7.020216464996338, "rewards/rejected": -8.355886459350586, "step": 3360 }, { "epoch": 1.7634746206174778, "grad_norm": 12.767733979968577, "learning_rate": 2.1713577331452016e-07, "logits/chosen": -2.994497299194336, "logits/rejected": -2.8859658241271973, "logps/chosen": -265.92633056640625, "logps/rejected": -286.20928955078125, "loss": 0.0712, "rewards/accuracies": 1.0, "rewards/chosen": -0.6347097158432007, "rewards/margins": 5.689723491668701, "rewards/rejected": -6.324433326721191, "step": 3370 }, { "epoch": 1.7687074829931972, "grad_norm": 43.49076545861488, "learning_rate": 2.1562721890986199e-07, "logits/chosen": -2.883199453353882, "logits/rejected": -2.757340908050537, "logps/chosen": -248.45993041992188, "logps/rejected": -270.29681396484375, "loss": 0.084, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.4014949798583984, "rewards/margins": 5.839360237121582, "rewards/rejected": -7.240854740142822, "step": 3380 }, { "epoch": 1.7739403453689166, "grad_norm": 7.133089937292718, "learning_rate": 2.1411993912899285e-07, "logits/chosen": -2.898311138153076, "logits/rejected": -2.9894628524780273, "logps/chosen": -242.5589599609375, "logps/rejected": -384.56903076171875, "loss": 0.0906, "rewards/accuracies": 1.0, "rewards/chosen": -0.8392623066902161, "rewards/margins": 6.078016757965088, "rewards/rejected": -6.917278289794922, "step": 3390 }, { "epoch": 1.7791732077446363, "grad_norm": 38.85160701165766, "learning_rate": 2.126139898654021e-07, "logits/chosen": -2.8840270042419434, "logits/rejected": -2.8573272228240967, "logps/chosen": -238.3805694580078, "logps/rejected": -302.7860412597656, "loss": 0.0769, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.587359070777893, "rewards/margins": 5.119924545288086, "rewards/rejected": -6.707283973693848, "step": 3400 }, { "epoch": 1.784406070120356, "grad_norm": 17.924533965611843, "learning_rate": 2.1110942696324012e-07, "logits/chosen": -3.1051347255706787, "logits/rejected": -3.0233821868896484, "logps/chosen": -319.886962890625, "logps/rejected": -332.4261169433594, "loss": 0.105, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.8453801870346069, "rewards/margins": 5.213593482971191, "rewards/rejected": -6.05897331237793, "step": 3410 }, { "epoch": 1.7896389324960753, "grad_norm": 43.540232097139366, "learning_rate": 2.0960630621524762e-07, "logits/chosen": -2.8899734020233154, "logits/rejected": -2.8264780044555664, "logps/chosen": -310.78387451171875, "logps/rejected": -289.8320007324219, "loss": 0.0916, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5663490891456604, "rewards/margins": 6.335883140563965, "rewards/rejected": -6.902231693267822, "step": 3420 }, { "epoch": 1.7948717948717947, "grad_norm": 29.344722876978526, "learning_rate": 2.0810468336068697e-07, "logits/chosen": -2.8978066444396973, "logits/rejected": -2.9264578819274902, "logps/chosen": -243.4187774658203, "logps/rejected": -298.56719970703125, "loss": 0.0941, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.327157735824585, "rewards/margins": 5.754612922668457, "rewards/rejected": -7.081770896911621, "step": 3430 }, { "epoch": 1.8001046572475143, "grad_norm": 12.52060029462021, "learning_rate": 2.0660461408327535e-07, "logits/chosen": -3.0149242877960205, "logits/rejected": -2.9196088314056396, "logps/chosen": -305.9316101074219, "logps/rejected": -287.62896728515625, "loss": 0.0744, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.265438437461853, "rewards/margins": 5.253451824188232, "rewards/rejected": -6.518891334533691, "step": 3440 }, { "epoch": 1.805337519623234, "grad_norm": 7.979422579746135, "learning_rate": 2.0510615400911906e-07, "logits/chosen": -3.0431969165802, "logits/rejected": -2.9783592224121094, "logps/chosen": -275.7513427734375, "logps/rejected": -292.515625, "loss": 0.0896, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.4673697352409363, "rewards/margins": 5.993512153625488, "rewards/rejected": -6.46088171005249, "step": 3450 }, { "epoch": 1.8105703819989536, "grad_norm": 28.16918861720906, "learning_rate": 2.0360935870465185e-07, "logits/chosen": -2.9939873218536377, "logits/rejected": -2.831987142562866, "logps/chosen": -335.2601623535156, "logps/rejected": -337.7147521972656, "loss": 0.0889, "rewards/accuracies": 1.0, "rewards/chosen": -0.2928660809993744, "rewards/margins": 6.129449844360352, "rewards/rejected": -6.422316551208496, "step": 3460 }, { "epoch": 1.815803244374673, "grad_norm": 60.11033434263851, "learning_rate": 2.021142836745739e-07, "logits/chosen": -2.9230611324310303, "logits/rejected": -2.833214282989502, "logps/chosen": -282.1880798339844, "logps/rejected": -310.4433288574219, "loss": 0.0972, "rewards/accuracies": 1.0, "rewards/chosen": -0.7712811231613159, "rewards/margins": 5.326231956481934, "rewards/rejected": -6.097513198852539, "step": 3470 }, { "epoch": 1.8210361067503924, "grad_norm": 46.40318492914859, "learning_rate": 2.0062098435979308e-07, "logits/chosen": -2.7931761741638184, "logits/rejected": -2.7565505504608154, "logps/chosen": -299.5055847167969, "logps/rejected": -293.8361511230469, "loss": 0.0986, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.2282201051712036, "rewards/margins": 4.726022720336914, "rewards/rejected": -5.954242706298828, "step": 3480 }, { "epoch": 1.826268969126112, "grad_norm": 19.72393387180669, "learning_rate": 1.9912951613536997e-07, "logits/chosen": -2.993452548980713, "logits/rejected": -2.862706184387207, "logps/chosen": -295.91131591796875, "logps/rejected": -294.0292053222656, "loss": 0.0819, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.0687099695205688, "rewards/margins": 6.073367118835449, "rewards/rejected": -7.142076015472412, "step": 3490 }, { "epoch": 1.8315018315018317, "grad_norm": 28.542274255526042, "learning_rate": 1.9763993430846392e-07, "logits/chosen": -2.993786334991455, "logits/rejected": -2.796332597732544, "logps/chosen": -270.9580383300781, "logps/rejected": -254.5969696044922, "loss": 0.0952, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.2017524242401123, "rewards/margins": 5.413460731506348, "rewards/rejected": -6.615212440490723, "step": 3500 }, { "epoch": 1.836734693877551, "grad_norm": 12.987603067036108, "learning_rate": 1.9615229411628212e-07, "logits/chosen": -2.881495952606201, "logits/rejected": -2.857116460800171, "logps/chosen": -209.13272094726562, "logps/rejected": -315.0049743652344, "loss": 0.0781, "rewards/accuracies": 1.0, "rewards/chosen": -1.51089608669281, "rewards/margins": 5.136195659637451, "rewards/rejected": -6.647091865539551, "step": 3510 }, { "epoch": 1.8419675562532705, "grad_norm": 13.53120989556987, "learning_rate": 1.946666507240314e-07, "logits/chosen": -2.9498937129974365, "logits/rejected": -2.8784759044647217, "logps/chosen": -311.5005798339844, "logps/rejected": -351.9678039550781, "loss": 0.0835, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.2483736276626587, "rewards/margins": 5.8507795333862305, "rewards/rejected": -7.0991530418396, "step": 3520 }, { "epoch": 1.84720041862899, "grad_norm": 15.175456524547657, "learning_rate": 1.9318305922287268e-07, "logits/chosen": -2.884366512298584, "logits/rejected": -2.867556571960449, "logps/chosen": -266.84967041015625, "logps/rejected": -305.1980285644531, "loss": 0.068, "rewards/accuracies": 1.0, "rewards/chosen": -0.957389235496521, "rewards/margins": 6.541996955871582, "rewards/rejected": -7.499385833740234, "step": 3530 }, { "epoch": 1.8524332810047097, "grad_norm": 34.00226659057054, "learning_rate": 1.9170157462787762e-07, "logits/chosen": -2.9759156703948975, "logits/rejected": -2.8705861568450928, "logps/chosen": -327.4330139160156, "logps/rejected": -305.6136474609375, "loss": 0.0921, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6973533630371094, "rewards/margins": 5.863638877868652, "rewards/rejected": -6.560992240905762, "step": 3540 }, { "epoch": 1.8576661433804291, "grad_norm": 24.827131494281314, "learning_rate": 1.902222518759891e-07, "logits/chosen": -3.073078155517578, "logits/rejected": -2.8961453437805176, "logps/chosen": -354.58367919921875, "logps/rejected": -363.75030517578125, "loss": 0.1011, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5833311080932617, "rewards/margins": 5.8877105712890625, "rewards/rejected": -6.471041679382324, "step": 3550 }, { "epoch": 1.8628990057561485, "grad_norm": 25.724627305030385, "learning_rate": 1.8874514582398368e-07, "logits/chosen": -2.9343771934509277, "logits/rejected": -2.9873697757720947, "logps/chosen": -296.8329162597656, "logps/rejected": -342.500732421875, "loss": 0.0992, "rewards/accuracies": 1.0, "rewards/chosen": -0.39233070611953735, "rewards/margins": 7.163987636566162, "rewards/rejected": -7.556318759918213, "step": 3560 }, { "epoch": 1.8681318681318682, "grad_norm": 12.47463942684228, "learning_rate": 1.8727031124643738e-07, "logits/chosen": -2.9487969875335693, "logits/rejected": -2.8963401317596436, "logps/chosen": -228.98684692382812, "logps/rejected": -275.0185546875, "loss": 0.0821, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6753531098365784, "rewards/margins": 5.562913417816162, "rewards/rejected": -6.238266944885254, "step": 3570 }, { "epoch": 1.8733647305075878, "grad_norm": 27.453066384567453, "learning_rate": 1.8579780283369472e-07, "logits/chosen": -2.85052490234375, "logits/rejected": -2.715588092803955, "logps/chosen": -290.2726745605469, "logps/rejected": -270.888427734375, "loss": 0.0981, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.132917046546936, "rewards/margins": 5.187754154205322, "rewards/rejected": -6.3206706047058105, "step": 3580 }, { "epoch": 1.8785975928833072, "grad_norm": 12.0441652812134, "learning_rate": 1.8432767518984043e-07, "logits/chosen": -2.8838024139404297, "logits/rejected": -2.8034605979919434, "logps/chosen": -292.6697082519531, "logps/rejected": -311.16265869140625, "loss": 0.0978, "rewards/accuracies": 1.0, "rewards/chosen": -0.6306116580963135, "rewards/margins": 5.607513427734375, "rewards/rejected": -6.238125801086426, "step": 3590 }, { "epoch": 1.8838304552590266, "grad_norm": 89.27546397258557, "learning_rate": 1.8285998283067478e-07, "logits/chosen": -2.9904944896698, "logits/rejected": -2.928205966949463, "logps/chosen": -262.712890625, "logps/rejected": -300.67132568359375, "loss": 0.0929, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7022815942764282, "rewards/margins": 6.132050037384033, "rewards/rejected": -6.834331512451172, "step": 3600 }, { "epoch": 1.8890633176347462, "grad_norm": 44.40166413963045, "learning_rate": 1.8139478018169197e-07, "logits/chosen": -2.886094570159912, "logits/rejected": -2.835921049118042, "logps/chosen": -242.9951934814453, "logps/rejected": -281.91143798828125, "loss": 0.0841, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.8270211219787598, "rewards/margins": 5.5995354652404785, "rewards/rejected": -6.426556587219238, "step": 3610 }, { "epoch": 1.8942961800104658, "grad_norm": 21.27065469592044, "learning_rate": 1.799321215760617e-07, "logits/chosen": -2.90838623046875, "logits/rejected": -2.882166862487793, "logps/chosen": -264.93646240234375, "logps/rejected": -281.915771484375, "loss": 0.1107, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.3578184843063354, "rewards/margins": 5.0995774269104, "rewards/rejected": -6.457396030426025, "step": 3620 }, { "epoch": 1.8995290423861853, "grad_norm": 47.814792912758904, "learning_rate": 1.7847206125261476e-07, "logits/chosen": -2.908107280731201, "logits/rejected": -2.89654278755188, "logps/chosen": -236.9935302734375, "logps/rejected": -271.1524658203125, "loss": 0.113, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.3795278072357178, "rewards/margins": 5.0340495109558105, "rewards/rejected": -6.413577079772949, "step": 3630 }, { "epoch": 1.9047619047619047, "grad_norm": 34.798225157019615, "learning_rate": 1.7701465335383148e-07, "logits/chosen": -2.963202476501465, "logits/rejected": -2.85608172416687, "logps/chosen": -274.09893798828125, "logps/rejected": -273.7052001953125, "loss": 0.0779, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9602730870246887, "rewards/margins": 4.646059989929199, "rewards/rejected": -5.606332778930664, "step": 3640 }, { "epoch": 1.9099947671376243, "grad_norm": 38.27306451884473, "learning_rate": 1.7555995192383377e-07, "logits/chosen": -2.8957676887512207, "logits/rejected": -2.9778990745544434, "logps/chosen": -238.47216796875, "logps/rejected": -417.1255798339844, "loss": 0.0835, "rewards/accuracies": 1.0, "rewards/chosen": -0.5256121158599854, "rewards/margins": 6.498622894287109, "rewards/rejected": -7.024234771728516, "step": 3650 }, { "epoch": 1.915227629513344, "grad_norm": 51.892233331739185, "learning_rate": 1.7410801090638166e-07, "logits/chosen": -2.93211030960083, "logits/rejected": -2.8617310523986816, "logps/chosen": -296.158447265625, "logps/rejected": -310.12689208984375, "loss": 0.0948, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7393634915351868, "rewards/margins": 5.988359451293945, "rewards/rejected": -6.727723121643066, "step": 3660 }, { "epoch": 1.9204604918890633, "grad_norm": 22.32680952556182, "learning_rate": 1.7265888414287245e-07, "logits/chosen": -2.971330165863037, "logits/rejected": -2.942361354827881, "logps/chosen": -275.4486083984375, "logps/rejected": -321.77764892578125, "loss": 0.0829, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6169160604476929, "rewards/margins": 6.724116325378418, "rewards/rejected": -7.3410325050354, "step": 3670 }, { "epoch": 1.9256933542647827, "grad_norm": 43.032017369290884, "learning_rate": 1.7121262537034396e-07, "logits/chosen": -3.007481098175049, "logits/rejected": -2.871833562850952, "logps/chosen": -300.15631103515625, "logps/rejected": -314.15020751953125, "loss": 0.0965, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.8639653921127319, "rewards/margins": 5.244901657104492, "rewards/rejected": -6.108867168426514, "step": 3680 }, { "epoch": 1.9309262166405023, "grad_norm": 50.5163480699902, "learning_rate": 1.697692882194826e-07, "logits/chosen": -2.834320068359375, "logits/rejected": -2.8236618041992188, "logps/chosen": -224.5089569091797, "logps/rejected": -290.79803466796875, "loss": 0.0974, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.094691514968872, "rewards/margins": 4.980583190917969, "rewards/rejected": -6.075274467468262, "step": 3690 }, { "epoch": 1.936159079016222, "grad_norm": 29.79584616718695, "learning_rate": 1.6832892621263406e-07, "logits/chosen": -3.1706554889678955, "logits/rejected": -2.99463152885437, "logps/chosen": -335.56573486328125, "logps/rejected": -362.2666931152344, "loss": 0.0827, "rewards/accuracies": 1.0, "rewards/chosen": -0.2189219444990158, "rewards/margins": 6.463695526123047, "rewards/rejected": -6.682618141174316, "step": 3700 }, { "epoch": 1.9413919413919414, "grad_norm": 9.966530468610411, "learning_rate": 1.668915927618183e-07, "logits/chosen": -2.9023125171661377, "logits/rejected": -2.8977699279785156, "logps/chosen": -210.21157836914062, "logps/rejected": -286.28265380859375, "loss": 0.0719, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.914448082447052, "rewards/margins": 4.942419052124023, "rewards/rejected": -5.856866836547852, "step": 3710 }, { "epoch": 1.9466248037676608, "grad_norm": 12.511773734964656, "learning_rate": 1.6545734116674965e-07, "logits/chosen": -3.023350238800049, "logits/rejected": -2.967996120452881, "logps/chosen": -277.91796875, "logps/rejected": -273.8655700683594, "loss": 0.1101, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.10800141096115112, "rewards/margins": 6.52965784072876, "rewards/rejected": -6.637658596038818, "step": 3720 }, { "epoch": 1.9518576661433804, "grad_norm": 34.3212343683574, "learning_rate": 1.6402622461286e-07, "logits/chosen": -2.8856921195983887, "logits/rejected": -2.8143417835235596, "logps/chosen": -298.9675598144531, "logps/rejected": -307.2413635253906, "loss": 0.0802, "rewards/accuracies": 1.0, "rewards/chosen": -0.908021092414856, "rewards/margins": 5.976801872253418, "rewards/rejected": -6.884822845458984, "step": 3730 }, { "epoch": 1.9570905285191, "grad_norm": 44.1016948538804, "learning_rate": 1.625982961693262e-07, "logits/chosen": -3.060004711151123, "logits/rejected": -2.865377902984619, "logps/chosen": -327.1813049316406, "logps/rejected": -293.9488220214844, "loss": 0.0815, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.44826745986938477, "rewards/margins": 6.143226146697998, "rewards/rejected": -6.591493129730225, "step": 3740 }, { "epoch": 1.9623233908948194, "grad_norm": 52.76598648080503, "learning_rate": 1.6117360878710266e-07, "logits/chosen": -3.0457653999328613, "logits/rejected": -2.8902697563171387, "logps/chosen": -303.8973693847656, "logps/rejected": -338.15557861328125, "loss": 0.1, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5990251302719116, "rewards/margins": 5.54285192489624, "rewards/rejected": -6.141877174377441, "step": 3750 }, { "epoch": 1.9675562532705388, "grad_norm": 48.11072475023021, "learning_rate": 1.5975221529695773e-07, "logits/chosen": -2.9446165561676025, "logits/rejected": -2.846625804901123, "logps/chosen": -221.36660766601562, "logps/rejected": -233.03173828125, "loss": 0.1038, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.9599982500076294, "rewards/margins": 5.139828205108643, "rewards/rejected": -6.099826335906982, "step": 3760 }, { "epoch": 1.9727891156462585, "grad_norm": 10.21222882685982, "learning_rate": 1.5833416840751406e-07, "logits/chosen": -2.909883499145508, "logits/rejected": -2.719053268432617, "logps/chosen": -234.9148712158203, "logps/rejected": -232.953125, "loss": 0.0969, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.9281904101371765, "rewards/margins": 5.767971992492676, "rewards/rejected": -6.696164131164551, "step": 3770 }, { "epoch": 1.978021978021978, "grad_norm": 26.49544219805348, "learning_rate": 1.5691952070329493e-07, "logits/chosen": -3.0307018756866455, "logits/rejected": -2.9728851318359375, "logps/chosen": -326.88861083984375, "logps/rejected": -383.60894775390625, "loss": 0.0825, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.4664892256259918, "rewards/margins": 6.368628025054932, "rewards/rejected": -6.835117340087891, "step": 3780 }, { "epoch": 1.9832548403976975, "grad_norm": 52.16257682161655, "learning_rate": 1.555083246427734e-07, "logits/chosen": -2.8593192100524902, "logits/rejected": -2.8581061363220215, "logps/chosen": -313.5380554199219, "logps/rejected": -354.5934143066406, "loss": 0.0842, "rewards/accuracies": 1.0, "rewards/chosen": -0.9946999549865723, "rewards/margins": 7.37838888168335, "rewards/rejected": -8.373088836669922, "step": 3790 }, { "epoch": 1.988487702773417, "grad_norm": 16.474025266616128, "learning_rate": 1.5410063255642767e-07, "logits/chosen": -2.8452136516571045, "logits/rejected": -2.8394722938537598, "logps/chosen": -273.67205810546875, "logps/rejected": -312.1125183105469, "loss": 0.0914, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.4113245010375977, "rewards/margins": 5.629705905914307, "rewards/rejected": -7.0410308837890625, "step": 3800 }, { "epoch": 1.9937205651491365, "grad_norm": 8.623847460798391, "learning_rate": 1.5269649664480037e-07, "logits/chosen": -2.8858256340026855, "logits/rejected": -2.8683369159698486, "logps/chosen": -316.9082336425781, "logps/rejected": -362.3924560546875, "loss": 0.0954, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.2730662822723389, "rewards/margins": 5.645076274871826, "rewards/rejected": -6.918142795562744, "step": 3810 }, { "epoch": 1.9989534275248562, "grad_norm": 42.86853921498298, "learning_rate": 1.5129596897656255e-07, "logits/chosen": -2.858860731124878, "logits/rejected": -2.775953769683838, "logps/chosen": -280.3272399902344, "logps/rejected": -293.852783203125, "loss": 0.062, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.8031293153762817, "rewards/margins": 5.772340774536133, "rewards/rejected": -6.575469970703125, "step": 3820 }, { "epoch": 2.004186289900576, "grad_norm": 2.502128922517174, "learning_rate": 1.4989910148658324e-07, "logits/chosen": -3.004093647003174, "logits/rejected": -2.9333481788635254, "logps/chosen": -286.138671875, "logps/rejected": -339.6099853515625, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": -1.1104406118392944, "rewards/margins": 5.947878360748291, "rewards/rejected": -7.058319091796875, "step": 3830 }, { "epoch": 2.009419152276295, "grad_norm": 8.8304469967401, "learning_rate": 1.485059459740035e-07, "logits/chosen": -2.95084810256958, "logits/rejected": -2.8358261585235596, "logps/chosen": -307.08660888671875, "logps/rejected": -365.07989501953125, "loss": 0.0145, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.2372806072235107, "rewards/margins": 7.171760559082031, "rewards/rejected": -8.409042358398438, "step": 3840 }, { "epoch": 2.0146520146520146, "grad_norm": 6.935553509930375, "learning_rate": 1.4711655410031536e-07, "logits/chosen": -2.914429187774658, "logits/rejected": -2.8401126861572266, "logps/chosen": -238.0872802734375, "logps/rejected": -300.6097717285156, "loss": 0.0154, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9826270341873169, "rewards/margins": 7.598635673522949, "rewards/rejected": -8.581262588500977, "step": 3850 }, { "epoch": 2.0198848770277342, "grad_norm": 12.349763692246215, "learning_rate": 1.4573097738744623e-07, "logits/chosen": -2.871941089630127, "logits/rejected": -2.8905444145202637, "logps/chosen": -249.6139678955078, "logps/rejected": -330.462158203125, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": -1.5438705682754517, "rewards/margins": 7.111802577972412, "rewards/rejected": -8.655673027038574, "step": 3860 }, { "epoch": 2.025117739403454, "grad_norm": 9.657443537805896, "learning_rate": 1.4434926721584865e-07, "logits/chosen": -2.9380996227264404, "logits/rejected": -2.803107738494873, "logps/chosen": -270.4034423828125, "logps/rejected": -346.66827392578125, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": -1.2198808193206787, "rewards/margins": 7.96026086807251, "rewards/rejected": -9.18014144897461, "step": 3870 }, { "epoch": 2.030350601779173, "grad_norm": 1.4338188477500189, "learning_rate": 1.4297147482259424e-07, "logits/chosen": -2.9446866512298584, "logits/rejected": -2.8614068031311035, "logps/chosen": -267.55633544921875, "logps/rejected": -306.0637512207031, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -1.6963237524032593, "rewards/margins": 7.74579381942749, "rewards/rejected": -9.442116737365723, "step": 3880 }, { "epoch": 2.0355834641548927, "grad_norm": 2.295288826074271, "learning_rate": 1.4159765129947443e-07, "logits/chosen": -3.0193746089935303, "logits/rejected": -2.9775304794311523, "logps/chosen": -245.93017578125, "logps/rejected": -309.11065673828125, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": -1.633589506149292, "rewards/margins": 8.402475357055664, "rewards/rejected": -10.036065101623535, "step": 3890 }, { "epoch": 2.0408163265306123, "grad_norm": 8.525650691300338, "learning_rate": 1.4022784759110576e-07, "logits/chosen": -2.8638484477996826, "logits/rejected": -2.788756847381592, "logps/chosen": -279.7586364746094, "logps/rejected": -349.2271423339844, "loss": 0.0086, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.4813990592956543, "rewards/margins": 7.096240043640137, "rewards/rejected": -9.57763957977295, "step": 3900 }, { "epoch": 2.046049188906332, "grad_norm": 9.036694139293859, "learning_rate": 1.3886211449304002e-07, "logits/chosen": -2.9092764854431152, "logits/rejected": -2.9151601791381836, "logps/chosen": -249.90771484375, "logps/rejected": -421.712646484375, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -2.4128258228302, "rewards/margins": 8.387086868286133, "rewards/rejected": -10.799911499023438, "step": 3910 }, { "epoch": 2.051282051282051, "grad_norm": 1.140491434113699, "learning_rate": 1.3750050264988172e-07, "logits/chosen": -2.8491408824920654, "logits/rejected": -2.9289326667785645, "logps/chosen": -191.1505889892578, "logps/rejected": -321.03790283203125, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": -1.255506157875061, "rewards/margins": 9.087458610534668, "rewards/rejected": -10.342966079711914, "step": 3920 }, { "epoch": 2.0565149136577707, "grad_norm": 1.3925500317567314, "learning_rate": 1.3614306255340918e-07, "logits/chosen": -2.985907793045044, "logits/rejected": -2.801304817199707, "logps/chosen": -292.23712158203125, "logps/rejected": -304.3588562011719, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": -1.6813970804214478, "rewards/margins": 8.123655319213867, "rewards/rejected": -9.805051803588867, "step": 3930 }, { "epoch": 2.0617477760334904, "grad_norm": 0.9872657227406865, "learning_rate": 1.347898445407027e-07, "logits/chosen": -2.8897337913513184, "logits/rejected": -2.800487995147705, "logps/chosen": -305.83197021484375, "logps/rejected": -368.384033203125, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -1.5269715785980225, "rewards/margins": 8.780084609985352, "rewards/rejected": -10.30705738067627, "step": 3940 }, { "epoch": 2.06698063840921, "grad_norm": 3.4061096708527274, "learning_rate": 1.3344089879227768e-07, "logits/chosen": -2.9304306507110596, "logits/rejected": -2.8476498126983643, "logps/chosen": -326.06573486328125, "logps/rejected": -354.74566650390625, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -2.519989252090454, "rewards/margins": 8.64919376373291, "rewards/rejected": -11.169183731079102, "step": 3950 }, { "epoch": 2.072213500784929, "grad_norm": 1.0018274823256332, "learning_rate": 1.3209627533022393e-07, "logits/chosen": -2.7748849391937256, "logits/rejected": -2.7915091514587402, "logps/chosen": -299.4941101074219, "logps/rejected": -359.0870361328125, "loss": 0.0071, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.4167101383209229, "rewards/margins": 9.10537052154541, "rewards/rejected": -10.52208137512207, "step": 3960 }, { "epoch": 2.077446363160649, "grad_norm": 4.704504411130569, "learning_rate": 1.3075602401635056e-07, "logits/chosen": -2.862394094467163, "logits/rejected": -2.8107314109802246, "logps/chosen": -234.2840576171875, "logps/rejected": -251.3478546142578, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -3.2791244983673096, "rewards/margins": 7.321046352386475, "rewards/rejected": -10.600171089172363, "step": 3970 }, { "epoch": 2.0826792255363684, "grad_norm": 2.0453558031945573, "learning_rate": 1.2942019455033715e-07, "logits/chosen": -2.905698537826538, "logits/rejected": -2.879190683364868, "logps/chosen": -360.8655090332031, "logps/rejected": -395.4041442871094, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -2.4923291206359863, "rewards/margins": 8.856918334960938, "rewards/rejected": -11.349247932434082, "step": 3980 }, { "epoch": 2.087912087912088, "grad_norm": 0.8182708426977608, "learning_rate": 1.2808883646789088e-07, "logits/chosen": -2.9061179161071777, "logits/rejected": -2.8345258235931396, "logps/chosen": -273.5254821777344, "logps/rejected": -344.9653625488281, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -2.726759910583496, "rewards/margins": 8.393701553344727, "rewards/rejected": -11.120462417602539, "step": 3990 }, { "epoch": 2.0931449502878072, "grad_norm": 1.8294526866725775, "learning_rate": 1.2676199913890933e-07, "logits/chosen": -2.7774550914764404, "logits/rejected": -2.672849178314209, "logps/chosen": -299.40802001953125, "logps/rejected": -322.0281066894531, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -2.636845827102661, "rewards/margins": 7.216422080993652, "rewards/rejected": -9.853267669677734, "step": 4000 }, { "epoch": 2.0931449502878072, "eval_logits/chosen": -2.8271658420562744, "eval_logits/rejected": -2.781247138977051, "eval_logps/chosen": -304.9618835449219, "eval_logps/rejected": -337.4993591308594, "eval_loss": 0.7140955924987793, "eval_rewards/accuracies": 0.7890625, "eval_rewards/chosen": -5.334571361541748, "eval_rewards/margins": 2.977198600769043, "eval_rewards/rejected": -8.31177043914795, "eval_runtime": 82.5227, "eval_samples_per_second": 24.236, "eval_steps_per_second": 0.388, "step": 4000 }, { "epoch": 2.098377812663527, "grad_norm": 1.8557012884061816, "learning_rate": 1.2543973176565012e-07, "logits/chosen": -2.8214221000671387, "logits/rejected": -2.756491184234619, "logps/chosen": -226.8396453857422, "logps/rejected": -329.8763427734375, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": -2.728241443634033, "rewards/margins": 9.391422271728516, "rewards/rejected": -12.119664192199707, "step": 4010 }, { "epoch": 2.1036106750392465, "grad_norm": 2.3126449381157954, "learning_rate": 1.2412208338090565e-07, "logits/chosen": -2.9196603298187256, "logits/rejected": -2.8625600337982178, "logps/chosen": -343.40972900390625, "logps/rejected": -411.28411865234375, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -2.0509331226348877, "rewards/margins": 9.095000267028809, "rewards/rejected": -11.145933151245117, "step": 4020 }, { "epoch": 2.108843537414966, "grad_norm": 1.2433369486270567, "learning_rate": 1.228091028461858e-07, "logits/chosen": -2.9145922660827637, "logits/rejected": -2.8534460067749023, "logps/chosen": -269.83636474609375, "logps/rejected": -377.8471984863281, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": -2.775130033493042, "rewards/margins": 8.035322189331055, "rewards/rejected": -10.810452461242676, "step": 4030 }, { "epoch": 2.1140763997906853, "grad_norm": 2.2094701924703357, "learning_rate": 1.2150083884990536e-07, "logits/chosen": -2.9067559242248535, "logits/rejected": -2.8128645420074463, "logps/chosen": -288.21795654296875, "logps/rejected": -361.03485107421875, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -2.817099094390869, "rewards/margins": 8.897134780883789, "rewards/rejected": -11.714232444763184, "step": 4040 }, { "epoch": 2.119309262166405, "grad_norm": 2.256244774478571, "learning_rate": 1.201973399055788e-07, "logits/chosen": -2.994520664215088, "logits/rejected": -2.927452564239502, "logps/chosen": -328.3680725097656, "logps/rejected": -377.5857849121094, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -2.3808367252349854, "rewards/margins": 8.583681106567383, "rewards/rejected": -10.964516639709473, "step": 4050 }, { "epoch": 2.1245421245421245, "grad_norm": 1.784680946649328, "learning_rate": 1.1889865435002117e-07, "logits/chosen": -2.9689199924468994, "logits/rejected": -2.931243896484375, "logps/chosen": -292.87603759765625, "logps/rejected": -365.150146484375, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -2.053737163543701, "rewards/margins": 8.405862808227539, "rewards/rejected": -10.459599494934082, "step": 4060 }, { "epoch": 2.129774986917844, "grad_norm": 2.6574206720671767, "learning_rate": 1.1760483034155588e-07, "logits/chosen": -2.8788862228393555, "logits/rejected": -2.8460500240325928, "logps/chosen": -288.20147705078125, "logps/rejected": -377.51837158203125, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -3.1821186542510986, "rewards/margins": 9.431540489196777, "rewards/rejected": -12.61366081237793, "step": 4070 }, { "epoch": 2.1350078492935634, "grad_norm": 7.636784935029143, "learning_rate": 1.163159158582284e-07, "logits/chosen": -2.765641689300537, "logits/rejected": -2.7661733627319336, "logps/chosen": -290.2430114746094, "logps/rejected": -375.72698974609375, "loss": 0.0179, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.086550235748291, "rewards/margins": 10.100221633911133, "rewards/rejected": -12.186772346496582, "step": 4080 }, { "epoch": 2.140240711669283, "grad_norm": 1.9033474142740545, "learning_rate": 1.1503195869602766e-07, "logits/chosen": -2.8451523780822754, "logits/rejected": -2.717595338821411, "logps/chosen": -285.7832946777344, "logps/rejected": -348.42230224609375, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -3.342125654220581, "rewards/margins": 10.15296459197998, "rewards/rejected": -13.495091438293457, "step": 4090 }, { "epoch": 2.1454735740450026, "grad_norm": 5.599416365339591, "learning_rate": 1.137530064671135e-07, "logits/chosen": -2.81258487701416, "logits/rejected": -2.881831169128418, "logps/chosen": -248.1467742919922, "logps/rejected": -362.12432861328125, "loss": 0.0122, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.6162829399108887, "rewards/margins": 8.592986106872559, "rewards/rejected": -11.209269523620605, "step": 4100 }, { "epoch": 2.1507064364207222, "grad_norm": 5.479922922323834, "learning_rate": 1.1247910659805063e-07, "logits/chosen": -2.9109134674072266, "logits/rejected": -2.8325867652893066, "logps/chosen": -319.20648193359375, "logps/rejected": -315.79150390625, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -2.472533702850342, "rewards/margins": 9.471918106079102, "rewards/rejected": -11.944452285766602, "step": 4110 }, { "epoch": 2.155939298796442, "grad_norm": 1.35259855136301, "learning_rate": 1.112103063280509e-07, "logits/chosen": -2.8561511039733887, "logits/rejected": -2.724534511566162, "logps/chosen": -262.9708557128906, "logps/rejected": -410.6546936035156, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -1.8880354166030884, "rewards/margins": 9.488630294799805, "rewards/rejected": -11.376665115356445, "step": 4120 }, { "epoch": 2.161172161172161, "grad_norm": 2.1750957249948577, "learning_rate": 1.099466527072207e-07, "logits/chosen": -2.816138744354248, "logits/rejected": -2.83309006690979, "logps/chosen": -233.3096466064453, "logps/rejected": -366.50970458984375, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -2.4871339797973633, "rewards/margins": 8.823835372924805, "rewards/rejected": -11.310968399047852, "step": 4130 }, { "epoch": 2.1664050235478807, "grad_norm": 4.398527872821512, "learning_rate": 1.0868819259481638e-07, "logits/chosen": -2.8250362873077393, "logits/rejected": -2.661219835281372, "logps/chosen": -293.8455810546875, "logps/rejected": -306.2014465332031, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -3.206350326538086, "rewards/margins": 9.097640037536621, "rewards/rejected": -12.303991317749023, "step": 4140 }, { "epoch": 2.1716378859236003, "grad_norm": 2.416053107356778, "learning_rate": 1.0743497265750701e-07, "logits/chosen": -2.9696569442749023, "logits/rejected": -2.8937861919403076, "logps/chosen": -284.00323486328125, "logps/rejected": -362.27728271484375, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -2.4969325065612793, "rewards/margins": 8.173836708068848, "rewards/rejected": -10.670769691467285, "step": 4150 }, { "epoch": 2.17687074829932, "grad_norm": 8.854581386261337, "learning_rate": 1.0618703936764359e-07, "logits/chosen": -2.952521562576294, "logits/rejected": -2.795865535736084, "logps/chosen": -318.38525390625, "logps/rejected": -401.44342041015625, "loss": 0.0105, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.5884785652160645, "rewards/margins": 9.206972122192383, "rewards/rejected": -12.795450210571289, "step": 4160 }, { "epoch": 2.182103610675039, "grad_norm": 1.3898396599708345, "learning_rate": 1.0494443900153557e-07, "logits/chosen": -2.9593417644500732, "logits/rejected": -2.781161308288574, "logps/chosen": -313.79510498046875, "logps/rejected": -368.4060974121094, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -2.666456699371338, "rewards/margins": 8.46729850769043, "rewards/rejected": -11.133755683898926, "step": 4170 }, { "epoch": 2.1873364730507587, "grad_norm": 6.718828681906219, "learning_rate": 1.0370721763773507e-07, "logits/chosen": -2.8983607292175293, "logits/rejected": -2.726405620574951, "logps/chosen": -341.4433288574219, "logps/rejected": -368.11700439453125, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": -1.9923518896102905, "rewards/margins": 10.40168571472168, "rewards/rejected": -12.394037246704102, "step": 4180 }, { "epoch": 2.1925693354264784, "grad_norm": 8.045996450431112, "learning_rate": 1.0247542115532845e-07, "logits/chosen": -2.8419604301452637, "logits/rejected": -2.7885799407958984, "logps/chosen": -291.759521484375, "logps/rejected": -361.86785888671875, "loss": 0.0166, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.6647796630859375, "rewards/margins": 9.394464492797852, "rewards/rejected": -12.059244155883789, "step": 4190 }, { "epoch": 2.197802197802198, "grad_norm": 18.666432884156833, "learning_rate": 1.0124909523223418e-07, "logits/chosen": -2.837294101715088, "logits/rejected": -2.7985639572143555, "logps/chosen": -304.92498779296875, "logps/rejected": -367.143310546875, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -2.466035842895508, "rewards/margins": 9.639225959777832, "rewards/rejected": -12.10526180267334, "step": 4200 }, { "epoch": 2.203035060177917, "grad_norm": 0.2559865288625136, "learning_rate": 1.0002828534350987e-07, "logits/chosen": -2.936079740524292, "logits/rejected": -2.8262181282043457, "logps/chosen": -334.14105224609375, "logps/rejected": -363.6258850097656, "loss": 0.0168, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.7327065467834473, "rewards/margins": 8.197755813598633, "rewards/rejected": -10.930462837219238, "step": 4210 }, { "epoch": 2.208267922553637, "grad_norm": 2.4223752857252885, "learning_rate": 9.881303675966524e-08, "logits/chosen": -2.8731751441955566, "logits/rejected": -2.7721402645111084, "logps/chosen": -295.3141174316406, "logps/rejected": -363.4248046875, "loss": 0.0064, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.085136890411377, "rewards/margins": 8.890009880065918, "rewards/rejected": -11.97514820098877, "step": 4220 }, { "epoch": 2.2135007849293564, "grad_norm": 0.5333750349511613, "learning_rate": 9.760339454498393e-08, "logits/chosen": -2.7324042320251465, "logits/rejected": -2.744624614715576, "logps/chosen": -249.5013427734375, "logps/rejected": -329.3797302246094, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -3.2695374488830566, "rewards/margins": 9.1023530960083, "rewards/rejected": -12.371891021728516, "step": 4230 }, { "epoch": 2.218733647305076, "grad_norm": 14.851230447998592, "learning_rate": 9.639940355585218e-08, "logits/chosen": -2.9656949043273926, "logits/rejected": -2.9145922660827637, "logps/chosen": -297.51251220703125, "logps/rejected": -382.37628173828125, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -3.787256956100464, "rewards/margins": 7.464027404785156, "rewards/rejected": -11.2512845993042, "step": 4240 }, { "epoch": 2.2239665096807952, "grad_norm": 18.595421619672845, "learning_rate": 9.52011084390954e-08, "logits/chosen": -2.8767952919006348, "logits/rejected": -2.841043472290039, "logps/chosen": -278.73419189453125, "logps/rejected": -345.7110290527344, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": -2.574826717376709, "rewards/margins": 8.830595016479492, "rewards/rejected": -11.40542221069336, "step": 4250 }, { "epoch": 2.229199372056515, "grad_norm": 5.161715911500888, "learning_rate": 9.400855363032262e-08, "logits/chosen": -2.9124603271484375, "logits/rejected": -2.939523458480835, "logps/chosen": -309.1670227050781, "logps/rejected": -392.32489013671875, "loss": 0.0123, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.5240213871002197, "rewards/margins": 8.883042335510254, "rewards/rejected": -11.407062530517578, "step": 4260 }, { "epoch": 2.2344322344322345, "grad_norm": 2.013703695996106, "learning_rate": 9.282178335227883e-08, "logits/chosen": -2.862917423248291, "logits/rejected": -2.7960150241851807, "logps/chosen": -271.2635803222656, "logps/rejected": -373.2563171386719, "loss": 0.0048, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.8724260330200195, "rewards/margins": 8.923782348632812, "rewards/rejected": -11.796208381652832, "step": 4270 }, { "epoch": 2.239665096807954, "grad_norm": 1.9691195566800963, "learning_rate": 9.164084161320471e-08, "logits/chosen": -2.8455491065979004, "logits/rejected": -2.7305119037628174, "logps/chosen": -276.65155029296875, "logps/rejected": -357.0572509765625, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": -2.862553834915161, "rewards/margins": 10.012895584106445, "rewards/rejected": -12.875450134277344, "step": 4280 }, { "epoch": 2.2448979591836733, "grad_norm": 0.7049463782291924, "learning_rate": 9.046577220520518e-08, "logits/chosen": -2.857173442840576, "logits/rejected": -2.7811245918273926, "logps/chosen": -276.25726318359375, "logps/rejected": -348.5311279296875, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -3.2030816078186035, "rewards/margins": 8.287731170654297, "rewards/rejected": -11.490813255310059, "step": 4290 }, { "epoch": 2.250130821559393, "grad_norm": 21.80373988683648, "learning_rate": 8.929661870262525e-08, "logits/chosen": -3.043079137802124, "logits/rejected": -2.920424222946167, "logps/chosen": -387.53302001953125, "logps/rejected": -384.66485595703125, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -2.3552753925323486, "rewards/margins": 8.978832244873047, "rewards/rejected": -11.334107398986816, "step": 4300 }, { "epoch": 2.2553636839351126, "grad_norm": 2.3094455274330503, "learning_rate": 8.813342446043423e-08, "logits/chosen": -2.9377517700195312, "logits/rejected": -2.804396867752075, "logps/chosen": -283.4033203125, "logps/rejected": -323.5387878417969, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": -2.404630184173584, "rewards/margins": 9.045892715454102, "rewards/rejected": -11.450523376464844, "step": 4310 }, { "epoch": 2.260596546310832, "grad_norm": 2.611672393100722, "learning_rate": 8.697623261261788e-08, "logits/chosen": -2.845099687576294, "logits/rejected": -2.8392512798309326, "logps/chosen": -262.60467529296875, "logps/rejected": -374.4058532714844, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -2.3563427925109863, "rewards/margins": 10.261815071105957, "rewards/rejected": -12.618158340454102, "step": 4320 }, { "epoch": 2.2658294086865514, "grad_norm": 4.186354167735096, "learning_rate": 8.58250860705792e-08, "logits/chosen": -3.0040595531463623, "logits/rejected": -2.9145419597625732, "logps/chosen": -331.91717529296875, "logps/rejected": -381.52398681640625, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -2.6650617122650146, "rewards/margins": 8.398420333862305, "rewards/rejected": -11.063481330871582, "step": 4330 }, { "epoch": 2.271062271062271, "grad_norm": 1.574738240716016, "learning_rate": 8.468002752154671e-08, "logits/chosen": -3.010850191116333, "logits/rejected": -2.8809456825256348, "logps/chosen": -321.1930236816406, "logps/rejected": -353.4471130371094, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": -2.3641421794891357, "rewards/margins": 8.63088607788086, "rewards/rejected": -10.995028495788574, "step": 4340 }, { "epoch": 2.2762951334379906, "grad_norm": 1.6824896935566453, "learning_rate": 8.354109942699208e-08, "logits/chosen": -2.9308817386627197, "logits/rejected": -2.8746654987335205, "logps/chosen": -288.91571044921875, "logps/rejected": -355.26348876953125, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -2.6068153381347656, "rewards/margins": 8.824785232543945, "rewards/rejected": -11.431601524353027, "step": 4350 }, { "epoch": 2.2815279958137102, "grad_norm": 0.9965226938088154, "learning_rate": 8.240834402105524e-08, "logits/chosen": -2.8435070514678955, "logits/rejected": -2.7604994773864746, "logps/chosen": -317.35357666015625, "logps/rejected": -343.9361877441406, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": -2.156468629837036, "rewards/margins": 8.740184783935547, "rewards/rejected": -10.896653175354004, "step": 4360 }, { "epoch": 2.2867608581894294, "grad_norm": 4.741290391924799, "learning_rate": 8.128180330897791e-08, "logits/chosen": -2.781571626663208, "logits/rejected": -2.8373541831970215, "logps/chosen": -290.853759765625, "logps/rejected": -410.56988525390625, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -2.2830965518951416, "rewards/margins": 9.717386245727539, "rewards/rejected": -12.000480651855469, "step": 4370 }, { "epoch": 2.291993720565149, "grad_norm": 1.0533267661714782, "learning_rate": 8.016151906554683e-08, "logits/chosen": -2.904536724090576, "logits/rejected": -2.912773609161377, "logps/chosen": -279.301513671875, "logps/rejected": -450.22210693359375, "loss": 0.0137, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.872162342071533, "rewards/margins": 9.188623428344727, "rewards/rejected": -12.060786247253418, "step": 4380 }, { "epoch": 2.2972265829408687, "grad_norm": 2.8120199323748727, "learning_rate": 7.90475328335439e-08, "logits/chosen": -2.869044542312622, "logits/rejected": -2.808676242828369, "logps/chosen": -244.701904296875, "logps/rejected": -311.186767578125, "loss": 0.0165, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.5861008167266846, "rewards/margins": 7.807511806488037, "rewards/rejected": -11.393610954284668, "step": 4390 }, { "epoch": 2.3024594453165883, "grad_norm": 2.209069709982538, "learning_rate": 7.793988592220568e-08, "logits/chosen": -2.8650224208831787, "logits/rejected": -2.7655138969421387, "logps/chosen": -295.19708251953125, "logps/rejected": -347.73248291015625, "loss": 0.0224, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.378282070159912, "rewards/margins": 8.248431205749512, "rewards/rejected": -11.626713752746582, "step": 4400 }, { "epoch": 2.3076923076923075, "grad_norm": 6.985486603119641, "learning_rate": 7.683861940569217e-08, "logits/chosen": -2.900050640106201, "logits/rejected": -2.8193471431732178, "logps/chosen": -353.52972412109375, "logps/rejected": -372.1658630371094, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -3.009934663772583, "rewards/margins": 8.790477752685547, "rewards/rejected": -11.80041217803955, "step": 4410 }, { "epoch": 2.312925170068027, "grad_norm": 21.118796687822105, "learning_rate": 7.574377412156291e-08, "logits/chosen": -2.9211108684539795, "logits/rejected": -2.7273151874542236, "logps/chosen": -296.21331787109375, "logps/rejected": -336.29339599609375, "loss": 0.021, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.7813594341278076, "rewards/margins": 8.324694633483887, "rewards/rejected": -12.106054306030273, "step": 4420 }, { "epoch": 2.3181580324437467, "grad_norm": 1.0303305877961897, "learning_rate": 7.465539066926322e-08, "logits/chosen": -2.838982343673706, "logits/rejected": -2.8010878562927246, "logps/chosen": -308.4911193847656, "logps/rejected": -351.95172119140625, "loss": 0.015, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.320422649383545, "rewards/margins": 9.343027114868164, "rewards/rejected": -12.66344928741455, "step": 4430 }, { "epoch": 2.3233908948194664, "grad_norm": 10.78928901535609, "learning_rate": 7.357350940861845e-08, "logits/chosen": -2.939601182937622, "logits/rejected": -2.8932018280029297, "logps/chosen": -340.8313293457031, "logps/rejected": -451.13372802734375, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -2.731177806854248, "rewards/margins": 9.879323959350586, "rewards/rejected": -12.610502243041992, "step": 4440 }, { "epoch": 2.328623757195186, "grad_norm": 1.1484119605229575, "learning_rate": 7.249817045833726e-08, "logits/chosen": -2.826831102371216, "logits/rejected": -2.7933297157287598, "logps/chosen": -286.6219177246094, "logps/rejected": -333.69488525390625, "loss": 0.0148, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.009855270385742, "rewards/margins": 8.995770454406738, "rewards/rejected": -12.00562572479248, "step": 4450 }, { "epoch": 2.333856619570905, "grad_norm": 1.9237839371279364, "learning_rate": 7.14294136945241e-08, "logits/chosen": -2.8781328201293945, "logits/rejected": -2.793633222579956, "logps/chosen": -295.270263671875, "logps/rejected": -379.59625244140625, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -2.1001155376434326, "rewards/margins": 10.206942558288574, "rewards/rejected": -12.307058334350586, "step": 4460 }, { "epoch": 2.339089481946625, "grad_norm": 13.67260351672764, "learning_rate": 7.036727874920043e-08, "logits/chosen": -2.7330899238586426, "logits/rejected": -2.719641923904419, "logps/chosen": -283.1666259765625, "logps/rejected": -386.0990295410156, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -3.4672768115997314, "rewards/margins": 9.18285846710205, "rewards/rejected": -12.65013599395752, "step": 4470 }, { "epoch": 2.3443223443223444, "grad_norm": 3.0475567218172537, "learning_rate": 6.931180500883484e-08, "logits/chosen": -2.861558437347412, "logits/rejected": -2.802945852279663, "logps/chosen": -241.9453125, "logps/rejected": -298.2765808105469, "loss": 0.0103, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.037928581237793, "rewards/margins": 8.126497268676758, "rewards/rejected": -11.164424896240234, "step": 4480 }, { "epoch": 2.3495552066980636, "grad_norm": 1.4352098841711, "learning_rate": 6.826303161288302e-08, "logits/chosen": -2.707292079925537, "logits/rejected": -2.6136269569396973, "logps/chosen": -258.99066162109375, "logps/rejected": -345.9324951171875, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -3.4183459281921387, "rewards/margins": 9.519994735717773, "rewards/rejected": -12.938342094421387, "step": 4490 }, { "epoch": 2.3547880690737832, "grad_norm": 2.6158869749215303, "learning_rate": 6.722099745233594e-08, "logits/chosen": -3.014127254486084, "logits/rejected": -2.8602631092071533, "logps/chosen": -336.11431884765625, "logps/rejected": -382.06817626953125, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -2.806563138961792, "rewards/margins": 9.499139785766602, "rewards/rejected": -12.305704116821289, "step": 4500 }, { "epoch": 2.360020931449503, "grad_norm": 18.008620032297163, "learning_rate": 6.618574116827786e-08, "logits/chosen": -2.8721041679382324, "logits/rejected": -2.8400163650512695, "logps/chosen": -262.0733642578125, "logps/rejected": -348.70343017578125, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": -2.9667296409606934, "rewards/margins": 8.90965461730957, "rewards/rejected": -11.876383781433105, "step": 4510 }, { "epoch": 2.3652537938252225, "grad_norm": 1.664243919937722, "learning_rate": 6.515730115045339e-08, "logits/chosen": -2.952232599258423, "logits/rejected": -2.8259382247924805, "logps/chosen": -329.37139892578125, "logps/rejected": -385.1444091796875, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": -2.510221481323242, "rewards/margins": 10.674477577209473, "rewards/rejected": -13.184698104858398, "step": 4520 }, { "epoch": 2.370486656200942, "grad_norm": 3.1878658038571457, "learning_rate": 6.413571553584399e-08, "logits/chosen": -2.836963176727295, "logits/rejected": -2.7493526935577393, "logps/chosen": -295.43524169921875, "logps/rejected": -363.8216552734375, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -4.03454065322876, "rewards/margins": 7.886300086975098, "rewards/rejected": -11.9208402633667, "step": 4530 }, { "epoch": 2.3757195185766613, "grad_norm": 1.3081687033011764, "learning_rate": 6.312102220725346e-08, "logits/chosen": -2.981173038482666, "logits/rejected": -2.808628797531128, "logps/chosen": -376.0943298339844, "logps/rejected": -396.55926513671875, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -3.5244765281677246, "rewards/margins": 9.487384796142578, "rewards/rejected": -13.011860847473145, "step": 4540 }, { "epoch": 2.380952380952381, "grad_norm": 2.005406246684602, "learning_rate": 6.21132587919036e-08, "logits/chosen": -2.928478956222534, "logits/rejected": -2.819777727127075, "logps/chosen": -314.96112060546875, "logps/rejected": -382.585693359375, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -3.17319393157959, "rewards/margins": 8.83165168762207, "rewards/rejected": -12.004846572875977, "step": 4550 }, { "epoch": 2.3861852433281006, "grad_norm": 1.7282438279247125, "learning_rate": 6.111246266003859e-08, "logits/chosen": -2.78440523147583, "logits/rejected": -2.723749876022339, "logps/chosen": -340.67681884765625, "logps/rejected": -438.6639709472656, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -2.9083125591278076, "rewards/margins": 10.545395851135254, "rewards/rejected": -13.453707695007324, "step": 4560 }, { "epoch": 2.3914181057038197, "grad_norm": 4.86912231415302, "learning_rate": 6.011867092353934e-08, "logits/chosen": -2.8875651359558105, "logits/rejected": -2.742415428161621, "logps/chosen": -317.0046081542969, "logps/rejected": -335.314697265625, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": -3.254955291748047, "rewards/margins": 9.052940368652344, "rewards/rejected": -12.307896614074707, "step": 4570 }, { "epoch": 2.3966509680795394, "grad_norm": 1.475056645440992, "learning_rate": 5.9131920434547235e-08, "logits/chosen": -2.7750096321105957, "logits/rejected": -2.81473970413208, "logps/chosen": -348.4535827636719, "logps/rejected": -451.83734130859375, "loss": 0.0138, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.0967555046081543, "rewards/margins": 10.137144088745117, "rewards/rejected": -13.23390007019043, "step": 4580 }, { "epoch": 2.401883830455259, "grad_norm": 0.858100929485638, "learning_rate": 5.8152247784097664e-08, "logits/chosen": -2.8970320224761963, "logits/rejected": -2.8206725120544434, "logps/chosen": -346.96429443359375, "logps/rejected": -426.76544189453125, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -2.045712471008301, "rewards/margins": 11.384037971496582, "rewards/rejected": -13.429750442504883, "step": 4590 }, { "epoch": 2.4071166928309786, "grad_norm": 0.6721181698087139, "learning_rate": 5.717968930076289e-08, "logits/chosen": -2.9083378314971924, "logits/rejected": -2.8689537048339844, "logps/chosen": -253.82522583007812, "logps/rejected": -347.12774658203125, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -3.02974534034729, "rewards/margins": 10.358114242553711, "rewards/rejected": -13.387860298156738, "step": 4600 }, { "epoch": 2.4123495552066982, "grad_norm": 1.5672411116374825, "learning_rate": 5.621428104930528e-08, "logits/chosen": -2.705878973007202, "logits/rejected": -2.662104368209839, "logps/chosen": -234.02694702148438, "logps/rejected": -356.3685302734375, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -4.257853031158447, "rewards/margins": 10.108559608459473, "rewards/rejected": -14.366412162780762, "step": 4610 }, { "epoch": 2.4175824175824174, "grad_norm": 1.9023001376563708, "learning_rate": 5.525605882933965e-08, "logits/chosen": -2.8304178714752197, "logits/rejected": -2.8306803703308105, "logps/chosen": -292.9932861328125, "logps/rejected": -374.8326721191406, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -3.5671279430389404, "rewards/margins": 9.288130760192871, "rewards/rejected": -12.855259895324707, "step": 4620 }, { "epoch": 2.422815279958137, "grad_norm": 7.360536696953532, "learning_rate": 5.4305058174005853e-08, "logits/chosen": -2.7550246715545654, "logits/rejected": -2.737776517868042, "logps/chosen": -401.20220947265625, "logps/rejected": -455.78668212890625, "loss": 0.0096, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.1972873210906982, "rewards/margins": 11.285956382751465, "rewards/rejected": -13.483243942260742, "step": 4630 }, { "epoch": 2.4280481423338567, "grad_norm": 0.7163266255752935, "learning_rate": 5.33613143486511e-08, "logits/chosen": -2.83345627784729, "logits/rejected": -2.675952434539795, "logps/chosen": -337.17108154296875, "logps/rejected": -346.447021484375, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -2.1417653560638428, "rewards/margins": 9.936838150024414, "rewards/rejected": -12.078604698181152, "step": 4640 }, { "epoch": 2.4332810047095763, "grad_norm": 1.370139809492605, "learning_rate": 5.242486234952206e-08, "logits/chosen": -2.8537354469299316, "logits/rejected": -2.7590529918670654, "logps/chosen": -308.4748229980469, "logps/rejected": -357.12701416015625, "loss": 0.0071, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.1934545040130615, "rewards/margins": 9.083258628845215, "rewards/rejected": -12.276715278625488, "step": 4650 }, { "epoch": 2.4385138670852955, "grad_norm": 1.5965278671876666, "learning_rate": 5.149573690246758e-08, "logits/chosen": -2.8379387855529785, "logits/rejected": -2.774552345275879, "logps/chosen": -337.2934875488281, "logps/rejected": -388.73870849609375, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -3.371943950653076, "rewards/margins": 10.062592506408691, "rewards/rejected": -13.434535026550293, "step": 4660 }, { "epoch": 2.443746729461015, "grad_norm": 2.1627742038377153, "learning_rate": 5.057397246165052e-08, "logits/chosen": -2.876816511154175, "logits/rejected": -2.7892343997955322, "logps/chosen": -381.94525146484375, "logps/rejected": -389.64556884765625, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -3.3894219398498535, "rewards/margins": 9.73163890838623, "rewards/rejected": -13.121060371398926, "step": 4670 }, { "epoch": 2.4489795918367347, "grad_norm": 3.514309496191984, "learning_rate": 4.9659603208270173e-08, "logits/chosen": -2.9651684761047363, "logits/rejected": -2.758455276489258, "logps/chosen": -368.32781982421875, "logps/rejected": -363.78680419921875, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -3.060225009918213, "rewards/margins": 8.99160099029541, "rewards/rejected": -12.051824569702148, "step": 4680 }, { "epoch": 2.4542124542124544, "grad_norm": 6.234029288363789, "learning_rate": 4.875266304929496e-08, "logits/chosen": -2.667440891265869, "logits/rejected": -2.586848258972168, "logps/chosen": -251.42919921875, "logps/rejected": -316.4362487792969, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -3.415973663330078, "rewards/margins": 8.94587516784668, "rewards/rejected": -12.361847877502441, "step": 4690 }, { "epoch": 2.4594453165881736, "grad_norm": 7.036762703948189, "learning_rate": 4.785318561620511e-08, "logits/chosen": -2.738654851913452, "logits/rejected": -2.735853672027588, "logps/chosen": -264.09539794921875, "logps/rejected": -371.7897033691406, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -4.563920497894287, "rewards/margins": 8.509939193725586, "rewards/rejected": -13.073860168457031, "step": 4700 }, { "epoch": 2.464678178963893, "grad_norm": 2.416057343800875, "learning_rate": 4.696120426374503e-08, "logits/chosen": -2.6708297729492188, "logits/rejected": -2.7059757709503174, "logps/chosen": -269.0534973144531, "logps/rejected": -386.5177917480469, "loss": 0.0117, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.766289472579956, "rewards/margins": 9.819225311279297, "rewards/rejected": -13.585515022277832, "step": 4710 }, { "epoch": 2.469911041339613, "grad_norm": 33.41284614590583, "learning_rate": 4.607675206868705e-08, "logits/chosen": -2.9334309101104736, "logits/rejected": -2.837580919265747, "logps/chosen": -296.67706298828125, "logps/rejected": -338.26300048828125, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -3.7836005687713623, "rewards/margins": 9.105581283569336, "rewards/rejected": -12.889181137084961, "step": 4720 }, { "epoch": 2.4751439037153324, "grad_norm": 3.433404841876578, "learning_rate": 4.519986182860452e-08, "logits/chosen": -2.8307602405548096, "logits/rejected": -2.6817924976348877, "logps/chosen": -305.6940612792969, "logps/rejected": -329.53314208984375, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": -2.854097843170166, "rewards/margins": 8.564054489135742, "rewards/rejected": -11.41815185546875, "step": 4730 }, { "epoch": 2.4803767660910516, "grad_norm": 1.7503452735037357, "learning_rate": 4.433056606065552e-08, "logits/chosen": -2.862783908843994, "logits/rejected": -2.829057455062866, "logps/chosen": -254.3591766357422, "logps/rejected": -356.7373352050781, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -2.8471312522888184, "rewards/margins": 8.464635848999023, "rewards/rejected": -11.311765670776367, "step": 4740 }, { "epoch": 2.4856096284667712, "grad_norm": 12.864352885762367, "learning_rate": 4.3468897000377427e-08, "logits/chosen": -3.0122060775756836, "logits/rejected": -2.9135868549346924, "logps/chosen": -298.08050537109375, "logps/rejected": -346.1724853515625, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -2.4133388996124268, "rewards/margins": 9.11259937286377, "rewards/rejected": -11.5259370803833, "step": 4750 }, { "epoch": 2.490842490842491, "grad_norm": 7.294784994187585, "learning_rate": 4.2614886600491115e-08, "logits/chosen": -2.9196221828460693, "logits/rejected": -2.854626178741455, "logps/chosen": -305.66046142578125, "logps/rejected": -403.1209411621094, "loss": 0.0083, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.070751667022705, "rewards/margins": 10.068588256835938, "rewards/rejected": -14.1393404006958, "step": 4760 }, { "epoch": 2.4960753532182105, "grad_norm": 53.620721241713376, "learning_rate": 4.1768566529716415e-08, "logits/chosen": -2.822803258895874, "logits/rejected": -2.8117191791534424, "logps/chosen": -264.75482177734375, "logps/rejected": -355.4839782714844, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": -4.027609825134277, "rewards/margins": 9.619600296020508, "rewards/rejected": -13.647211074829102, "step": 4770 }, { "epoch": 2.50130821559393, "grad_norm": 14.217377621738937, "learning_rate": 4.0929968171597526e-08, "logits/chosen": -2.7630136013031006, "logits/rejected": -2.716597080230713, "logps/chosen": -293.35772705078125, "logps/rejected": -309.22637939453125, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -2.939805507659912, "rewards/margins": 9.167088508605957, "rewards/rejected": -12.106892585754395, "step": 4780 }, { "epoch": 2.5065410779696493, "grad_norm": 0.9197327911529343, "learning_rate": 4.009912262333942e-08, "logits/chosen": -2.900611162185669, "logits/rejected": -2.800503730773926, "logps/chosen": -292.6119079589844, "logps/rejected": -365.04583740234375, "loss": 0.016, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.015660285949707, "rewards/margins": 9.221795082092285, "rewards/rejected": -14.237455368041992, "step": 4790 }, { "epoch": 2.511773940345369, "grad_norm": 44.18921841073164, "learning_rate": 3.927606069465442e-08, "logits/chosen": -2.783557891845703, "logits/rejected": -2.628039598464966, "logps/chosen": -330.0908508300781, "logps/rejected": -366.28497314453125, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": -3.5947022438049316, "rewards/margins": 10.150771141052246, "rewards/rejected": -13.745473861694336, "step": 4800 }, { "epoch": 2.5170068027210886, "grad_norm": 3.6221598161343618, "learning_rate": 3.8460812906620037e-08, "logits/chosen": -2.90415358543396, "logits/rejected": -2.7906334400177, "logps/chosen": -319.799072265625, "logps/rejected": -397.00885009765625, "loss": 0.0094, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.030406951904297, "rewards/margins": 9.888630867004395, "rewards/rejected": -12.919038772583008, "step": 4810 }, { "epoch": 2.5222396650968077, "grad_norm": 50.51999072650945, "learning_rate": 3.765340949054696e-08, "logits/chosen": -2.842414379119873, "logits/rejected": -2.71669340133667, "logps/chosen": -329.24957275390625, "logps/rejected": -342.67218017578125, "loss": 0.0146, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.7498135566711426, "rewards/margins": 9.324239730834961, "rewards/rejected": -12.074052810668945, "step": 4820 }, { "epoch": 2.5274725274725274, "grad_norm": 10.675236498961617, "learning_rate": 3.685388038685811e-08, "logits/chosen": -2.866079092025757, "logits/rejected": -2.8104681968688965, "logps/chosen": -367.0174255371094, "logps/rejected": -444.5892639160156, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -3.1451611518859863, "rewards/margins": 10.330235481262207, "rewards/rejected": -13.475397109985352, "step": 4830 }, { "epoch": 2.532705389848247, "grad_norm": 11.914375795742423, "learning_rate": 3.60622552439783e-08, "logits/chosen": -2.7594618797302246, "logits/rejected": -2.6851165294647217, "logps/chosen": -292.93084716796875, "logps/rejected": -390.409423828125, "loss": 0.0202, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.0230164527893066, "rewards/margins": 11.294994354248047, "rewards/rejected": -14.318010330200195, "step": 4840 }, { "epoch": 2.5379382522239666, "grad_norm": 2.3607224683580306, "learning_rate": 3.527856341723479e-08, "logits/chosen": -2.7595577239990234, "logits/rejected": -2.738102436065674, "logps/chosen": -245.0532989501953, "logps/rejected": -391.9998474121094, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -4.248190879821777, "rewards/margins": 10.610750198364258, "rewards/rejected": -14.858942031860352, "step": 4850 }, { "epoch": 2.5431711145996863, "grad_norm": 6.102369874731456, "learning_rate": 3.4502833967768816e-08, "logits/chosen": -2.8204293251037598, "logits/rejected": -2.8057093620300293, "logps/chosen": -344.6221618652344, "logps/rejected": -381.9562072753906, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -3.7023468017578125, "rewards/margins": 9.226800918579102, "rewards/rejected": -12.92914867401123, "step": 4860 }, { "epoch": 2.5484039769754054, "grad_norm": 0.9102087379738567, "learning_rate": 3.373509566145793e-08, "logits/chosen": -2.860015392303467, "logits/rejected": -2.716541290283203, "logps/chosen": -398.91314697265625, "logps/rejected": -395.56353759765625, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": -3.7158119678497314, "rewards/margins": 9.005880355834961, "rewards/rejected": -12.72169303894043, "step": 4870 }, { "epoch": 2.553636839351125, "grad_norm": 3.09647593133853, "learning_rate": 3.2975376967849104e-08, "logits/chosen": -2.8626489639282227, "logits/rejected": -2.74863862991333, "logps/chosen": -281.8648986816406, "logps/rejected": -366.81195068359375, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -2.710486650466919, "rewards/margins": 9.427326202392578, "rewards/rejected": -12.137812614440918, "step": 4880 }, { "epoch": 2.5588697017268447, "grad_norm": 1.5532621213138285, "learning_rate": 3.222370605910332e-08, "logits/chosen": -2.8417959213256836, "logits/rejected": -2.777325391769409, "logps/chosen": -313.1429443359375, "logps/rejected": -368.9937744140625, "loss": 0.0108, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.5751020908355713, "rewards/margins": 10.313004493713379, "rewards/rejected": -12.888105392456055, "step": 4890 }, { "epoch": 2.564102564102564, "grad_norm": 6.631595257208379, "learning_rate": 3.1480110808950746e-08, "logits/chosen": -2.6974239349365234, "logits/rejected": -2.763197183609009, "logps/chosen": -272.7969055175781, "logps/rejected": -399.4237976074219, "loss": 0.0091, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.687346935272217, "rewards/margins": 8.591886520385742, "rewards/rejected": -12.279233932495117, "step": 4900 }, { "epoch": 2.5693354264782835, "grad_norm": 0.44809711978467054, "learning_rate": 3.07446187916568e-08, "logits/chosen": -2.873383045196533, "logits/rejected": -2.8284332752227783, "logps/chosen": -308.70599365234375, "logps/rejected": -391.76348876953125, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -3.7383251190185547, "rewards/margins": 9.195091247558594, "rewards/rejected": -12.933416366577148, "step": 4910 }, { "epoch": 2.574568288854003, "grad_norm": 2.2369639934197303, "learning_rate": 3.001725728100021e-08, "logits/chosen": -2.8800582885742188, "logits/rejected": -2.7648346424102783, "logps/chosen": -333.2757263183594, "logps/rejected": -348.5023498535156, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -3.9780056476593018, "rewards/margins": 9.151884078979492, "rewards/rejected": -13.129888534545898, "step": 4920 }, { "epoch": 2.5798011512297228, "grad_norm": 5.657886258104803, "learning_rate": 2.9298053249261238e-08, "logits/chosen": -2.8016602993011475, "logits/rejected": -2.841737985610962, "logps/chosen": -227.00192260742188, "logps/rejected": -323.0851745605469, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -3.712557315826416, "rewards/margins": 8.412759780883789, "rewards/rejected": -12.12531852722168, "step": 4930 }, { "epoch": 2.5850340136054424, "grad_norm": 11.59473401355241, "learning_rate": 2.8587033366221534e-08, "logits/chosen": -2.787672281265259, "logits/rejected": -2.780534505844116, "logps/chosen": -261.76129150390625, "logps/rejected": -363.7298889160156, "loss": 0.0219, "rewards/accuracies": 1.0, "rewards/chosen": -3.805992603302002, "rewards/margins": 9.105320930480957, "rewards/rejected": -12.911314010620117, "step": 4940 }, { "epoch": 2.5902668759811616, "grad_norm": 2.6942207165997663, "learning_rate": 2.7884223998175248e-08, "logits/chosen": -2.873673677444458, "logits/rejected": -2.815357208251953, "logps/chosen": -275.468505859375, "logps/rejected": -385.5788269042969, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -3.989290714263916, "rewards/margins": 9.044259071350098, "rewards/rejected": -13.033549308776855, "step": 4950 }, { "epoch": 2.595499738356881, "grad_norm": 1.122492586197552, "learning_rate": 2.718965120695141e-08, "logits/chosen": -2.890367269515991, "logits/rejected": -2.919936180114746, "logps/chosen": -308.1990051269531, "logps/rejected": -405.6199645996094, "loss": 0.0111, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.9192490577697754, "rewards/margins": 9.57872200012207, "rewards/rejected": -12.497970581054688, "step": 4960 }, { "epoch": 2.600732600732601, "grad_norm": 4.291511768455033, "learning_rate": 2.6503340748947083e-08, "logits/chosen": -2.8792991638183594, "logits/rejected": -2.909409999847412, "logps/chosen": -314.22723388671875, "logps/rejected": -470.26043701171875, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": -3.1178078651428223, "rewards/margins": 10.709100723266602, "rewards/rejected": -13.82690715789795, "step": 4970 }, { "epoch": 2.60596546310832, "grad_norm": 23.478269906648457, "learning_rate": 2.5825318074172763e-08, "logits/chosen": -2.9769644737243652, "logits/rejected": -2.848550796508789, "logps/chosen": -309.47967529296875, "logps/rejected": -383.3211364746094, "loss": 0.0099, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.4848246574401855, "rewards/margins": 9.416452407836914, "rewards/rejected": -12.901275634765625, "step": 4980 }, { "epoch": 2.6111983254840396, "grad_norm": 3.1552205893368797, "learning_rate": 2.5155608325308358e-08, "logits/chosen": -2.9016048908233643, "logits/rejected": -2.762575626373291, "logps/chosen": -342.06072998046875, "logps/rejected": -409.46014404296875, "loss": 0.0093, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.788952350616455, "rewards/margins": 10.575352668762207, "rewards/rejected": -13.36430549621582, "step": 4990 }, { "epoch": 2.6164311878597593, "grad_norm": 0.7599255992845267, "learning_rate": 2.4494236336770695e-08, "logits/chosen": -2.910125255584717, "logits/rejected": -2.8650388717651367, "logps/chosen": -298.43170166015625, "logps/rejected": -396.6000671386719, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -3.7643840312957764, "rewards/margins": 9.222797393798828, "rewards/rejected": -12.9871826171875, "step": 5000 }, { "epoch": 2.621664050235479, "grad_norm": 12.970910490123991, "learning_rate": 2.3841226633792983e-08, "logits/chosen": -2.8564035892486572, "logits/rejected": -2.729964017868042, "logps/chosen": -343.878173828125, "logps/rejected": -371.1933288574219, "loss": 0.0046, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.737527370452881, "rewards/margins": 8.40316390991211, "rewards/rejected": -12.140690803527832, "step": 5010 }, { "epoch": 2.6268969126111985, "grad_norm": 25.916441494763838, "learning_rate": 2.319660343151511e-08, "logits/chosen": -2.8791275024414062, "logits/rejected": -2.8084702491760254, "logps/chosen": -287.1297302246094, "logps/rejected": -326.0321044921875, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -2.9665133953094482, "rewards/margins": 9.391793251037598, "rewards/rejected": -12.358306884765625, "step": 5020 }, { "epoch": 2.6321297749869177, "grad_norm": 5.055659317114852, "learning_rate": 2.2560390634085715e-08, "logits/chosen": -2.656431198120117, "logits/rejected": -2.679764986038208, "logps/chosen": -267.49029541015625, "logps/rejected": -433.2579040527344, "loss": 0.0144, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.033551216125488, "rewards/margins": 10.259428977966309, "rewards/rejected": -14.292981147766113, "step": 5030 }, { "epoch": 2.6373626373626373, "grad_norm": 2.488784476333342, "learning_rate": 2.1932611833775843e-08, "logits/chosen": -2.8333637714385986, "logits/rejected": -2.727128505706787, "logps/chosen": -273.6491394042969, "logps/rejected": -364.9803161621094, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -3.2481048107147217, "rewards/margins": 11.017594337463379, "rewards/rejected": -14.265698432922363, "step": 5040 }, { "epoch": 2.642595499738357, "grad_norm": 7.702936037771005, "learning_rate": 2.1313290310103897e-08, "logits/chosen": -2.8388736248016357, "logits/rejected": -2.753382921218872, "logps/chosen": -254.21572875976562, "logps/rejected": -362.6852111816406, "loss": 0.0072, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.4444668292999268, "rewards/margins": 9.6504545211792, "rewards/rejected": -13.094922065734863, "step": 5050 }, { "epoch": 2.647828362114076, "grad_norm": 6.106879107523412, "learning_rate": 2.0702449028972696e-08, "logits/chosen": -2.7956011295318604, "logits/rejected": -2.8213047981262207, "logps/chosen": -295.2662048339844, "logps/rejected": -396.3173828125, "loss": 0.0112, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.826176881790161, "rewards/margins": 9.507055282592773, "rewards/rejected": -12.333231925964355, "step": 5060 }, { "epoch": 2.6530612244897958, "grad_norm": 1.332839683983502, "learning_rate": 2.0100110641817547e-08, "logits/chosen": -2.8495430946350098, "logits/rejected": -2.7025628089904785, "logps/chosen": -316.6834716796875, "logps/rejected": -378.25616455078125, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -3.185817241668701, "rewards/margins": 11.556318283081055, "rewards/rejected": -14.742134094238281, "step": 5070 }, { "epoch": 2.6582940868655154, "grad_norm": 6.863802797588559, "learning_rate": 1.9506297484766427e-08, "logits/chosen": -2.9166717529296875, "logits/rejected": -2.8051648139953613, "logps/chosen": -402.67236328125, "logps/rejected": -343.987060546875, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -2.770043134689331, "rewards/margins": 9.530643463134766, "rewards/rejected": -12.300686836242676, "step": 5080 }, { "epoch": 2.663526949241235, "grad_norm": 32.8651043281032, "learning_rate": 1.8921031577811692e-08, "logits/chosen": -2.6460587978363037, "logits/rejected": -2.592620849609375, "logps/chosen": -284.3311462402344, "logps/rejected": -376.4635925292969, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -3.755556583404541, "rewards/margins": 10.209291458129883, "rewards/rejected": -13.964848518371582, "step": 5090 }, { "epoch": 2.6687598116169546, "grad_norm": 1.02540734861296, "learning_rate": 1.834433462399351e-08, "logits/chosen": -2.872791051864624, "logits/rejected": -2.7638444900512695, "logps/chosen": -310.2535705566406, "logps/rejected": -379.98565673828125, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": -3.7113349437713623, "rewards/margins": 8.955098152160645, "rewards/rejected": -12.666433334350586, "step": 5100 }, { "epoch": 2.6739926739926743, "grad_norm": 0.8906061015652305, "learning_rate": 1.7776228008594962e-08, "logits/chosen": -2.8720507621765137, "logits/rejected": -2.8419528007507324, "logps/chosen": -294.3254699707031, "logps/rejected": -435.17742919921875, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -3.210043430328369, "rewards/margins": 10.167320251464844, "rewards/rejected": -13.377363204956055, "step": 5110 }, { "epoch": 2.6792255363683934, "grad_norm": 3.743999615947606, "learning_rate": 1.721673279834926e-08, "logits/chosen": -2.80963397026062, "logits/rejected": -2.715552806854248, "logps/chosen": -296.8659362792969, "logps/rejected": -358.91448974609375, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -4.570162773132324, "rewards/margins": 9.030353546142578, "rewards/rejected": -13.600517272949219, "step": 5120 }, { "epoch": 2.684458398744113, "grad_norm": 0.5720399353702275, "learning_rate": 1.666586974065831e-08, "logits/chosen": -2.843332290649414, "logits/rejected": -2.835294246673584, "logps/chosen": -325.23748779296875, "logps/rejected": -448.0404357910156, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -3.327688694000244, "rewards/margins": 10.649667739868164, "rewards/rejected": -13.977354049682617, "step": 5130 }, { "epoch": 2.6896912611198327, "grad_norm": 0.501633619042611, "learning_rate": 1.6123659262823497e-08, "logits/chosen": -2.817737579345703, "logits/rejected": -2.7480359077453613, "logps/chosen": -312.1848449707031, "logps/rejected": -341.4622802734375, "loss": 0.0095, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.3892722129821777, "rewards/margins": 9.231483459472656, "rewards/rejected": -12.620755195617676, "step": 5140 }, { "epoch": 2.694924123495552, "grad_norm": 2.1213698580107696, "learning_rate": 1.5590121471288104e-08, "logits/chosen": -2.747671604156494, "logits/rejected": -2.7753236293792725, "logps/chosen": -231.80209350585938, "logps/rejected": -355.83154296875, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -3.0472300052642822, "rewards/margins": 11.057841300964355, "rewards/rejected": -14.105072021484375, "step": 5150 }, { "epoch": 2.7001569858712715, "grad_norm": 1.4220726140543474, "learning_rate": 1.5065276150891787e-08, "logits/chosen": -2.7651257514953613, "logits/rejected": -2.730579137802124, "logps/chosen": -272.3035583496094, "logps/rejected": -380.696044921875, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -4.214627265930176, "rewards/margins": 9.828374862670898, "rewards/rejected": -14.043004035949707, "step": 5160 }, { "epoch": 2.705389848246991, "grad_norm": 1.0516778304415866, "learning_rate": 1.4549142764136768e-08, "logits/chosen": -2.7688686847686768, "logits/rejected": -2.6597981452941895, "logps/chosen": -282.28900146484375, "logps/rejected": -381.85528564453125, "loss": 0.0144, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.350693225860596, "rewards/margins": 9.804131507873535, "rewards/rejected": -14.154825210571289, "step": 5170 }, { "epoch": 2.7106227106227108, "grad_norm": 2.0624676285254098, "learning_rate": 1.4041740450466383e-08, "logits/chosen": -2.7915375232696533, "logits/rejected": -2.7800753116607666, "logps/chosen": -288.1428527832031, "logps/rejected": -393.03900146484375, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": -3.4694926738739014, "rewards/margins": 10.24232292175293, "rewards/rejected": -13.711816787719727, "step": 5180 }, { "epoch": 2.7158555729984304, "grad_norm": 1.207146086609638, "learning_rate": 1.3543088025555094e-08, "logits/chosen": -2.759786605834961, "logits/rejected": -2.743520975112915, "logps/chosen": -288.2125244140625, "logps/rejected": -330.58380126953125, "loss": 0.01, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.4429144859313965, "rewards/margins": 8.501128196716309, "rewards/rejected": -11.944043159484863, "step": 5190 }, { "epoch": 2.7210884353741496, "grad_norm": 0.67696500737907, "learning_rate": 1.3053203980610744e-08, "logits/chosen": -2.731365919113159, "logits/rejected": -2.7339062690734863, "logps/chosen": -338.1434326171875, "logps/rejected": -423.2023010253906, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -3.036250114440918, "rewards/margins": 10.574939727783203, "rewards/rejected": -13.611188888549805, "step": 5200 }, { "epoch": 2.726321297749869, "grad_norm": 2.50280286723359, "learning_rate": 1.2572106481689243e-08, "logits/chosen": -2.7959039211273193, "logits/rejected": -2.689049482345581, "logps/chosen": -268.6997985839844, "logps/rejected": -334.78045654296875, "loss": 0.013, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.286745548248291, "rewards/margins": 8.401272773742676, "rewards/rejected": -12.688016891479492, "step": 5210 }, { "epoch": 2.731554160125589, "grad_norm": 0.863838650118753, "learning_rate": 1.2099813369020467e-08, "logits/chosen": -2.882978677749634, "logits/rejected": -2.826648473739624, "logps/chosen": -316.676025390625, "logps/rejected": -418.959716796875, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -3.397019863128662, "rewards/margins": 9.40782642364502, "rewards/rejected": -12.804845809936523, "step": 5220 }, { "epoch": 2.736787022501308, "grad_norm": 0.6815554548342774, "learning_rate": 1.1636342156346846e-08, "logits/chosen": -2.863192081451416, "logits/rejected": -2.722346544265747, "logps/chosen": -290.1851501464844, "logps/rejected": -364.11370849609375, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -4.035996437072754, "rewards/margins": 8.95928955078125, "rewards/rejected": -12.995285034179688, "step": 5230 }, { "epoch": 2.7420198848770276, "grad_norm": 0.5552531172868459, "learning_rate": 1.1181710030274043e-08, "logits/chosen": -2.6296334266662598, "logits/rejected": -2.5234222412109375, "logps/chosen": -241.07223510742188, "logps/rejected": -330.5494384765625, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -3.236060380935669, "rewards/margins": 10.941947937011719, "rewards/rejected": -14.178007125854492, "step": 5240 }, { "epoch": 2.7472527472527473, "grad_norm": 2.617159971201451, "learning_rate": 1.0735933849633561e-08, "logits/chosen": -2.859862804412842, "logits/rejected": -2.741220474243164, "logps/chosen": -334.3736267089844, "logps/rejected": -349.4825439453125, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -3.3494479656219482, "rewards/margins": 9.855086326599121, "rewards/rejected": -13.204533576965332, "step": 5250 }, { "epoch": 2.752485609628467, "grad_norm": 5.263139658176844, "learning_rate": 1.0299030144857445e-08, "logits/chosen": -2.7687082290649414, "logits/rejected": -2.8085827827453613, "logps/chosen": -257.25408935546875, "logps/rejected": -372.70440673828125, "loss": 0.0103, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.268962860107422, "rewards/margins": 10.460187911987305, "rewards/rejected": -14.729150772094727, "step": 5260 }, { "epoch": 2.7577184720041865, "grad_norm": 0.6524117487683248, "learning_rate": 9.871015117365516e-09, "logits/chosen": -2.8147265911102295, "logits/rejected": -2.780487537384033, "logps/chosen": -252.65658569335938, "logps/rejected": -331.97412109375, "loss": 0.0089, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.7238247394561768, "rewards/margins": 9.096030235290527, "rewards/rejected": -12.819854736328125, "step": 5270 }, { "epoch": 2.7629513343799057, "grad_norm": 3.4162174338191442, "learning_rate": 9.451904638964447e-09, "logits/chosen": -2.8511269092559814, "logits/rejected": -2.7249796390533447, "logps/chosen": -339.9266662597656, "logps/rejected": -360.003662109375, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -3.539249897003174, "rewards/margins": 9.068693161010742, "rewards/rejected": -12.607943534851074, "step": 5280 }, { "epoch": 2.7681841967556253, "grad_norm": 2.4343934919292507, "learning_rate": 9.041714251259214e-09, "logits/chosen": -2.7060048580169678, "logits/rejected": -2.5530850887298584, "logps/chosen": -307.7421875, "logps/rejected": -380.89599609375, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -3.511340379714966, "rewards/margins": 10.741117477416992, "rewards/rejected": -14.252458572387695, "step": 5290 }, { "epoch": 2.773417059131345, "grad_norm": 9.978830763246204, "learning_rate": 8.640459165076857e-09, "logits/chosen": -2.7529006004333496, "logits/rejected": -2.8561973571777344, "logps/chosen": -244.1162109375, "logps/rejected": -389.89593505859375, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -3.882157802581787, "rewards/margins": 10.100847244262695, "rewards/rejected": -13.983004570007324, "step": 5300 }, { "epoch": 2.778649921507064, "grad_norm": 7.492917817240637, "learning_rate": 8.248154259902246e-09, "logits/chosen": -2.8315601348876953, "logits/rejected": -2.649303913116455, "logps/chosen": -313.1072692871094, "logps/rejected": -328.4375, "loss": 0.0109, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.388908386230469, "rewards/margins": 8.733987808227539, "rewards/rejected": -13.122896194458008, "step": 5310 }, { "epoch": 2.7838827838827838, "grad_norm": 0.9139975721239019, "learning_rate": 7.86481408332651e-09, "logits/chosen": -2.8557746410369873, "logits/rejected": -2.7217860221862793, "logps/chosen": -257.1571044921875, "logps/rejected": -346.288330078125, "loss": 0.0093, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.985661029815674, "rewards/margins": 8.736552238464355, "rewards/rejected": -12.722212791442871, "step": 5320 }, { "epoch": 2.7891156462585034, "grad_norm": 1.7906375898687616, "learning_rate": 7.490452850507506e-09, "logits/chosen": -2.8483726978302, "logits/rejected": -2.781266689300537, "logps/chosen": -293.47613525390625, "logps/rejected": -337.77850341796875, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -3.844744920730591, "rewards/margins": 8.693450927734375, "rewards/rejected": -12.538195610046387, "step": 5330 }, { "epoch": 2.794348508634223, "grad_norm": 1.519457845709969, "learning_rate": 7.1250844436426535e-09, "logits/chosen": -2.721013307571411, "logits/rejected": -2.6416993141174316, "logps/chosen": -261.25341796875, "logps/rejected": -359.62005615234375, "loss": 0.0089, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.205333232879639, "rewards/margins": 10.659531593322754, "rewards/rejected": -14.864863395690918, "step": 5340 }, { "epoch": 2.7995813710099426, "grad_norm": 1.840927408855351, "learning_rate": 6.768722411454153e-09, "logits/chosen": -2.7464046478271484, "logits/rejected": -2.7040112018585205, "logps/chosen": -279.9874572753906, "logps/rejected": -358.63482666015625, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -4.264468669891357, "rewards/margins": 9.235993385314941, "rewards/rejected": -13.500460624694824, "step": 5350 }, { "epoch": 2.804814233385662, "grad_norm": 9.46525377600493, "learning_rate": 6.421379968686663e-09, "logits/chosen": -2.9282453060150146, "logits/rejected": -2.78389048576355, "logps/chosen": -397.5576477050781, "logps/rejected": -406.98651123046875, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -3.326833724975586, "rewards/margins": 8.562904357910156, "rewards/rejected": -11.889738082885742, "step": 5360 }, { "epoch": 2.8100470957613815, "grad_norm": 2.0780866700148115, "learning_rate": 6.083069995617113e-09, "logits/chosen": -2.7446446418762207, "logits/rejected": -2.5998244285583496, "logps/chosen": -289.5511169433594, "logps/rejected": -369.8989562988281, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -4.226802825927734, "rewards/margins": 10.15228271484375, "rewards/rejected": -14.379084587097168, "step": 5370 }, { "epoch": 2.815279958137101, "grad_norm": 36.534531216151066, "learning_rate": 5.753805037577192e-09, "logits/chosen": -2.653334856033325, "logits/rejected": -2.7123851776123047, "logps/chosen": -281.099365234375, "logps/rejected": -354.68426513671875, "loss": 0.0108, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.273808240890503, "rewards/margins": 9.460447311401367, "rewards/rejected": -12.73425579071045, "step": 5380 }, { "epoch": 2.8205128205128203, "grad_norm": 4.303928149236192, "learning_rate": 5.433597304488113e-09, "logits/chosen": -2.810227632522583, "logits/rejected": -2.6698215007781982, "logps/chosen": -320.6605529785156, "logps/rejected": -434.53961181640625, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -3.0288872718811035, "rewards/margins": 10.409846305847168, "rewards/rejected": -13.43873405456543, "step": 5390 }, { "epoch": 2.82574568288854, "grad_norm": 2.1090518582708224, "learning_rate": 5.122458670407836e-09, "logits/chosen": -2.886220693588257, "logits/rejected": -2.7037885189056396, "logps/chosen": -271.8846435546875, "logps/rejected": -290.8267822265625, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -2.9898478984832764, "rewards/margins": 8.048036575317383, "rewards/rejected": -11.037884712219238, "step": 5400 }, { "epoch": 2.8309785452642595, "grad_norm": 1.1107113980565553, "learning_rate": 4.820400673090669e-09, "logits/chosen": -2.7903153896331787, "logits/rejected": -2.851806879043579, "logps/chosen": -344.9774475097656, "logps/rejected": -442.70245361328125, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -3.5053601264953613, "rewards/margins": 10.806097030639648, "rewards/rejected": -14.311456680297852, "step": 5410 }, { "epoch": 2.836211407639979, "grad_norm": 4.168337198150935, "learning_rate": 4.5274345135595525e-09, "logits/chosen": -2.8443143367767334, "logits/rejected": -2.8058857917785645, "logps/chosen": -370.59796142578125, "logps/rejected": -433.8174743652344, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -2.6774582862854004, "rewards/margins": 11.402239799499512, "rewards/rejected": -14.079699516296387, "step": 5420 }, { "epoch": 2.8414442700156988, "grad_norm": 2.0459245628224654, "learning_rate": 4.243571055690648e-09, "logits/chosen": -2.93727445602417, "logits/rejected": -2.899395704269409, "logps/chosen": -369.89190673828125, "logps/rejected": -441.6454162597656, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -3.533994197845459, "rewards/margins": 9.58867073059082, "rewards/rejected": -13.122665405273438, "step": 5430 }, { "epoch": 2.846677132391418, "grad_norm": 10.080175200184115, "learning_rate": 3.968820825810431e-09, "logits/chosen": -2.6326746940612793, "logits/rejected": -2.534114122390747, "logps/chosen": -275.90081787109375, "logps/rejected": -332.41644287109375, "loss": 0.0121, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.2677104473114014, "rewards/margins": 9.558293342590332, "rewards/rejected": -12.826004028320312, "step": 5440 }, { "epoch": 2.8519099947671376, "grad_norm": 9.182759164729383, "learning_rate": 3.7031940123053997e-09, "logits/chosen": -2.719938039779663, "logits/rejected": -2.671802282333374, "logps/chosen": -260.7502136230469, "logps/rejected": -372.0321350097656, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -3.513227939605713, "rewards/margins": 9.95615005493164, "rewards/rejected": -13.469378471374512, "step": 5450 }, { "epoch": 2.857142857142857, "grad_norm": 1.8244214050116199, "learning_rate": 3.4467004652442842e-09, "logits/chosen": -2.6700356006622314, "logits/rejected": -2.6170523166656494, "logps/chosen": -240.4885711669922, "logps/rejected": -334.0597839355469, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -3.383838653564453, "rewards/margins": 9.104108810424805, "rewards/rejected": -12.487947463989258, "step": 5460 }, { "epoch": 2.8623757195185764, "grad_norm": 27.197318857646856, "learning_rate": 3.1993496960127653e-09, "logits/chosen": -2.7899322509765625, "logits/rejected": -2.7347030639648438, "logps/chosen": -255.9981231689453, "logps/rejected": -327.0517883300781, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -3.804147243499756, "rewards/margins": 9.399972915649414, "rewards/rejected": -13.204119682312012, "step": 5470 }, { "epoch": 2.867608581894296, "grad_norm": 6.435912142248445, "learning_rate": 2.9611508769606663e-09, "logits/chosen": -2.8758127689361572, "logits/rejected": -2.898636817932129, "logps/chosen": -328.01678466796875, "logps/rejected": -394.6129150390625, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -4.162623405456543, "rewards/margins": 9.01399040222168, "rewards/rejected": -13.176612854003906, "step": 5480 }, { "epoch": 2.8728414442700156, "grad_norm": 2.6180736266408946, "learning_rate": 2.7321128410620344e-09, "logits/chosen": -2.7072675228118896, "logits/rejected": -2.526613712310791, "logps/chosen": -256.47113037109375, "logps/rejected": -310.18328857421875, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": -3.5599682331085205, "rewards/margins": 8.551800727844238, "rewards/rejected": -12.111769676208496, "step": 5490 }, { "epoch": 2.8780743066457353, "grad_norm": 5.493094873462439, "learning_rate": 2.5122440815873724e-09, "logits/chosen": -2.8046538829803467, "logits/rejected": -2.6390182971954346, "logps/chosen": -350.84197998046875, "logps/rejected": -341.56817626953125, "loss": 0.0083, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.8398940563201904, "rewards/margins": 9.738390922546387, "rewards/rejected": -12.578286170959473, "step": 5500 }, { "epoch": 2.883307169021455, "grad_norm": 2.875722105541665, "learning_rate": 2.301552751788838e-09, "logits/chosen": -2.6933045387268066, "logits/rejected": -2.7587409019470215, "logps/chosen": -289.3899841308594, "logps/rejected": -430.6471252441406, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -2.9077162742614746, "rewards/margins": 10.210702896118164, "rewards/rejected": -13.11841869354248, "step": 5510 }, { "epoch": 2.8885400313971745, "grad_norm": 11.110783877041788, "learning_rate": 2.1000466645978433e-09, "logits/chosen": -2.912329912185669, "logits/rejected": -2.856783151626587, "logps/chosen": -241.56387329101562, "logps/rejected": -322.1996765136719, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -3.3059451580047607, "rewards/margins": 9.324197769165039, "rewards/rejected": -12.630143165588379, "step": 5520 }, { "epoch": 2.8937728937728937, "grad_norm": 9.186921951104162, "learning_rate": 1.9077332923353728e-09, "logits/chosen": -2.8002140522003174, "logits/rejected": -2.7607316970825195, "logps/chosen": -335.98663330078125, "logps/rejected": -411.86663818359375, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -3.6601040363311768, "rewards/margins": 9.689733505249023, "rewards/rejected": -13.349836349487305, "step": 5530 }, { "epoch": 2.8990057561486133, "grad_norm": 5.749306971675307, "learning_rate": 1.7246197664347872e-09, "logits/chosen": -2.9372615814208984, "logits/rejected": -2.863039493560791, "logps/chosen": -323.7273864746094, "logps/rejected": -489.85394287109375, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -3.1434175968170166, "rewards/margins": 9.491623878479004, "rewards/rejected": -12.635041236877441, "step": 5540 }, { "epoch": 2.904238618524333, "grad_norm": 19.80184799967056, "learning_rate": 1.5507128771775346e-09, "logits/chosen": -2.7384345531463623, "logits/rejected": -2.6649088859558105, "logps/chosen": -296.66400146484375, "logps/rejected": -399.456298828125, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -4.249185085296631, "rewards/margins": 10.072070121765137, "rewards/rejected": -14.321255683898926, "step": 5550 }, { "epoch": 2.909471480900052, "grad_norm": 3.108883640560441, "learning_rate": 1.3860190734411858e-09, "logits/chosen": -2.835890293121338, "logits/rejected": -2.7130818367004395, "logps/chosen": -332.89617919921875, "logps/rejected": -409.10870361328125, "loss": 0.0177, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.203044891357422, "rewards/margins": 8.749673843383789, "rewards/rejected": -11.952718734741211, "step": 5560 }, { "epoch": 2.9147043432757718, "grad_norm": 2.6730396047168186, "learning_rate": 1.2305444624604034e-09, "logits/chosen": -2.9026782512664795, "logits/rejected": -2.892371416091919, "logps/chosen": -323.1064758300781, "logps/rejected": -410.4134216308594, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -2.4592738151550293, "rewards/margins": 10.656106948852539, "rewards/rejected": -13.115381240844727, "step": 5570 }, { "epoch": 2.9199372056514914, "grad_norm": 2.4642807715342343, "learning_rate": 1.0842948096004835e-09, "logits/chosen": -2.7803101539611816, "logits/rejected": -2.733808994293213, "logps/chosen": -269.18170166015625, "logps/rejected": -383.1356201171875, "loss": 0.0166, "rewards/accuracies": 1.0, "rewards/chosen": -3.340747833251953, "rewards/margins": 9.614646911621094, "rewards/rejected": -12.95539379119873, "step": 5580 }, { "epoch": 2.925170068027211, "grad_norm": 5.0936140554749905, "learning_rate": 9.472755381434161e-10, "logits/chosen": -2.7640509605407715, "logits/rejected": -2.594125270843506, "logps/chosen": -312.6282653808594, "logps/rejected": -319.25518798828125, "loss": 0.0122, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.360225200653076, "rewards/margins": 9.879701614379883, "rewards/rejected": -13.2399263381958, "step": 5590 }, { "epoch": 2.9304029304029307, "grad_norm": 1.8322697079723698, "learning_rate": 8.194917290869907e-10, "logits/chosen": -2.8278939723968506, "logits/rejected": -2.7503864765167236, "logps/chosen": -327.7463684082031, "logps/rejected": -389.7525329589844, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": -3.055058002471924, "rewards/margins": 10.197959899902344, "rewards/rejected": -13.253018379211426, "step": 5600 }, { "epoch": 2.93563579277865, "grad_norm": 2.4795212119044434, "learning_rate": 7.009481209561685e-10, "logits/chosen": -2.8461666107177734, "logits/rejected": -2.792017698287964, "logps/chosen": -254.1486053466797, "logps/rejected": -383.47406005859375, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -3.8368611335754395, "rewards/margins": 9.87394905090332, "rewards/rejected": -13.710809707641602, "step": 5610 }, { "epoch": 2.9408686551543695, "grad_norm": 0.6429875051879272, "learning_rate": 5.916491096275845e-10, "logits/chosen": -2.89123272895813, "logits/rejected": -2.854510545730591, "logps/chosen": -313.09100341796875, "logps/rejected": -416.30621337890625, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -3.4978785514831543, "rewards/margins": 11.085000038146973, "rewards/rejected": -14.582880020141602, "step": 5620 }, { "epoch": 2.946101517530089, "grad_norm": 8.205104208036113, "learning_rate": 4.915987481662887e-10, "logits/chosen": -2.6969799995422363, "logits/rejected": -2.650015115737915, "logps/chosen": -255.4471893310547, "logps/rejected": -349.42706298828125, "loss": 0.0081, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.9879844188690186, "rewards/margins": 9.7333345413208, "rewards/rejected": -13.721318244934082, "step": 5630 }, { "epoch": 2.9513343799058083, "grad_norm": 0.47360528335858865, "learning_rate": 4.0080074667570017e-10, "logits/chosen": -2.831747055053711, "logits/rejected": -2.7555124759674072, "logps/chosen": -272.45703125, "logps/rejected": -420.6329040527344, "loss": 0.0121, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.155839920043945, "rewards/margins": 9.454890251159668, "rewards/rejected": -13.61072826385498, "step": 5640 }, { "epoch": 2.956567242281528, "grad_norm": 0.11320673607728095, "learning_rate": 3.1925847215980017e-10, "logits/chosen": -2.8791775703430176, "logits/rejected": -2.773256540298462, "logps/chosen": -289.65753173828125, "logps/rejected": -384.1953125, "loss": 0.0162, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.059587001800537, "rewards/margins": 9.536903381347656, "rewards/rejected": -12.596489906311035, "step": 5650 }, { "epoch": 2.9618001046572475, "grad_norm": 11.540740013104552, "learning_rate": 2.469749483985095e-10, "logits/chosen": -2.7931549549102783, "logits/rejected": -2.694794178009033, "logps/chosen": -287.82244873046875, "logps/rejected": -382.6214294433594, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -3.5354487895965576, "rewards/margins": 9.668214797973633, "rewards/rejected": -13.20366382598877, "step": 5660 }, { "epoch": 2.967032967032967, "grad_norm": 2.0125397015508257, "learning_rate": 1.8395285583530652e-10, "logits/chosen": -2.7975234985351562, "logits/rejected": -2.707702159881592, "logps/chosen": -314.2371826171875, "logps/rejected": -361.87982177734375, "loss": 0.0127, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.5953078269958496, "rewards/margins": 9.611360549926758, "rewards/rejected": -13.206669807434082, "step": 5670 }, { "epoch": 2.9722658294086868, "grad_norm": 0.6707876259231261, "learning_rate": 1.3019453147805614e-10, "logits/chosen": -2.833800792694092, "logits/rejected": -2.7089126110076904, "logps/chosen": -307.8482666015625, "logps/rejected": -402.6416931152344, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -4.7450151443481445, "rewards/margins": 10.314329147338867, "rewards/rejected": -15.059346199035645, "step": 5680 }, { "epoch": 2.977498691784406, "grad_norm": 1.5080333761549847, "learning_rate": 8.570196881216297e-11, "logits/chosen": -2.598090410232544, "logits/rejected": -2.626593828201294, "logps/chosen": -243.28610229492188, "logps/rejected": -374.43829345703125, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -2.6521449089050293, "rewards/margins": 10.693000793457031, "rewards/rejected": -13.345146179199219, "step": 5690 }, { "epoch": 2.9827315541601256, "grad_norm": 1.0096462074317578, "learning_rate": 5.0476817726852194e-11, "logits/chosen": -2.7691233158111572, "logits/rejected": -2.8181586265563965, "logps/chosen": -331.1947021484375, "logps/rejected": -446.33697509765625, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": -3.7296948432922363, "rewards/margins": 10.267393112182617, "rewards/rejected": -13.997087478637695, "step": 5700 }, { "epoch": 2.987964416535845, "grad_norm": 0.2285862220795119, "learning_rate": 2.4520384453746712e-11, "logits/chosen": -2.706902265548706, "logits/rejected": -2.689692258834839, "logps/chosen": -326.62713623046875, "logps/rejected": -442.3352966308594, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -4.0081987380981445, "rewards/margins": 10.280616760253906, "rewards/rejected": -14.288816452026367, "step": 5710 }, { "epoch": 2.9931972789115644, "grad_norm": 0.953746197958993, "learning_rate": 7.833631518627815e-12, "logits/chosen": -2.675525665283203, "logits/rejected": -2.6633100509643555, "logps/chosen": -286.07476806640625, "logps/rejected": -381.57354736328125, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": -3.86041522026062, "rewards/margins": 9.643239974975586, "rewards/rejected": -13.503654479980469, "step": 5720 }, { "epoch": 2.998430141287284, "grad_norm": 27.93831602300026, "learning_rate": 4.1717770565830033e-13, "logits/chosen": -2.8829872608184814, "logits/rejected": -2.8055663108825684, "logps/chosen": -296.50177001953125, "logps/rejected": -327.78375244140625, "loss": 0.0149, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.4994888305664062, "rewards/margins": 8.469379425048828, "rewards/rejected": -11.968867301940918, "step": 5730 }, { "epoch": 3.0, "step": 5733, "total_flos": 0.0, "train_loss": 0.22024025758457094, "train_runtime": 20844.5321, "train_samples_per_second": 8.799, "train_steps_per_second": 0.275 } ], "logging_steps": 10, "max_steps": 5733, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }