diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7970 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998527896363903, + "eval_steps": 100, + "global_step": 5094, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 9.803921568627451e-09, + "logits/chosen": -2.973508358001709, + "logits/rejected": -3.0340657234191895, + "logps/chosen": -228.45870971679688, + "logps/rejected": -221.87188720703125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 9.803921568627452e-08, + "logits/chosen": -2.862595558166504, + "logits/rejected": -2.9409985542297363, + "logps/chosen": -276.31146240234375, + "logps/rejected": -225.0379180908203, + "loss": 0.6933, + "rewards/accuracies": 0.5555555820465088, + "rewards/chosen": 0.00011433070176281035, + "rewards/margins": 0.0006582156638614833, + "rewards/rejected": -0.0005438849912025034, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 1.9607843137254904e-07, + "logits/chosen": -2.9764370918273926, + "logits/rejected": -2.8498854637145996, + "logps/chosen": -256.53411865234375, + "logps/rejected": -240.7257080078125, + "loss": 0.6926, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0033233885187655687, + "rewards/margins": 0.001273788744583726, + "rewards/rejected": 0.0020496000070124865, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 2.9411764705882356e-07, + "logits/chosen": -3.0605838298797607, + "logits/rejected": -3.031383991241455, + "logps/chosen": -228.6864013671875, + "logps/rejected": -264.53155517578125, + "loss": 0.6921, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.004886351525783539, + "rewards/margins": 0.0012393262004479766, + "rewards/rejected": 0.0036470252089202404, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 3.921568627450981e-07, + "logits/chosen": -2.8909895420074463, + "logits/rejected": -2.947922468185425, + "logps/chosen": -255.6454315185547, + "logps/rejected": -217.5404815673828, + "loss": 0.6919, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.014554759487509727, + "rewards/margins": 0.0036192506086081266, + "rewards/rejected": 0.010935508646070957, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 4.901960784313725e-07, + "logits/chosen": -2.9534642696380615, + "logits/rejected": -2.8623435497283936, + "logps/chosen": -310.3286437988281, + "logps/rejected": -262.513427734375, + "loss": 0.6909, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.023258253931999207, + "rewards/margins": 0.004231014288961887, + "rewards/rejected": 0.019027236849069595, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 5.882352941176471e-07, + "logits/chosen": -2.904386043548584, + "logits/rejected": -2.8669381141662598, + "logps/chosen": -252.37451171875, + "logps/rejected": -206.24642944335938, + "loss": 0.6893, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02650384232401848, + "rewards/margins": 0.010579499416053295, + "rewards/rejected": 0.015924345701932907, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 6.862745098039217e-07, + "logits/chosen": -2.994832754135132, + "logits/rejected": -2.9528610706329346, + "logps/chosen": -273.45733642578125, + "logps/rejected": -279.2757873535156, + "loss": 0.6917, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.028729554265737534, + "rewards/margins": 0.003322303295135498, + "rewards/rejected": 0.025407250970602036, + "step": 70 + }, + { + "epoch": 0.02, + "learning_rate": 7.843137254901962e-07, + "logits/chosen": -2.755485773086548, + "logits/rejected": -2.800650119781494, + "logps/chosen": -242.27700805664062, + "logps/rejected": -236.52713012695312, + "loss": 0.6917, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.027540069073438644, + "rewards/margins": 0.0055449483916163445, + "rewards/rejected": 0.021995123475790024, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 8.823529411764707e-07, + "logits/chosen": -2.7677536010742188, + "logits/rejected": -2.863986015319824, + "logps/chosen": -220.959716796875, + "logps/rejected": -232.0678253173828, + "loss": 0.6881, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.030912240967154503, + "rewards/margins": 0.008048823103308678, + "rewards/rejected": 0.022863419726490974, + "step": 90 + }, + { + "epoch": 0.02, + "learning_rate": 9.80392156862745e-07, + "logits/chosen": -2.8327622413635254, + "logits/rejected": -2.894792318344116, + "logps/chosen": -234.09976196289062, + "logps/rejected": -229.82955932617188, + "loss": 0.6884, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.031112805008888245, + "rewards/margins": 0.009142523631453514, + "rewards/rejected": 0.02197028137743473, + "step": 100 + }, + { + "epoch": 0.02, + "eval_logits/chosen": -2.336580991744995, + "eval_logits/rejected": -2.3475990295410156, + "eval_logps/chosen": -259.33624267578125, + "eval_logps/rejected": -230.2778778076172, + "eval_loss": 0.6868417859077454, + "eval_rewards/accuracies": 0.6145833134651184, + "eval_rewards/chosen": 0.039044398814439774, + "eval_rewards/margins": 0.010639672167599201, + "eval_rewards/rejected": 0.028404729440808296, + "eval_runtime": 477.4451, + "eval_samples_per_second": 4.189, + "eval_steps_per_second": 0.176, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 1.0784313725490197e-06, + "logits/chosen": -2.8984100818634033, + "logits/rejected": -2.901028633117676, + "logps/chosen": -224.906494140625, + "logps/rejected": -201.27830505371094, + "loss": 0.6873, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.03233223035931587, + "rewards/margins": 0.009733730927109718, + "rewards/rejected": 0.022598499432206154, + "step": 110 + }, + { + "epoch": 0.02, + "learning_rate": 1.1764705882352942e-06, + "logits/chosen": -2.9261550903320312, + "logits/rejected": -2.8849735260009766, + "logps/chosen": -229.10400390625, + "logps/rejected": -204.9363555908203, + "loss": 0.6821, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.03767221421003342, + "rewards/margins": 0.0093983830884099, + "rewards/rejected": 0.02827383577823639, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 1.2745098039215686e-06, + "logits/chosen": -2.832059383392334, + "logits/rejected": -2.76809024810791, + "logps/chosen": -241.0215301513672, + "logps/rejected": -262.1598205566406, + "loss": 0.6838, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.04861343652009964, + "rewards/margins": 0.0215291790664196, + "rewards/rejected": 0.02708425745368004, + "step": 130 + }, + { + "epoch": 0.03, + "learning_rate": 1.3725490196078434e-06, + "logits/chosen": -2.9472458362579346, + "logits/rejected": -2.9169039726257324, + "logps/chosen": -270.29833984375, + "logps/rejected": -233.29098510742188, + "loss": 0.6818, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.054968465119600296, + "rewards/margins": 0.03214170038700104, + "rewards/rejected": 0.022826772183179855, + "step": 140 + }, + { + "epoch": 0.03, + "learning_rate": 1.4705882352941177e-06, + "logits/chosen": -2.8875279426574707, + "logits/rejected": -2.8734564781188965, + "logps/chosen": -274.0886535644531, + "logps/rejected": -231.80947875976562, + "loss": 0.6811, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.05963331460952759, + "rewards/margins": 0.05601944774389267, + "rewards/rejected": 0.003613865002989769, + "step": 150 + }, + { + "epoch": 0.03, + "learning_rate": 1.5686274509803923e-06, + "logits/chosen": -2.930586338043213, + "logits/rejected": -2.765791654586792, + "logps/chosen": -236.7968292236328, + "logps/rejected": -213.68270874023438, + "loss": 0.674, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.041575588285923004, + "rewards/margins": 0.03660685569047928, + "rewards/rejected": 0.004968726541846991, + "step": 160 + }, + { + "epoch": 0.03, + "learning_rate": 1.6666666666666667e-06, + "logits/chosen": -2.907088041305542, + "logits/rejected": -2.8094048500061035, + "logps/chosen": -249.4150390625, + "logps/rejected": -232.6487274169922, + "loss": 0.6793, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.041034214198589325, + "rewards/margins": 0.0442991778254509, + "rewards/rejected": -0.003264959901571274, + "step": 170 + }, + { + "epoch": 0.04, + "learning_rate": 1.7647058823529414e-06, + "logits/chosen": -2.873412847518921, + "logits/rejected": -2.9365124702453613, + "logps/chosen": -230.6173858642578, + "logps/rejected": -237.4728546142578, + "loss": 0.6789, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.018897254019975662, + "rewards/margins": 0.024724815040826797, + "rewards/rejected": -0.00582756195217371, + "step": 180 + }, + { + "epoch": 0.04, + "learning_rate": 1.8627450980392158e-06, + "logits/chosen": -2.8295395374298096, + "logits/rejected": -2.749323844909668, + "logps/chosen": -257.933837890625, + "logps/rejected": -229.88223266601562, + "loss": 0.6653, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.03761621564626694, + "rewards/margins": 0.06116952374577522, + "rewards/rejected": -0.023553304374217987, + "step": 190 + }, + { + "epoch": 0.04, + "learning_rate": 1.96078431372549e-06, + "logits/chosen": -2.7251996994018555, + "logits/rejected": -2.806915760040283, + "logps/chosen": -205.859619140625, + "logps/rejected": -223.43838500976562, + "loss": 0.6654, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.014988506212830544, + "rewards/margins": 0.04508345201611519, + "rewards/rejected": -0.030094945803284645, + "step": 200 + }, + { + "epoch": 0.04, + "eval_logits/chosen": -2.258488178253174, + "eval_logits/rejected": -2.263549327850342, + "eval_logps/chosen": -259.9051818847656, + "eval_logps/rejected": -235.06222534179688, + "eval_loss": 0.6657090783119202, + "eval_rewards/accuracies": 0.6398809552192688, + "eval_rewards/chosen": 0.03335539624094963, + "eval_rewards/margins": 0.052794355899095535, + "eval_rewards/rejected": -0.019438959658145905, + "eval_runtime": 472.5343, + "eval_samples_per_second": 4.232, + "eval_steps_per_second": 0.178, + "step": 200 + }, + { + "epoch": 0.04, + "learning_rate": 2.058823529411765e-06, + "logits/chosen": -2.9805192947387695, + "logits/rejected": -2.8744161128997803, + "logps/chosen": -288.2845764160156, + "logps/rejected": -254.07949829101562, + "loss": 0.6676, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.005574061069637537, + "rewards/margins": 0.061119239777326584, + "rewards/rejected": -0.05554518848657608, + "step": 210 + }, + { + "epoch": 0.04, + "learning_rate": 2.1568627450980393e-06, + "logits/chosen": -2.911360025405884, + "logits/rejected": -2.8783349990844727, + "logps/chosen": -235.96401977539062, + "logps/rejected": -217.7150421142578, + "loss": 0.6562, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04486488178372383, + "rewards/margins": 0.06540326774120331, + "rewards/rejected": -0.11026813834905624, + "step": 220 + }, + { + "epoch": 0.05, + "learning_rate": 2.254901960784314e-06, + "logits/chosen": -2.9542288780212402, + "logits/rejected": -2.878969192504883, + "logps/chosen": -260.88934326171875, + "logps/rejected": -296.89776611328125, + "loss": 0.6446, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05443768948316574, + "rewards/margins": 0.04301251843571663, + "rewards/rejected": -0.09745021164417267, + "step": 230 + }, + { + "epoch": 0.05, + "learning_rate": 2.3529411764705885e-06, + "logits/chosen": -2.817126512527466, + "logits/rejected": -2.8219687938690186, + "logps/chosen": -273.76239013671875, + "logps/rejected": -246.7056121826172, + "loss": 0.6503, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.019011136144399643, + "rewards/margins": 0.09237994253635406, + "rewards/rejected": -0.111391082406044, + "step": 240 + }, + { + "epoch": 0.05, + "learning_rate": 2.450980392156863e-06, + "logits/chosen": -2.8546347618103027, + "logits/rejected": -2.9686641693115234, + "logps/chosen": -275.5454406738281, + "logps/rejected": -434.2581481933594, + "loss": 0.6438, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03764430433511734, + "rewards/margins": 0.1496858149766922, + "rewards/rejected": -0.18733009696006775, + "step": 250 + }, + { + "epoch": 0.05, + "learning_rate": 2.549019607843137e-06, + "logits/chosen": -2.9442734718322754, + "logits/rejected": -2.8972764015197754, + "logps/chosen": -357.8099060058594, + "logps/rejected": -298.7733459472656, + "loss": 0.6497, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22853727638721466, + "rewards/margins": 0.1591685563325882, + "rewards/rejected": -0.38770583271980286, + "step": 260 + }, + { + "epoch": 0.05, + "learning_rate": 2.647058823529412e-06, + "logits/chosen": -2.94228196144104, + "logits/rejected": -2.8907318115234375, + "logps/chosen": -284.27130126953125, + "logps/rejected": -285.43927001953125, + "loss": 0.6602, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2485658824443817, + "rewards/margins": 0.1097467914223671, + "rewards/rejected": -0.3583126664161682, + "step": 270 + }, + { + "epoch": 0.05, + "learning_rate": 2.7450980392156867e-06, + "logits/chosen": -2.926578998565674, + "logits/rejected": -2.8990461826324463, + "logps/chosen": -319.3984680175781, + "logps/rejected": -296.30694580078125, + "loss": 0.69, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.27285149693489075, + "rewards/margins": 0.02198920026421547, + "rewards/rejected": -0.2948406934738159, + "step": 280 + }, + { + "epoch": 0.06, + "learning_rate": 2.843137254901961e-06, + "logits/chosen": -2.916912078857422, + "logits/rejected": -2.9760031700134277, + "logps/chosen": -317.46405029296875, + "logps/rejected": -306.2522277832031, + "loss": 0.6347, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.08358258008956909, + "rewards/margins": 0.14197704195976257, + "rewards/rejected": -0.22555962204933167, + "step": 290 + }, + { + "epoch": 0.06, + "learning_rate": 2.9411764705882355e-06, + "logits/chosen": -2.7295899391174316, + "logits/rejected": -2.7394189834594727, + "logps/chosen": -269.99737548828125, + "logps/rejected": -244.5452880859375, + "loss": 0.6346, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.11713893711566925, + "rewards/margins": 0.10082261264324188, + "rewards/rejected": -0.21796154975891113, + "step": 300 + }, + { + "epoch": 0.06, + "eval_logits/chosen": -2.2216763496398926, + "eval_logits/rejected": -2.2106869220733643, + "eval_logps/chosen": -288.878662109375, + "eval_logps/rejected": -270.03985595703125, + "eval_loss": 0.6431363821029663, + "eval_rewards/accuracies": 0.6532738208770752, + "eval_rewards/chosen": -0.2563799023628235, + "eval_rewards/margins": 0.11283508688211441, + "eval_rewards/rejected": -0.3692150413990021, + "eval_runtime": 474.5307, + "eval_samples_per_second": 4.215, + "eval_steps_per_second": 0.177, + "step": 300 + }, + { + "epoch": 0.06, + "learning_rate": 3.03921568627451e-06, + "logits/chosen": -2.82896089553833, + "logits/rejected": -2.8039159774780273, + "logps/chosen": -269.8193359375, + "logps/rejected": -215.48959350585938, + "loss": 0.6101, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.24714116752147675, + "rewards/margins": 0.13896790146827698, + "rewards/rejected": -0.38610905408859253, + "step": 310 + }, + { + "epoch": 0.06, + "learning_rate": 3.1372549019607846e-06, + "logits/chosen": -2.8394603729248047, + "logits/rejected": -2.9360859394073486, + "logps/chosen": -293.5857849121094, + "logps/rejected": -340.95806884765625, + "loss": 0.6425, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12740099430084229, + "rewards/margins": 0.14778587222099304, + "rewards/rejected": -0.2751868665218353, + "step": 320 + }, + { + "epoch": 0.06, + "learning_rate": 3.2352941176470594e-06, + "logits/chosen": -2.8742594718933105, + "logits/rejected": -2.81569504737854, + "logps/chosen": -277.4770202636719, + "logps/rejected": -286.01336669921875, + "loss": 0.6182, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.13143302500247955, + "rewards/margins": 0.09543346613645554, + "rewards/rejected": -0.2268664538860321, + "step": 330 + }, + { + "epoch": 0.07, + "learning_rate": 3.3333333333333333e-06, + "logits/chosen": -2.8065128326416016, + "logits/rejected": -2.8967134952545166, + "logps/chosen": -331.92132568359375, + "logps/rejected": -347.8213806152344, + "loss": 0.5963, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2281709909439087, + "rewards/margins": 0.25101691484451294, + "rewards/rejected": -0.47918787598609924, + "step": 340 + }, + { + "epoch": 0.07, + "learning_rate": 3.431372549019608e-06, + "logits/chosen": -2.7618370056152344, + "logits/rejected": -2.784069538116455, + "logps/chosen": -300.3914794921875, + "logps/rejected": -342.68963623046875, + "loss": 0.6123, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3338148295879364, + "rewards/margins": 0.19719263911247253, + "rewards/rejected": -0.5310074687004089, + "step": 350 + }, + { + "epoch": 0.07, + "learning_rate": 3.529411764705883e-06, + "logits/chosen": -2.8726532459259033, + "logits/rejected": -2.837023973464966, + "logps/chosen": -314.0732421875, + "logps/rejected": -322.673583984375, + "loss": 0.6246, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.22630402445793152, + "rewards/margins": 0.29907548427581787, + "rewards/rejected": -0.5253795385360718, + "step": 360 + }, + { + "epoch": 0.07, + "learning_rate": 3.6274509803921573e-06, + "logits/chosen": -2.6191344261169434, + "logits/rejected": -2.686654567718506, + "logps/chosen": -249.3698272705078, + "logps/rejected": -269.16033935546875, + "loss": 0.6096, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1989968717098236, + "rewards/margins": 0.3198801875114441, + "rewards/rejected": -0.5188770890235901, + "step": 370 + }, + { + "epoch": 0.07, + "learning_rate": 3.7254901960784316e-06, + "logits/chosen": -2.65582013130188, + "logits/rejected": -2.6156890392303467, + "logps/chosen": -316.6374206542969, + "logps/rejected": -299.0272216796875, + "loss": 0.5863, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3338475227355957, + "rewards/margins": 0.2835896611213684, + "rewards/rejected": -0.6174371838569641, + "step": 380 + }, + { + "epoch": 0.08, + "learning_rate": 3.8235294117647055e-06, + "logits/chosen": -2.82440447807312, + "logits/rejected": -2.8799052238464355, + "logps/chosen": -353.44732666015625, + "logps/rejected": -368.5728454589844, + "loss": 0.5932, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.632414698600769, + "rewards/margins": 0.2788391709327698, + "rewards/rejected": -0.9112539291381836, + "step": 390 + }, + { + "epoch": 0.08, + "learning_rate": 3.92156862745098e-06, + "logits/chosen": -2.7008981704711914, + "logits/rejected": -2.69873309135437, + "logps/chosen": -269.3892517089844, + "logps/rejected": -307.84246826171875, + "loss": 0.5888, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.38206809759140015, + "rewards/margins": 0.2594672739505768, + "rewards/rejected": -0.6415354013442993, + "step": 400 + }, + { + "epoch": 0.08, + "eval_logits/chosen": -1.9904673099517822, + "eval_logits/rejected": -1.9579482078552246, + "eval_logps/chosen": -305.1883850097656, + "eval_logps/rejected": -296.24200439453125, + "eval_loss": 0.6161777973175049, + "eval_rewards/accuracies": 0.6517857313156128, + "eval_rewards/chosen": -0.4194769263267517, + "eval_rewards/margins": 0.21175935864448547, + "eval_rewards/rejected": -0.6312363147735596, + "eval_runtime": 472.4636, + "eval_samples_per_second": 4.233, + "eval_steps_per_second": 0.178, + "step": 400 + }, + { + "epoch": 0.08, + "learning_rate": 4.019607843137255e-06, + "logits/chosen": -2.676201581954956, + "logits/rejected": -2.6336846351623535, + "logps/chosen": -273.431396484375, + "logps/rejected": -287.57550048828125, + "loss": 0.6385, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3741230368614197, + "rewards/margins": 0.1745728999376297, + "rewards/rejected": -0.5486959218978882, + "step": 410 + }, + { + "epoch": 0.08, + "learning_rate": 4.11764705882353e-06, + "logits/chosen": -2.614751100540161, + "logits/rejected": -2.624110221862793, + "logps/chosen": -290.9742431640625, + "logps/rejected": -326.0084533691406, + "loss": 0.5882, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3570622503757477, + "rewards/margins": 0.3694344162940979, + "rewards/rejected": -0.7264967560768127, + "step": 420 + }, + { + "epoch": 0.08, + "learning_rate": 4.215686274509805e-06, + "logits/chosen": -2.639557361602783, + "logits/rejected": -2.620819568634033, + "logps/chosen": -268.70245361328125, + "logps/rejected": -312.71990966796875, + "loss": 0.6095, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3011035919189453, + "rewards/margins": 0.4379960894584656, + "rewards/rejected": -0.7390996813774109, + "step": 430 + }, + { + "epoch": 0.09, + "learning_rate": 4.313725490196079e-06, + "logits/chosen": -2.6626319885253906, + "logits/rejected": -2.6842870712280273, + "logps/chosen": -302.835205078125, + "logps/rejected": -280.4335021972656, + "loss": 0.6034, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.45034360885620117, + "rewards/margins": 0.2182246446609497, + "rewards/rejected": -0.6685682535171509, + "step": 440 + }, + { + "epoch": 0.09, + "learning_rate": 4.411764705882353e-06, + "logits/chosen": -2.625410556793213, + "logits/rejected": -2.690067768096924, + "logps/chosen": -291.3028869628906, + "logps/rejected": -334.9422912597656, + "loss": 0.5729, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5663882493972778, + "rewards/margins": 0.3346121907234192, + "rewards/rejected": -0.9010004997253418, + "step": 450 + }, + { + "epoch": 0.09, + "learning_rate": 4.509803921568628e-06, + "logits/chosen": -2.628746509552002, + "logits/rejected": -2.707404851913452, + "logps/chosen": -299.4609375, + "logps/rejected": -319.9123229980469, + "loss": 0.5846, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6919635534286499, + "rewards/margins": 0.19277557730674744, + "rewards/rejected": -0.884739100933075, + "step": 460 + }, + { + "epoch": 0.09, + "learning_rate": 4.607843137254902e-06, + "logits/chosen": -2.645448684692383, + "logits/rejected": -2.391211748123169, + "logps/chosen": -413.65765380859375, + "logps/rejected": -357.4613952636719, + "loss": 0.6402, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6778701543807983, + "rewards/margins": 0.4130594730377197, + "rewards/rejected": -1.090929627418518, + "step": 470 + }, + { + "epoch": 0.09, + "learning_rate": 4.705882352941177e-06, + "logits/chosen": -2.6332736015319824, + "logits/rejected": -2.4814000129699707, + "logps/chosen": -250.36038208007812, + "logps/rejected": -264.98663330078125, + "loss": 0.5656, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4399610161781311, + "rewards/margins": 0.33872875571250916, + "rewards/rejected": -0.7786898016929626, + "step": 480 + }, + { + "epoch": 0.1, + "learning_rate": 4.803921568627452e-06, + "logits/chosen": -2.584453821182251, + "logits/rejected": -2.591069221496582, + "logps/chosen": -395.53118896484375, + "logps/rejected": -409.98126220703125, + "loss": 0.5925, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5409294366836548, + "rewards/margins": 0.5661200881004333, + "rewards/rejected": -1.107049584388733, + "step": 490 + }, + { + "epoch": 0.1, + "learning_rate": 4.901960784313726e-06, + "logits/chosen": -2.58937931060791, + "logits/rejected": -2.5619587898254395, + "logps/chosen": -391.55841064453125, + "logps/rejected": -380.4855041503906, + "loss": 0.5806, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9605814218521118, + "rewards/margins": 0.352530300617218, + "rewards/rejected": -1.3131117820739746, + "step": 500 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -0.5252847075462341, + "eval_logits/rejected": -0.4989687204360962, + "eval_logps/chosen": -394.94683837890625, + "eval_logps/rejected": -398.19195556640625, + "eval_loss": 0.5915763974189758, + "eval_rewards/accuracies": 0.663690447807312, + "eval_rewards/chosen": -1.3170610666275024, + "eval_rewards/margins": 0.33367499709129333, + "eval_rewards/rejected": -1.6507360935211182, + "eval_runtime": 466.1502, + "eval_samples_per_second": 4.29, + "eval_steps_per_second": 0.18, + "step": 500 + }, + { + "epoch": 0.1, + "learning_rate": 5e-06, + "logits/chosen": -2.3698482513427734, + "logits/rejected": -2.3730268478393555, + "logps/chosen": -325.0603942871094, + "logps/rejected": -333.634033203125, + "loss": 0.6341, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3440351486206055, + "rewards/margins": 0.16630719602108002, + "rewards/rejected": -1.5103422403335571, + "step": 510 + }, + { + "epoch": 0.1, + "learning_rate": 4.999941289086112e-06, + "logits/chosen": -2.434908628463745, + "logits/rejected": -2.4711365699768066, + "logps/chosen": -319.7598571777344, + "logps/rejected": -411.1217346191406, + "loss": 0.6063, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8812028169631958, + "rewards/margins": 0.4554789662361145, + "rewards/rejected": -1.336681842803955, + "step": 520 + }, + { + "epoch": 0.1, + "learning_rate": 4.999765159102025e-06, + "logits/chosen": -2.4896693229675293, + "logits/rejected": -2.3514962196350098, + "logps/chosen": -338.57122802734375, + "logps/rejected": -392.3268737792969, + "loss": 0.6066, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4506074786186218, + "rewards/margins": 0.2730453908443451, + "rewards/rejected": -0.7236528992652893, + "step": 530 + }, + { + "epoch": 0.11, + "learning_rate": 4.999471618320339e-06, + "logits/chosen": -2.5596091747283936, + "logits/rejected": -2.569035291671753, + "logps/chosen": -285.6130065917969, + "logps/rejected": -337.4367370605469, + "loss": 0.5838, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35268205404281616, + "rewards/margins": 0.43236231803894043, + "rewards/rejected": -0.7850444912910461, + "step": 540 + }, + { + "epoch": 0.11, + "learning_rate": 4.999060680528294e-06, + "logits/chosen": -2.639185667037964, + "logits/rejected": -2.4026732444763184, + "logps/chosen": -354.9365234375, + "logps/rejected": -350.83575439453125, + "loss": 0.5779, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7679035067558289, + "rewards/margins": 0.30885177850723267, + "rewards/rejected": -1.0767552852630615, + "step": 550 + }, + { + "epoch": 0.11, + "learning_rate": 4.998532365027117e-06, + "logits/chosen": -2.2880897521972656, + "logits/rejected": -2.203477621078491, + "logps/chosen": -402.0857849121094, + "logps/rejected": -435.74090576171875, + "loss": 0.5588, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.303902268409729, + "rewards/margins": 0.5334252715110779, + "rewards/rejected": -1.8373275995254517, + "step": 560 + }, + { + "epoch": 0.11, + "learning_rate": 4.997886696631115e-06, + "logits/chosen": -2.271165132522583, + "logits/rejected": -2.3455963134765625, + "logps/chosen": -351.71917724609375, + "logps/rejected": -414.587646484375, + "loss": 0.5306, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.247244119644165, + "rewards/margins": 0.45136338472366333, + "rewards/rejected": -1.6986074447631836, + "step": 570 + }, + { + "epoch": 0.11, + "learning_rate": 4.997123705666514e-06, + "logits/chosen": -2.1570160388946533, + "logits/rejected": -2.1810998916625977, + "logps/chosen": -408.73883056640625, + "logps/rejected": -413.47174072265625, + "loss": 0.6342, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6730220317840576, + "rewards/margins": 0.255155473947525, + "rewards/rejected": -1.928177833557129, + "step": 580 + }, + { + "epoch": 0.12, + "learning_rate": 4.996243427970032e-06, + "logits/chosen": -2.1544055938720703, + "logits/rejected": -1.9909731149673462, + "logps/chosen": -351.2218933105469, + "logps/rejected": -395.60662841796875, + "loss": 0.6315, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1272488832473755, + "rewards/margins": 0.46109557151794434, + "rewards/rejected": -1.5883444547653198, + "step": 590 + }, + { + "epoch": 0.12, + "learning_rate": 4.995245904887195e-06, + "logits/chosen": -2.532106876373291, + "logits/rejected": -2.246403932571411, + "logps/chosen": -447.5849609375, + "logps/rejected": -391.7081298828125, + "loss": 0.6219, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1101863384246826, + "rewards/margins": 0.3275432586669922, + "rewards/rejected": -1.4377295970916748, + "step": 600 + }, + { + "epoch": 0.12, + "eval_logits/chosen": -0.03613191470503807, + "eval_logits/rejected": 0.03838645666837692, + "eval_logps/chosen": -376.6808166503906, + "eval_logps/rejected": -383.747802734375, + "eval_loss": 0.5753170847892761, + "eval_rewards/accuracies": 0.6502976417541504, + "eval_rewards/chosen": -1.1344010829925537, + "eval_rewards/margins": 0.37189337611198425, + "eval_rewards/rejected": -1.5062944889068604, + "eval_runtime": 466.8014, + "eval_samples_per_second": 4.284, + "eval_steps_per_second": 0.18, + "step": 600 + }, + { + "epoch": 0.12, + "learning_rate": 4.994131183270396e-06, + "logits/chosen": -2.3710973262786865, + "logits/rejected": -2.3201870918273926, + "logps/chosen": -421.33709716796875, + "logps/rejected": -424.4642639160156, + "loss": 0.5701, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1958863735198975, + "rewards/margins": 0.5684593915939331, + "rewards/rejected": -1.7643455266952515, + "step": 610 + }, + { + "epoch": 0.12, + "learning_rate": 4.992899315476696e-06, + "logits/chosen": -2.2475314140319824, + "logits/rejected": -2.1929378509521484, + "logps/chosen": -438.3310546875, + "logps/rejected": -469.04241943359375, + "loss": 0.5701, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3635677099227905, + "rewards/margins": 0.46668586134910583, + "rewards/rejected": -1.8302538394927979, + "step": 620 + }, + { + "epoch": 0.12, + "learning_rate": 4.99155035936536e-06, + "logits/chosen": -2.449263095855713, + "logits/rejected": -2.265713930130005, + "logps/chosen": -329.15960693359375, + "logps/rejected": -357.1077880859375, + "loss": 0.5563, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0894893407821655, + "rewards/margins": 0.3795085847377777, + "rewards/rejected": -1.4689979553222656, + "step": 630 + }, + { + "epoch": 0.13, + "learning_rate": 4.990084378295148e-06, + "logits/chosen": -2.448363780975342, + "logits/rejected": -2.4051525592803955, + "logps/chosen": -341.86956787109375, + "logps/rejected": -338.0180969238281, + "loss": 0.5833, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.8054157495498657, + "rewards/margins": 0.5508281588554382, + "rewards/rejected": -1.3562438488006592, + "step": 640 + }, + { + "epoch": 0.13, + "learning_rate": 4.988501441121328e-06, + "logits/chosen": -2.3474011421203613, + "logits/rejected": -2.023271322250366, + "logps/chosen": -382.7298889160156, + "logps/rejected": -369.0820007324219, + "loss": 0.6044, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0760247707366943, + "rewards/margins": 0.42178216576576233, + "rewards/rejected": -1.4978069067001343, + "step": 650 + }, + { + "epoch": 0.13, + "learning_rate": 4.986801622192453e-06, + "logits/chosen": -2.241762638092041, + "logits/rejected": -1.996416449546814, + "logps/chosen": -416.619384765625, + "logps/rejected": -449.72052001953125, + "loss": 0.5281, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.3046057224273682, + "rewards/margins": 0.7088319659233093, + "rewards/rejected": -2.0134377479553223, + "step": 660 + }, + { + "epoch": 0.13, + "learning_rate": 4.984985001346859e-06, + "logits/chosen": -2.369713544845581, + "logits/rejected": -2.3672068119049072, + "logps/chosen": -391.0481262207031, + "logps/rejected": -429.03759765625, + "loss": 0.5965, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2604517936706543, + "rewards/margins": 0.23743471503257751, + "rewards/rejected": -1.4978865385055542, + "step": 670 + }, + { + "epoch": 0.13, + "learning_rate": 4.9830516639089226e-06, + "logits/chosen": -2.217136859893799, + "logits/rejected": -2.181342124938965, + "logps/chosen": -373.64794921875, + "logps/rejected": -409.3558654785156, + "loss": 0.5203, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.8603825569152832, + "rewards/margins": 0.7127417325973511, + "rewards/rejected": -1.5731241703033447, + "step": 680 + }, + { + "epoch": 0.14, + "learning_rate": 4.981001700685051e-06, + "logits/chosen": -2.3387327194213867, + "logits/rejected": -2.1862881183624268, + "logps/chosen": -413.24298095703125, + "logps/rejected": -404.2867126464844, + "loss": 0.5476, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.955763041973114, + "rewards/margins": 0.55617755651474, + "rewards/rejected": -1.5119404792785645, + "step": 690 + }, + { + "epoch": 0.14, + "learning_rate": 4.978835207959414e-06, + "logits/chosen": -2.3319435119628906, + "logits/rejected": -2.234999418258667, + "logps/chosen": -312.42431640625, + "logps/rejected": -341.87005615234375, + "loss": 0.5586, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9548459053039551, + "rewards/margins": 0.35879063606262207, + "rewards/rejected": -1.3136365413665771, + "step": 700 + }, + { + "epoch": 0.14, + "eval_logits/chosen": 0.24727466702461243, + "eval_logits/rejected": 0.30730071663856506, + "eval_logps/chosen": -342.16094970703125, + "eval_logps/rejected": -351.8956604003906, + "eval_loss": 0.5732717514038086, + "eval_rewards/accuracies": 0.6666666865348816, + "eval_rewards/chosen": -0.7892022132873535, + "eval_rewards/margins": 0.3985709249973297, + "eval_rewards/rejected": -1.1877731084823608, + "eval_runtime": 465.6349, + "eval_samples_per_second": 4.295, + "eval_steps_per_second": 0.18, + "step": 700 + }, + { + "epoch": 0.14, + "learning_rate": 4.976552287489427e-06, + "logits/chosen": -2.3588404655456543, + "logits/rejected": -2.3238019943237305, + "logps/chosen": -366.0771179199219, + "logps/rejected": -426.2799377441406, + "loss": 0.4964, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7471452951431274, + "rewards/margins": 0.7910875678062439, + "rewards/rejected": -1.5382329225540161, + "step": 710 + }, + { + "epoch": 0.14, + "learning_rate": 4.9741530465009665e-06, + "logits/chosen": -2.0894782543182373, + "logits/rejected": -2.11095929145813, + "logps/chosen": -360.30963134765625, + "logps/rejected": -429.9984436035156, + "loss": 0.5533, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0334492921829224, + "rewards/margins": 0.8144800066947937, + "rewards/rejected": -1.8479293584823608, + "step": 720 + }, + { + "epoch": 0.14, + "learning_rate": 4.9716375976833395e-06, + "logits/chosen": -2.13502836227417, + "logits/rejected": -2.0187134742736816, + "logps/chosen": -387.08819580078125, + "logps/rejected": -384.268798828125, + "loss": 0.5997, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.14211905002594, + "rewards/margins": 0.3813985288143158, + "rewards/rejected": -1.5235174894332886, + "step": 730 + }, + { + "epoch": 0.15, + "learning_rate": 4.969006059183984e-06, + "logits/chosen": -2.2879302501678467, + "logits/rejected": -2.1529295444488525, + "logps/chosen": -307.6477966308594, + "logps/rejected": -326.3321838378906, + "loss": 0.5228, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8938900232315063, + "rewards/margins": 0.3973291218280792, + "rewards/rejected": -1.2912189960479736, + "step": 740 + }, + { + "epoch": 0.15, + "learning_rate": 4.966258554602924e-06, + "logits/chosen": -2.2723472118377686, + "logits/rejected": -2.1424341201782227, + "logps/chosen": -370.5997619628906, + "logps/rejected": -356.42242431640625, + "loss": 0.6127, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.07357919216156, + "rewards/margins": 0.4219037890434265, + "rewards/rejected": -1.4954830408096313, + "step": 750 + }, + { + "epoch": 0.15, + "learning_rate": 4.963395212986964e-06, + "logits/chosen": -2.4523584842681885, + "logits/rejected": -2.3470985889434814, + "logps/chosen": -400.797607421875, + "logps/rejected": -381.6387939453125, + "loss": 0.5702, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.7941485643386841, + "rewards/margins": 0.6219018697738647, + "rewards/rejected": -1.4160504341125488, + "step": 760 + }, + { + "epoch": 0.15, + "learning_rate": 4.960416168823626e-06, + "logits/chosen": -2.389075756072998, + "logits/rejected": -2.132652759552002, + "logps/chosen": -314.0247802734375, + "logps/rejected": -338.5572509765625, + "loss": 0.5706, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6960195302963257, + "rewards/margins": 0.6828244924545288, + "rewards/rejected": -1.3788437843322754, + "step": 770 + }, + { + "epoch": 0.15, + "learning_rate": 4.957321562034833e-06, + "logits/chosen": -2.3235738277435303, + "logits/rejected": -2.2064554691314697, + "logps/chosen": -361.6338195800781, + "logps/rejected": -368.1969299316406, + "loss": 0.536, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8635967373847961, + "rewards/margins": 0.5974106788635254, + "rewards/rejected": -1.4610074758529663, + "step": 780 + }, + { + "epoch": 0.16, + "learning_rate": 4.954111537970342e-06, + "logits/chosen": -2.2395503520965576, + "logits/rejected": -2.0655577182769775, + "logps/chosen": -382.55023193359375, + "logps/rejected": -423.2119140625, + "loss": 0.6418, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3395254611968994, + "rewards/margins": 0.3794713616371155, + "rewards/rejected": -1.7189967632293701, + "step": 790 + }, + { + "epoch": 0.16, + "learning_rate": 4.950786247400908e-06, + "logits/chosen": -2.2357563972473145, + "logits/rejected": -2.2734086513519287, + "logps/chosen": -346.0960693359375, + "logps/rejected": -380.20074462890625, + "loss": 0.6123, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.31546151638031, + "rewards/margins": 0.2703830301761627, + "rewards/rejected": -1.58584463596344, + "step": 800 + }, + { + "epoch": 0.16, + "eval_logits/chosen": 1.0326988697052002, + "eval_logits/rejected": 1.0808866024017334, + "eval_logps/chosen": -390.5542297363281, + "eval_logps/rejected": -403.5396728515625, + "eval_loss": 0.5578105449676514, + "eval_rewards/accuracies": 0.6651785969734192, + "eval_rewards/chosen": -1.2731355428695679, + "eval_rewards/margins": 0.43107810616493225, + "eval_rewards/rejected": -1.7042136192321777, + "eval_runtime": 471.0801, + "eval_samples_per_second": 4.246, + "eval_steps_per_second": 0.178, + "step": 800 + }, + { + "epoch": 0.16, + "learning_rate": 4.94734584651121e-06, + "logits/chosen": -2.2077815532684326, + "logits/rejected": -1.9737892150878906, + "logps/chosen": -449.19549560546875, + "logps/rejected": -509.70611572265625, + "loss": 0.4952, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3227405548095703, + "rewards/margins": 0.795570969581604, + "rewards/rejected": -2.1183114051818848, + "step": 810 + }, + { + "epoch": 0.16, + "learning_rate": 4.943790496892513e-06, + "logits/chosen": -2.080096960067749, + "logits/rejected": -1.8696342706680298, + "logps/chosen": -482.72174072265625, + "logps/rejected": -451.3267517089844, + "loss": 0.5183, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5850732326507568, + "rewards/margins": 0.7216095328330994, + "rewards/rejected": -2.306682586669922, + "step": 820 + }, + { + "epoch": 0.16, + "learning_rate": 4.940120365535076e-06, + "logits/chosen": -2.4171366691589355, + "logits/rejected": -2.279550075531006, + "logps/chosen": -424.75775146484375, + "logps/rejected": -457.83026123046875, + "loss": 0.5598, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3365710973739624, + "rewards/margins": 0.7002229690551758, + "rewards/rejected": -2.0367941856384277, + "step": 830 + }, + { + "epoch": 0.16, + "learning_rate": 4.936335624820313e-06, + "logits/chosen": -2.015204429626465, + "logits/rejected": -2.105721950531006, + "logps/chosen": -332.3876953125, + "logps/rejected": -497.28656005859375, + "loss": 0.4966, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.1477575302124023, + "rewards/margins": 1.0808641910552979, + "rewards/rejected": -2.2286219596862793, + "step": 840 + }, + { + "epoch": 0.17, + "learning_rate": 4.932436452512693e-06, + "logits/chosen": -2.215301275253296, + "logits/rejected": -2.0272057056427, + "logps/chosen": -424.7681579589844, + "logps/rejected": -471.20709228515625, + "loss": 0.5339, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3136613368988037, + "rewards/margins": 0.7162275314331055, + "rewards/rejected": -2.029888868331909, + "step": 850 + }, + { + "epoch": 0.17, + "learning_rate": 4.9284230317513906e-06, + "logits/chosen": -2.259880304336548, + "logits/rejected": -2.076293706893921, + "logps/chosen": -361.982177734375, + "logps/rejected": -447.18798828125, + "loss": 0.5459, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.242569088935852, + "rewards/margins": 0.8317493200302124, + "rewards/rejected": -2.0743184089660645, + "step": 860 + }, + { + "epoch": 0.17, + "learning_rate": 4.924295551041688e-06, + "logits/chosen": -2.0979232788085938, + "logits/rejected": -2.2426350116729736, + "logps/chosen": -359.0535888671875, + "logps/rejected": -432.423095703125, + "loss": 0.5804, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3003675937652588, + "rewards/margins": 0.2097325325012207, + "rewards/rejected": -1.51010000705719, + "step": 870 + }, + { + "epoch": 0.17, + "learning_rate": 4.920054204246116e-06, + "logits/chosen": -1.8829069137573242, + "logits/rejected": -1.774928331375122, + "logps/chosen": -295.80743408203125, + "logps/rejected": -424.815673828125, + "loss": 0.5379, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0513074398040771, + "rewards/margins": 0.9006274938583374, + "rewards/rejected": -1.9519351720809937, + "step": 880 + }, + { + "epoch": 0.17, + "learning_rate": 4.915699190575349e-06, + "logits/chosen": -2.2078824043273926, + "logits/rejected": -1.8246761560440063, + "logps/chosen": -414.6767578125, + "logps/rejected": -450.5083923339844, + "loss": 0.5016, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1404173374176025, + "rewards/margins": 0.949181854724884, + "rewards/rejected": -2.089599132537842, + "step": 890 + }, + { + "epoch": 0.18, + "learning_rate": 4.911230714578858e-06, + "logits/chosen": -2.33544659614563, + "logits/rejected": -2.159890651702881, + "logps/chosen": -484.42315673828125, + "logps/rejected": -484.75543212890625, + "loss": 0.555, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2610318660736084, + "rewards/margins": 0.7130531072616577, + "rewards/rejected": -1.9740852117538452, + "step": 900 + }, + { + "epoch": 0.18, + "eval_logits/chosen": 1.3993161916732788, + "eval_logits/rejected": 1.4158341884613037, + "eval_logps/chosen": -382.6491394042969, + "eval_logps/rejected": -413.9874572753906, + "eval_loss": 0.5461392998695374, + "eval_rewards/accuracies": 0.6770833134651184, + "eval_rewards/chosen": -1.1940844058990479, + "eval_rewards/margins": 0.6146063208580017, + "eval_rewards/rejected": -1.8086905479431152, + "eval_runtime": 464.4097, + "eval_samples_per_second": 4.307, + "eval_steps_per_second": 0.181, + "step": 900 + }, + { + "epoch": 0.18, + "learning_rate": 4.9066489861352875e-06, + "logits/chosen": -2.130706310272217, + "logits/rejected": -2.1322884559631348, + "logps/chosen": -396.38226318359375, + "logps/rejected": -399.5631103515625, + "loss": 0.578, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1745949983596802, + "rewards/margins": 0.6246682405471802, + "rewards/rejected": -1.79926335811615, + "step": 910 + }, + { + "epoch": 0.18, + "learning_rate": 4.90195422044261e-06, + "logits/chosen": -2.1645100116729736, + "logits/rejected": -2.146820545196533, + "logps/chosen": -313.99578857421875, + "logps/rejected": -400.88330078125, + "loss": 0.4906, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.011773705482483, + "rewards/margins": 0.7602158188819885, + "rewards/rejected": -1.7719894647598267, + "step": 920 + }, + { + "epoch": 0.18, + "learning_rate": 4.897146638008012e-06, + "logits/chosen": -2.0529541969299316, + "logits/rejected": -2.055722713470459, + "logps/chosen": -401.8716125488281, + "logps/rejected": -484.28863525390625, + "loss": 0.4835, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.197890043258667, + "rewards/margins": 0.9039093852043152, + "rewards/rejected": -2.101799488067627, + "step": 930 + }, + { + "epoch": 0.18, + "learning_rate": 4.89222646463754e-06, + "logits/chosen": -1.9853671789169312, + "logits/rejected": -1.9570581912994385, + "logps/chosen": -390.3425598144531, + "logps/rejected": -452.4058532714844, + "loss": 0.5724, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5371025800704956, + "rewards/margins": 0.6626251339912415, + "rewards/rejected": -2.199727773666382, + "step": 940 + }, + { + "epoch": 0.19, + "learning_rate": 4.8871939314254965e-06, + "logits/chosen": -2.0246262550354004, + "logits/rejected": -1.9982950687408447, + "logps/chosen": -401.33062744140625, + "logps/rejected": -527.936767578125, + "loss": 0.4559, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.588289499282837, + "rewards/margins": 1.0147461891174316, + "rewards/rejected": -2.6030356884002686, + "step": 950 + }, + { + "epoch": 0.19, + "learning_rate": 4.882049274743578e-06, + "logits/chosen": -2.163276195526123, + "logits/rejected": -1.980111837387085, + "logps/chosen": -394.1162109375, + "logps/rejected": -423.0694885253906, + "loss": 0.5912, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2555248737335205, + "rewards/margins": 0.48558419942855835, + "rewards/rejected": -1.7411092519760132, + "step": 960 + }, + { + "epoch": 0.19, + "learning_rate": 4.876792736229782e-06, + "logits/chosen": -2.1688385009765625, + "logits/rejected": -1.9922654628753662, + "logps/chosen": -377.38580322265625, + "logps/rejected": -441.0692443847656, + "loss": 0.5064, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.8046746253967285, + "rewards/margins": 0.9893644452095032, + "rewards/rejected": -1.7940391302108765, + "step": 970 + }, + { + "epoch": 0.19, + "learning_rate": 4.8714245627770515e-06, + "logits/chosen": -2.1530489921569824, + "logits/rejected": -2.2191214561462402, + "logps/chosen": -370.20111083984375, + "logps/rejected": -379.66998291015625, + "loss": 0.604, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9098675847053528, + "rewards/margins": 0.4256000518798828, + "rewards/rejected": -1.3354675769805908, + "step": 980 + }, + { + "epoch": 0.19, + "learning_rate": 4.865945006521684e-06, + "logits/chosen": -2.262906789779663, + "logits/rejected": -2.044600009918213, + "logps/chosen": -346.9558410644531, + "logps/rejected": -393.48828125, + "loss": 0.5445, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.7731464505195618, + "rewards/margins": 0.82574862241745, + "rewards/rejected": -1.5988948345184326, + "step": 990 + }, + { + "epoch": 0.2, + "learning_rate": 4.860354324831482e-06, + "logits/chosen": -2.0484061241149902, + "logits/rejected": -1.9440243244171143, + "logps/chosen": -324.58270263671875, + "logps/rejected": -407.22796630859375, + "loss": 0.4905, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.867997944355011, + "rewards/margins": 0.8739334344863892, + "rewards/rejected": -1.7419313192367554, + "step": 1000 + }, + { + "epoch": 0.2, + "eval_logits/chosen": 0.7731575965881348, + "eval_logits/rejected": 0.8211134076118469, + "eval_logps/chosen": -387.9333801269531, + "eval_logps/rejected": -428.39447021484375, + "eval_loss": 0.5462982058525085, + "eval_rewards/accuracies": 0.6889880895614624, + "eval_rewards/chosen": -1.2469266653060913, + "eval_rewards/margins": 0.7058340907096863, + "eval_rewards/rejected": -1.952761173248291, + "eval_runtime": 475.5201, + "eval_samples_per_second": 4.206, + "eval_steps_per_second": 0.177, + "step": 1000 + }, + { + "epoch": 0.2, + "learning_rate": 4.854652780293672e-06, + "logits/chosen": -1.7674760818481445, + "logits/rejected": -1.483852744102478, + "logps/chosen": -344.6761779785156, + "logps/rejected": -485.04400634765625, + "loss": 0.4088, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1883604526519775, + "rewards/margins": 1.2560036182403564, + "rewards/rejected": -2.444364309310913, + "step": 1010 + }, + { + "epoch": 0.2, + "learning_rate": 4.848840640702565e-06, + "logits/chosen": -1.7526578903198242, + "logits/rejected": -1.849961280822754, + "logps/chosen": -413.9766540527344, + "logps/rejected": -477.40582275390625, + "loss": 0.6059, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.663875937461853, + "rewards/margins": 0.5298476815223694, + "rewards/rejected": -2.193723440170288, + "step": 1020 + }, + { + "epoch": 0.2, + "learning_rate": 4.842918179046982e-06, + "logits/chosen": -1.7181034088134766, + "logits/rejected": -1.4771192073822021, + "logps/chosen": -380.33673095703125, + "logps/rejected": -443.24139404296875, + "loss": 0.5238, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0469777584075928, + "rewards/margins": 1.1572082042694092, + "rewards/rejected": -2.204185962677002, + "step": 1030 + }, + { + "epoch": 0.2, + "learning_rate": 4.836885673497435e-06, + "logits/chosen": -1.8101253509521484, + "logits/rejected": -1.8072385787963867, + "logps/chosen": -405.07415771484375, + "logps/rejected": -431.8306579589844, + "loss": 0.5149, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1933597326278687, + "rewards/margins": 0.6611484885215759, + "rewards/rejected": -1.8545081615447998, + "step": 1040 + }, + { + "epoch": 0.21, + "learning_rate": 4.830743407393052e-06, + "logits/chosen": -1.8477604389190674, + "logits/rejected": -1.9941295385360718, + "logps/chosen": -387.93194580078125, + "logps/rejected": -495.73565673828125, + "loss": 0.5424, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1492812633514404, + "rewards/margins": 0.8768144845962524, + "rewards/rejected": -2.0260956287384033, + "step": 1050 + }, + { + "epoch": 0.21, + "learning_rate": 4.824491669228279e-06, + "logits/chosen": -2.023118257522583, + "logits/rejected": -1.845958948135376, + "logps/chosen": -384.5801696777344, + "logps/rejected": -453.61260986328125, + "loss": 0.5636, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9846547842025757, + "rewards/margins": 0.5864224433898926, + "rewards/rejected": -1.5710772275924683, + "step": 1060 + }, + { + "epoch": 0.21, + "learning_rate": 4.818130752639326e-06, + "logits/chosen": -1.8387682437896729, + "logits/rejected": -1.9775362014770508, + "logps/chosen": -323.5730895996094, + "logps/rejected": -408.73388671875, + "loss": 0.5621, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1858361959457397, + "rewards/margins": 0.44976431131362915, + "rewards/rejected": -1.6356006860733032, + "step": 1070 + }, + { + "epoch": 0.21, + "learning_rate": 4.811660956390372e-06, + "logits/chosen": -2.032257556915283, + "logits/rejected": -1.8938792943954468, + "logps/chosen": -384.43597412109375, + "logps/rejected": -397.3373107910156, + "loss": 0.6228, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0033721923828125, + "rewards/margins": 0.4804312288761139, + "rewards/rejected": -1.483803391456604, + "step": 1080 + }, + { + "epoch": 0.21, + "learning_rate": 4.8050825843595395e-06, + "logits/chosen": -2.114607572555542, + "logits/rejected": -2.0265519618988037, + "logps/chosen": -308.5718688964844, + "logps/rejected": -327.57550048828125, + "loss": 0.5083, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.671250581741333, + "rewards/margins": 0.6738994717597961, + "rewards/rejected": -1.3451499938964844, + "step": 1090 + }, + { + "epoch": 0.22, + "learning_rate": 4.798395945524615e-06, + "logits/chosen": -2.040956735610962, + "logits/rejected": -1.9049040079116821, + "logps/chosen": -340.9862365722656, + "logps/rejected": -406.28216552734375, + "loss": 0.5214, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0534942150115967, + "rewards/margins": 0.7783910036087036, + "rewards/rejected": -1.8318853378295898, + "step": 1100 + }, + { + "epoch": 0.22, + "eval_logits/chosen": 1.416340708732605, + "eval_logits/rejected": 1.3945404291152954, + "eval_logps/chosen": -391.100830078125, + "eval_logps/rejected": -423.03472900390625, + "eval_loss": 0.5355843305587769, + "eval_rewards/accuracies": 0.6979166865348816, + "eval_rewards/chosen": -1.2786011695861816, + "eval_rewards/margins": 0.6205627918243408, + "eval_rewards/rejected": -1.8991637229919434, + "eval_runtime": 474.4879, + "eval_samples_per_second": 4.215, + "eval_steps_per_second": 0.177, + "step": 1100 + }, + { + "epoch": 0.22, + "learning_rate": 4.791601353948537e-06, + "logits/chosen": -2.277050495147705, + "logits/rejected": -1.9915672540664673, + "logps/chosen": -436.3802795410156, + "logps/rejected": -471.61761474609375, + "loss": 0.5151, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1520277261734009, + "rewards/margins": 0.8716039657592773, + "rewards/rejected": -2.0236315727233887, + "step": 1110 + }, + { + "epoch": 0.22, + "learning_rate": 4.784699128764654e-06, + "logits/chosen": -2.116421699523926, + "logits/rejected": -2.0355710983276367, + "logps/chosen": -420.1725158691406, + "logps/rejected": -476.16168212890625, + "loss": 0.5504, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3343623876571655, + "rewards/margins": 0.5101630687713623, + "rewards/rejected": -1.8445253372192383, + "step": 1120 + }, + { + "epoch": 0.22, + "learning_rate": 4.777689594161724e-06, + "logits/chosen": -2.114661931991577, + "logits/rejected": -2.0693721771240234, + "logps/chosen": -463.98077392578125, + "logps/rejected": -556.939453125, + "loss": 0.5139, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.175616979598999, + "rewards/margins": 0.8436921238899231, + "rewards/rejected": -2.0193092823028564, + "step": 1130 + }, + { + "epoch": 0.22, + "learning_rate": 4.770573079368691e-06, + "logits/chosen": -2.093860149383545, + "logits/rejected": -1.8888766765594482, + "logps/chosen": -430.1224060058594, + "logps/rejected": -518.7135009765625, + "loss": 0.5418, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.3075679540634155, + "rewards/margins": 0.8580087423324585, + "rewards/rejected": -2.165576696395874, + "step": 1140 + }, + { + "epoch": 0.23, + "learning_rate": 4.763349918639228e-06, + "logits/chosen": -2.113792657852173, + "logits/rejected": -1.906060814857483, + "logps/chosen": -448.8399353027344, + "logps/rejected": -523.9088134765625, + "loss": 0.5246, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4472620487213135, + "rewards/margins": 0.853011429309845, + "rewards/rejected": -2.3002734184265137, + "step": 1150 + }, + { + "epoch": 0.23, + "learning_rate": 4.756020451236025e-06, + "logits/chosen": -2.009995222091675, + "logits/rejected": -1.607735276222229, + "logps/chosen": -415.8470153808594, + "logps/rejected": -441.7735290527344, + "loss": 0.5287, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5157968997955322, + "rewards/margins": 0.8386972546577454, + "rewards/rejected": -2.354494094848633, + "step": 1160 + }, + { + "epoch": 0.23, + "learning_rate": 4.748585021414869e-06, + "logits/chosen": -1.892051100730896, + "logits/rejected": -1.7459224462509155, + "logps/chosen": -378.3355407714844, + "logps/rejected": -477.277587890625, + "loss": 0.5629, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5294681787490845, + "rewards/margins": 0.621840238571167, + "rewards/rejected": -2.151308536529541, + "step": 1170 + }, + { + "epoch": 0.23, + "learning_rate": 4.741043978408463e-06, + "logits/chosen": -1.8768208026885986, + "logits/rejected": -1.7806380987167358, + "logps/chosen": -328.5000305175781, + "logps/rejected": -318.7469177246094, + "loss": 0.5865, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0944186449050903, + "rewards/margins": 0.3815223276615143, + "rewards/rejected": -1.4759409427642822, + "step": 1180 + }, + { + "epoch": 0.23, + "learning_rate": 4.733397676410027e-06, + "logits/chosen": -2.0375049114227295, + "logits/rejected": -1.5592041015625, + "logps/chosen": -374.0691223144531, + "logps/rejected": -392.29302978515625, + "loss": 0.5295, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3160943984985352, + "rewards/margins": 0.6633793711662292, + "rewards/rejected": -1.9794738292694092, + "step": 1190 + }, + { + "epoch": 0.24, + "learning_rate": 4.725646474556666e-06, + "logits/chosen": -2.212542772293091, + "logits/rejected": -1.9363590478897095, + "logps/chosen": -367.44024658203125, + "logps/rejected": -388.07452392578125, + "loss": 0.4988, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1194865703582764, + "rewards/margins": 0.8652739524841309, + "rewards/rejected": -1.9847608804702759, + "step": 1200 + }, + { + "epoch": 0.24, + "eval_logits/chosen": 0.9227811098098755, + "eval_logits/rejected": 1.0273418426513672, + "eval_logps/chosen": -385.0260925292969, + "eval_logps/rejected": -426.0503234863281, + "eval_loss": 0.530683159828186, + "eval_rewards/accuracies": 0.6979166865348816, + "eval_rewards/chosen": -1.2178539037704468, + "eval_rewards/margins": 0.7114654183387756, + "eval_rewards/rejected": -1.9293192625045776, + "eval_runtime": 471.1353, + "eval_samples_per_second": 4.245, + "eval_steps_per_second": 0.178, + "step": 1200 + }, + { + "epoch": 0.24, + "learning_rate": 4.717790736912493e-06, + "logits/chosen": -1.9181499481201172, + "logits/rejected": -1.7514560222625732, + "logps/chosen": -359.93341064453125, + "logps/rejected": -441.38916015625, + "loss": 0.4777, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.209984540939331, + "rewards/margins": 1.0826075077056885, + "rewards/rejected": -2.2925918102264404, + "step": 1210 + }, + { + "epoch": 0.24, + "learning_rate": 4.709830832451538e-06, + "logits/chosen": -1.9338840246200562, + "logits/rejected": -1.9117820262908936, + "logps/chosen": -480.5819396972656, + "logps/rejected": -532.8638916015625, + "loss": 0.5032, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4112778902053833, + "rewards/margins": 0.9129534959793091, + "rewards/rejected": -2.3242313861846924, + "step": 1220 + }, + { + "epoch": 0.24, + "learning_rate": 4.701767135040415e-06, + "logits/chosen": -1.8094873428344727, + "logits/rejected": -1.5869756937026978, + "logps/chosen": -513.3035888671875, + "logps/rejected": -593.2393798828125, + "loss": 0.5695, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2972371578216553, + "rewards/margins": 0.9080797433853149, + "rewards/rejected": -3.2053170204162598, + "step": 1230 + }, + { + "epoch": 0.24, + "learning_rate": 4.693600023420758e-06, + "logits/chosen": -1.4798624515533447, + "logits/rejected": -1.369323968887329, + "logps/chosen": -486.832275390625, + "logps/rejected": -542.3109130859375, + "loss": 0.5068, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.3903822898864746, + "rewards/margins": 0.7768516540527344, + "rewards/rejected": -3.167233943939209, + "step": 1240 + }, + { + "epoch": 0.25, + "learning_rate": 4.685329881191436e-06, + "logits/chosen": -1.6226552724838257, + "logits/rejected": -1.513686180114746, + "logps/chosen": -399.8258056640625, + "logps/rejected": -447.3807678222656, + "loss": 0.5162, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.9063441753387451, + "rewards/margins": 0.6103194355964661, + "rewards/rejected": -2.5166637897491455, + "step": 1250 + }, + { + "epoch": 0.25, + "learning_rate": 4.676957096790536e-06, + "logits/chosen": -1.4838886260986328, + "logits/rejected": -1.0853869915008545, + "logps/chosen": -432.89801025390625, + "logps/rejected": -512.7747802734375, + "loss": 0.5367, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6043317317962646, + "rewards/margins": 1.2039530277252197, + "rewards/rejected": -2.8082849979400635, + "step": 1260 + }, + { + "epoch": 0.25, + "learning_rate": 4.668482063477118e-06, + "logits/chosen": -1.923230528831482, + "logits/rejected": -1.7049680948257446, + "logps/chosen": -423.11859130859375, + "logps/rejected": -456.056884765625, + "loss": 0.5425, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4797232151031494, + "rewards/margins": 0.6203845739364624, + "rewards/rejected": -2.1001076698303223, + "step": 1270 + }, + { + "epoch": 0.25, + "learning_rate": 4.659905179312743e-06, + "logits/chosen": -1.4056613445281982, + "logits/rejected": -1.4088342189788818, + "logps/chosen": -336.7528381347656, + "logps/rejected": -490.25946044921875, + "loss": 0.5005, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4386407136917114, + "rewards/margins": 1.2846283912658691, + "rewards/rejected": -2.723268985748291, + "step": 1280 + }, + { + "epoch": 0.25, + "learning_rate": 4.651226847142774e-06, + "logits/chosen": -1.3955827951431274, + "logits/rejected": -1.3704919815063477, + "logps/chosen": -420.107177734375, + "logps/rejected": -611.5444946289062, + "loss": 0.5074, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8919422626495361, + "rewards/margins": 1.1938669681549072, + "rewards/rejected": -3.0858092308044434, + "step": 1290 + }, + { + "epoch": 0.26, + "learning_rate": 4.642447474577466e-06, + "logits/chosen": -1.661499261856079, + "logits/rejected": -1.3011276721954346, + "logps/chosen": -438.72747802734375, + "logps/rejected": -499.0279235839844, + "loss": 0.5324, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.768543004989624, + "rewards/margins": 0.987531840801239, + "rewards/rejected": -2.756074905395508, + "step": 1300 + }, + { + "epoch": 0.26, + "eval_logits/chosen": 0.591680645942688, + "eval_logits/rejected": 0.9343855977058411, + "eval_logps/chosen": -408.3595275878906, + "eval_logps/rejected": -450.9059753417969, + "eval_loss": 0.5320248007774353, + "eval_rewards/accuracies": 0.7023809552192688, + "eval_rewards/chosen": -1.451188087463379, + "eval_rewards/margins": 0.7266876697540283, + "eval_rewards/rejected": -2.1778759956359863, + "eval_runtime": 469.1631, + "eval_samples_per_second": 4.263, + "eval_steps_per_second": 0.179, + "step": 1300 + }, + { + "epoch": 0.26, + "learning_rate": 4.6335674739728055e-06, + "logits/chosen": -1.9182145595550537, + "logits/rejected": -1.6547549962997437, + "logps/chosen": -414.7605895996094, + "logps/rejected": -529.4044799804688, + "loss": 0.5226, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5170066356658936, + "rewards/margins": 1.003164529800415, + "rewards/rejected": -2.5201709270477295, + "step": 1310 + }, + { + "epoch": 0.26, + "learning_rate": 4.6245872624111535e-06, + "logits/chosen": -1.8730331659317017, + "logits/rejected": -1.5008450746536255, + "logps/chosen": -348.73748779296875, + "logps/rejected": -431.8541564941406, + "loss": 0.5453, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0170891284942627, + "rewards/margins": 0.7871019244194031, + "rewards/rejected": -1.804190993309021, + "step": 1320 + }, + { + "epoch": 0.26, + "learning_rate": 4.6155072616816515e-06, + "logits/chosen": -1.6007864475250244, + "logits/rejected": -1.7298269271850586, + "logps/chosen": -354.00042724609375, + "logps/rejected": -439.82025146484375, + "loss": 0.5378, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1786034107208252, + "rewards/margins": 0.829128623008728, + "rewards/rejected": -2.0077319145202637, + "step": 1330 + }, + { + "epoch": 0.26, + "learning_rate": 4.606327898260413e-06, + "logits/chosen": -1.7862968444824219, + "logits/rejected": -1.6234447956085205, + "logps/chosen": -358.39483642578125, + "logps/rejected": -438.0057678222656, + "loss": 0.5152, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1921870708465576, + "rewards/margins": 1.013359785079956, + "rewards/rejected": -2.2055468559265137, + "step": 1340 + }, + { + "epoch": 0.26, + "learning_rate": 4.597049603290491e-06, + "logits/chosen": -1.755611777305603, + "logits/rejected": -1.4681942462921143, + "logps/chosen": -490.5035095214844, + "logps/rejected": -585.6339111328125, + "loss": 0.4694, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7060209512710571, + "rewards/margins": 1.0946344137191772, + "rewards/rejected": -2.8006553649902344, + "step": 1350 + }, + { + "epoch": 0.27, + "learning_rate": 4.587672812561626e-06, + "logits/chosen": -1.5236080884933472, + "logits/rejected": -1.5720051527023315, + "logps/chosen": -467.0594787597656, + "logps/rejected": -558.3744506835938, + "loss": 0.5319, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.960383653640747, + "rewards/margins": 0.865696907043457, + "rewards/rejected": -2.826080322265625, + "step": 1360 + }, + { + "epoch": 0.27, + "learning_rate": 4.578197966489782e-06, + "logits/chosen": -1.61127507686615, + "logits/rejected": -1.5698819160461426, + "logps/chosen": -402.5616149902344, + "logps/rejected": -409.2900390625, + "loss": 0.5398, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4020943641662598, + "rewards/margins": 0.8227679133415222, + "rewards/rejected": -2.2248623371124268, + "step": 1370 + }, + { + "epoch": 0.27, + "learning_rate": 4.5686255100964535e-06, + "logits/chosen": -1.72307550907135, + "logits/rejected": -1.580277442932129, + "logps/chosen": -336.98004150390625, + "logps/rejected": -388.698486328125, + "loss": 0.5668, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2258248329162598, + "rewards/margins": 0.6155509948730469, + "rewards/rejected": -1.8413759469985962, + "step": 1380 + }, + { + "epoch": 0.27, + "learning_rate": 4.558955892987774e-06, + "logits/chosen": -1.59917414188385, + "logits/rejected": -1.4982532262802124, + "logps/chosen": -340.16107177734375, + "logps/rejected": -405.5443420410156, + "loss": 0.4999, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.325260877609253, + "rewards/margins": 0.938653826713562, + "rewards/rejected": -2.2639145851135254, + "step": 1390 + }, + { + "epoch": 0.27, + "learning_rate": 4.549189569333387e-06, + "logits/chosen": -1.7914392948150635, + "logits/rejected": -1.4878028631210327, + "logps/chosen": -396.49798583984375, + "logps/rejected": -399.7130432128906, + "loss": 0.5286, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.035138726234436, + "rewards/margins": 0.9291532635688782, + "rewards/rejected": -1.9642921686172485, + "step": 1400 + }, + { + "epoch": 0.27, + "eval_logits/chosen": 0.8243531584739685, + "eval_logits/rejected": 1.1978684663772583, + "eval_logps/chosen": -401.0144958496094, + "eval_logps/rejected": -447.237060546875, + "eval_loss": 0.5193303227424622, + "eval_rewards/accuracies": 0.7038690447807312, + "eval_rewards/chosen": -1.3777379989624023, + "eval_rewards/margins": 0.7634496092796326, + "eval_rewards/rejected": -2.1411876678466797, + "eval_runtime": 469.87, + "eval_samples_per_second": 4.256, + "eval_steps_per_second": 0.179, + "step": 1400 + }, + { + "epoch": 0.28, + "learning_rate": 4.539326997845124e-06, + "logits/chosen": -1.8032891750335693, + "logits/rejected": -1.5840117931365967, + "logps/chosen": -414.5916442871094, + "logps/rejected": -584.2218627929688, + "loss": 0.4502, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7965669631958008, + "rewards/margins": 1.4432768821716309, + "rewards/rejected": -3.2398440837860107, + "step": 1410 + }, + { + "epoch": 0.28, + "learning_rate": 4.529368641755453e-06, + "logits/chosen": -1.791632890701294, + "logits/rejected": -1.1489166021347046, + "logps/chosen": -516.554443359375, + "logps/rejected": -596.7518920898438, + "loss": 0.5538, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.201895236968994, + "rewards/margins": 1.2632453441619873, + "rewards/rejected": -3.4651405811309814, + "step": 1420 + }, + { + "epoch": 0.28, + "learning_rate": 4.519314968795722e-06, + "logits/chosen": -1.3863189220428467, + "logits/rejected": -1.6445220708847046, + "logps/chosen": -467.303466796875, + "logps/rejected": -603.4534301757812, + "loss": 0.6027, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.094290018081665, + "rewards/margins": 0.9386318325996399, + "rewards/rejected": -3.03292179107666, + "step": 1430 + }, + { + "epoch": 0.28, + "learning_rate": 4.509166451174194e-06, + "logits/chosen": -1.7560551166534424, + "logits/rejected": -1.5599005222320557, + "logps/chosen": -516.143310546875, + "logps/rejected": -626.3974609375, + "loss": 0.4954, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.342982053756714, + "rewards/margins": 1.2926456928253174, + "rewards/rejected": -3.6356277465820312, + "step": 1440 + }, + { + "epoch": 0.28, + "learning_rate": 4.498923565553866e-06, + "logits/chosen": -1.6948440074920654, + "logits/rejected": -1.644818663597107, + "logps/chosen": -414.959716796875, + "logps/rejected": -533.6055908203125, + "loss": 0.5304, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.96126389503479, + "rewards/margins": 0.956006646156311, + "rewards/rejected": -2.9172706604003906, + "step": 1450 + }, + { + "epoch": 0.29, + "learning_rate": 4.488586793030075e-06, + "logits/chosen": -1.698326826095581, + "logits/rejected": -1.4736413955688477, + "logps/chosen": -370.10833740234375, + "logps/rejected": -512.392333984375, + "loss": 0.4624, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6409461498260498, + "rewards/margins": 1.054914116859436, + "rewards/rejected": -2.6958603858947754, + "step": 1460 + }, + { + "epoch": 0.29, + "learning_rate": 4.478156619107912e-06, + "logits/chosen": -1.7624685764312744, + "logits/rejected": -1.3393685817718506, + "logps/chosen": -386.123046875, + "logps/rejected": -550.3563232421875, + "loss": 0.4852, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5561819076538086, + "rewards/margins": 1.3703248500823975, + "rewards/rejected": -2.926506519317627, + "step": 1470 + }, + { + "epoch": 0.29, + "learning_rate": 4.4676335336794125e-06, + "logits/chosen": -2.015625238418579, + "logits/rejected": -1.9299089908599854, + "logps/chosen": -498.6893615722656, + "logps/rejected": -575.2039184570312, + "loss": 0.615, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.8964248895645142, + "rewards/margins": 0.6595728397369385, + "rewards/rejected": -2.555997848510742, + "step": 1480 + }, + { + "epoch": 0.29, + "learning_rate": 4.457018031000544e-06, + "logits/chosen": -1.7110872268676758, + "logits/rejected": -1.4807096719741821, + "logps/chosen": -418.84381103515625, + "logps/rejected": -466.87628173828125, + "loss": 0.4524, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6023814678192139, + "rewards/margins": 0.7802639007568359, + "rewards/rejected": -2.38264536857605, + "step": 1490 + }, + { + "epoch": 0.29, + "learning_rate": 4.446310609668001e-06, + "logits/chosen": -1.6997228860855103, + "logits/rejected": -1.5844206809997559, + "logps/chosen": -328.55035400390625, + "logps/rejected": -412.58380126953125, + "loss": 0.6095, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.467379093170166, + "rewards/margins": 0.6344095468521118, + "rewards/rejected": -2.1017885208129883, + "step": 1500 + }, + { + "epoch": 0.29, + "eval_logits/chosen": -0.023842444643378258, + "eval_logits/rejected": 0.35978996753692627, + "eval_logps/chosen": -380.54217529296875, + "eval_logps/rejected": -421.9496765136719, + "eval_loss": 0.5205935835838318, + "eval_rewards/accuracies": 0.7008928656578064, + "eval_rewards/chosen": -1.1730148792266846, + "eval_rewards/margins": 0.7152983546257019, + "eval_rewards/rejected": -1.8883132934570312, + "eval_runtime": 472.7173, + "eval_samples_per_second": 4.231, + "eval_steps_per_second": 0.178, + "step": 1500 + }, + { + "epoch": 0.3, + "learning_rate": 4.435511772595773e-06, + "logits/chosen": -1.9284782409667969, + "logits/rejected": -1.6309083700180054, + "logps/chosen": -413.587158203125, + "logps/rejected": -419.9436950683594, + "loss": 0.5097, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2852447032928467, + "rewards/margins": 0.7189093232154846, + "rewards/rejected": -2.0041542053222656, + "step": 1510 + }, + { + "epoch": 0.3, + "learning_rate": 4.424622026991536e-06, + "logits/chosen": -1.9414260387420654, + "logits/rejected": -1.683161973953247, + "logps/chosen": -374.2628173828125, + "logps/rejected": -393.2518615722656, + "loss": 0.5011, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1332727670669556, + "rewards/margins": 0.7838243246078491, + "rewards/rejected": -1.9170968532562256, + "step": 1520 + }, + { + "epoch": 0.3, + "learning_rate": 4.413641884332825e-06, + "logits/chosen": -1.6223335266113281, + "logits/rejected": -1.4534587860107422, + "logps/chosen": -372.34564208984375, + "logps/rejected": -485.110107421875, + "loss": 0.4454, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1431056261062622, + "rewards/margins": 1.2465258836746216, + "rewards/rejected": -2.389631748199463, + "step": 1530 + }, + { + "epoch": 0.3, + "learning_rate": 4.402571860343006e-06, + "logits/chosen": -1.517801284790039, + "logits/rejected": -1.4638478755950928, + "logps/chosen": -393.1158447265625, + "logps/rejected": -419.7225036621094, + "loss": 0.5361, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1784864664077759, + "rewards/margins": 0.9864957928657532, + "rewards/rejected": -2.164982318878174, + "step": 1540 + }, + { + "epoch": 0.3, + "learning_rate": 4.39141247496706e-06, + "logits/chosen": -1.835381269454956, + "logits/rejected": -1.4641658067703247, + "logps/chosen": -422.1026916503906, + "logps/rejected": -509.72357177734375, + "loss": 0.5483, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.7700493335723877, + "rewards/margins": 0.6548973321914673, + "rewards/rejected": -2.4249467849731445, + "step": 1550 + }, + { + "epoch": 0.31, + "learning_rate": 4.3801642523471585e-06, + "logits/chosen": -1.53899347782135, + "logits/rejected": -1.6525852680206299, + "logps/chosen": -452.6863708496094, + "logps/rejected": -562.87109375, + "loss": 0.481, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9491571187973022, + "rewards/margins": 1.0457651615142822, + "rewards/rejected": -2.994922161102295, + "step": 1560 + }, + { + "epoch": 0.31, + "learning_rate": 4.368827720798044e-06, + "logits/chosen": -1.6177898645401, + "logits/rejected": -1.3273764848709106, + "logps/chosen": -503.0707092285156, + "logps/rejected": -591.0535888671875, + "loss": 0.5171, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.2399299144744873, + "rewards/margins": 1.0294549465179443, + "rewards/rejected": -3.2693848609924316, + "step": 1570 + }, + { + "epoch": 0.31, + "learning_rate": 4.35740341278222e-06, + "logits/chosen": -1.937767744064331, + "logits/rejected": -1.4204972982406616, + "logps/chosen": -516.6253662109375, + "logps/rejected": -531.3732299804688, + "loss": 0.4762, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.137434244155884, + "rewards/margins": 1.3851698637008667, + "rewards/rejected": -3.522603988647461, + "step": 1580 + }, + { + "epoch": 0.31, + "learning_rate": 4.345891864884937e-06, + "logits/chosen": -1.4484498500823975, + "logits/rejected": -1.2074967622756958, + "logps/chosen": -376.8973388671875, + "logps/rejected": -465.946044921875, + "loss": 0.4636, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6752738952636719, + "rewards/margins": 1.1741887331008911, + "rewards/rejected": -2.8494625091552734, + "step": 1590 + }, + { + "epoch": 0.31, + "learning_rate": 4.334293617788992e-06, + "logits/chosen": -1.7261905670166016, + "logits/rejected": -1.6486454010009766, + "logps/chosen": -483.2078552246094, + "logps/rejected": -598.8380126953125, + "loss": 0.5627, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0185751914978027, + "rewards/margins": 1.0393562316894531, + "rewards/rejected": -3.057931423187256, + "step": 1600 + }, + { + "epoch": 0.31, + "eval_logits/chosen": 0.41467130184173584, + "eval_logits/rejected": 0.7394571304321289, + "eval_logps/chosen": -451.3461608886719, + "eval_logps/rejected": -510.4462890625, + "eval_loss": 0.5224563479423523, + "eval_rewards/accuracies": 0.6934523582458496, + "eval_rewards/chosen": -1.8810548782348633, + "eval_rewards/margins": 0.892224907875061, + "eval_rewards/rejected": -2.773279905319214, + "eval_runtime": 468.6159, + "eval_samples_per_second": 4.268, + "eval_steps_per_second": 0.179, + "step": 1600 + }, + { + "epoch": 0.32, + "learning_rate": 4.322609216249336e-06, + "logits/chosen": -1.4203035831451416, + "logits/rejected": -1.3660808801651, + "logps/chosen": -459.4410095214844, + "logps/rejected": -574.9068603515625, + "loss": 0.5023, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7188522815704346, + "rewards/margins": 1.460274577140808, + "rewards/rejected": -3.179126739501953, + "step": 1610 + }, + { + "epoch": 0.32, + "learning_rate": 4.310839209067482e-06, + "logits/chosen": -1.4383463859558105, + "logits/rejected": -1.3699233531951904, + "logps/chosen": -462.7704162597656, + "logps/rejected": -559.3999633789062, + "loss": 0.5211, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.2110602855682373, + "rewards/margins": 0.9121319055557251, + "rewards/rejected": -3.123192310333252, + "step": 1620 + }, + { + "epoch": 0.32, + "learning_rate": 4.298984149065732e-06, + "logits/chosen": -1.6822255849838257, + "logits/rejected": -1.4348416328430176, + "logps/chosen": -506.04083251953125, + "logps/rejected": -583.9845581054688, + "loss": 0.5117, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2623586654663086, + "rewards/margins": 0.8611604571342468, + "rewards/rejected": -3.1235194206237793, + "step": 1630 + }, + { + "epoch": 0.32, + "learning_rate": 4.2870445930612135e-06, + "logits/chosen": -1.5166553258895874, + "logits/rejected": -1.214342713356018, + "logps/chosen": -446.37664794921875, + "logps/rejected": -529.3927612304688, + "loss": 0.3845, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0724778175354004, + "rewards/margins": 1.1939537525177002, + "rewards/rejected": -3.2664313316345215, + "step": 1640 + }, + { + "epoch": 0.32, + "learning_rate": 4.2750211018397204e-06, + "logits/chosen": -1.3522977828979492, + "logits/rejected": -1.2287527322769165, + "logps/chosen": -498.2793884277344, + "logps/rejected": -644.9459228515625, + "loss": 0.4669, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.349169969558716, + "rewards/margins": 1.3009414672851562, + "rewards/rejected": -3.650111675262451, + "step": 1650 + }, + { + "epoch": 0.33, + "learning_rate": 4.262914240129379e-06, + "logits/chosen": -1.5904858112335205, + "logits/rejected": -1.40458083152771, + "logps/chosen": -495.546630859375, + "logps/rejected": -591.2596435546875, + "loss": 0.4949, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.0124459266662598, + "rewards/margins": 1.3423213958740234, + "rewards/rejected": -3.354767322540283, + "step": 1660 + }, + { + "epoch": 0.33, + "learning_rate": 4.2507245765741215e-06, + "logits/chosen": -1.656916618347168, + "logits/rejected": -1.5722547769546509, + "logps/chosen": -529.6829833984375, + "logps/rejected": -617.2437744140625, + "loss": 0.4921, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2331345081329346, + "rewards/margins": 1.2079585790634155, + "rewards/rejected": -3.4410929679870605, + "step": 1670 + }, + { + "epoch": 0.33, + "learning_rate": 4.238452683706979e-06, + "logits/chosen": -1.775490403175354, + "logits/rejected": -1.6295232772827148, + "logps/chosen": -435.9142150878906, + "logps/rejected": -568.1779174804688, + "loss": 0.5471, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8802047967910767, + "rewards/margins": 1.2522530555725098, + "rewards/rejected": -3.132458209991455, + "step": 1680 + }, + { + "epoch": 0.33, + "learning_rate": 4.226099137923186e-06, + "logits/chosen": -1.973921537399292, + "logits/rejected": -1.6960344314575195, + "logps/chosen": -485.93902587890625, + "logps/rejected": -519.7362060546875, + "loss": 0.5535, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8466163873672485, + "rewards/margins": 0.6419102549552917, + "rewards/rejected": -2.4885265827178955, + "step": 1690 + }, + { + "epoch": 0.33, + "learning_rate": 4.213664519453115e-06, + "logits/chosen": -1.9954742193222046, + "logits/rejected": -2.0354678630828857, + "logps/chosen": -381.51885986328125, + "logps/rejected": -468.69818115234375, + "loss": 0.5222, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.402010440826416, + "rewards/margins": 0.8014278411865234, + "rewards/rejected": -2.2034382820129395, + "step": 1700 + }, + { + "epoch": 0.33, + "eval_logits/chosen": -0.384397953748703, + "eval_logits/rejected": -0.06426750123500824, + "eval_logps/chosen": -382.0738830566406, + "eval_logps/rejected": -417.88531494140625, + "eval_loss": 0.5209601521492004, + "eval_rewards/accuracies": 0.7142857313156128, + "eval_rewards/chosen": -1.188332200050354, + "eval_rewards/margins": 0.6593378186225891, + "eval_rewards/rejected": -1.847670078277588, + "eval_runtime": 458.9623, + "eval_samples_per_second": 4.358, + "eval_steps_per_second": 0.183, + "step": 1700 + }, + { + "epoch": 0.34, + "learning_rate": 4.201149412335015e-06, + "logits/chosen": -2.1409153938293457, + "logits/rejected": -1.844313621520996, + "logps/chosen": -423.0439453125, + "logps/rejected": -440.78570556640625, + "loss": 0.4777, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.185486912727356, + "rewards/margins": 0.8028791546821594, + "rewards/rejected": -1.988365888595581, + "step": 1710 + }, + { + "epoch": 0.34, + "learning_rate": 4.188554404387588e-06, + "logits/chosen": -1.8138986825942993, + "logits/rejected": -1.6512314081192017, + "logps/chosen": -327.36737060546875, + "logps/rejected": -398.70184326171875, + "loss": 0.5029, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.018953800201416, + "rewards/margins": 0.9757841229438782, + "rewards/rejected": -1.994738221168518, + "step": 1720 + }, + { + "epoch": 0.34, + "learning_rate": 4.175880087182376e-06, + "logits/chosen": -1.8932291269302368, + "logits/rejected": -1.6494147777557373, + "logps/chosen": -279.64373779296875, + "logps/rejected": -345.70465087890625, + "loss": 0.5683, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9910870790481567, + "rewards/margins": 0.7539299726486206, + "rewards/rejected": -1.7450170516967773, + "step": 1730 + }, + { + "epoch": 0.34, + "learning_rate": 4.163127056015975e-06, + "logits/chosen": -2.0070395469665527, + "logits/rejected": -1.9813343286514282, + "logps/chosen": -383.7837829589844, + "logps/rejected": -468.212646484375, + "loss": 0.561, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9646474123001099, + "rewards/margins": 1.0487464666366577, + "rewards/rejected": -2.0133938789367676, + "step": 1740 + }, + { + "epoch": 0.34, + "learning_rate": 4.1502959098820774e-06, + "logits/chosen": -1.8485866785049438, + "logits/rejected": -1.7759544849395752, + "logps/chosen": -352.3072814941406, + "logps/rejected": -425.56915283203125, + "loss": 0.4764, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.904167652130127, + "rewards/margins": 1.056064486503601, + "rewards/rejected": -1.960232138633728, + "step": 1750 + }, + { + "epoch": 0.35, + "learning_rate": 4.137387251443335e-06, + "logits/chosen": -1.5388623476028442, + "logits/rejected": -1.4535671472549438, + "logps/chosen": -339.34197998046875, + "logps/rejected": -419.2979431152344, + "loss": 0.531, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3059513568878174, + "rewards/margins": 0.8590526580810547, + "rewards/rejected": -2.165003776550293, + "step": 1760 + }, + { + "epoch": 0.35, + "learning_rate": 4.124401687003057e-06, + "logits/chosen": -1.820469856262207, + "logits/rejected": -1.6758521795272827, + "logps/chosen": -412.8130798339844, + "logps/rejected": -451.430419921875, + "loss": 0.5253, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3753070831298828, + "rewards/margins": 0.783947229385376, + "rewards/rejected": -2.159254550933838, + "step": 1770 + }, + { + "epoch": 0.35, + "learning_rate": 4.111339826476725e-06, + "logits/chosen": -1.655381441116333, + "logits/rejected": -1.4131075143814087, + "logps/chosen": -486.42401123046875, + "logps/rejected": -557.5438842773438, + "loss": 0.5744, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7467525005340576, + "rewards/margins": 0.8533447980880737, + "rewards/rejected": -2.600097179412842, + "step": 1780 + }, + { + "epoch": 0.35, + "learning_rate": 4.098202283363356e-06, + "logits/chosen": -1.9893325567245483, + "logits/rejected": -1.7300984859466553, + "logps/chosen": -499.3247985839844, + "logps/rejected": -464.3089294433594, + "loss": 0.5828, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4985158443450928, + "rewards/margins": 0.48281264305114746, + "rewards/rejected": -1.9813286066055298, + "step": 1790 + }, + { + "epoch": 0.35, + "learning_rate": 4.084989674716679e-06, + "logits/chosen": -1.932824730873108, + "logits/rejected": -1.5683988332748413, + "logps/chosen": -370.2239685058594, + "logps/rejected": -427.7416076660156, + "loss": 0.5163, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3531526327133179, + "rewards/margins": 0.8490608334541321, + "rewards/rejected": -2.2022132873535156, + "step": 1800 + }, + { + "epoch": 0.35, + "eval_logits/chosen": 0.9604912996292114, + "eval_logits/rejected": 1.2999762296676636, + "eval_logps/chosen": -381.0427551269531, + "eval_logps/rejected": -430.9522399902344, + "eval_loss": 0.5219135880470276, + "eval_rewards/accuracies": 0.7247023582458496, + "eval_rewards/chosen": -1.1780204772949219, + "eval_rewards/margins": 0.8003180623054504, + "eval_rewards/rejected": -1.9783387184143066, + "eval_runtime": 466.9785, + "eval_samples_per_second": 4.283, + "eval_steps_per_second": 0.18, + "step": 1800 + }, + { + "epoch": 0.36, + "learning_rate": 4.071702621116158e-06, + "logits/chosen": -1.9070345163345337, + "logits/rejected": -1.732208490371704, + "logps/chosen": -396.03912353515625, + "logps/rejected": -419.9725646972656, + "loss": 0.6269, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3162269592285156, + "rewards/margins": 0.49873286485671997, + "rewards/rejected": -1.8149597644805908, + "step": 1810 + }, + { + "epoch": 0.36, + "learning_rate": 4.05834174663784e-06, + "logits/chosen": -1.7320168018341064, + "logits/rejected": -1.5021142959594727, + "logps/chosen": -367.4055480957031, + "logps/rejected": -415.22900390625, + "loss": 0.5183, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.2845791578292847, + "rewards/margins": 0.863175094127655, + "rewards/rejected": -2.147754430770874, + "step": 1820 + }, + { + "epoch": 0.36, + "learning_rate": 4.044907678825045e-06, + "logits/chosen": -1.7646843194961548, + "logits/rejected": -1.7249200344085693, + "logps/chosen": -432.54925537109375, + "logps/rejected": -497.01348876953125, + "loss": 0.5647, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4014708995819092, + "rewards/margins": 0.8739999532699585, + "rewards/rejected": -2.2754709720611572, + "step": 1830 + }, + { + "epoch": 0.36, + "learning_rate": 4.031401048658892e-06, + "logits/chosen": -2.0431199073791504, + "logits/rejected": -1.9735429286956787, + "logps/chosen": -399.82501220703125, + "logps/rejected": -501.8202209472656, + "loss": 0.4795, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.3438560962677002, + "rewards/margins": 1.3123345375061035, + "rewards/rejected": -2.656190872192383, + "step": 1840 + }, + { + "epoch": 0.36, + "learning_rate": 4.017822490528664e-06, + "logits/chosen": -1.529136061668396, + "logits/rejected": -1.4915677309036255, + "logps/chosen": -423.474365234375, + "logps/rejected": -527.9143676757812, + "loss": 0.4903, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7233842611312866, + "rewards/margins": 0.9296349287033081, + "rewards/rejected": -2.653019428253174, + "step": 1850 + }, + { + "epoch": 0.37, + "learning_rate": 4.004172642202002e-06, + "logits/chosen": -1.5431911945343018, + "logits/rejected": -1.4479010105133057, + "logps/chosen": -502.7958984375, + "logps/rejected": -612.9650268554688, + "loss": 0.54, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8878120183944702, + "rewards/margins": 1.3048069477081299, + "rewards/rejected": -3.1926190853118896, + "step": 1860 + }, + { + "epoch": 0.37, + "learning_rate": 3.990452144794966e-06, + "logits/chosen": -1.468390941619873, + "logits/rejected": -1.2413341999053955, + "logps/chosen": -449.906982421875, + "logps/rejected": -506.55914306640625, + "loss": 0.5596, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0285403728485107, + "rewards/margins": 0.9599047899246216, + "rewards/rejected": -2.9884450435638428, + "step": 1870 + }, + { + "epoch": 0.37, + "learning_rate": 3.976661642741908e-06, + "logits/chosen": -1.447554111480713, + "logits/rejected": -1.3410959243774414, + "logps/chosen": -457.1023864746094, + "logps/rejected": -557.6286010742188, + "loss": 0.4901, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.9820743799209595, + "rewards/margins": 0.9573952555656433, + "rewards/rejected": -2.939469575881958, + "step": 1880 + }, + { + "epoch": 0.37, + "learning_rate": 3.96280178376521e-06, + "logits/chosen": -1.8144747018814087, + "logits/rejected": -1.8249022960662842, + "logps/chosen": -422.5291442871094, + "logps/rejected": -498.2322692871094, + "loss": 0.5081, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5139400959014893, + "rewards/margins": 0.7199869751930237, + "rewards/rejected": -2.2339272499084473, + "step": 1890 + }, + { + "epoch": 0.37, + "learning_rate": 3.948873218844863e-06, + "logits/chosen": -1.4795005321502686, + "logits/rejected": -1.4016082286834717, + "logps/chosen": -480.5079040527344, + "logps/rejected": -600.5020751953125, + "loss": 0.511, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.804473638534546, + "rewards/margins": 1.339569330215454, + "rewards/rejected": -3.144043207168579, + "step": 1900 + }, + { + "epoch": 0.37, + "eval_logits/chosen": 0.9550392627716064, + "eval_logits/rejected": 1.305156946182251, + "eval_logps/chosen": -448.5622253417969, + "eval_logps/rejected": -507.06622314453125, + "eval_loss": 0.5213829278945923, + "eval_rewards/accuracies": 0.71875, + "eval_rewards/chosen": -1.8532155752182007, + "eval_rewards/margins": 0.8862631916999817, + "eval_rewards/rejected": -2.739478588104248, + "eval_runtime": 466.4383, + "eval_samples_per_second": 4.288, + "eval_steps_per_second": 0.18, + "step": 1900 + }, + { + "epoch": 0.37, + "learning_rate": 3.934876602187886e-06, + "logits/chosen": -1.4697811603546143, + "logits/rejected": -1.375469446182251, + "logps/chosen": -453.0664978027344, + "logps/rejected": -478.17547607421875, + "loss": 0.5752, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7024272680282593, + "rewards/margins": 0.9104480743408203, + "rewards/rejected": -2.612875461578369, + "step": 1910 + }, + { + "epoch": 0.38, + "learning_rate": 3.920812591197604e-06, + "logits/chosen": -1.5655367374420166, + "logits/rejected": -1.5533018112182617, + "logps/chosen": -464.52154541015625, + "logps/rejected": -574.5903930664062, + "loss": 0.5062, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.1701035499572754, + "rewards/margins": 0.9189977645874023, + "rewards/rejected": -3.0891010761260986, + "step": 1920 + }, + { + "epoch": 0.38, + "learning_rate": 3.906681846442768e-06, + "logits/chosen": -1.3456695079803467, + "logits/rejected": -1.4312808513641357, + "logps/chosen": -495.2659606933594, + "logps/rejected": -612.2733154296875, + "loss": 0.4389, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.6250083446502686, + "rewards/margins": 1.3098533153533936, + "rewards/rejected": -3.934861421585083, + "step": 1930 + }, + { + "epoch": 0.38, + "learning_rate": 3.892485031626527e-06, + "logits/chosen": -1.3476498126983643, + "logits/rejected": -1.0824012756347656, + "logps/chosen": -602.011474609375, + "logps/rejected": -693.4718017578125, + "loss": 0.4966, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.9910778999328613, + "rewards/margins": 1.296937346458435, + "rewards/rejected": -4.2880144119262695, + "step": 1940 + }, + { + "epoch": 0.38, + "learning_rate": 3.8782228135552615e-06, + "logits/chosen": -1.182939052581787, + "logits/rejected": -1.1389211416244507, + "logps/chosen": -505.2765197753906, + "logps/rejected": -689.6798095703125, + "loss": 0.5595, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.8137130737304688, + "rewards/margins": 1.3322536945343018, + "rewards/rejected": -4.145966529846191, + "step": 1950 + }, + { + "epoch": 0.38, + "learning_rate": 3.863895862107255e-06, + "logits/chosen": -1.6203094720840454, + "logits/rejected": -1.1558005809783936, + "logps/chosen": -561.7947387695312, + "logps/rejected": -610.0185546875, + "loss": 0.4369, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.4451165199279785, + "rewards/margins": 1.4709765911102295, + "rewards/rejected": -3.91609263420105, + "step": 1960 + }, + { + "epoch": 0.39, + "learning_rate": 3.849504850201238e-06, + "logits/chosen": -1.3694090843200684, + "logits/rejected": -1.1556737422943115, + "logps/chosen": -539.959228515625, + "logps/rejected": -617.424072265625, + "loss": 0.4763, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.7902231216430664, + "rewards/margins": 0.9595049023628235, + "rewards/rejected": -3.749727964401245, + "step": 1970 + }, + { + "epoch": 0.39, + "learning_rate": 3.835050453764779e-06, + "logits/chosen": -1.4200727939605713, + "logits/rejected": -1.1518549919128418, + "logps/chosen": -501.21063232421875, + "logps/rejected": -642.7298583984375, + "loss": 0.582, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.5669314861297607, + "rewards/margins": 1.3281501531600952, + "rewards/rejected": -3.8950817584991455, + "step": 1980 + }, + { + "epoch": 0.39, + "learning_rate": 3.820533351702538e-06, + "logits/chosen": -1.4720011949539185, + "logits/rejected": -1.189164161682129, + "logps/chosen": -465.8736877441406, + "logps/rejected": -565.3887329101562, + "loss": 0.5744, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.1700820922851562, + "rewards/margins": 1.2808657884597778, + "rewards/rejected": -3.4509479999542236, + "step": 1990 + }, + { + "epoch": 0.39, + "learning_rate": 3.80595422586438e-06, + "logits/chosen": -1.685773491859436, + "logits/rejected": -1.3915890455245972, + "logps/chosen": -437.60894775390625, + "logps/rejected": -498.50390625, + "loss": 0.484, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.956373929977417, + "rewards/margins": 0.9637616872787476, + "rewards/rejected": -2.920135736465454, + "step": 2000 + }, + { + "epoch": 0.39, + "eval_logits/chosen": 1.3132091760635376, + "eval_logits/rejected": 1.6338907480239868, + "eval_logps/chosen": -441.24273681640625, + "eval_logps/rejected": -494.93695068359375, + "eval_loss": 0.5161046981811523, + "eval_rewards/accuracies": 0.71875, + "eval_rewards/chosen": -1.7800202369689941, + "eval_rewards/margins": 0.8381654024124146, + "eval_rewards/rejected": -2.6181857585906982, + "eval_runtime": 461.8655, + "eval_samples_per_second": 4.33, + "eval_steps_per_second": 0.182, + "step": 2000 + }, + { + "epoch": 0.39, + "learning_rate": 3.791313761013343e-06, + "logits/chosen": -1.5194200277328491, + "logits/rejected": -1.4752867221832275, + "logps/chosen": -414.8877868652344, + "logps/rejected": -529.7425537109375, + "loss": 0.392, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.467829942703247, + "rewards/margins": 1.1718862056732178, + "rewards/rejected": -2.639716386795044, + "step": 2010 + }, + { + "epoch": 0.4, + "learning_rate": 3.7766126447934857e-06, + "logits/chosen": -1.9778658151626587, + "logits/rejected": -1.9723155498504639, + "logps/chosen": -564.9736938476562, + "logps/rejected": -619.1234130859375, + "loss": 0.4866, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.397613525390625, + "rewards/margins": 0.49791449308395386, + "rewards/rejected": -2.8955278396606445, + "step": 2020 + }, + { + "epoch": 0.4, + "learning_rate": 3.761851567697583e-06, + "logits/chosen": -1.4296815395355225, + "logits/rejected": -1.3542410135269165, + "logps/chosen": -450.0517578125, + "logps/rejected": -552.33935546875, + "loss": 0.5166, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.0276377201080322, + "rewards/margins": 1.0687463283538818, + "rewards/rejected": -3.096384048461914, + "step": 2030 + }, + { + "epoch": 0.4, + "learning_rate": 3.7470312230346955e-06, + "logits/chosen": -1.493628978729248, + "logits/rejected": -1.6857808828353882, + "logps/chosen": -440.001708984375, + "logps/rejected": -583.5567626953125, + "loss": 0.5048, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.050156354904175, + "rewards/margins": 1.209169626235962, + "rewards/rejected": -3.2593257427215576, + "step": 2040 + }, + { + "epoch": 0.4, + "learning_rate": 3.7321523068976068e-06, + "logits/chosen": -1.536556363105774, + "logits/rejected": -1.583799123764038, + "logps/chosen": -450.343505859375, + "logps/rejected": -600.0687255859375, + "loss": 0.4018, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8308045864105225, + "rewards/margins": 1.2747979164123535, + "rewards/rejected": -3.105602741241455, + "step": 2050 + }, + { + "epoch": 0.4, + "learning_rate": 3.717215518130127e-06, + "logits/chosen": -1.6683518886566162, + "logits/rejected": -1.5575300455093384, + "logps/chosen": -513.941650390625, + "logps/rejected": -543.3946533203125, + "loss": 0.5745, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.285006046295166, + "rewards/margins": 0.6656274199485779, + "rewards/rejected": -2.9506335258483887, + "step": 2060 + }, + { + "epoch": 0.41, + "learning_rate": 3.702221558294274e-06, + "logits/chosen": -1.1688969135284424, + "logits/rejected": -0.8642789125442505, + "logps/chosen": -405.437255859375, + "logps/rejected": -475.583740234375, + "loss": 0.5127, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.129958152770996, + "rewards/margins": 1.0819867849349976, + "rewards/rejected": -3.211945056915283, + "step": 2070 + }, + { + "epoch": 0.41, + "learning_rate": 3.687171131637314e-06, + "logits/chosen": -1.8774950504302979, + "logits/rejected": -1.746014952659607, + "logps/chosen": -484.9676818847656, + "logps/rejected": -528.9475708007812, + "loss": 0.4464, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.9119985103607178, + "rewards/margins": 0.8228870630264282, + "rewards/rejected": -2.7348859310150146, + "step": 2080 + }, + { + "epoch": 0.41, + "learning_rate": 3.6720649450586885e-06, + "logits/chosen": -1.770233154296875, + "logits/rejected": -1.5432703495025635, + "logps/chosen": -523.8516845703125, + "logps/rejected": -624.4163818359375, + "loss": 0.462, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.295456886291504, + "rewards/margins": 1.4275439977645874, + "rewards/rejected": -3.7230007648468018, + "step": 2090 + }, + { + "epoch": 0.41, + "learning_rate": 3.6569037080768153e-06, + "logits/chosen": -1.7754666805267334, + "logits/rejected": -1.4459218978881836, + "logps/chosen": -549.3494873046875, + "logps/rejected": -605.2619018554688, + "loss": 0.4863, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.489126205444336, + "rewards/margins": 1.4342973232269287, + "rewards/rejected": -3.9234230518341064, + "step": 2100 + }, + { + "epoch": 0.41, + "eval_logits/chosen": 2.0461196899414062, + "eval_logits/rejected": 2.342752456665039, + "eval_logps/chosen": -541.5034790039062, + "eval_logps/rejected": -617.3856811523438, + "eval_loss": 0.5182604789733887, + "eval_rewards/accuracies": 0.7157738208770752, + "eval_rewards/chosen": -2.782627820968628, + "eval_rewards/margins": 1.0600451231002808, + "eval_rewards/rejected": -3.8426730632781982, + "eval_runtime": 471.2755, + "eval_samples_per_second": 4.244, + "eval_steps_per_second": 0.178, + "step": 2100 + }, + { + "epoch": 0.41, + "learning_rate": 3.641688132795757e-06, + "logits/chosen": -1.4198346138000488, + "logits/rejected": -1.179573655128479, + "logps/chosen": -549.3745727539062, + "logps/rejected": -682.312744140625, + "loss": 0.4812, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.051333427429199, + "rewards/margins": 1.439754843711853, + "rewards/rejected": -4.491088390350342, + "step": 2110 + }, + { + "epoch": 0.42, + "learning_rate": 3.6264189338717766e-06, + "logits/chosen": -1.679993987083435, + "logits/rejected": -1.485656499862671, + "logps/chosen": -512.0293579101562, + "logps/rejected": -534.0633544921875, + "loss": 0.5092, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.6621744632720947, + "rewards/margins": 0.41850295662879944, + "rewards/rejected": -3.0806775093078613, + "step": 2120 + }, + { + "epoch": 0.42, + "learning_rate": 3.611096828479773e-06, + "logits/chosen": -1.690553069114685, + "logits/rejected": -1.6504080295562744, + "logps/chosen": -460.89923095703125, + "logps/rejected": -589.2086791992188, + "loss": 0.503, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.381263256072998, + "rewards/margins": 1.3044815063476562, + "rewards/rejected": -3.6857447624206543, + "step": 2130 + }, + { + "epoch": 0.42, + "learning_rate": 3.595722536279595e-06, + "logits/chosen": -1.7249847650527954, + "logits/rejected": -1.7725975513458252, + "logps/chosen": -478.90478515625, + "logps/rejected": -575.4903564453125, + "loss": 0.5849, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.228055953979492, + "rewards/margins": 1.0864596366882324, + "rewards/rejected": -3.3145155906677246, + "step": 2140 + }, + { + "epoch": 0.42, + "learning_rate": 3.5802967793822386e-06, + "logits/chosen": -1.5533924102783203, + "logits/rejected": -1.4917786121368408, + "logps/chosen": -496.40606689453125, + "logps/rejected": -567.6104736328125, + "loss": 0.454, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.6233153343200684, + "rewards/margins": 1.0902141332626343, + "rewards/rejected": -3.7135300636291504, + "step": 2150 + }, + { + "epoch": 0.42, + "learning_rate": 3.5648202823159317e-06, + "logits/chosen": -1.7521919012069702, + "logits/rejected": -1.4811947345733643, + "logps/chosen": -518.8137817382812, + "logps/rejected": -560.3478393554688, + "loss": 0.4623, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.16514253616333, + "rewards/margins": 0.9891520738601685, + "rewards/rejected": -3.154294490814209, + "step": 2160 + }, + { + "epoch": 0.43, + "learning_rate": 3.549293771992104e-06, + "logits/chosen": -1.439638376235962, + "logits/rejected": -1.3014863729476929, + "logps/chosen": -445.81585693359375, + "logps/rejected": -580.5036010742188, + "loss": 0.5646, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.1650853157043457, + "rewards/margins": 0.892579197883606, + "rewards/rejected": -3.057664394378662, + "step": 2170 + }, + { + "epoch": 0.43, + "learning_rate": 3.5337179776712427e-06, + "logits/chosen": -1.9127401113510132, + "logits/rejected": -1.8155453205108643, + "logps/chosen": -468.47650146484375, + "logps/rejected": -502.29949951171875, + "loss": 0.5507, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7308883666992188, + "rewards/margins": 0.747651219367981, + "rewards/rejected": -2.478539228439331, + "step": 2180 + }, + { + "epoch": 0.43, + "learning_rate": 3.5180936309286444e-06, + "logits/chosen": -1.9631290435791016, + "logits/rejected": -1.778114676475525, + "logps/chosen": -451.0347595214844, + "logps/rejected": -454.935546875, + "loss": 0.4834, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4917702674865723, + "rewards/margins": 0.7677947878837585, + "rewards/rejected": -2.2595651149749756, + "step": 2190 + }, + { + "epoch": 0.43, + "learning_rate": 3.5024214656200497e-06, + "logits/chosen": -1.657544493675232, + "logits/rejected": -1.5524613857269287, + "logps/chosen": -398.22076416015625, + "logps/rejected": -510.81866455078125, + "loss": 0.5233, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5267362594604492, + "rewards/margins": 1.0521165132522583, + "rewards/rejected": -2.578852653503418, + "step": 2200 + }, + { + "epoch": 0.43, + "eval_logits/chosen": 0.5627753734588623, + "eval_logits/rejected": 0.9790993928909302, + "eval_logps/chosen": -440.25799560546875, + "eval_logps/rejected": -494.96429443359375, + "eval_loss": 0.5114842653274536, + "eval_rewards/accuracies": 0.7172619104385376, + "eval_rewards/chosen": -1.7701728343963623, + "eval_rewards/margins": 0.8482868671417236, + "eval_rewards/rejected": -2.618459701538086, + "eval_runtime": 471.2173, + "eval_samples_per_second": 4.244, + "eval_steps_per_second": 0.178, + "step": 2200 + }, + { + "epoch": 0.43, + "learning_rate": 3.4867022178471764e-06, + "logits/chosen": -1.8048145771026611, + "logits/rejected": -1.6141141653060913, + "logps/chosen": -382.29986572265625, + "logps/rejected": -459.41729736328125, + "loss": 0.5518, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8070600032806396, + "rewards/margins": 1.0074267387390137, + "rewards/rejected": -2.8144869804382324, + "step": 2210 + }, + { + "epoch": 0.44, + "learning_rate": 3.4709366259231468e-06, + "logits/chosen": -1.8121106624603271, + "logits/rejected": -1.7930854558944702, + "logps/chosen": -457.7802734375, + "logps/rejected": -583.01708984375, + "loss": 0.5169, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9206135272979736, + "rewards/margins": 0.919543445110321, + "rewards/rejected": -2.8401570320129395, + "step": 2220 + }, + { + "epoch": 0.44, + "learning_rate": 3.455125430337809e-06, + "logits/chosen": -1.637721300125122, + "logits/rejected": -1.5897992849349976, + "logps/chosen": -431.33209228515625, + "logps/rejected": -502.7018127441406, + "loss": 0.6024, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1560745239257812, + "rewards/margins": 0.6406152844429016, + "rewards/rejected": -2.796689987182617, + "step": 2230 + }, + { + "epoch": 0.44, + "learning_rate": 3.439269373722957e-06, + "logits/chosen": -1.791049599647522, + "logits/rejected": -1.5320322513580322, + "logps/chosen": -456.51739501953125, + "logps/rejected": -544.6226196289062, + "loss": 0.5001, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.033076763153076, + "rewards/margins": 0.6280455589294434, + "rewards/rejected": -2.6611223220825195, + "step": 2240 + }, + { + "epoch": 0.44, + "learning_rate": 3.4233692008174497e-06, + "logits/chosen": -1.7971560955047607, + "logits/rejected": -1.3178821802139282, + "logps/chosen": -524.2515869140625, + "logps/rejected": -556.8560791015625, + "loss": 0.5692, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.5150840282440186, + "rewards/margins": 0.8231359720230103, + "rewards/rejected": -3.3382201194763184, + "step": 2250 + }, + { + "epoch": 0.44, + "learning_rate": 3.4074256584322336e-06, + "logits/chosen": -1.49863600730896, + "logits/rejected": -1.141994595527649, + "logps/chosen": -595.3875122070312, + "logps/rejected": -646.2301025390625, + "loss": 0.55, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.773069381713867, + "rewards/margins": 0.7329319715499878, + "rewards/rejected": -3.5060012340545654, + "step": 2260 + }, + { + "epoch": 0.45, + "learning_rate": 3.3914394954152635e-06, + "logits/chosen": -1.5777288675308228, + "logits/rejected": -1.4726307392120361, + "logps/chosen": -527.77099609375, + "logps/rejected": -569.65625, + "loss": 0.5508, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.4085521697998047, + "rewards/margins": 0.8079524040222168, + "rewards/rejected": -3.2165043354034424, + "step": 2270 + }, + { + "epoch": 0.45, + "learning_rate": 3.375411462616332e-06, + "logits/chosen": -1.5236730575561523, + "logits/rejected": -1.6202980279922485, + "logps/chosen": -460.6907653808594, + "logps/rejected": -555.6349487304688, + "loss": 0.5557, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.1377615928649902, + "rewards/margins": 0.86217200756073, + "rewards/rejected": -2.9999334812164307, + "step": 2280 + }, + { + "epoch": 0.45, + "learning_rate": 3.3593423128518017e-06, + "logits/chosen": -1.9396699666976929, + "logits/rejected": -1.71761155128479, + "logps/chosen": -395.2947082519531, + "logps/rejected": -500.8861389160156, + "loss": 0.4944, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.609452486038208, + "rewards/margins": 1.191273808479309, + "rewards/rejected": -2.8007264137268066, + "step": 2290 + }, + { + "epoch": 0.45, + "learning_rate": 3.343232800869247e-06, + "logits/chosen": -1.7285282611846924, + "logits/rejected": -1.6788495779037476, + "logps/chosen": -368.57952880859375, + "logps/rejected": -448.85833740234375, + "loss": 0.5343, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4113600254058838, + "rewards/margins": 1.0198975801467896, + "rewards/rejected": -2.431257724761963, + "step": 2300 + }, + { + "epoch": 0.45, + "eval_logits/chosen": 0.5468556880950928, + "eval_logits/rejected": 1.0255200862884521, + "eval_logps/chosen": -406.37005615234375, + "eval_logps/rejected": -455.22125244140625, + "eval_loss": 0.507876455783844, + "eval_rewards/accuracies": 0.7202380895614624, + "eval_rewards/chosen": -1.4312934875488281, + "eval_rewards/margins": 0.7897354960441589, + "eval_rewards/rejected": -2.2210288047790527, + "eval_runtime": 469.5669, + "eval_samples_per_second": 4.259, + "eval_steps_per_second": 0.179, + "step": 2300 + }, + { + "epoch": 0.45, + "learning_rate": 3.3270836833120047e-06, + "logits/chosen": -1.9568431377410889, + "logits/rejected": -1.5243427753448486, + "logps/chosen": -324.5955810546875, + "logps/rejected": -381.22576904296875, + "loss": 0.5557, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.318694829940796, + "rewards/margins": 0.8176606893539429, + "rewards/rejected": -2.1363556385040283, + "step": 2310 + }, + { + "epoch": 0.46, + "learning_rate": 3.310895718683635e-06, + "logits/chosen": -2.0503430366516113, + "logits/rejected": -1.8878549337387085, + "logps/chosen": -412.74017333984375, + "logps/rejected": -501.390625, + "loss": 0.5432, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5697447061538696, + "rewards/margins": 0.8855142593383789, + "rewards/rejected": -2.455258846282959, + "step": 2320 + }, + { + "epoch": 0.46, + "learning_rate": 3.2946696673122953e-06, + "logits/chosen": -1.760880708694458, + "logits/rejected": -1.8217281103134155, + "logps/chosen": -435.4695739746094, + "logps/rejected": -527.7000732421875, + "loss": 0.4251, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.736303687095642, + "rewards/margins": 1.1922600269317627, + "rewards/rejected": -2.9285635948181152, + "step": 2330 + }, + { + "epoch": 0.46, + "learning_rate": 3.27840629131503e-06, + "logits/chosen": -1.9441564083099365, + "logits/rejected": -1.6510957479476929, + "logps/chosen": -512.7154541015625, + "logps/rejected": -557.56005859375, + "loss": 0.5342, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9222028255462646, + "rewards/margins": 1.151485562324524, + "rewards/rejected": -3.073688268661499, + "step": 2340 + }, + { + "epoch": 0.46, + "learning_rate": 3.2621063545619734e-06, + "logits/chosen": -1.6034061908721924, + "logits/rejected": -1.4517968893051147, + "logps/chosen": -548.5872192382812, + "logps/rejected": -585.1336669921875, + "loss": 0.5077, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.5587801933288574, + "rewards/margins": 0.6855789422988892, + "rewards/rejected": -3.244358777999878, + "step": 2350 + }, + { + "epoch": 0.46, + "learning_rate": 3.2457706226404715e-06, + "logits/chosen": -1.538956880569458, + "logits/rejected": -1.595827579498291, + "logps/chosen": -428.7093811035156, + "logps/rejected": -501.70111083984375, + "loss": 0.5667, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.207473039627075, + "rewards/margins": 0.5854024887084961, + "rewards/rejected": -2.792875051498413, + "step": 2360 + }, + { + "epoch": 0.47, + "learning_rate": 3.2293998628191246e-06, + "logits/chosen": -1.7881847620010376, + "logits/rejected": -1.5611097812652588, + "logps/chosen": -468.3409118652344, + "logps/rejected": -515.4591064453125, + "loss": 0.5295, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.007239818572998, + "rewards/margins": 0.803979754447937, + "rewards/rejected": -2.8112196922302246, + "step": 2370 + }, + { + "epoch": 0.47, + "learning_rate": 3.2129948440117487e-06, + "logits/chosen": -1.548412561416626, + "logits/rejected": -1.2934997081756592, + "logps/chosen": -426.29937744140625, + "logps/rejected": -491.78662109375, + "loss": 0.4824, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7749338150024414, + "rewards/margins": 0.7233313918113708, + "rewards/rejected": -2.498265027999878, + "step": 2380 + }, + { + "epoch": 0.47, + "learning_rate": 3.196556336741261e-06, + "logits/chosen": -1.6830675601959229, + "logits/rejected": -1.5950696468353271, + "logps/chosen": -488.90960693359375, + "logps/rejected": -620.7691650390625, + "loss": 0.4662, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.2164549827575684, + "rewards/margins": 1.2778276205062866, + "rewards/rejected": -3.4942824840545654, + "step": 2390 + }, + { + "epoch": 0.47, + "learning_rate": 3.1800851131034904e-06, + "logits/chosen": -1.626853346824646, + "logits/rejected": -1.2772800922393799, + "logps/chosen": -600.2614135742188, + "logps/rejected": -646.8006591796875, + "loss": 0.5251, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.8196444511413574, + "rewards/margins": 1.0009969472885132, + "rewards/rejected": -3.820641279220581, + "step": 2400 + }, + { + "epoch": 0.47, + "eval_logits/chosen": 1.5133353471755981, + "eval_logits/rejected": 2.1152803897857666, + "eval_logps/chosen": -534.41259765625, + "eval_logps/rejected": -613.07080078125, + "eval_loss": 0.5087560415267944, + "eval_rewards/accuracies": 0.7172619104385376, + "eval_rewards/chosen": -2.711719512939453, + "eval_rewards/margins": 1.0878052711486816, + "eval_rewards/rejected": -3.7995245456695557, + "eval_runtime": 461.5152, + "eval_samples_per_second": 4.334, + "eval_steps_per_second": 0.182, + "step": 2400 + }, + { + "epoch": 0.47, + "learning_rate": 3.1635819467309094e-06, + "logits/chosen": -1.764386773109436, + "logits/rejected": -1.4338537454605103, + "logps/chosen": -613.195556640625, + "logps/rejected": -632.0240478515625, + "loss": 0.4615, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8152177333831787, + "rewards/margins": 1.2248064279556274, + "rewards/rejected": -4.040024280548096, + "step": 2410 + }, + { + "epoch": 0.47, + "learning_rate": 3.147047612756302e-06, + "logits/chosen": -1.6126506328582764, + "logits/rejected": -1.3426209688186646, + "logps/chosen": -522.5135498046875, + "logps/rejected": -672.3236083984375, + "loss": 0.4471, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.4577982425689697, + "rewards/margins": 1.5147340297698975, + "rewards/rejected": -3.9725327491760254, + "step": 2420 + }, + { + "epoch": 0.48, + "learning_rate": 3.1304828877763567e-06, + "logits/chosen": -1.835339903831482, + "logits/rejected": -1.46001398563385, + "logps/chosen": -562.58642578125, + "logps/rejected": -616.4159545898438, + "loss": 0.4956, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.563161849975586, + "rewards/margins": 1.3803926706314087, + "rewards/rejected": -3.943554639816284, + "step": 2430 + }, + { + "epoch": 0.48, + "learning_rate": 3.1138885498151843e-06, + "logits/chosen": -1.6502008438110352, + "logits/rejected": -1.3979710340499878, + "logps/chosen": -537.5780029296875, + "logps/rejected": -613.78857421875, + "loss": 0.5772, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.5154900550842285, + "rewards/margins": 1.337890386581421, + "rewards/rejected": -3.8533802032470703, + "step": 2440 + }, + { + "epoch": 0.48, + "learning_rate": 3.0972653782877836e-06, + "logits/chosen": -1.7864410877227783, + "logits/rejected": -1.657470941543579, + "logps/chosen": -552.2943115234375, + "logps/rejected": -602.1121826171875, + "loss": 0.556, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.7135872840881348, + "rewards/margins": 0.8077222108840942, + "rewards/rejected": -3.5213096141815186, + "step": 2450 + }, + { + "epoch": 0.48, + "learning_rate": 3.0806141539634294e-06, + "logits/chosen": -1.750838041305542, + "logits/rejected": -1.445452094078064, + "logps/chosen": -595.9046630859375, + "logps/rejected": -634.5581665039062, + "loss": 0.4801, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9126906394958496, + "rewards/margins": 0.7003231048583984, + "rewards/rejected": -3.613013505935669, + "step": 2460 + }, + { + "epoch": 0.48, + "learning_rate": 3.063935658928998e-06, + "logits/chosen": -1.584479808807373, + "logits/rejected": -1.4214916229248047, + "logps/chosen": -502.6748962402344, + "logps/rejected": -579.9150390625, + "loss": 0.5253, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5569519996643066, + "rewards/margins": 1.0509430170059204, + "rewards/rejected": -3.6078953742980957, + "step": 2470 + }, + { + "epoch": 0.49, + "learning_rate": 3.0472306765522393e-06, + "logits/chosen": -1.4984403848648071, + "logits/rejected": -1.5491224527359009, + "logps/chosen": -531.972900390625, + "logps/rejected": -635.1728515625, + "loss": 0.4886, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.8651509284973145, + "rewards/margins": 1.0753536224365234, + "rewards/rejected": -3.940504550933838, + "step": 2480 + }, + { + "epoch": 0.49, + "learning_rate": 3.0304999914449774e-06, + "logits/chosen": -1.3658727407455444, + "logits/rejected": -1.5433199405670166, + "logps/chosen": -462.2303771972656, + "logps/rejected": -598.8155517578125, + "loss": 0.5015, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.40022349357605, + "rewards/margins": 0.9658218622207642, + "rewards/rejected": -3.3660457134246826, + "step": 2490 + }, + { + "epoch": 0.49, + "learning_rate": 3.0137443894262634e-06, + "logits/chosen": -1.8412716388702393, + "logits/rejected": -1.5159828662872314, + "logps/chosen": -576.944580078125, + "logps/rejected": -619.55810546875, + "loss": 0.5104, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.637674570083618, + "rewards/margins": 1.1486151218414307, + "rewards/rejected": -3.786289691925049, + "step": 2500 + }, + { + "epoch": 0.49, + "eval_logits/chosen": 1.7461397647857666, + "eval_logits/rejected": 2.2888615131378174, + "eval_logps/chosen": -562.9376831054688, + "eval_logps/rejected": -633.336181640625, + "eval_loss": 0.5005843639373779, + "eval_rewards/accuracies": 0.7202380895614624, + "eval_rewards/chosen": -2.9969699382781982, + "eval_rewards/margins": 1.0052084922790527, + "eval_rewards/rejected": -4.002178192138672, + "eval_runtime": 466.0167, + "eval_samples_per_second": 4.292, + "eval_steps_per_second": 0.18, + "step": 2500 + }, + { + "epoch": 0.49, + "learning_rate": 2.9969646574854632e-06, + "logits/chosen": -1.2289443016052246, + "logits/rejected": -1.1734968423843384, + "logps/chosen": -571.4437866210938, + "logps/rejected": -637.2028198242188, + "loss": 0.4959, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.083714008331299, + "rewards/margins": 1.2261943817138672, + "rewards/rejected": -4.309908390045166, + "step": 2510 + }, + { + "epoch": 0.49, + "learning_rate": 2.980161583745294e-06, + "logits/chosen": -1.57248055934906, + "logits/rejected": -1.629582405090332, + "logps/chosen": -613.6909790039062, + "logps/rejected": -710.1002807617188, + "loss": 0.5703, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2303519248962402, + "rewards/margins": 0.9834693670272827, + "rewards/rejected": -4.213820934295654, + "step": 2520 + }, + { + "epoch": 0.5, + "learning_rate": 2.9633359574248077e-06, + "logits/chosen": -1.5341389179229736, + "logits/rejected": -1.6534173488616943, + "logps/chosen": -466.84539794921875, + "logps/rejected": -630.6109619140625, + "loss": 0.5298, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.717761754989624, + "rewards/margins": 0.9395642280578613, + "rewards/rejected": -3.6573257446289062, + "step": 2530 + }, + { + "epoch": 0.5, + "learning_rate": 2.946488568802324e-06, + "logits/chosen": -1.7299247980117798, + "logits/rejected": -1.6176536083221436, + "logps/chosen": -538.8867797851562, + "logps/rejected": -684.6195068359375, + "loss": 0.5598, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.5870797634124756, + "rewards/margins": 1.0093539953231812, + "rewards/rejected": -3.5964341163635254, + "step": 2540 + }, + { + "epoch": 0.5, + "learning_rate": 2.929620209178307e-06, + "logits/chosen": -1.6182016134262085, + "logits/rejected": -1.2783520221710205, + "logps/chosen": -498.810546875, + "logps/rejected": -527.8185424804688, + "loss": 0.4818, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.53918719291687, + "rewards/margins": 0.693315863609314, + "rewards/rejected": -3.2325031757354736, + "step": 2550 + }, + { + "epoch": 0.5, + "learning_rate": 2.912731670838207e-06, + "logits/chosen": -1.6861143112182617, + "logits/rejected": -1.5500279664993286, + "logps/chosen": -578.6981201171875, + "logps/rejected": -656.0988159179688, + "loss": 0.4895, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.022346019744873, + "rewards/margins": 0.8760727047920227, + "rewards/rejected": -3.898418426513672, + "step": 2560 + }, + { + "epoch": 0.5, + "learning_rate": 2.8958237470152374e-06, + "logits/chosen": -1.2931665182113647, + "logits/rejected": -1.2862474918365479, + "logps/chosen": -509.349365234375, + "logps/rejected": -610.8724365234375, + "loss": 0.5523, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.792919874191284, + "rewards/margins": 1.0914205312728882, + "rewards/rejected": -3.884340763092041, + "step": 2570 + }, + { + "epoch": 0.51, + "learning_rate": 2.8788972318531272e-06, + "logits/chosen": -1.5627813339233398, + "logits/rejected": -1.2355066537857056, + "logps/chosen": -600.7859497070312, + "logps/rejected": -690.1842041015625, + "loss": 0.4223, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.2663826942443848, + "rewards/margins": 1.125218152999878, + "rewards/rejected": -4.391600608825684, + "step": 2580 + }, + { + "epoch": 0.51, + "learning_rate": 2.861952920368816e-06, + "logits/chosen": -1.438971757888794, + "logits/rejected": -1.2704181671142578, + "logps/chosen": -601.17724609375, + "logps/rejected": -654.62890625, + "loss": 0.4558, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.0297670364379883, + "rewards/margins": 1.1394041776657104, + "rewards/rejected": -4.169171333312988, + "step": 2590 + }, + { + "epoch": 0.51, + "learning_rate": 2.844991608415113e-06, + "logits/chosen": -1.4737740755081177, + "logits/rejected": -1.1703132390975952, + "logps/chosen": -623.9537353515625, + "logps/rejected": -725.4031372070312, + "loss": 0.429, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.3853325843811035, + "rewards/margins": 1.5982491970062256, + "rewards/rejected": -4.98358154296875, + "step": 2600 + }, + { + "epoch": 0.51, + "eval_logits/chosen": 3.2826781272888184, + "eval_logits/rejected": 3.6630711555480957, + "eval_logps/chosen": -626.0599975585938, + "eval_logps/rejected": -713.4385986328125, + "eval_loss": 0.5238474011421204, + "eval_rewards/accuracies": 0.7142857313156128, + "eval_rewards/chosen": -3.6281931400299072, + "eval_rewards/margins": 1.175009846687317, + "eval_rewards/rejected": -4.8032026290893555, + "eval_runtime": 470.5782, + "eval_samples_per_second": 4.25, + "eval_steps_per_second": 0.179, + "step": 2600 + }, + { + "epoch": 0.51, + "learning_rate": 2.828014092643319e-06, + "logits/chosen": -1.471954345703125, + "logits/rejected": -1.1723235845565796, + "logps/chosen": -665.859619140625, + "logps/rejected": -789.55859375, + "loss": 0.6042, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.7893319129943848, + "rewards/margins": 1.5368101596832275, + "rewards/rejected": -5.326141834259033, + "step": 2610 + }, + { + "epoch": 0.51, + "learning_rate": 2.8110211704658073e-06, + "logits/chosen": -1.1027779579162598, + "logits/rejected": -1.2894227504730225, + "logps/chosen": -625.0130004882812, + "logps/rejected": -680.197021484375, + "loss": 0.5153, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.4413421154022217, + "rewards/margins": 1.2280861139297485, + "rewards/rejected": -4.669427871704102, + "step": 2620 + }, + { + "epoch": 0.52, + "learning_rate": 2.7940136400185697e-06, + "logits/chosen": -1.7961403131484985, + "logits/rejected": -1.3579736948013306, + "logps/chosen": -618.7360229492188, + "logps/rejected": -626.2679443359375, + "loss": 0.5125, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.7992968559265137, + "rewards/margins": 1.030220627784729, + "rewards/rejected": -3.829517364501953, + "step": 2630 + }, + { + "epoch": 0.52, + "learning_rate": 2.776992300123732e-06, + "logits/chosen": -1.4044650793075562, + "logits/rejected": -1.0620936155319214, + "logps/chosen": -550.3770751953125, + "logps/rejected": -686.0360107421875, + "loss": 0.453, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9266915321350098, + "rewards/margins": 1.4235899448394775, + "rewards/rejected": -4.350281715393066, + "step": 2640 + }, + { + "epoch": 0.52, + "learning_rate": 2.7599579502520295e-06, + "logits/chosen": -1.5171329975128174, + "logits/rejected": -1.2877377271652222, + "logps/chosen": -535.6190185546875, + "logps/rejected": -594.6309814453125, + "loss": 0.5086, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.649843692779541, + "rewards/margins": 0.9460121989250183, + "rewards/rejected": -3.595855712890625, + "step": 2650 + }, + { + "epoch": 0.52, + "learning_rate": 2.742911390485262e-06, + "logits/chosen": -1.7375767230987549, + "logits/rejected": -1.5891015529632568, + "logps/chosen": -522.1452026367188, + "logps/rejected": -644.7877197265625, + "loss": 0.5278, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.4621710777282715, + "rewards/margins": 1.264937400817871, + "rewards/rejected": -3.7271084785461426, + "step": 2660 + }, + { + "epoch": 0.52, + "learning_rate": 2.7258534214787108e-06, + "logits/chosen": -1.1750578880310059, + "logits/rejected": -1.2237205505371094, + "logps/chosen": -444.18402099609375, + "logps/rejected": -584.5846557617188, + "loss": 0.4825, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.4036126136779785, + "rewards/margins": 1.157496452331543, + "rewards/rejected": -3.5611088275909424, + "step": 2670 + }, + { + "epoch": 0.53, + "learning_rate": 2.7087848444235354e-06, + "logits/chosen": -1.661940574645996, + "logits/rejected": -1.6658750772476196, + "logps/chosen": -430.6551818847656, + "logps/rejected": -487.06341552734375, + "loss": 0.4763, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0942399501800537, + "rewards/margins": 0.7797794342041016, + "rewards/rejected": -2.8740196228027344, + "step": 2680 + }, + { + "epoch": 0.53, + "learning_rate": 2.6917064610091425e-06, + "logits/chosen": -1.58583664894104, + "logits/rejected": -1.0581375360488892, + "logps/chosen": -522.0615234375, + "logps/rejected": -650.8190307617188, + "loss": 0.4368, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.498887538909912, + "rewards/margins": 1.6336562633514404, + "rewards/rejected": -4.132543563842773, + "step": 2690 + }, + { + "epoch": 0.53, + "learning_rate": 2.674619073385531e-06, + "logits/chosen": -1.503377079963684, + "logits/rejected": -1.3497763872146606, + "logps/chosen": -531.3106079101562, + "logps/rejected": -552.6693115234375, + "loss": 0.4255, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2223851680755615, + "rewards/margins": 1.1123954057693481, + "rewards/rejected": -3.33478045463562, + "step": 2700 + }, + { + "epoch": 0.53, + "eval_logits/chosen": 1.687303066253662, + "eval_logits/rejected": 2.191955089569092, + "eval_logps/chosen": -512.7009887695312, + "eval_logps/rejected": -583.7889404296875, + "eval_loss": 0.49927037954330444, + "eval_rewards/accuracies": 0.71875, + "eval_rewards/chosen": -2.494602680206299, + "eval_rewards/margins": 1.0121026039123535, + "eval_rewards/rejected": -3.5067055225372314, + "eval_runtime": 467.7621, + "eval_samples_per_second": 4.276, + "eval_steps_per_second": 0.18, + "step": 2700 + }, + { + "epoch": 0.53, + "learning_rate": 2.6575234841256137e-06, + "logits/chosen": -1.3108127117156982, + "logits/rejected": -1.0921776294708252, + "logps/chosen": -561.8636474609375, + "logps/rejected": -687.85888671875, + "loss": 0.5514, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.914976119995117, + "rewards/margins": 1.1241728067398071, + "rewards/rejected": -4.039148807525635, + "step": 2710 + }, + { + "epoch": 0.53, + "learning_rate": 2.640420496187528e-06, + "logits/chosen": -1.6478326320648193, + "logits/rejected": -1.1835671663284302, + "logps/chosen": -631.3289794921875, + "logps/rejected": -626.5182495117188, + "loss": 0.5263, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.000012159347534, + "rewards/margins": 0.643200159072876, + "rewards/rejected": -3.6432127952575684, + "step": 2720 + }, + { + "epoch": 0.54, + "learning_rate": 2.6233109128769134e-06, + "logits/chosen": -1.5925486087799072, + "logits/rejected": -1.4888114929199219, + "logps/chosen": -582.1151123046875, + "logps/rejected": -646.9674072265625, + "loss": 0.4845, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.2123794555664062, + "rewards/margins": 0.7758103013038635, + "rewards/rejected": -3.988189697265625, + "step": 2730 + }, + { + "epoch": 0.54, + "learning_rate": 2.6061955378091896e-06, + "logits/chosen": -1.295643925666809, + "logits/rejected": -1.2166447639465332, + "logps/chosen": -478.65655517578125, + "logps/rejected": -537.48486328125, + "loss": 0.4722, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.5640273094177246, + "rewards/margins": 1.0961014032363892, + "rewards/rejected": -3.660128355026245, + "step": 2740 + }, + { + "epoch": 0.54, + "learning_rate": 2.5890751748718055e-06, + "logits/chosen": -0.9939683675765991, + "logits/rejected": -1.0293607711791992, + "logps/chosen": -543.2905883789062, + "logps/rejected": -722.3085327148438, + "loss": 0.4421, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.28556752204895, + "rewards/margins": 1.4245612621307373, + "rewards/rejected": -4.7101287841796875, + "step": 2750 + }, + { + "epoch": 0.54, + "learning_rate": 2.5719506281864838e-06, + "logits/chosen": -1.103952169418335, + "logits/rejected": -0.954735279083252, + "logps/chosen": -631.2025756835938, + "logps/rejected": -684.2635498046875, + "loss": 0.6389, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.1589555740356445, + "rewards/margins": 0.8297792673110962, + "rewards/rejected": -4.988734722137451, + "step": 2760 + }, + { + "epoch": 0.54, + "learning_rate": 2.5548227020714532e-06, + "logits/chosen": -0.9842194318771362, + "logits/rejected": -0.25584936141967773, + "logps/chosen": -638.1835327148438, + "logps/rejected": -731.91162109375, + "loss": 0.5314, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.321416854858398, + "rewards/margins": 1.1731635332107544, + "rewards/rejected": -5.494580268859863, + "step": 2770 + }, + { + "epoch": 0.55, + "learning_rate": 2.537692201003671e-06, + "logits/chosen": -1.3658416271209717, + "logits/rejected": -0.6274362802505493, + "logps/chosen": -674.0128173828125, + "logps/rejected": -775.1448974609375, + "loss": 0.4476, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.8319332599639893, + "rewards/margins": 1.8677552938461304, + "rewards/rejected": -5.699688911437988, + "step": 2780 + }, + { + "epoch": 0.55, + "learning_rate": 2.520559929581034e-06, + "logits/chosen": -0.9689489603042603, + "logits/rejected": -1.237882137298584, + "logps/chosen": -640.4005126953125, + "logps/rejected": -691.1866455078125, + "loss": 0.5249, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.775344133377075, + "rewards/margins": 0.899651825428009, + "rewards/rejected": -4.674995422363281, + "step": 2790 + }, + { + "epoch": 0.55, + "learning_rate": 2.503426692484594e-06, + "logits/chosen": -1.564483642578125, + "logits/rejected": -1.5032641887664795, + "logps/chosen": -535.062744140625, + "logps/rejected": -703.3568725585938, + "loss": 0.4733, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.2556099891662598, + "rewards/margins": 1.3399698734283447, + "rewards/rejected": -4.595579624176025, + "step": 2800 + }, + { + "epoch": 0.55, + "eval_logits/chosen": 2.2111027240753174, + "eval_logits/rejected": 2.679600715637207, + "eval_logps/chosen": -584.3987426757812, + "eval_logps/rejected": -661.117431640625, + "eval_loss": 0.49895724654197693, + "eval_rewards/accuracies": 0.7202380895614624, + "eval_rewards/chosen": -3.211580514907837, + "eval_rewards/margins": 1.068410873413086, + "eval_rewards/rejected": -4.279991149902344, + "eval_runtime": 474.1567, + "eval_samples_per_second": 4.218, + "eval_steps_per_second": 0.177, + "step": 2800 + }, + { + "epoch": 0.55, + "learning_rate": 2.486293294440755e-06, + "logits/chosen": -1.2708990573883057, + "logits/rejected": -1.044060468673706, + "logps/chosen": -603.8212890625, + "logps/rejected": -696.2957763671875, + "loss": 0.4856, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.46049427986145, + "rewards/margins": 1.102307677268982, + "rewards/rejected": -4.562802314758301, + "step": 2810 + }, + { + "epoch": 0.55, + "learning_rate": 2.4691605401834843e-06, + "logits/chosen": -1.3584177494049072, + "logits/rejected": -1.0103107690811157, + "logps/chosen": -636.3997802734375, + "logps/rejected": -636.8760375976562, + "loss": 0.5038, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.3813788890838623, + "rewards/margins": 0.9568415880203247, + "rewards/rejected": -4.338220596313477, + "step": 2820 + }, + { + "epoch": 0.56, + "learning_rate": 2.4520292344165093e-06, + "logits/chosen": -1.0483958721160889, + "logits/rejected": -0.6678327918052673, + "logps/chosen": -531.8612060546875, + "logps/rejected": -628.7724609375, + "loss": 0.4416, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.131394863128662, + "rewards/margins": 1.3460534811019897, + "rewards/rejected": -4.477448463439941, + "step": 2830 + }, + { + "epoch": 0.56, + "learning_rate": 2.434900181775524e-06, + "logits/chosen": -1.2988386154174805, + "logits/rejected": -1.310418725013733, + "logps/chosen": -623.8057861328125, + "logps/rejected": -765.7599487304688, + "loss": 0.6217, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.3501858711242676, + "rewards/margins": 1.2648541927337646, + "rewards/rejected": -4.615039825439453, + "step": 2840 + }, + { + "epoch": 0.56, + "learning_rate": 2.4177741867903966e-06, + "logits/chosen": -1.3026063442230225, + "logits/rejected": -1.5730775594711304, + "logps/chosen": -574.4251098632812, + "logps/rejected": -738.9527587890625, + "loss": 0.4522, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.1266930103302, + "rewards/margins": 1.183801531791687, + "rewards/rejected": -4.310494422912598, + "step": 2850 + }, + { + "epoch": 0.56, + "learning_rate": 2.40065205384738e-06, + "logits/chosen": -1.2829475402832031, + "logits/rejected": -0.7965912818908691, + "logps/chosen": -579.4163208007812, + "logps/rejected": -643.0167846679688, + "loss": 0.4394, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.2237465381622314, + "rewards/margins": 1.3293319940567017, + "rewards/rejected": -4.553078651428223, + "step": 2860 + }, + { + "epoch": 0.56, + "learning_rate": 2.3835345871513334e-06, + "logits/chosen": -1.3497217893600464, + "logits/rejected": -1.2243283987045288, + "logps/chosen": -546.4749145507812, + "logps/rejected": -704.9342041015625, + "loss": 0.4332, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.956820011138916, + "rewards/margins": 1.5760740041732788, + "rewards/rejected": -4.532893657684326, + "step": 2870 + }, + { + "epoch": 0.57, + "learning_rate": 2.3664225906879452e-06, + "logits/chosen": -1.3499679565429688, + "logits/rejected": -1.1703636646270752, + "logps/chosen": -603.4747314453125, + "logps/rejected": -666.8853759765625, + "loss": 0.4698, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.120850086212158, + "rewards/margins": 1.1407297849655151, + "rewards/rejected": -4.261579990386963, + "step": 2880 + }, + { + "epoch": 0.57, + "learning_rate": 2.3493168681859782e-06, + "logits/chosen": -1.677674651145935, + "logits/rejected": -1.3583574295043945, + "logps/chosen": -669.3980102539062, + "logps/rejected": -768.7742919921875, + "loss": 0.5308, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.5255637168884277, + "rewards/margins": 1.2949529886245728, + "rewards/rejected": -4.820516586303711, + "step": 2890 + }, + { + "epoch": 0.57, + "learning_rate": 2.3322182230795127e-06, + "logits/chosen": -1.104689359664917, + "logits/rejected": -0.9737561345100403, + "logps/chosen": -493.5654296875, + "logps/rejected": -645.1444091796875, + "loss": 0.5394, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.879586696624756, + "rewards/margins": 1.7333507537841797, + "rewards/rejected": -4.612936973571777, + "step": 2900 + }, + { + "epoch": 0.57, + "eval_logits/chosen": 1.2350561618804932, + "eval_logits/rejected": 1.7757776975631714, + "eval_logps/chosen": -554.5653076171875, + "eval_logps/rejected": -625.8765869140625, + "eval_loss": 0.5039872527122498, + "eval_rewards/accuracies": 0.7157738208770752, + "eval_rewards/chosen": -2.9132461547851562, + "eval_rewards/margins": 1.014336347579956, + "eval_rewards/rejected": -3.927582263946533, + "eval_runtime": 471.9119, + "eval_samples_per_second": 4.238, + "eval_steps_per_second": 0.178, + "step": 2900 + }, + { + "epoch": 0.57, + "learning_rate": 2.315127458470212e-06, + "logits/chosen": -1.6969263553619385, + "logits/rejected": -1.1446783542633057, + "logps/chosen": -537.927734375, + "logps/rejected": -597.5512084960938, + "loss": 0.4702, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.5726828575134277, + "rewards/margins": 1.4696344137191772, + "rewards/rejected": -4.0423173904418945, + "step": 2910 + }, + { + "epoch": 0.57, + "learning_rate": 2.298045377089604e-06, + "logits/chosen": -1.631068468093872, + "logits/rejected": -1.5715925693511963, + "logps/chosen": -588.5338134765625, + "logps/rejected": -680.6761474609375, + "loss": 0.5311, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.739121913909912, + "rewards/margins": 1.2334686517715454, + "rewards/rejected": -3.972590684890747, + "step": 2920 + }, + { + "epoch": 0.58, + "learning_rate": 2.2809727812613767e-06, + "logits/chosen": -1.3248388767242432, + "logits/rejected": -1.031965970993042, + "logps/chosen": -521.406005859375, + "logps/rejected": -598.8995361328125, + "loss": 0.5472, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.8669657707214355, + "rewards/margins": 1.0183131694793701, + "rewards/rejected": -3.8852791786193848, + "step": 2930 + }, + { + "epoch": 0.58, + "learning_rate": 2.2639104728636915e-06, + "logits/chosen": -1.6601577997207642, + "logits/rejected": -1.537408471107483, + "logps/chosen": -529.3999633789062, + "logps/rejected": -581.3070678710938, + "loss": 0.5026, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.78721284866333, + "rewards/margins": 0.9390803575515747, + "rewards/rejected": -3.7262930870056152, + "step": 2940 + }, + { + "epoch": 0.58, + "learning_rate": 2.246859253291524e-06, + "logits/chosen": -1.3986241817474365, + "logits/rejected": -1.3699901103973389, + "logps/chosen": -501.624267578125, + "logps/rejected": -596.663818359375, + "loss": 0.4539, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.6009819507598877, + "rewards/margins": 1.3324124813079834, + "rewards/rejected": -3.933394193649292, + "step": 2950 + }, + { + "epoch": 0.58, + "learning_rate": 2.2298199234190236e-06, + "logits/chosen": -1.3165680170059204, + "logits/rejected": -1.3338545560836792, + "logps/chosen": -550.9847412109375, + "logps/rejected": -629.339111328125, + "loss": 0.5604, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.0295987129211426, + "rewards/margins": 0.9783576726913452, + "rewards/rejected": -4.007956504821777, + "step": 2960 + }, + { + "epoch": 0.58, + "learning_rate": 2.21279328356189e-06, + "logits/chosen": -1.407828688621521, + "logits/rejected": -1.4380414485931396, + "logps/chosen": -561.3121337890625, + "logps/rejected": -591.61572265625, + "loss": 0.509, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.8780205249786377, + "rewards/margins": 0.8007872700691223, + "rewards/rejected": -3.6788082122802734, + "step": 2970 + }, + { + "epoch": 0.58, + "learning_rate": 2.195780133439794e-06, + "logits/chosen": -1.6875905990600586, + "logits/rejected": -1.2853671312332153, + "logps/chosen": -455.3389587402344, + "logps/rejected": -507.90771484375, + "loss": 0.469, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.5273537635803223, + "rewards/margins": 1.1238353252410889, + "rewards/rejected": -3.6511893272399902, + "step": 2980 + }, + { + "epoch": 0.59, + "learning_rate": 2.1787812721388093e-06, + "logits/chosen": -1.6108009815216064, + "logits/rejected": -1.2731918096542358, + "logps/chosen": -545.8873291015625, + "logps/rejected": -659.3258666992188, + "loss": 0.4554, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7631006240844727, + "rewards/margins": 1.0572516918182373, + "rewards/rejected": -3.820352077484131, + "step": 2990 + }, + { + "epoch": 0.59, + "learning_rate": 2.1617974980738814e-06, + "logits/chosen": -1.4267470836639404, + "logits/rejected": -1.2199398279190063, + "logps/chosen": -505.8406677246094, + "logps/rejected": -642.9313354492188, + "loss": 0.5128, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.7473788261413574, + "rewards/margins": 1.0760217905044556, + "rewards/rejected": -3.8234009742736816, + "step": 3000 + }, + { + "epoch": 0.59, + "eval_logits/chosen": 1.666269063949585, + "eval_logits/rejected": 2.128396987915039, + "eval_logps/chosen": -522.9818115234375, + "eval_logps/rejected": -590.36376953125, + "eval_loss": 0.5060966610908508, + "eval_rewards/accuracies": 0.7172619104385376, + "eval_rewards/chosen": -2.5974111557006836, + "eval_rewards/margins": 0.975042462348938, + "eval_rewards/rejected": -3.572453737258911, + "eval_runtime": 471.1115, + "eval_samples_per_second": 4.245, + "eval_steps_per_second": 0.178, + "step": 3000 + }, + { + "epoch": 0.59, + "learning_rate": 2.1448296089513273e-06, + "logits/chosen": -1.5748159885406494, + "logits/rejected": -1.455926537513733, + "logps/chosen": -551.4447021484375, + "logps/rejected": -678.2323608398438, + "loss": 0.5886, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.5907955169677734, + "rewards/margins": 0.9479290246963501, + "rewards/rejected": -3.538724422454834, + "step": 3010 + }, + { + "epoch": 0.59, + "learning_rate": 2.1278784017313688e-06, + "logits/chosen": -1.8420231342315674, + "logits/rejected": -1.4978781938552856, + "logps/chosen": -536.6190185546875, + "logps/rejected": -554.1761474609375, + "loss": 0.488, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.728389263153076, + "rewards/margins": 0.605846107006073, + "rewards/rejected": -3.334235429763794, + "step": 3020 + }, + { + "epoch": 0.59, + "learning_rate": 2.1109446725907003e-06, + "logits/chosen": -1.6853519678115845, + "logits/rejected": -1.6203676462173462, + "logps/chosen": -522.2265014648438, + "logps/rejected": -587.4464111328125, + "loss": 0.5519, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.5095019340515137, + "rewards/margins": 0.8812414407730103, + "rewards/rejected": -3.3907437324523926, + "step": 3030 + }, + { + "epoch": 0.6, + "learning_rate": 2.0940292168850913e-06, + "logits/chosen": -1.5753730535507202, + "logits/rejected": -1.2198309898376465, + "logps/chosen": -500.56121826171875, + "logps/rejected": -598.4393310546875, + "loss": 0.5011, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.4829251766204834, + "rewards/margins": 1.1780023574829102, + "rewards/rejected": -3.6609275341033936, + "step": 3040 + }, + { + "epoch": 0.6, + "learning_rate": 2.0771328291120336e-06, + "logits/chosen": -1.6674007177352905, + "logits/rejected": -1.412819743156433, + "logps/chosen": -471.34857177734375, + "logps/rejected": -570.1939697265625, + "loss": 0.4816, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2399003505706787, + "rewards/margins": 0.8762845993041992, + "rewards/rejected": -3.116184949874878, + "step": 3050 + }, + { + "epoch": 0.6, + "learning_rate": 2.060256302873421e-06, + "logits/chosen": -1.6781727075576782, + "logits/rejected": -1.4560127258300781, + "logps/chosen": -543.1824951171875, + "logps/rejected": -595.8843383789062, + "loss": 0.4934, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.42879056930542, + "rewards/margins": 0.9694629907608032, + "rewards/rejected": -3.3982536792755127, + "step": 3060 + }, + { + "epoch": 0.6, + "learning_rate": 2.043400430838276e-06, + "logits/chosen": -1.662009835243225, + "logits/rejected": -1.1407781839370728, + "logps/chosen": -496.57666015625, + "logps/rejected": -568.4881591796875, + "loss": 0.563, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.3798727989196777, + "rewards/margins": 1.0104573965072632, + "rewards/rejected": -3.3903305530548096, + "step": 3070 + }, + { + "epoch": 0.6, + "learning_rate": 2.02656600470552e-06, + "logits/chosen": -1.5956240892410278, + "logits/rejected": -1.5949556827545166, + "logps/chosen": -416.9606018066406, + "logps/rejected": -524.3873291015625, + "loss": 0.5409, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.056086540222168, + "rewards/margins": 0.749538004398346, + "rewards/rejected": -2.80562424659729, + "step": 3080 + }, + { + "epoch": 0.61, + "learning_rate": 2.0097538151667885e-06, + "logits/chosen": -1.6122442483901978, + "logits/rejected": -1.130645990371704, + "logps/chosen": -440.63494873046875, + "logps/rejected": -500.908203125, + "loss": 0.4872, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.053252696990967, + "rewards/margins": 1.218098759651184, + "rewards/rejected": -3.2713515758514404, + "step": 3090 + }, + { + "epoch": 0.61, + "learning_rate": 1.99296465186929e-06, + "logits/chosen": -1.49817955493927, + "logits/rejected": -1.393099308013916, + "logps/chosen": -449.64447021484375, + "logps/rejected": -532.80712890625, + "loss": 0.5215, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.427433490753174, + "rewards/margins": 0.646077036857605, + "rewards/rejected": -3.0735104084014893, + "step": 3100 + }, + { + "epoch": 0.61, + "eval_logits/chosen": 0.8593930006027222, + "eval_logits/rejected": 1.4432045221328735, + "eval_logps/chosen": -489.5559997558594, + "eval_logps/rejected": -551.8787231445312, + "eval_loss": 0.4959636628627777, + "eval_rewards/accuracies": 0.71875, + "eval_rewards/chosen": -2.263152599334717, + "eval_rewards/margins": 0.9244504570960999, + "eval_rewards/rejected": -3.187603235244751, + "eval_runtime": 469.4229, + "eval_samples_per_second": 4.261, + "eval_steps_per_second": 0.179, + "step": 3100 + }, + { + "epoch": 0.61, + "learning_rate": 1.9761993033787206e-06, + "logits/chosen": -1.436286211013794, + "logits/rejected": -1.667724847793579, + "logps/chosen": -481.9756774902344, + "logps/rejected": -577.6883544921875, + "loss": 0.4815, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.447153091430664, + "rewards/margins": 0.920573353767395, + "rewards/rejected": -3.3677260875701904, + "step": 3110 + }, + { + "epoch": 0.61, + "learning_rate": 1.959458557142228e-06, + "logits/chosen": -1.811018705368042, + "logits/rejected": -1.5022584199905396, + "logps/chosen": -477.35906982421875, + "logps/rejected": -560.0203247070312, + "loss": 0.4815, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.4165022373199463, + "rewards/margins": 1.0257890224456787, + "rewards/rejected": -3.442291259765625, + "step": 3120 + }, + { + "epoch": 0.61, + "learning_rate": 1.942743199451418e-06, + "logits/chosen": -1.4638209342956543, + "logits/rejected": -1.3204270601272583, + "logps/chosen": -535.1734008789062, + "logps/rejected": -584.9055786132812, + "loss": 0.5191, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.5391428470611572, + "rewards/margins": 0.8269332051277161, + "rewards/rejected": -3.3660759925842285, + "step": 3130 + }, + { + "epoch": 0.62, + "learning_rate": 1.9260540154054317e-06, + "logits/chosen": -1.6943273544311523, + "logits/rejected": -1.4195598363876343, + "logps/chosen": -619.236572265625, + "logps/rejected": -740.2224731445312, + "loss": 0.4769, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.336930274963379, + "rewards/margins": 1.6488691568374634, + "rewards/rejected": -3.9857993125915527, + "step": 3140 + }, + { + "epoch": 0.62, + "learning_rate": 1.909391788874069e-06, + "logits/chosen": -1.7227598428726196, + "logits/rejected": -1.6687244176864624, + "logps/chosen": -519.3486328125, + "logps/rejected": -590.3075561523438, + "loss": 0.497, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.510953187942505, + "rewards/margins": 0.8232176899909973, + "rewards/rejected": -3.3341705799102783, + "step": 3150 + }, + { + "epoch": 0.62, + "learning_rate": 1.8927573024609666e-06, + "logits/chosen": -1.7152988910675049, + "logits/rejected": -1.376106858253479, + "logps/chosen": -565.9855346679688, + "logps/rejected": -634.7908935546875, + "loss": 0.4533, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.826343536376953, + "rewards/margins": 1.0604288578033447, + "rewards/rejected": -3.8867721557617188, + "step": 3160 + }, + { + "epoch": 0.62, + "learning_rate": 1.8761513374668434e-06, + "logits/chosen": -1.7361100912094116, + "logits/rejected": -1.1922258138656616, + "logps/chosen": -569.1446533203125, + "logps/rejected": -591.5360107421875, + "loss": 0.4621, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.7468013763427734, + "rewards/margins": 0.9878908395767212, + "rewards/rejected": -3.734692096710205, + "step": 3170 + }, + { + "epoch": 0.62, + "learning_rate": 1.8595746738528045e-06, + "logits/chosen": -1.5199767351150513, + "logits/rejected": -1.3576875925064087, + "logps/chosen": -579.2857055664062, + "logps/rejected": -665.6951904296875, + "loss": 0.473, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.663944959640503, + "rewards/margins": 1.08833909034729, + "rewards/rejected": -3.752284288406372, + "step": 3180 + }, + { + "epoch": 0.63, + "learning_rate": 1.8430280902037061e-06, + "logits/chosen": -1.3695231676101685, + "logits/rejected": -1.2491718530654907, + "logps/chosen": -510.26409912109375, + "logps/rejected": -649.3592529296875, + "loss": 0.4412, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.6402225494384766, + "rewards/margins": 1.2626936435699463, + "rewards/rejected": -3.902916669845581, + "step": 3190 + }, + { + "epoch": 0.63, + "learning_rate": 1.826512363691586e-06, + "logits/chosen": -0.9787014722824097, + "logits/rejected": -1.1823240518569946, + "logps/chosen": -531.9524536132812, + "logps/rejected": -697.2850341796875, + "loss": 0.5023, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.7318313121795654, + "rewards/margins": 1.3929023742675781, + "rewards/rejected": -4.124733924865723, + "step": 3200 + }, + { + "epoch": 0.63, + "eval_logits/chosen": 1.2951397895812988, + "eval_logits/rejected": 1.9057204723358154, + "eval_logps/chosen": -549.5391845703125, + "eval_logps/rejected": -629.523681640625, + "eval_loss": 0.49993908405303955, + "eval_rewards/accuracies": 0.7127976417541504, + "eval_rewards/chosen": -2.8629848957061768, + "eval_rewards/margins": 1.101068377494812, + "eval_rewards/rejected": -3.964053153991699, + "eval_runtime": 469.417, + "eval_samples_per_second": 4.261, + "eval_steps_per_second": 0.179, + "step": 3200 + }, + { + "epoch": 0.63, + "learning_rate": 1.8100282700391616e-06, + "logits/chosen": -1.81111741065979, + "logits/rejected": -1.0964924097061157, + "logps/chosen": -580.1653442382812, + "logps/rejected": -659.6453247070312, + "loss": 0.3998, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.867839813232422, + "rewards/margins": 1.4525978565216064, + "rewards/rejected": -4.320437908172607, + "step": 3210 + }, + { + "epoch": 0.63, + "learning_rate": 1.7935765834833966e-06, + "logits/chosen": -1.3006770610809326, + "logits/rejected": -1.370444655418396, + "logps/chosen": -544.4671630859375, + "logps/rejected": -650.7494506835938, + "loss": 0.4532, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.8837952613830566, + "rewards/margins": 1.2802571058273315, + "rewards/rejected": -4.164052486419678, + "step": 3220 + }, + { + "epoch": 0.63, + "learning_rate": 1.7771580767391314e-06, + "logits/chosen": -1.5054481029510498, + "logits/rejected": -1.3543479442596436, + "logps/chosen": -568.9913330078125, + "logps/rejected": -634.9281005859375, + "loss": 0.4947, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.2038333415985107, + "rewards/margins": 0.8554115295410156, + "rewards/rejected": -4.0592451095581055, + "step": 3230 + }, + { + "epoch": 0.64, + "learning_rate": 1.7607735209627953e-06, + "logits/chosen": -1.2933645248413086, + "logits/rejected": -0.849597156047821, + "logps/chosen": -549.0391235351562, + "logps/rejected": -660.4158935546875, + "loss": 0.5686, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.0153212547302246, + "rewards/margins": 1.2810765504837036, + "rewards/rejected": -4.2963972091674805, + "step": 3240 + }, + { + "epoch": 0.64, + "learning_rate": 1.7444236857161837e-06, + "logits/chosen": -1.6714146137237549, + "logits/rejected": -1.5842548608779907, + "logps/chosen": -662.2576293945312, + "logps/rejected": -656.3047485351562, + "loss": 0.4996, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.8321645259857178, + "rewards/margins": 0.9911035299301147, + "rewards/rejected": -3.823268175125122, + "step": 3250 + }, + { + "epoch": 0.64, + "learning_rate": 1.7281093389303105e-06, + "logits/chosen": -1.5343830585479736, + "logits/rejected": -1.2284438610076904, + "logps/chosen": -543.3328857421875, + "logps/rejected": -612.6687622070312, + "loss": 0.4262, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.54597806930542, + "rewards/margins": 1.2225624322891235, + "rewards/rejected": -3.768540620803833, + "step": 3260 + }, + { + "epoch": 0.64, + "learning_rate": 1.7118312468693437e-06, + "logits/chosen": -1.4476630687713623, + "logits/rejected": -1.4361945390701294, + "logps/chosen": -556.40673828125, + "logps/rejected": -581.2120361328125, + "loss": 0.5645, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6290464401245117, + "rewards/margins": 0.753653883934021, + "rewards/rejected": -3.3827004432678223, + "step": 3270 + }, + { + "epoch": 0.64, + "learning_rate": 1.6955901740946136e-06, + "logits/chosen": -1.3370723724365234, + "logits/rejected": -1.1809570789337158, + "logps/chosen": -486.3965759277344, + "logps/rejected": -560.33740234375, + "loss": 0.4332, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.7506370544433594, + "rewards/margins": 0.7360703349113464, + "rewards/rejected": -3.4867072105407715, + "step": 3280 + }, + { + "epoch": 0.65, + "learning_rate": 1.6793868834286985e-06, + "logits/chosen": -1.5083470344543457, + "logits/rejected": -1.303236722946167, + "logps/chosen": -488.62933349609375, + "logps/rejected": -601.1016845703125, + "loss": 0.5114, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.4180212020874023, + "rewards/margins": 1.666049599647522, + "rewards/rejected": -4.084070682525635, + "step": 3290 + }, + { + "epoch": 0.65, + "learning_rate": 1.663222135919601e-06, + "logits/chosen": -1.4300639629364014, + "logits/rejected": -1.349562168121338, + "logps/chosen": -599.2969970703125, + "logps/rejected": -618.0079956054688, + "loss": 0.5042, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.0840084552764893, + "rewards/margins": 0.6745599508285522, + "rewards/rejected": -3.758568525314331, + "step": 3300 + }, + { + "epoch": 0.65, + "eval_logits/chosen": 1.4333837032318115, + "eval_logits/rejected": 1.9776458740234375, + "eval_logps/chosen": -547.7244873046875, + "eval_logps/rejected": -621.050048828125, + "eval_loss": 0.4904102087020874, + "eval_rewards/accuracies": 0.730654776096344, + "eval_rewards/chosen": -2.8448381423950195, + "eval_rewards/margins": 1.0344792604446411, + "eval_rewards/rejected": -3.879317045211792, + "eval_runtime": 468.097, + "eval_samples_per_second": 4.273, + "eval_steps_per_second": 0.179, + "step": 3300 + }, + { + "epoch": 0.65, + "learning_rate": 1.6470966908050012e-06, + "logits/chosen": -1.3912785053253174, + "logits/rejected": -1.166416883468628, + "logps/chosen": -551.7232666015625, + "logps/rejected": -542.6461181640625, + "loss": 0.4594, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.709670066833496, + "rewards/margins": 1.1980925798416138, + "rewards/rejected": -3.9077625274658203, + "step": 3310 + }, + { + "epoch": 0.65, + "learning_rate": 1.6310113054765947e-06, + "logits/chosen": -1.1876062154769897, + "logits/rejected": -1.081268310546875, + "logps/chosen": -555.0560913085938, + "logps/rejected": -684.9098510742188, + "loss": 0.4173, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.7197089195251465, + "rewards/margins": 1.6264817714691162, + "rewards/rejected": -4.346190929412842, + "step": 3320 + }, + { + "epoch": 0.65, + "learning_rate": 1.6149667354445192e-06, + "logits/chosen": -1.2864583730697632, + "logits/rejected": -1.1489123106002808, + "logps/chosen": -517.0961303710938, + "logps/rejected": -634.1941528320312, + "loss": 0.4567, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.976073741912842, + "rewards/margins": 1.2558077573776245, + "rewards/rejected": -4.231881141662598, + "step": 3330 + }, + { + "epoch": 0.66, + "learning_rate": 1.5989637343018705e-06, + "logits/chosen": -1.4901988506317139, + "logits/rejected": -1.3862135410308838, + "logps/chosen": -543.125, + "logps/rejected": -639.5211181640625, + "loss": 0.5195, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8310227394104004, + "rewards/margins": 0.9975448846817017, + "rewards/rejected": -3.8285675048828125, + "step": 3340 + }, + { + "epoch": 0.66, + "learning_rate": 1.5830030536893066e-06, + "logits/chosen": -1.5147039890289307, + "logits/rejected": -1.5230892896652222, + "logps/chosen": -561.2169189453125, + "logps/rejected": -679.5860595703125, + "loss": 0.5184, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.9714980125427246, + "rewards/margins": 1.08120596408844, + "rewards/rejected": -4.052703857421875, + "step": 3350 + }, + { + "epoch": 0.66, + "learning_rate": 1.5670854432597433e-06, + "logits/chosen": -1.0411088466644287, + "logits/rejected": -1.4086737632751465, + "logps/chosen": -509.64947509765625, + "logps/rejected": -623.0701904296875, + "loss": 0.5161, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.043079376220703, + "rewards/margins": 0.5725584030151367, + "rewards/rejected": -3.615638017654419, + "step": 3360 + }, + { + "epoch": 0.66, + "learning_rate": 1.551211650643144e-06, + "logits/chosen": -1.7322018146514893, + "logits/rejected": -1.328801155090332, + "logps/chosen": -521.364990234375, + "logps/rejected": -571.3067626953125, + "loss": 0.5073, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.0033740997314453, + "rewards/margins": 0.6566835641860962, + "rewards/rejected": -3.660057544708252, + "step": 3370 + }, + { + "epoch": 0.66, + "learning_rate": 1.5353824214114075e-06, + "logits/chosen": -1.5441231727600098, + "logits/rejected": -1.23960280418396, + "logps/chosen": -581.4852294921875, + "logps/rejected": -667.9786376953125, + "loss": 0.488, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.996382474899292, + "rewards/margins": 1.474982500076294, + "rewards/rejected": -4.471364974975586, + "step": 3380 + }, + { + "epoch": 0.67, + "learning_rate": 1.5195984990433437e-06, + "logits/chosen": -1.4372183084487915, + "logits/rejected": -1.19015371799469, + "logps/chosen": -541.4854125976562, + "logps/rejected": -616.9935913085938, + "loss": 0.4854, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.807796001434326, + "rewards/margins": 1.1931097507476807, + "rewards/rejected": -4.000905990600586, + "step": 3390 + }, + { + "epoch": 0.67, + "learning_rate": 1.5038606248897586e-06, + "logits/chosen": -1.525448203086853, + "logits/rejected": -1.3807704448699951, + "logps/chosen": -604.7877807617188, + "logps/rejected": -673.3497924804688, + "loss": 0.498, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.8307852745056152, + "rewards/margins": 1.112122893333435, + "rewards/rejected": -3.942908525466919, + "step": 3400 + }, + { + "epoch": 0.67, + "eval_logits/chosen": 0.960801362991333, + "eval_logits/rejected": 1.4781014919281006, + "eval_logps/chosen": -547.4754028320312, + "eval_logps/rejected": -614.0842895507812, + "eval_loss": 0.48787009716033936, + "eval_rewards/accuracies": 0.7321428656578064, + "eval_rewards/chosen": -2.842346668243408, + "eval_rewards/margins": 0.9673130512237549, + "eval_rewards/rejected": -3.809659719467163, + "eval_runtime": 465.7039, + "eval_samples_per_second": 4.295, + "eval_steps_per_second": 0.18, + "step": 3400 + }, + { + "epoch": 0.67, + "learning_rate": 1.4881695381386324e-06, + "logits/chosen": -1.2873122692108154, + "logits/rejected": -0.9784881472587585, + "logps/chosen": -531.7821044921875, + "logps/rejected": -645.1023559570312, + "loss": 0.4981, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.941249132156372, + "rewards/margins": 1.3310232162475586, + "rewards/rejected": -4.272271633148193, + "step": 3410 + }, + { + "epoch": 0.67, + "learning_rate": 1.4725259757803983e-06, + "logits/chosen": -1.3161420822143555, + "logits/rejected": -1.2805860042572021, + "logps/chosen": -540.3150634765625, + "logps/rejected": -688.1172485351562, + "loss": 0.4693, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.891829013824463, + "rewards/margins": 1.5289371013641357, + "rewards/rejected": -4.420766830444336, + "step": 3420 + }, + { + "epoch": 0.67, + "learning_rate": 1.4569306725733313e-06, + "logits/chosen": -1.1628152132034302, + "logits/rejected": -0.943515419960022, + "logps/chosen": -589.7276000976562, + "logps/rejected": -602.8516845703125, + "loss": 0.5659, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.100240468978882, + "rewards/margins": 0.6115292310714722, + "rewards/rejected": -3.7117698192596436, + "step": 3430 + }, + { + "epoch": 0.68, + "learning_rate": 1.4413843610090342e-06, + "logits/chosen": -1.3137940168380737, + "logits/rejected": -1.1455672979354858, + "logps/chosen": -529.7459716796875, + "logps/rejected": -633.06787109375, + "loss": 0.5939, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.8938117027282715, + "rewards/margins": 0.768166184425354, + "rewards/rejected": -3.661977767944336, + "step": 3440 + }, + { + "epoch": 0.68, + "learning_rate": 1.4258877712780333e-06, + "logits/chosen": -1.950718641281128, + "logits/rejected": -1.7199163436889648, + "logps/chosen": -601.2168579101562, + "logps/rejected": -665.1631469726562, + "loss": 0.4123, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4955344200134277, + "rewards/margins": 1.3623603582382202, + "rewards/rejected": -3.8578948974609375, + "step": 3450 + }, + { + "epoch": 0.68, + "learning_rate": 1.410441631235487e-06, + "logits/chosen": -1.2827733755111694, + "logits/rejected": -1.3301194906234741, + "logps/chosen": -592.0504760742188, + "logps/rejected": -645.4057006835938, + "loss": 0.491, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.829989194869995, + "rewards/margins": 1.2216827869415283, + "rewards/rejected": -4.051672458648682, + "step": 3460 + }, + { + "epoch": 0.68, + "learning_rate": 1.3950466663669915e-06, + "logits/chosen": -1.3424937725067139, + "logits/rejected": -1.0797219276428223, + "logps/chosen": -557.3580322265625, + "logps/rejected": -691.0580444335938, + "loss": 0.5238, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.78297758102417, + "rewards/margins": 1.1057649850845337, + "rewards/rejected": -3.888741970062256, + "step": 3470 + }, + { + "epoch": 0.68, + "learning_rate": 1.3797035997545144e-06, + "logits/chosen": -1.8202447891235352, + "logits/rejected": -1.4908593893051147, + "logps/chosen": -573.6468505859375, + "logps/rejected": -617.0345458984375, + "loss": 0.4676, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.885648488998413, + "rewards/margins": 1.005799651145935, + "rewards/rejected": -3.8914477825164795, + "step": 3480 + }, + { + "epoch": 0.69, + "learning_rate": 1.3644131520424241e-06, + "logits/chosen": -1.4213025569915771, + "logits/rejected": -1.0844852924346924, + "logps/chosen": -557.9754638671875, + "logps/rejected": -613.9381713867188, + "loss": 0.5547, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9787003993988037, + "rewards/margins": 1.2177660465240479, + "rewards/rejected": -4.196466445922852, + "step": 3490 + }, + { + "epoch": 0.69, + "learning_rate": 1.3491760414036478e-06, + "logits/chosen": -1.622127890586853, + "logits/rejected": -1.628751516342163, + "logps/chosen": -499.74346923828125, + "logps/rejected": -582.7059326171875, + "loss": 0.4987, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.6528587341308594, + "rewards/margins": 0.9285345077514648, + "rewards/rejected": -3.5813934803009033, + "step": 3500 + }, + { + "epoch": 0.69, + "eval_logits/chosen": 0.8556860685348511, + "eval_logits/rejected": 1.38192617893219, + "eval_logps/chosen": -532.4976806640625, + "eval_logps/rejected": -604.8372192382812, + "eval_loss": 0.4902108907699585, + "eval_rewards/accuracies": 0.730654776096344, + "eval_rewards/chosen": -2.6925694942474365, + "eval_rewards/margins": 1.024619698524475, + "eval_rewards/rejected": -3.717189311981201, + "eval_runtime": 468.3987, + "eval_samples_per_second": 4.27, + "eval_steps_per_second": 0.179, + "step": 3500 + }, + { + "epoch": 0.69, + "learning_rate": 1.3339929835059393e-06, + "logits/chosen": -1.4586498737335205, + "logits/rejected": -1.3849519491195679, + "logps/chosen": -517.4207153320312, + "logps/rejected": -552.0170288085938, + "loss": 0.5863, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.6804182529449463, + "rewards/margins": 0.6542779207229614, + "rewards/rejected": -3.334695816040039, + "step": 3510 + }, + { + "epoch": 0.69, + "learning_rate": 1.3188646914782616e-06, + "logits/chosen": -1.5962340831756592, + "logits/rejected": -1.6393506526947021, + "logps/chosen": -480.1661682128906, + "logps/rejected": -566.5606689453125, + "loss": 0.4092, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3056910037994385, + "rewards/margins": 1.0150161981582642, + "rewards/rejected": -3.320706844329834, + "step": 3520 + }, + { + "epoch": 0.69, + "learning_rate": 1.3037918758772944e-06, + "logits/chosen": -1.2742021083831787, + "logits/rejected": -1.164294958114624, + "logps/chosen": -438.3392639160156, + "logps/rejected": -590.9483032226562, + "loss": 0.4378, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.328174114227295, + "rewards/margins": 1.5971230268478394, + "rewards/rejected": -3.925297260284424, + "step": 3530 + }, + { + "epoch": 0.69, + "learning_rate": 1.288775244654062e-06, + "logits/chosen": -1.6658971309661865, + "logits/rejected": -1.5197378396987915, + "logps/chosen": -534.6138916015625, + "logps/rejected": -623.6290283203125, + "loss": 0.4726, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.696544647216797, + "rewards/margins": 1.2441034317016602, + "rewards/rejected": -3.940647840499878, + "step": 3540 + }, + { + "epoch": 0.7, + "learning_rate": 1.2738155031206772e-06, + "logits/chosen": -1.400800108909607, + "logits/rejected": -1.4668363332748413, + "logps/chosen": -527.6932373046875, + "logps/rejected": -605.9529418945312, + "loss": 0.537, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.713974952697754, + "rewards/margins": 0.906460165977478, + "rewards/rejected": -3.6204352378845215, + "step": 3550 + }, + { + "epoch": 0.7, + "learning_rate": 1.2589133539172193e-06, + "logits/chosen": -1.618035912513733, + "logits/rejected": -1.2135871648788452, + "logps/chosen": -610.2980346679688, + "logps/rejected": -685.7574462890625, + "loss": 0.5337, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.3299853801727295, + "rewards/margins": 0.8561728596687317, + "rewards/rejected": -4.186158180236816, + "step": 3560 + }, + { + "epoch": 0.7, + "learning_rate": 1.2440694969787262e-06, + "logits/chosen": -1.460566759109497, + "logits/rejected": -1.4396517276763916, + "logps/chosen": -508.64617919921875, + "logps/rejected": -639.2283935546875, + "loss": 0.4811, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.7212297916412354, + "rewards/margins": 1.1402428150177002, + "rewards/rejected": -3.8614726066589355, + "step": 3570 + }, + { + "epoch": 0.7, + "learning_rate": 1.2292846295023222e-06, + "logits/chosen": -0.9604961276054382, + "logits/rejected": -0.9926923513412476, + "logps/chosen": -546.4825439453125, + "logps/rejected": -696.2088623046875, + "loss": 0.504, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.0244810581207275, + "rewards/margins": 1.4495601654052734, + "rewards/rejected": -4.474040508270264, + "step": 3580 + }, + { + "epoch": 0.7, + "learning_rate": 1.2145594459144745e-06, + "logits/chosen": -1.2483197450637817, + "logits/rejected": -1.1393179893493652, + "logps/chosen": -451.2967224121094, + "logps/rejected": -571.0396728515625, + "loss": 0.4789, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.4650654792785645, + "rewards/margins": 1.3195736408233643, + "rewards/rejected": -3.7846388816833496, + "step": 3590 + }, + { + "epoch": 0.71, + "learning_rate": 1.19989463783837e-06, + "logits/chosen": -1.7845804691314697, + "logits/rejected": -1.4175323247909546, + "logps/chosen": -554.6177368164062, + "logps/rejected": -556.827880859375, + "loss": 0.5824, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.5787031650543213, + "rewards/margins": 0.7001349925994873, + "rewards/rejected": -3.2788383960723877, + "step": 3600 + }, + { + "epoch": 0.71, + "eval_logits/chosen": 0.5335975289344788, + "eval_logits/rejected": 1.10366690158844, + "eval_logps/chosen": -519.9661254882812, + "eval_logps/rejected": -592.4444580078125, + "eval_loss": 0.49079516530036926, + "eval_rewards/accuracies": 0.7291666865348816, + "eval_rewards/chosen": -2.567253589630127, + "eval_rewards/margins": 1.0260074138641357, + "eval_rewards/rejected": -3.5932610034942627, + "eval_runtime": 470.105, + "eval_samples_per_second": 4.254, + "eval_steps_per_second": 0.179, + "step": 3600 + }, + { + "epoch": 0.71, + "learning_rate": 1.1852908940614354e-06, + "logits/chosen": -1.411493182182312, + "logits/rejected": -1.0653778314590454, + "logps/chosen": -560.4616088867188, + "logps/rejected": -618.6390380859375, + "loss": 0.5021, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.9032797813415527, + "rewards/margins": 1.0411841869354248, + "rewards/rejected": -3.9444642066955566, + "step": 3610 + }, + { + "epoch": 0.71, + "learning_rate": 1.1707489005029877e-06, + "logits/chosen": -1.6911699771881104, + "logits/rejected": -1.442684531211853, + "logps/chosen": -532.0489501953125, + "logps/rejected": -603.7266235351562, + "loss": 0.4439, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.5354647636413574, + "rewards/margins": 0.9961854815483093, + "rewards/rejected": -3.5316505432128906, + "step": 3620 + }, + { + "epoch": 0.71, + "learning_rate": 1.1562693401820094e-06, + "logits/chosen": -1.2883455753326416, + "logits/rejected": -1.1316763162612915, + "logps/chosen": -515.0657958984375, + "logps/rejected": -637.5360107421875, + "loss": 0.5508, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.779428720474243, + "rewards/margins": 1.0291640758514404, + "rewards/rejected": -3.8085930347442627, + "step": 3630 + }, + { + "epoch": 0.71, + "learning_rate": 1.1418528931850781e-06, + "logits/chosen": -1.702050805091858, + "logits/rejected": -1.3528387546539307, + "logps/chosen": -614.9039916992188, + "logps/rejected": -675.7921142578125, + "loss": 0.4977, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.8123397827148438, + "rewards/margins": 1.1464719772338867, + "rewards/rejected": -3.9588119983673096, + "step": 3640 + }, + { + "epoch": 0.72, + "learning_rate": 1.1275002366344156e-06, + "logits/chosen": -1.5493879318237305, + "logits/rejected": -1.2522351741790771, + "logps/chosen": -499.3773498535156, + "logps/rejected": -630.6110229492188, + "loss": 0.3831, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.6639785766601562, + "rewards/margins": 1.3068674802780151, + "rewards/rejected": -3.970846176147461, + "step": 3650 + }, + { + "epoch": 0.72, + "learning_rate": 1.113212044656087e-06, + "logits/chosen": -1.2208716869354248, + "logits/rejected": -0.8246925473213196, + "logps/chosen": -522.56689453125, + "logps/rejected": -544.2653198242188, + "loss": 0.5248, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.8272600173950195, + "rewards/margins": 0.7849901914596558, + "rewards/rejected": -3.6122500896453857, + "step": 3660 + }, + { + "epoch": 0.72, + "learning_rate": 1.0989889883483415e-06, + "logits/chosen": -1.6956714391708374, + "logits/rejected": -1.2789404392242432, + "logps/chosen": -505.72564697265625, + "logps/rejected": -618.1361083984375, + "loss": 0.4234, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.5855579376220703, + "rewards/margins": 1.574141263961792, + "rewards/rejected": -4.159698963165283, + "step": 3670 + }, + { + "epoch": 0.72, + "learning_rate": 1.0848317357500854e-06, + "logits/chosen": -1.5158052444458008, + "logits/rejected": -1.1376941204071045, + "logps/chosen": -522.5565185546875, + "logps/rejected": -601.7191162109375, + "loss": 0.5567, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8235135078430176, + "rewards/margins": 1.2751871347427368, + "rewards/rejected": -4.098701000213623, + "step": 3680 + }, + { + "epoch": 0.72, + "learning_rate": 1.070740951809508e-06, + "logits/chosen": -1.6038143634796143, + "logits/rejected": -1.3182604312896729, + "logps/chosen": -525.1019897460938, + "logps/rejected": -647.8762817382812, + "loss": 0.4621, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.785275936126709, + "rewards/margins": 1.263718605041504, + "rewards/rejected": -4.048994064331055, + "step": 3690 + }, + { + "epoch": 0.73, + "learning_rate": 1.0567172983528534e-06, + "logits/chosen": -1.3500463962554932, + "logits/rejected": -1.1551401615142822, + "logps/chosen": -501.09039306640625, + "logps/rejected": -653.4024658203125, + "loss": 0.425, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.015165328979492, + "rewards/margins": 1.1308244466781616, + "rewards/rejected": -4.145989894866943, + "step": 3700 + }, + { + "epoch": 0.73, + "eval_logits/chosen": 0.7257053256034851, + "eval_logits/rejected": 1.2902551889419556, + "eval_logps/chosen": -539.9019775390625, + "eval_logps/rejected": -615.5826416015625, + "eval_loss": 0.49059832096099854, + "eval_rewards/accuracies": 0.730654776096344, + "eval_rewards/chosen": -2.7666125297546387, + "eval_rewards/margins": 1.0580310821533203, + "eval_rewards/rejected": -3.82464337348938, + "eval_runtime": 464.5737, + "eval_samples_per_second": 4.305, + "eval_steps_per_second": 0.181, + "step": 3700 + }, + { + "epoch": 0.73, + "learning_rate": 1.0427614340533293e-06, + "logits/chosen": -1.522006630897522, + "logits/rejected": -1.1353822946548462, + "logps/chosen": -548.3036499023438, + "logps/rejected": -561.1580810546875, + "loss": 0.4571, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.8132216930389404, + "rewards/margins": 1.0134377479553223, + "rewards/rejected": -3.8266589641571045, + "step": 3710 + }, + { + "epoch": 0.73, + "learning_rate": 1.0288740144001722e-06, + "logits/chosen": -0.8532799482345581, + "logits/rejected": -1.0594323873519897, + "logps/chosen": -472.4169921875, + "logps/rejected": -640.5179443359375, + "loss": 0.454, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8810791969299316, + "rewards/margins": 1.3560783863067627, + "rewards/rejected": -4.237157344818115, + "step": 3720 + }, + { + "epoch": 0.73, + "learning_rate": 1.0150556916678634e-06, + "logits/chosen": -1.344792366027832, + "logits/rejected": -1.2084754705429077, + "logps/chosen": -511.25238037109375, + "logps/rejected": -632.6343383789062, + "loss": 0.385, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9333691596984863, + "rewards/margins": 1.295652985572815, + "rewards/rejected": -4.229022026062012, + "step": 3730 + }, + { + "epoch": 0.73, + "learning_rate": 1.0013071148854861e-06, + "logits/chosen": -1.4463289976119995, + "logits/rejected": -1.3817778825759888, + "logps/chosen": -558.93603515625, + "logps/rejected": -659.0678100585938, + "loss": 0.5267, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.1432852745056152, + "rewards/margins": 1.226470708847046, + "rewards/rejected": -4.36975622177124, + "step": 3740 + }, + { + "epoch": 0.74, + "learning_rate": 9.876289298062478e-07, + "logits/chosen": -1.5644800662994385, + "logits/rejected": -1.373430609703064, + "logps/chosen": -536.2015380859375, + "logps/rejected": -613.3848876953125, + "loss": 0.4981, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.0300514698028564, + "rewards/margins": 0.7052406668663025, + "rewards/rejected": -3.735292434692383, + "step": 3750 + }, + { + "epoch": 0.74, + "learning_rate": 9.740217788771453e-07, + "logits/chosen": -1.371816635131836, + "logits/rejected": -1.263338565826416, + "logps/chosen": -574.6948852539062, + "logps/rejected": -703.0294799804688, + "loss": 0.5132, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.1160013675689697, + "rewards/margins": 1.5144450664520264, + "rewards/rejected": -4.630446434020996, + "step": 3760 + }, + { + "epoch": 0.74, + "learning_rate": 9.604863012087904e-07, + "logits/chosen": -1.4769089221954346, + "logits/rejected": -1.448307752609253, + "logps/chosen": -574.9603271484375, + "logps/rejected": -687.00341796875, + "loss": 0.4775, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8347721099853516, + "rewards/margins": 1.0564663410186768, + "rewards/rejected": -3.8912384510040283, + "step": 3770 + }, + { + "epoch": 0.74, + "learning_rate": 9.470231325453958e-07, + "logits/chosen": -1.3595274686813354, + "logits/rejected": -1.3797645568847656, + "logps/chosen": -597.6848754882812, + "logps/rejected": -693.9461059570312, + "loss": 0.4352, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.229882001876831, + "rewards/margins": 1.3679691553115845, + "rewards/rejected": -4.597851276397705, + "step": 3780 + }, + { + "epoch": 0.74, + "learning_rate": 9.336329052349089e-07, + "logits/chosen": -1.5314759016036987, + "logits/rejected": -1.400506854057312, + "logps/chosen": -585.5628051757812, + "logps/rejected": -757.356689453125, + "loss": 0.43, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.8826019763946533, + "rewards/margins": 1.7076170444488525, + "rewards/rejected": -4.590218544006348, + "step": 3790 + }, + { + "epoch": 0.75, + "learning_rate": 9.203162481993175e-07, + "logits/chosen": -1.3001272678375244, + "logits/rejected": -0.6133405566215515, + "logps/chosen": -554.6573486328125, + "logps/rejected": -680.6116943359375, + "loss": 0.4756, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.721282482147217, + "rewards/margins": 1.9358084201812744, + "rewards/rejected": -4.65709114074707, + "step": 3800 + }, + { + "epoch": 0.75, + "eval_logits/chosen": 0.9387251734733582, + "eval_logits/rejected": 1.50146484375, + "eval_logps/chosen": -550.5606689453125, + "eval_logps/rejected": -629.0961303710938, + "eval_loss": 0.4916338324546814, + "eval_rewards/accuracies": 0.7291666865348816, + "eval_rewards/chosen": -2.873199939727783, + "eval_rewards/margins": 1.0865780115127563, + "eval_rewards/rejected": -3.959777355194092, + "eval_runtime": 473.6033, + "eval_samples_per_second": 4.223, + "eval_steps_per_second": 0.177, + "step": 3800 + }, + { + "epoch": 0.75, + "learning_rate": 9.070737869051044e-07, + "logits/chosen": -1.2892731428146362, + "logits/rejected": -1.412314772605896, + "logps/chosen": -556.5120239257812, + "logps/rejected": -668.5825805664062, + "loss": 0.54, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.2580204010009766, + "rewards/margins": 0.9198251962661743, + "rewards/rejected": -4.1778459548950195, + "step": 3810 + }, + { + "epoch": 0.75, + "learning_rate": 8.939061433338722e-07, + "logits/chosen": -1.6973645687103271, + "logits/rejected": -1.2744815349578857, + "logps/chosen": -602.6815185546875, + "logps/rejected": -683.7181396484375, + "loss": 0.4363, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.169668674468994, + "rewards/margins": 1.1083095073699951, + "rewards/rejected": -4.277978420257568, + "step": 3820 + }, + { + "epoch": 0.75, + "learning_rate": 8.808139359531332e-07, + "logits/chosen": -1.469407558441162, + "logits/rejected": -1.1006393432617188, + "logps/chosen": -486.2688903808594, + "logps/rejected": -569.169677734375, + "loss": 0.4169, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.552382707595825, + "rewards/margins": 1.328578233718872, + "rewards/rejected": -3.8809609413146973, + "step": 3830 + }, + { + "epoch": 0.75, + "learning_rate": 8.677977796872541e-07, + "logits/chosen": -1.5187252759933472, + "logits/rejected": -1.0123035907745361, + "logps/chosen": -576.05908203125, + "logps/rejected": -661.7920532226562, + "loss": 0.5426, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.05285382270813, + "rewards/margins": 1.5546613931655884, + "rewards/rejected": -4.60751485824585, + "step": 3840 + }, + { + "epoch": 0.76, + "learning_rate": 8.548582858885787e-07, + "logits/chosen": -1.2918612957000732, + "logits/rejected": -1.4479345083236694, + "logps/chosen": -565.8729248046875, + "logps/rejected": -629.3079833984375, + "loss": 0.5346, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.0743348598480225, + "rewards/margins": 0.7325077056884766, + "rewards/rejected": -3.80684232711792, + "step": 3850 + }, + { + "epoch": 0.76, + "learning_rate": 8.419960623087129e-07, + "logits/chosen": -1.4159595966339111, + "logits/rejected": -1.4038660526275635, + "logps/chosen": -540.0975952148438, + "logps/rejected": -623.6644287109375, + "loss": 0.4268, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.547733783721924, + "rewards/margins": 0.9830751419067383, + "rewards/rejected": -3.530808925628662, + "step": 3860 + }, + { + "epoch": 0.76, + "learning_rate": 8.292117130699767e-07, + "logits/chosen": -1.3750841617584229, + "logits/rejected": -0.923875629901886, + "logps/chosen": -544.1021728515625, + "logps/rejected": -588.0936279296875, + "loss": 0.5068, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.6934802532196045, + "rewards/margins": 0.9981153607368469, + "rewards/rejected": -3.6915955543518066, + "step": 3870 + }, + { + "epoch": 0.76, + "learning_rate": 8.165058386370314e-07, + "logits/chosen": -1.3146508932113647, + "logits/rejected": -1.3720710277557373, + "logps/chosen": -520.4107055664062, + "logps/rejected": -645.3299560546875, + "loss": 0.4759, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.5611908435821533, + "rewards/margins": 1.5067590475082397, + "rewards/rejected": -4.067950248718262, + "step": 3880 + }, + { + "epoch": 0.76, + "learning_rate": 8.038790357886783e-07, + "logits/chosen": -1.5785200595855713, + "logits/rejected": -1.4798448085784912, + "logps/chosen": -547.60595703125, + "logps/rejected": -614.8184814453125, + "loss": 0.5455, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.773311138153076, + "rewards/margins": 0.8587993383407593, + "rewards/rejected": -3.632110118865967, + "step": 3890 + }, + { + "epoch": 0.77, + "learning_rate": 7.913318975898238e-07, + "logits/chosen": -1.518733263015747, + "logits/rejected": -1.3821210861206055, + "logps/chosen": -609.395263671875, + "logps/rejected": -702.1522216796875, + "loss": 0.4597, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.131622791290283, + "rewards/margins": 1.0383737087249756, + "rewards/rejected": -4.169996738433838, + "step": 3900 + }, + { + "epoch": 0.77, + "eval_logits/chosen": 0.7636324763298035, + "eval_logits/rejected": 1.3349512815475464, + "eval_logps/chosen": -549.4085693359375, + "eval_logps/rejected": -627.3712158203125, + "eval_loss": 0.48964768648147583, + "eval_rewards/accuracies": 0.7276785969734192, + "eval_rewards/chosen": -2.8616786003112793, + "eval_rewards/margins": 1.0808496475219727, + "eval_rewards/rejected": -3.942528486251831, + "eval_runtime": 474.3988, + "eval_samples_per_second": 4.216, + "eval_steps_per_second": 0.177, + "step": 3900 + }, + { + "epoch": 0.77, + "learning_rate": 7.788650133636291e-07, + "logits/chosen": -1.2482768297195435, + "logits/rejected": -1.1767711639404297, + "logps/chosen": -543.4593505859375, + "logps/rejected": -596.6537475585938, + "loss": 0.5069, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.8678791522979736, + "rewards/margins": 1.308966875076294, + "rewards/rejected": -4.176846504211426, + "step": 3910 + }, + { + "epoch": 0.77, + "learning_rate": 7.664789686638272e-07, + "logits/chosen": -1.4836885929107666, + "logits/rejected": -1.18732488155365, + "logps/chosen": -538.8760986328125, + "logps/rejected": -629.8192138671875, + "loss": 0.458, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.757962226867676, + "rewards/margins": 1.2660038471221924, + "rewards/rejected": -4.023966312408447, + "step": 3920 + }, + { + "epoch": 0.77, + "learning_rate": 7.541743452472194e-07, + "logits/chosen": -1.8099521398544312, + "logits/rejected": -1.2252228260040283, + "logps/chosen": -555.2513427734375, + "logps/rejected": -627.4388427734375, + "loss": 0.5447, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.7958285808563232, + "rewards/margins": 1.049216866493225, + "rewards/rejected": -3.845045566558838, + "step": 3930 + }, + { + "epoch": 0.77, + "learning_rate": 7.41951721046357e-07, + "logits/chosen": -1.4922353029251099, + "logits/rejected": -1.045175313949585, + "logps/chosen": -594.3184814453125, + "logps/rejected": -714.8767700195312, + "loss": 0.4802, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1308560371398926, + "rewards/margins": 1.3848567008972168, + "rewards/rejected": -4.515712738037109, + "step": 3940 + }, + { + "epoch": 0.78, + "learning_rate": 7.298116701423874e-07, + "logits/chosen": -1.6804969310760498, + "logits/rejected": -1.3681905269622803, + "logps/chosen": -653.5281982421875, + "logps/rejected": -714.237060546875, + "loss": 0.4969, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.151542901992798, + "rewards/margins": 1.1915686130523682, + "rewards/rejected": -4.343111515045166, + "step": 3950 + }, + { + "epoch": 0.78, + "learning_rate": 7.177547627380987e-07, + "logits/chosen": -1.7028090953826904, + "logits/rejected": -1.2910749912261963, + "logps/chosen": -559.255859375, + "logps/rejected": -671.7525634765625, + "loss": 0.494, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.8281595706939697, + "rewards/margins": 1.2134965658187866, + "rewards/rejected": -4.041655540466309, + "step": 3960 + }, + { + "epoch": 0.78, + "learning_rate": 7.057815651311323e-07, + "logits/chosen": -1.2011252641677856, + "logits/rejected": -1.1789474487304688, + "logps/chosen": -517.1002197265625, + "logps/rejected": -617.9232788085938, + "loss": 0.4652, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.9817757606506348, + "rewards/margins": 1.4597489833831787, + "rewards/rejected": -4.441524505615234, + "step": 3970 + }, + { + "epoch": 0.78, + "learning_rate": 6.93892639687386e-07, + "logits/chosen": -1.2308969497680664, + "logits/rejected": -1.37088143825531, + "logps/chosen": -484.86016845703125, + "logps/rejected": -606.0040283203125, + "loss": 0.4863, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.809051036834717, + "rewards/margins": 0.9818013906478882, + "rewards/rejected": -3.7908523082733154, + "step": 3980 + }, + { + "epoch": 0.78, + "learning_rate": 6.820885448146041e-07, + "logits/chosen": -1.4448660612106323, + "logits/rejected": -1.356806993484497, + "logps/chosen": -596.8998413085938, + "logps/rejected": -694.8192138671875, + "loss": 0.5442, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.2882466316223145, + "rewards/margins": 1.0220199823379517, + "rewards/rejected": -4.310266971588135, + "step": 3990 + }, + { + "epoch": 0.79, + "learning_rate": 6.703698349361437e-07, + "logits/chosen": -1.4850349426269531, + "logits/rejected": -1.4774724245071411, + "logps/chosen": -545.3355712890625, + "logps/rejected": -660.8809814453125, + "loss": 0.4649, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.9125723838806152, + "rewards/margins": 0.9985507130622864, + "rewards/rejected": -3.9111227989196777, + "step": 4000 + }, + { + "epoch": 0.79, + "eval_logits/chosen": 0.7213255167007446, + "eval_logits/rejected": 1.2903335094451904, + "eval_logps/chosen": -550.0615234375, + "eval_logps/rejected": -626.822998046875, + "eval_loss": 0.4885156452655792, + "eval_rewards/accuracies": 0.7232142686843872, + "eval_rewards/chosen": -2.868208408355713, + "eval_rewards/margins": 1.0688380002975464, + "eval_rewards/rejected": -3.937046527862549, + "eval_runtime": 474.6508, + "eval_samples_per_second": 4.214, + "eval_steps_per_second": 0.177, + "step": 4000 + }, + { + "epoch": 0.79, + "learning_rate": 6.587370604649373e-07, + "logits/chosen": -1.7546825408935547, + "logits/rejected": -1.379854440689087, + "logps/chosen": -639.4371337890625, + "logps/rejected": -648.8902587890625, + "loss": 0.4445, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.9304933547973633, + "rewards/margins": 0.9470575451850891, + "rewards/rejected": -3.877551317214966, + "step": 4010 + }, + { + "epoch": 0.79, + "learning_rate": 6.471907677776426e-07, + "logits/chosen": -1.4482325315475464, + "logits/rejected": -1.013795256614685, + "logps/chosen": -637.1885986328125, + "logps/rejected": -682.3165893554688, + "loss": 0.456, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.1467432975769043, + "rewards/margins": 1.3057441711425781, + "rewards/rejected": -4.452487945556641, + "step": 4020 + }, + { + "epoch": 0.79, + "learning_rate": 6.357314991889757e-07, + "logits/chosen": -1.1399719715118408, + "logits/rejected": -1.3395874500274658, + "logps/chosen": -552.9012451171875, + "logps/rejected": -661.7059326171875, + "loss": 0.4233, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.0356509685516357, + "rewards/margins": 1.240136981010437, + "rewards/rejected": -4.275787830352783, + "step": 4030 + }, + { + "epoch": 0.79, + "learning_rate": 6.243597929262404e-07, + "logits/chosen": -1.6289745569229126, + "logits/rejected": -1.1411449909210205, + "logps/chosen": -638.5223999023438, + "logps/rejected": -637.6964111328125, + "loss": 0.5376, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.073040008544922, + "rewards/margins": 0.9560378193855286, + "rewards/rejected": -4.029077529907227, + "step": 4040 + }, + { + "epoch": 0.79, + "learning_rate": 6.130761831040522e-07, + "logits/chosen": -1.2583967447280884, + "logits/rejected": -1.3109838962554932, + "logps/chosen": -503.34527587890625, + "logps/rejected": -617.5255126953125, + "loss": 0.497, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.6558876037597656, + "rewards/margins": 1.1685144901275635, + "rewards/rejected": -3.82440185546875, + "step": 4050 + }, + { + "epoch": 0.8, + "learning_rate": 6.018811996992455e-07, + "logits/chosen": -1.5982011556625366, + "logits/rejected": -0.981221079826355, + "logps/chosen": -584.2902221679688, + "logps/rejected": -656.946044921875, + "loss": 0.4365, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.050417423248291, + "rewards/margins": 1.3010116815567017, + "rewards/rejected": -4.351428985595703, + "step": 4060 + }, + { + "epoch": 0.8, + "learning_rate": 5.907753685259865e-07, + "logits/chosen": -1.481526494026184, + "logits/rejected": -1.2903486490249634, + "logps/chosen": -581.9069213867188, + "logps/rejected": -682.1727294921875, + "loss": 0.5691, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.04373836517334, + "rewards/margins": 1.1073167324066162, + "rewards/rejected": -4.151054859161377, + "step": 4070 + }, + { + "epoch": 0.8, + "learning_rate": 5.797592112110734e-07, + "logits/chosen": -1.7861783504486084, + "logits/rejected": -1.5701062679290771, + "logps/chosen": -582.6986083984375, + "logps/rejected": -750.8484497070312, + "loss": 0.5171, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.053048849105835, + "rewards/margins": 1.0844833850860596, + "rewards/rejected": -4.137532711029053, + "step": 4080 + }, + { + "epoch": 0.8, + "learning_rate": 5.688332451694356e-07, + "logits/chosen": -1.3515706062316895, + "logits/rejected": -1.1656490564346313, + "logps/chosen": -552.8885498046875, + "logps/rejected": -614.0411376953125, + "loss": 0.5751, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.6918797492980957, + "rewards/margins": 0.9679886102676392, + "rewards/rejected": -3.6598687171936035, + "step": 4090 + }, + { + "epoch": 0.8, + "learning_rate": 5.579979835798361e-07, + "logits/chosen": -1.4821619987487793, + "logits/rejected": -1.256227970123291, + "logps/chosen": -478.2623596191406, + "logps/rejected": -660.5684814453125, + "loss": 0.4689, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.627953052520752, + "rewards/margins": 1.5396177768707275, + "rewards/rejected": -4.167571067810059, + "step": 4100 + }, + { + "epoch": 0.8, + "eval_logits/chosen": 0.6762639284133911, + "eval_logits/rejected": 1.249507188796997, + "eval_logps/chosen": -547.4949951171875, + "eval_logps/rejected": -623.7166137695312, + "eval_loss": 0.48798251152038574, + "eval_rewards/accuracies": 0.7232142686843872, + "eval_rewards/chosen": -2.842543125152588, + "eval_rewards/margins": 1.0634390115737915, + "eval_rewards/rejected": -3.905982255935669, + "eval_runtime": 475.2471, + "eval_samples_per_second": 4.208, + "eval_steps_per_second": 0.177, + "step": 4100 + }, + { + "epoch": 0.81, + "learning_rate": 5.472539353607612e-07, + "logits/chosen": -1.6128085851669312, + "logits/rejected": -1.0963213443756104, + "logps/chosen": -575.4742431640625, + "logps/rejected": -641.0703735351562, + "loss": 0.4269, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5407283306121826, + "rewards/margins": 1.1561939716339111, + "rewards/rejected": -3.6969223022460938, + "step": 4110 + }, + { + "epoch": 0.81, + "learning_rate": 5.366016051465245e-07, + "logits/chosen": -1.2906649112701416, + "logits/rejected": -0.9485572576522827, + "logps/chosen": -501.63824462890625, + "logps/rejected": -602.8499145507812, + "loss": 0.5234, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.7279627323150635, + "rewards/margins": 1.3095009326934814, + "rewards/rejected": -4.037463188171387, + "step": 4120 + }, + { + "epoch": 0.81, + "learning_rate": 5.260414932635588e-07, + "logits/chosen": -1.6067225933074951, + "logits/rejected": -1.5598185062408447, + "logps/chosen": -568.68798828125, + "logps/rejected": -691.7966918945312, + "loss": 0.4267, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.682281970977783, + "rewards/margins": 1.302312970161438, + "rewards/rejected": -3.9845948219299316, + "step": 4130 + }, + { + "epoch": 0.81, + "learning_rate": 5.155740957069186e-07, + "logits/chosen": -1.5909394025802612, + "logits/rejected": -1.1699573993682861, + "logps/chosen": -647.465087890625, + "logps/rejected": -690.4301147460938, + "loss": 0.5324, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.0774474143981934, + "rewards/margins": 1.313281774520874, + "rewards/rejected": -4.390728950500488, + "step": 4140 + }, + { + "epoch": 0.81, + "learning_rate": 5.051999041169869e-07, + "logits/chosen": -1.702135443687439, + "logits/rejected": -1.2952522039413452, + "logps/chosen": -592.5914916992188, + "logps/rejected": -648.1393432617188, + "loss": 0.6196, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.9406609535217285, + "rewards/margins": 1.1990137100219727, + "rewards/rejected": -4.139674663543701, + "step": 4150 + }, + { + "epoch": 0.82, + "learning_rate": 4.949194057563783e-07, + "logits/chosen": -1.4475417137145996, + "logits/rejected": -1.2416096925735474, + "logps/chosen": -536.6510009765625, + "logps/rejected": -681.0042724609375, + "loss": 0.4172, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.51859712600708, + "rewards/margins": 1.5448805093765259, + "rewards/rejected": -4.063477516174316, + "step": 4160 + }, + { + "epoch": 0.82, + "learning_rate": 4.847330834870551e-07, + "logits/chosen": -1.7177613973617554, + "logits/rejected": -1.2784730195999146, + "logps/chosen": -523.9808349609375, + "logps/rejected": -643.7147216796875, + "loss": 0.4968, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.432467460632324, + "rewards/margins": 1.484404444694519, + "rewards/rejected": -3.916872024536133, + "step": 4170 + }, + { + "epoch": 0.82, + "learning_rate": 4.746414157476506e-07, + "logits/chosen": -1.6163629293441772, + "logits/rejected": -1.0138940811157227, + "logps/chosen": -532.6239013671875, + "logps/rejected": -668.1685791015625, + "loss": 0.4849, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.767432451248169, + "rewards/margins": 1.6345386505126953, + "rewards/rejected": -4.401970863342285, + "step": 4180 + }, + { + "epoch": 0.82, + "learning_rate": 4.6464487653099216e-07, + "logits/chosen": -1.466838002204895, + "logits/rejected": -1.3406709432601929, + "logps/chosen": -547.8306274414062, + "logps/rejected": -684.18408203125, + "loss": 0.5063, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.955652952194214, + "rewards/margins": 1.1953189373016357, + "rewards/rejected": -4.15097188949585, + "step": 4190 + }, + { + "epoch": 0.82, + "learning_rate": 4.5474393536184214e-07, + "logits/chosen": -1.5597014427185059, + "logits/rejected": -1.5640283823013306, + "logps/chosen": -577.12451171875, + "logps/rejected": -654.9713134765625, + "loss": 0.4275, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.894909620285034, + "rewards/margins": 0.7931472659111023, + "rewards/rejected": -3.6880574226379395, + "step": 4200 + }, + { + "epoch": 0.82, + "eval_logits/chosen": 0.7330583930015564, + "eval_logits/rejected": 1.3066866397857666, + "eval_logps/chosen": -549.9531860351562, + "eval_logps/rejected": -626.6477661132812, + "eval_loss": 0.487714946269989, + "eval_rewards/accuracies": 0.7232142686843872, + "eval_rewards/chosen": -2.8671252727508545, + "eval_rewards/margins": 1.0681687593460083, + "eval_rewards/rejected": -3.9352939128875732, + "eval_runtime": 467.6637, + "eval_samples_per_second": 4.277, + "eval_steps_per_second": 0.18, + "step": 4200 + }, + { + "epoch": 0.83, + "learning_rate": 4.449390572748449e-07, + "logits/chosen": -1.133298635482788, + "logits/rejected": -0.997468113899231, + "logps/chosen": -500.6832580566406, + "logps/rejected": -642.2027587890625, + "loss": 0.5158, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7464077472686768, + "rewards/margins": 1.693267583847046, + "rewards/rejected": -4.4396748542785645, + "step": 4210 + }, + { + "epoch": 0.83, + "learning_rate": 4.352307027926828e-07, + "logits/chosen": -1.6892297267913818, + "logits/rejected": -1.145814299583435, + "logps/chosen": -536.3302001953125, + "logps/rejected": -664.6512451171875, + "loss": 0.3924, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.630311965942383, + "rewards/margins": 1.869051218032837, + "rewards/rejected": -4.499362945556641, + "step": 4220 + }, + { + "epoch": 0.83, + "learning_rate": 4.2561932790444597e-07, + "logits/chosen": -1.4355401992797852, + "logits/rejected": -1.4584405422210693, + "logps/chosen": -499.212158203125, + "logps/rejected": -660.339111328125, + "loss": 0.5618, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.6407253742218018, + "rewards/margins": 1.5666038990020752, + "rewards/rejected": -4.207329273223877, + "step": 4230 + }, + { + "epoch": 0.83, + "learning_rate": 4.1610538404421837e-07, + "logits/chosen": -1.648999810218811, + "logits/rejected": -1.5709302425384521, + "logps/chosen": -538.1671752929688, + "logps/rejected": -669.4725341796875, + "loss": 0.4689, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.68080472946167, + "rewards/margins": 1.298196792602539, + "rewards/rejected": -3.979001998901367, + "step": 4240 + }, + { + "epoch": 0.83, + "learning_rate": 4.0668931806987e-07, + "logits/chosen": -1.6326240301132202, + "logits/rejected": -1.6104761362075806, + "logps/chosen": -607.5662231445312, + "logps/rejected": -701.7105102539062, + "loss": 0.4732, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.885305166244507, + "rewards/margins": 1.4834986925125122, + "rewards/rejected": -4.368803977966309, + "step": 4250 + }, + { + "epoch": 0.84, + "learning_rate": 3.9737157224207265e-07, + "logits/chosen": -1.6464459896087646, + "logits/rejected": -1.6791051626205444, + "logps/chosen": -548.1013793945312, + "logps/rejected": -617.052734375, + "loss": 0.4732, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.502802848815918, + "rewards/margins": 1.1017303466796875, + "rewards/rejected": -3.6045334339141846, + "step": 4260 + }, + { + "epoch": 0.84, + "learning_rate": 3.8815258420352385e-07, + "logits/chosen": -1.4971342086791992, + "logits/rejected": -1.3662328720092773, + "logps/chosen": -561.4806518554688, + "logps/rejected": -697.7352905273438, + "loss": 0.411, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.7723212242126465, + "rewards/margins": 1.7536299228668213, + "rewards/rejected": -4.525950908660889, + "step": 4270 + }, + { + "epoch": 0.84, + "learning_rate": 3.7903278695839456e-07, + "logits/chosen": -1.6506578922271729, + "logits/rejected": -1.4599170684814453, + "logps/chosen": -518.5300903320312, + "logps/rejected": -640.587646484375, + "loss": 0.4808, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7276813983917236, + "rewards/margins": 1.2205212116241455, + "rewards/rejected": -3.9482028484344482, + "step": 4280 + }, + { + "epoch": 0.84, + "learning_rate": 3.7001260885198925e-07, + "logits/chosen": -1.7103252410888672, + "logits/rejected": -1.1314291954040527, + "logps/chosen": -595.9619750976562, + "logps/rejected": -664.2386474609375, + "loss": 0.4952, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.961540937423706, + "rewards/margins": 1.683045744895935, + "rewards/rejected": -4.644586086273193, + "step": 4290 + }, + { + "epoch": 0.84, + "learning_rate": 3.610924735506274e-07, + "logits/chosen": -1.5473103523254395, + "logits/rejected": -1.102410078048706, + "logps/chosen": -491.91522216796875, + "logps/rejected": -658.9654541015625, + "loss": 0.5325, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.771312713623047, + "rewards/margins": 1.5326588153839111, + "rewards/rejected": -4.303971290588379, + "step": 4300 + }, + { + "epoch": 0.84, + "eval_logits/chosen": 0.8069880604743958, + "eval_logits/rejected": 1.3795324563980103, + "eval_logps/chosen": -551.7904663085938, + "eval_logps/rejected": -629.420166015625, + "eval_loss": 0.4881390929222107, + "eval_rewards/accuracies": 0.726190447807312, + "eval_rewards/chosen": -2.885497570037842, + "eval_rewards/margins": 1.0775203704833984, + "eval_rewards/rejected": -3.9630184173583984, + "eval_runtime": 468.0535, + "eval_samples_per_second": 4.273, + "eval_steps_per_second": 0.179, + "step": 4300 + }, + { + "epoch": 0.85, + "learning_rate": 3.5227280002174626e-07, + "logits/chosen": -1.4590882062911987, + "logits/rejected": -1.0051017999649048, + "logps/chosen": -617.4918823242188, + "logps/rejected": -654.9302978515625, + "loss": 0.5026, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.8475120067596436, + "rewards/margins": 1.0741074085235596, + "rewards/rejected": -3.9216198921203613, + "step": 4310 + }, + { + "epoch": 0.85, + "learning_rate": 3.4355400251421977e-07, + "logits/chosen": -1.5790798664093018, + "logits/rejected": -1.4507157802581787, + "logps/chosen": -529.469970703125, + "logps/rejected": -670.0650634765625, + "loss": 0.4777, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.685770273208618, + "rewards/margins": 1.1587498188018799, + "rewards/rejected": -3.844520092010498, + "step": 4320 + }, + { + "epoch": 0.85, + "learning_rate": 3.3493649053890325e-07, + "logits/chosen": -1.507348656654358, + "logits/rejected": -1.4271577596664429, + "logps/chosen": -546.6571655273438, + "logps/rejected": -621.9240112304688, + "loss": 0.4806, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.7799060344696045, + "rewards/margins": 0.9522191882133484, + "rewards/rejected": -3.732125759124756, + "step": 4330 + }, + { + "epoch": 0.85, + "learning_rate": 3.2642066884940064e-07, + "logits/chosen": -1.4219774007797241, + "logits/rejected": -1.054166316986084, + "logps/chosen": -609.1405639648438, + "logps/rejected": -705.8851928710938, + "loss": 0.6196, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.914445161819458, + "rewards/margins": 1.4480302333831787, + "rewards/rejected": -4.362475395202637, + "step": 4340 + }, + { + "epoch": 0.85, + "learning_rate": 3.1800693742305074e-07, + "logits/chosen": -1.2960752248764038, + "logits/rejected": -1.182204008102417, + "logps/chosen": -497.4002990722656, + "logps/rejected": -645.9159545898438, + "loss": 0.4492, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.8490395545959473, + "rewards/margins": 1.3345952033996582, + "rewards/rejected": -4.1836347579956055, + "step": 4350 + }, + { + "epoch": 0.86, + "learning_rate": 3.0969569144214147e-07, + "logits/chosen": -1.3932464122772217, + "logits/rejected": -1.5933971405029297, + "logps/chosen": -510.855712890625, + "logps/rejected": -639.6701049804688, + "loss": 0.5679, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.6335511207580566, + "rewards/margins": 1.0062687397003174, + "rewards/rejected": -3.639819383621216, + "step": 4360 + }, + { + "epoch": 0.86, + "learning_rate": 3.014873212753516e-07, + "logits/chosen": -1.4604628086090088, + "logits/rejected": -1.1423250436782837, + "logps/chosen": -519.5737915039062, + "logps/rejected": -592.70458984375, + "loss": 0.4433, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.1641898155212402, + "rewards/margins": 0.8947893381118774, + "rewards/rejected": -4.0589799880981445, + "step": 4370 + }, + { + "epoch": 0.86, + "learning_rate": 2.933822124594124e-07, + "logits/chosen": -1.4781441688537598, + "logits/rejected": -1.0364634990692139, + "logps/chosen": -474.93829345703125, + "logps/rejected": -620.2733154296875, + "loss": 0.4233, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6123297214508057, + "rewards/margins": 1.690799355506897, + "rewards/rejected": -4.303129196166992, + "step": 4380 + }, + { + "epoch": 0.86, + "learning_rate": 2.8538074568099954e-07, + "logits/chosen": -1.3508070707321167, + "logits/rejected": -0.8607912063598633, + "logps/chosen": -572.3214721679688, + "logps/rejected": -780.388671875, + "loss": 0.5175, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.1330060958862305, + "rewards/margins": 1.9722713232040405, + "rewards/rejected": -5.1052775382995605, + "step": 4390 + }, + { + "epoch": 0.86, + "learning_rate": 2.774832967588556e-07, + "logits/chosen": -1.3934319019317627, + "logits/rejected": -1.3498982191085815, + "logps/chosen": -520.5974731445312, + "logps/rejected": -616.978271484375, + "loss": 0.532, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.889716625213623, + "rewards/margins": 0.6850132942199707, + "rewards/rejected": -3.5747299194335938, + "step": 4400 + }, + { + "epoch": 0.86, + "eval_logits/chosen": 0.7732266187667847, + "eval_logits/rejected": 1.3435120582580566, + "eval_logps/chosen": -549.9609985351562, + "eval_logps/rejected": -627.1785278320312, + "eval_loss": 0.48813939094543457, + "eval_rewards/accuracies": 0.7276785969734192, + "eval_rewards/chosen": -2.8672022819519043, + "eval_rewards/margins": 1.073399543762207, + "eval_rewards/rejected": -3.9406018257141113, + "eval_runtime": 470.3533, + "eval_samples_per_second": 4.252, + "eval_steps_per_second": 0.179, + "step": 4400 + }, + { + "epoch": 0.87, + "learning_rate": 2.6969023662613473e-07, + "logits/chosen": -1.2395694255828857, + "logits/rejected": -1.2353589534759521, + "logps/chosen": -604.3405151367188, + "logps/rejected": -702.7529296875, + "loss": 0.4027, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2787222862243652, + "rewards/margins": 1.1157318353652954, + "rewards/rejected": -4.394454002380371, + "step": 4410 + }, + { + "epoch": 0.87, + "learning_rate": 2.6200193131298376e-07, + "logits/chosen": -1.449593186378479, + "logits/rejected": -1.4640024900436401, + "logps/chosen": -543.8770751953125, + "logps/rejected": -664.721923828125, + "loss": 0.4854, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.8391997814178467, + "rewards/margins": 1.157238245010376, + "rewards/rejected": -3.9964377880096436, + "step": 4420 + }, + { + "epoch": 0.87, + "learning_rate": 2.544187419293462e-07, + "logits/chosen": -1.5657931566238403, + "logits/rejected": -1.2472331523895264, + "logps/chosen": -567.7420654296875, + "logps/rejected": -664.3908081054688, + "loss": 0.5379, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.110095500946045, + "rewards/margins": 1.1092383861541748, + "rewards/rejected": -4.219334125518799, + "step": 4430 + }, + { + "epoch": 0.87, + "learning_rate": 2.469410246480067e-07, + "logits/chosen": -1.2664387226104736, + "logits/rejected": -1.014687180519104, + "logps/chosen": -577.5288696289062, + "logps/rejected": -627.8881225585938, + "loss": 0.505, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.1275877952575684, + "rewards/margins": 1.2311753034591675, + "rewards/rejected": -4.358763217926025, + "step": 4440 + }, + { + "epoch": 0.87, + "learning_rate": 2.3956913068785697e-07, + "logits/chosen": -1.8588111400604248, + "logits/rejected": -1.5784744024276733, + "logps/chosen": -606.2607421875, + "logps/rejected": -602.3804931640625, + "loss": 0.5029, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.689060926437378, + "rewards/margins": 0.9545005559921265, + "rewards/rejected": -3.643561601638794, + "step": 4450 + }, + { + "epoch": 0.88, + "learning_rate": 2.3230340629740166e-07, + "logits/chosen": -1.300377607345581, + "logits/rejected": -1.225534200668335, + "logps/chosen": -591.2781372070312, + "logps/rejected": -612.3079223632812, + "loss": 0.5624, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.157484769821167, + "rewards/margins": 0.8397257924079895, + "rewards/rejected": -3.9972102642059326, + "step": 4460 + }, + { + "epoch": 0.88, + "learning_rate": 2.2514419273849674e-07, + "logits/chosen": -1.5014206171035767, + "logits/rejected": -1.5819671154022217, + "logps/chosen": -518.7572021484375, + "logps/rejected": -617.6094360351562, + "loss": 0.5182, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.928231954574585, + "rewards/margins": 0.8245126605033875, + "rewards/rejected": -3.752744197845459, + "step": 4470 + }, + { + "epoch": 0.88, + "learning_rate": 2.1809182627031883e-07, + "logits/chosen": -1.4660804271697998, + "logits/rejected": -1.550597071647644, + "logps/chosen": -489.97802734375, + "logps/rejected": -606.5343017578125, + "loss": 0.4504, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.7580807209014893, + "rewards/margins": 1.096167802810669, + "rewards/rejected": -3.854248523712158, + "step": 4480 + }, + { + "epoch": 0.88, + "learning_rate": 2.111466381335714e-07, + "logits/chosen": -1.643610954284668, + "logits/rejected": -1.324254035949707, + "logps/chosen": -560.3173828125, + "logps/rejected": -619.2115478515625, + "loss": 0.5088, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.0471415519714355, + "rewards/margins": 1.023773193359375, + "rewards/rejected": -4.070915222167969, + "step": 4490 + }, + { + "epoch": 0.88, + "learning_rate": 2.0430895453492944e-07, + "logits/chosen": -1.7354183197021484, + "logits/rejected": -1.5083003044128418, + "logps/chosen": -547.43798828125, + "logps/rejected": -576.88330078125, + "loss": 0.4558, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.6389975547790527, + "rewards/margins": 0.869195818901062, + "rewards/rejected": -3.5081934928894043, + "step": 4500 + }, + { + "epoch": 0.88, + "eval_logits/chosen": 0.7711246013641357, + "eval_logits/rejected": 1.3411438465118408, + "eval_logps/chosen": -548.8391723632812, + "eval_logps/rejected": -625.7067260742188, + "eval_loss": 0.4879080057144165, + "eval_rewards/accuracies": 0.726190447807312, + "eval_rewards/chosen": -2.855985403060913, + "eval_rewards/margins": 1.0698983669281006, + "eval_rewards/rejected": -3.9258840084075928, + "eval_runtime": 468.3034, + "eval_samples_per_second": 4.271, + "eval_steps_per_second": 0.179, + "step": 4500 + }, + { + "epoch": 0.89, + "learning_rate": 1.9757909663171508e-07, + "logits/chosen": -1.497092604637146, + "logits/rejected": -1.3735014200210571, + "logps/chosen": -544.7706909179688, + "logps/rejected": -599.2404174804688, + "loss": 0.4727, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.603917360305786, + "rewards/margins": 1.5174095630645752, + "rewards/rejected": -4.121326446533203, + "step": 4510 + }, + { + "epoch": 0.89, + "learning_rate": 1.9095738051681412e-07, + "logits/chosen": -1.1327025890350342, + "logits/rejected": -1.2740120887756348, + "logps/chosen": -515.599609375, + "logps/rejected": -598.8855590820312, + "loss": 0.5864, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.005495309829712, + "rewards/margins": 0.9244306683540344, + "rewards/rejected": -3.9299259185791016, + "step": 4520 + }, + { + "epoch": 0.89, + "learning_rate": 1.844441172038311e-07, + "logits/chosen": -1.7061221599578857, + "logits/rejected": -1.4002907276153564, + "logps/chosen": -560.3880615234375, + "logps/rejected": -678.8825073242188, + "loss": 0.3871, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8229405879974365, + "rewards/margins": 1.5331408977508545, + "rewards/rejected": -4.356081485748291, + "step": 4530 + }, + { + "epoch": 0.89, + "learning_rate": 1.7803961261247864e-07, + "logits/chosen": -1.7175147533416748, + "logits/rejected": -1.5015738010406494, + "logps/chosen": -564.5352783203125, + "logps/rejected": -642.4002075195312, + "loss": 0.5053, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.874131441116333, + "rewards/margins": 1.224721908569336, + "rewards/rejected": -4.09885311126709, + "step": 4540 + }, + { + "epoch": 0.89, + "learning_rate": 1.717441675542106e-07, + "logits/chosen": -1.5754218101501465, + "logits/rejected": -1.1890827417373657, + "logps/chosen": -534.8074340820312, + "logps/rejected": -595.2333374023438, + "loss": 0.5414, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6678903102874756, + "rewards/margins": 1.0343869924545288, + "rewards/rejected": -3.7022769451141357, + "step": 4550 + }, + { + "epoch": 0.9, + "learning_rate": 1.6555807771809375e-07, + "logits/chosen": -1.4952316284179688, + "logits/rejected": -1.4921852350234985, + "logps/chosen": -543.2953491210938, + "logps/rejected": -644.8867797851562, + "loss": 0.449, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.997863531112671, + "rewards/margins": 0.9223406910896301, + "rewards/rejected": -3.9202046394348145, + "step": 4560 + }, + { + "epoch": 0.9, + "learning_rate": 1.5948163365691798e-07, + "logits/chosen": -1.542854905128479, + "logits/rejected": -1.1436667442321777, + "logps/chosen": -567.2286376953125, + "logps/rejected": -647.6072387695312, + "loss": 0.4908, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.061537027359009, + "rewards/margins": 0.9950931668281555, + "rewards/rejected": -4.0566301345825195, + "step": 4570 + }, + { + "epoch": 0.9, + "learning_rate": 1.5351512077355024e-07, + "logits/chosen": -1.57053542137146, + "logits/rejected": -1.3311067819595337, + "logps/chosen": -661.517578125, + "logps/rejected": -673.8138427734375, + "loss": 0.4326, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.0921597480773926, + "rewards/margins": 1.1435569524765015, + "rewards/rejected": -4.235716819763184, + "step": 4580 + }, + { + "epoch": 0.9, + "learning_rate": 1.4765881930752983e-07, + "logits/chosen": -1.4779008626937866, + "logits/rejected": -1.4968267679214478, + "logps/chosen": -565.7744750976562, + "logps/rejected": -658.5062255859375, + "loss": 0.491, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.344609498977661, + "rewards/margins": 0.8472648859024048, + "rewards/rejected": -4.1918745040893555, + "step": 4590 + }, + { + "epoch": 0.9, + "learning_rate": 1.4191300432190634e-07, + "logits/chosen": -1.478987455368042, + "logits/rejected": -1.0707805156707764, + "logps/chosen": -554.4490966796875, + "logps/rejected": -645.3145751953125, + "loss": 0.5541, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.8828506469726562, + "rewards/margins": 1.1082347631454468, + "rewards/rejected": -3.9910855293273926, + "step": 4600 + }, + { + "epoch": 0.9, + "eval_logits/chosen": 0.7729079127311707, + "eval_logits/rejected": 1.342787742614746, + "eval_logps/chosen": -549.2481079101562, + "eval_logps/rejected": -626.0703735351562, + "eval_loss": 0.4881538152694702, + "eval_rewards/accuracies": 0.726190447807312, + "eval_rewards/chosen": -2.8600735664367676, + "eval_rewards/margins": 1.0694462060928345, + "eval_rewards/rejected": -3.9295201301574707, + "eval_runtime": 470.2505, + "eval_samples_per_second": 4.253, + "eval_steps_per_second": 0.179, + "step": 4600 + }, + { + "epoch": 0.9, + "learning_rate": 1.362779456903182e-07, + "logits/chosen": -1.3034653663635254, + "logits/rejected": -1.1462644338607788, + "logps/chosen": -512.8648681640625, + "logps/rejected": -576.4937133789062, + "loss": 0.4791, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.9724960327148438, + "rewards/margins": 0.8336421251296997, + "rewards/rejected": -3.806138277053833, + "step": 4610 + }, + { + "epoch": 0.91, + "learning_rate": 1.3075390808431897e-07, + "logits/chosen": -1.6049964427947998, + "logits/rejected": -1.38681960105896, + "logps/chosen": -504.82086181640625, + "logps/rejected": -620.4871826171875, + "loss": 0.4517, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.6851613521575928, + "rewards/margins": 1.6643813848495483, + "rewards/rejected": -4.34954309463501, + "step": 4620 + }, + { + "epoch": 0.91, + "learning_rate": 1.253411509609459e-07, + "logits/chosen": -1.4405021667480469, + "logits/rejected": -1.1606242656707764, + "logps/chosen": -493.46160888671875, + "logps/rejected": -642.3789672851562, + "loss": 0.534, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.7728381156921387, + "rewards/margins": 1.5827041864395142, + "rewards/rejected": -4.355542182922363, + "step": 4630 + }, + { + "epoch": 0.91, + "learning_rate": 1.2003992855053326e-07, + "logits/chosen": -1.223035454750061, + "logits/rejected": -0.8624464273452759, + "logps/chosen": -497.0065002441406, + "logps/rejected": -650.3905639648438, + "loss": 0.44, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.5264394283294678, + "rewards/margins": 1.526515245437622, + "rewards/rejected": -4.05295467376709, + "step": 4640 + }, + { + "epoch": 0.91, + "learning_rate": 1.1485048984476998e-07, + "logits/chosen": -1.4365018606185913, + "logits/rejected": -1.4451005458831787, + "logps/chosen": -546.9738159179688, + "logps/rejected": -685.6676025390625, + "loss": 0.4365, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.7223660945892334, + "rewards/margins": 1.5035502910614014, + "rewards/rejected": -4.225916385650635, + "step": 4650 + }, + { + "epoch": 0.91, + "learning_rate": 1.0977307858500818e-07, + "logits/chosen": -1.23599112033844, + "logits/rejected": -1.4569393396377563, + "logps/chosen": -502.11016845703125, + "logps/rejected": -630.4044189453125, + "loss": 0.4859, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.0557899475097656, + "rewards/margins": 1.1950440406799316, + "rewards/rejected": -4.250833511352539, + "step": 4660 + }, + { + "epoch": 0.92, + "learning_rate": 1.0480793325081174e-07, + "logits/chosen": -1.729148507118225, + "logits/rejected": -1.2992174625396729, + "logps/chosen": -502.6327209472656, + "logps/rejected": -604.9547119140625, + "loss": 0.4806, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1948771476745605, + "rewards/margins": 1.5025956630706787, + "rewards/rejected": -3.6974730491638184, + "step": 4670 + }, + { + "epoch": 0.92, + "learning_rate": 9.995528704875635e-08, + "logits/chosen": -1.5218055248260498, + "logits/rejected": -1.5403430461883545, + "logps/chosen": -569.9359130859375, + "logps/rejected": -700.4747314453125, + "loss": 0.5107, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.7318825721740723, + "rewards/margins": 1.2422831058502197, + "rewards/rejected": -3.974165678024292, + "step": 4680 + }, + { + "epoch": 0.92, + "learning_rate": 9.521536790147722e-08, + "logits/chosen": -1.601231336593628, + "logits/rejected": -1.397159457206726, + "logps/chosen": -591.1676025390625, + "logps/rejected": -694.2679443359375, + "loss": 0.4968, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.1166365146636963, + "rewards/margins": 1.142465353012085, + "rewards/rejected": -4.259101867675781, + "step": 4690 + }, + { + "epoch": 0.92, + "learning_rate": 9.058839843696237e-08, + "logits/chosen": -1.6953855752944946, + "logits/rejected": -1.4888617992401123, + "logps/chosen": -580.9130859375, + "logps/rejected": -659.9354248046875, + "loss": 0.5743, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.9675400257110596, + "rewards/margins": 1.20027494430542, + "rewards/rejected": -4.1678147315979, + "step": 4700 + }, + { + "epoch": 0.92, + "eval_logits/chosen": 0.7755272388458252, + "eval_logits/rejected": 1.3445079326629639, + "eval_logps/chosen": -549.6526489257812, + "eval_logps/rejected": -626.5550537109375, + "eval_loss": 0.4878641664981842, + "eval_rewards/accuracies": 0.726190447807312, + "eval_rewards/chosen": -2.8641197681427, + "eval_rewards/margins": 1.0702478885650635, + "eval_rewards/rejected": -3.9343676567077637, + "eval_runtime": 471.0127, + "eval_samples_per_second": 4.246, + "eval_steps_per_second": 0.178, + "step": 4700 + }, + { + "epoch": 0.92, + "learning_rate": 8.607459597809565e-08, + "logits/chosen": -1.82345712184906, + "logits/rejected": -1.6927080154418945, + "logps/chosen": -487.48687744140625, + "logps/rejected": -665.341796875, + "loss": 0.429, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.701889991760254, + "rewards/margins": 1.1798779964447021, + "rewards/rejected": -3.881767749786377, + "step": 4710 + }, + { + "epoch": 0.93, + "learning_rate": 8.167417253245213e-08, + "logits/chosen": -1.5453943014144897, + "logits/rejected": -1.4755821228027344, + "logps/chosen": -578.2698974609375, + "logps/rejected": -590.1826171875, + "loss": 0.5955, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.737913131713867, + "rewards/margins": 0.9511528015136719, + "rewards/rejected": -3.6890664100646973, + "step": 4720 + }, + { + "epoch": 0.93, + "learning_rate": 7.738733478233673e-08, + "logits/chosen": -1.5097318887710571, + "logits/rejected": -1.0241183042526245, + "logps/chosen": -550.3403930664062, + "logps/rejected": -640.8431396484375, + "loss": 0.5177, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.93200945854187, + "rewards/margins": 1.2537683248519897, + "rewards/rejected": -4.185778617858887, + "step": 4730 + }, + { + "epoch": 0.93, + "learning_rate": 7.32142840750788e-08, + "logits/chosen": -1.4939231872558594, + "logits/rejected": -1.254494309425354, + "logps/chosen": -608.97119140625, + "logps/rejected": -694.7242431640625, + "loss": 0.4677, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.08970308303833, + "rewards/margins": 0.9346927404403687, + "rewards/rejected": -4.024395942687988, + "step": 4740 + }, + { + "epoch": 0.93, + "learning_rate": 6.915521641357504e-08, + "logits/chosen": -1.3975909948349, + "logits/rejected": -1.4619452953338623, + "logps/chosen": -559.6480712890625, + "logps/rejected": -671.4445190429688, + "loss": 0.4622, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.970442295074463, + "rewards/margins": 1.1618378162384033, + "rewards/rejected": -4.132280349731445, + "step": 4750 + }, + { + "epoch": 0.93, + "learning_rate": 6.521032244708375e-08, + "logits/chosen": -1.5189892053604126, + "logits/rejected": -1.4619439840316772, + "logps/chosen": -530.2403564453125, + "logps/rejected": -606.9569091796875, + "loss": 0.5584, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.9585671424865723, + "rewards/margins": 0.770112156867981, + "rewards/rejected": -3.728679656982422, + "step": 4760 + }, + { + "epoch": 0.94, + "learning_rate": 6.137978746226848e-08, + "logits/chosen": -1.2548673152923584, + "logits/rejected": -1.4840552806854248, + "logps/chosen": -565.2719116210938, + "logps/rejected": -694.0643920898438, + "loss": 0.4721, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.204113721847534, + "rewards/margins": 1.1963703632354736, + "rewards/rejected": -4.400484085083008, + "step": 4770 + }, + { + "epoch": 0.94, + "learning_rate": 5.766379137449624e-08, + "logits/chosen": -1.4481416940689087, + "logits/rejected": -1.0816242694854736, + "logps/chosen": -640.5113525390625, + "logps/rejected": -766.0667724609375, + "loss": 0.4272, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.9921982288360596, + "rewards/margins": 1.9512176513671875, + "rewards/rejected": -4.943415641784668, + "step": 4780 + }, + { + "epoch": 0.94, + "learning_rate": 5.406250871938912e-08, + "logits/chosen": -1.626961350440979, + "logits/rejected": -1.5448499917984009, + "logps/chosen": -551.1680908203125, + "logps/rejected": -574.9620971679688, + "loss": 0.5034, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.941655397415161, + "rewards/margins": 0.6056486964225769, + "rewards/rejected": -3.547304153442383, + "step": 4790 + }, + { + "epoch": 0.94, + "learning_rate": 5.0576108644623536e-08, + "logits/chosen": -1.4706220626831055, + "logits/rejected": -1.495298981666565, + "logps/chosen": -528.7744140625, + "logps/rejected": -635.4783935546875, + "loss": 0.4657, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.6470999717712402, + "rewards/margins": 1.1267454624176025, + "rewards/rejected": -3.7738451957702637, + "step": 4800 + }, + { + "epoch": 0.94, + "eval_logits/chosen": 0.7749411463737488, + "eval_logits/rejected": 1.3436989784240723, + "eval_logps/chosen": -549.4993286132812, + "eval_logps/rejected": -626.3385620117188, + "eval_loss": 0.4880455732345581, + "eval_rewards/accuracies": 0.7291666865348816, + "eval_rewards/chosen": -2.8625855445861816, + "eval_rewards/margins": 1.069616436958313, + "eval_rewards/rejected": -3.9322023391723633, + "eval_runtime": 471.6194, + "eval_samples_per_second": 4.241, + "eval_steps_per_second": 0.178, + "step": 4800 + }, + { + "epoch": 0.94, + "learning_rate": 4.720475490198634e-08, + "logits/chosen": -1.3117965459823608, + "logits/rejected": -1.2122046947479248, + "logps/chosen": -583.239990234375, + "logps/rejected": -767.133544921875, + "loss": 0.446, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.2251923084259033, + "rewards/margins": 1.360032558441162, + "rewards/rejected": -4.5852251052856445, + "step": 4810 + }, + { + "epoch": 0.95, + "learning_rate": 4.394860583968624e-08, + "logits/chosen": -1.5553256273269653, + "logits/rejected": -1.46771240234375, + "logps/chosen": -525.3215942382812, + "logps/rejected": -673.70849609375, + "loss": 0.4399, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.5879640579223633, + "rewards/margins": 1.598061203956604, + "rewards/rejected": -4.186025142669678, + "step": 4820 + }, + { + "epoch": 0.95, + "learning_rate": 4.0807814394911996e-08, + "logits/chosen": -1.505833387374878, + "logits/rejected": -1.222219467163086, + "logps/chosen": -514.3837280273438, + "logps/rejected": -615.126220703125, + "loss": 0.524, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.7017548084259033, + "rewards/margins": 1.5507588386535645, + "rewards/rejected": -4.252513408660889, + "step": 4830 + }, + { + "epoch": 0.95, + "learning_rate": 3.778252808665284e-08, + "logits/chosen": -1.4378811120986938, + "logits/rejected": -1.2144570350646973, + "logps/chosen": -524.7705078125, + "logps/rejected": -648.2017822265625, + "loss": 0.4667, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.9504950046539307, + "rewards/margins": 1.5240049362182617, + "rewards/rejected": -4.474499702453613, + "step": 4840 + }, + { + "epoch": 0.95, + "learning_rate": 3.4872889008767954e-08, + "logits/chosen": -1.5345133543014526, + "logits/rejected": -1.3139573335647583, + "logps/chosen": -548.332275390625, + "logps/rejected": -588.4432373046875, + "loss": 0.5319, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.756354808807373, + "rewards/margins": 0.9404181241989136, + "rewards/rejected": -3.696773052215576, + "step": 4850 + }, + { + "epoch": 0.95, + "learning_rate": 3.207903382331262e-08, + "logits/chosen": -1.3094028234481812, + "logits/rejected": -0.9478843808174133, + "logps/chosen": -526.9425659179688, + "logps/rejected": -672.6736450195312, + "loss": 0.4164, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.8847177028656006, + "rewards/margins": 1.873437523841858, + "rewards/rejected": -4.758155345916748, + "step": 4860 + }, + { + "epoch": 0.96, + "learning_rate": 2.940109375411976e-08, + "logits/chosen": -1.6699939966201782, + "logits/rejected": -1.1226723194122314, + "logps/chosen": -588.523193359375, + "logps/rejected": -711.6766357421875, + "loss": 0.5051, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.0758986473083496, + "rewards/margins": 1.7069766521453857, + "rewards/rejected": -4.782876014709473, + "step": 4870 + }, + { + "epoch": 0.96, + "learning_rate": 2.683919458063705e-08, + "logits/chosen": -1.4859200716018677, + "logits/rejected": -1.0420989990234375, + "logps/chosen": -515.8519897460938, + "logps/rejected": -623.1054077148438, + "loss": 0.4612, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.6491692066192627, + "rewards/margins": 1.424944519996643, + "rewards/rejected": -4.074113368988037, + "step": 4880 + }, + { + "epoch": 0.96, + "learning_rate": 2.4393456632016977e-08, + "logits/chosen": -1.6544864177703857, + "logits/rejected": -1.0563756227493286, + "logps/chosen": -636.07568359375, + "logps/rejected": -661.3756103515625, + "loss": 0.4991, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.177685260772705, + "rewards/margins": 1.1920883655548096, + "rewards/rejected": -4.369773864746094, + "step": 4890 + }, + { + "epoch": 0.96, + "learning_rate": 2.2063994781468256e-08, + "logits/chosen": -1.73834228515625, + "logits/rejected": -1.2774205207824707, + "logps/chosen": -620.8421020507812, + "logps/rejected": -687.5242919921875, + "loss": 0.5126, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.0707154273986816, + "rewards/margins": 1.0745410919189453, + "rewards/rejected": -4.145256519317627, + "step": 4900 + }, + { + "epoch": 0.96, + "eval_logits/chosen": 0.7748141288757324, + "eval_logits/rejected": 1.3439706563949585, + "eval_logps/chosen": -549.604248046875, + "eval_logps/rejected": -626.5125732421875, + "eval_loss": 0.4879511594772339, + "eval_rewards/accuracies": 0.7276785969734192, + "eval_rewards/chosen": -2.863635540008545, + "eval_rewards/margins": 1.0703070163726807, + "eval_rewards/rejected": -3.9339427947998047, + "eval_runtime": 474.6902, + "eval_samples_per_second": 4.213, + "eval_steps_per_second": 0.177, + "step": 4900 + }, + { + "epoch": 0.96, + "learning_rate": 1.985091844085796e-08, + "logits/chosen": -1.444320559501648, + "logits/rejected": -1.05115807056427, + "logps/chosen": -531.5235595703125, + "logps/rejected": -714.3778076171875, + "loss": 0.4811, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.1124281883239746, + "rewards/margins": 1.223809003829956, + "rewards/rejected": -4.33623743057251, + "step": 4910 + }, + { + "epoch": 0.97, + "learning_rate": 1.7754331555573656e-08, + "logits/chosen": -1.3569526672363281, + "logits/rejected": -1.4361093044281006, + "logps/chosen": -560.3422241210938, + "logps/rejected": -637.2498168945312, + "loss": 0.4647, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.958449602127075, + "rewards/margins": 1.0478699207305908, + "rewards/rejected": -4.006319046020508, + "step": 4920 + }, + { + "epoch": 0.97, + "learning_rate": 1.5774332599641228e-08, + "logits/chosen": -1.3948105573654175, + "logits/rejected": -1.3578966856002808, + "logps/chosen": -559.6775512695312, + "logps/rejected": -650.9224243164062, + "loss": 0.5038, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.05662202835083, + "rewards/margins": 0.9975637197494507, + "rewards/rejected": -4.0541863441467285, + "step": 4930 + }, + { + "epoch": 0.97, + "learning_rate": 1.3911014571098835e-08, + "logits/chosen": -1.3849412202835083, + "logits/rejected": -1.3447059392929077, + "logps/chosen": -466.18218994140625, + "logps/rejected": -612.2434692382812, + "loss": 0.5102, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.5744361877441406, + "rewards/margins": 1.5182634592056274, + "rewards/rejected": -4.0926995277404785, + "step": 4940 + }, + { + "epoch": 0.97, + "learning_rate": 1.2164464987630131e-08, + "logits/chosen": -1.3624558448791504, + "logits/rejected": -1.4019831418991089, + "logps/chosen": -501.2599182128906, + "logps/rejected": -611.615966796875, + "loss": 0.4275, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9041037559509277, + "rewards/margins": 0.9574500918388367, + "rewards/rejected": -3.86155366897583, + "step": 4950 + }, + { + "epoch": 0.97, + "learning_rate": 1.0534765882453113e-08, + "logits/chosen": -1.3500089645385742, + "logits/rejected": -1.2703732252120972, + "logps/chosen": -562.4613037109375, + "logps/rejected": -663.7322387695312, + "loss": 0.4457, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.9094297885894775, + "rewards/margins": 1.388083577156067, + "rewards/rejected": -4.297513008117676, + "step": 4960 + }, + { + "epoch": 0.98, + "learning_rate": 9.021993800466256e-09, + "logits/chosen": -1.2833272218704224, + "logits/rejected": -1.3957890272140503, + "logps/chosen": -558.2420654296875, + "logps/rejected": -692.7595825195312, + "loss": 0.5062, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.962353467941284, + "rewards/margins": 1.226319670677185, + "rewards/rejected": -4.188673496246338, + "step": 4970 + }, + { + "epoch": 0.98, + "learning_rate": 7.626219794655553e-09, + "logits/chosen": -1.369972825050354, + "logits/rejected": -1.2614896297454834, + "logps/chosen": -567.4708251953125, + "logps/rejected": -585.6512451171875, + "loss": 0.5181, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.9253811836242676, + "rewards/margins": 0.8255603909492493, + "rewards/rejected": -3.750941514968872, + "step": 4980 + }, + { + "epoch": 0.98, + "learning_rate": 6.347509422754139e-09, + "logits/chosen": -1.561457633972168, + "logits/rejected": -1.3234002590179443, + "logps/chosen": -571.8287353515625, + "logps/rejected": -671.9757690429688, + "loss": 0.5035, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.9173166751861572, + "rewards/margins": 1.3114066123962402, + "rewards/rejected": -4.228723049163818, + "step": 4990 + }, + { + "epoch": 0.98, + "learning_rate": 5.185922744166128e-09, + "logits/chosen": -1.435817003250122, + "logits/rejected": -1.3162662982940674, + "logps/chosen": -520.7555541992188, + "logps/rejected": -686.9046630859375, + "loss": 0.3967, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.884249448776245, + "rewards/margins": 1.8164691925048828, + "rewards/rejected": -4.700718879699707, + "step": 5000 + }, + { + "epoch": 0.98, + "eval_logits/chosen": 0.7736020088195801, + "eval_logits/rejected": 1.342397928237915, + "eval_logps/chosen": -549.6658325195312, + "eval_logps/rejected": -626.5614013671875, + "eval_loss": 0.48798322677612305, + "eval_rewards/accuracies": 0.726190447807312, + "eval_rewards/chosen": -2.864250898361206, + "eval_rewards/margins": 1.0701801776885986, + "eval_rewards/rejected": -3.9344310760498047, + "eval_runtime": 476.236, + "eval_samples_per_second": 4.2, + "eval_steps_per_second": 0.176, + "step": 5000 + }, + { + "epoch": 0.98, + "learning_rate": 4.1415143171436026e-09, + "logits/chosen": -1.745918869972229, + "logits/rejected": -1.5587217807769775, + "logps/chosen": -586.3348388671875, + "logps/rejected": -672.9236450195312, + "loss": 0.4885, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.848877429962158, + "rewards/margins": 1.149807333946228, + "rewards/rejected": -3.998685121536255, + "step": 5010 + }, + { + "epoch": 0.99, + "learning_rate": 3.2143331962256053e-09, + "logits/chosen": -1.6162992715835571, + "logits/rejected": -1.0427744388580322, + "logps/chosen": -599.6419677734375, + "logps/rejected": -640.3694458007812, + "loss": 0.3889, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.6251840591430664, + "rewards/margins": 1.4099795818328857, + "rewards/rejected": -4.035163879394531, + "step": 5020 + }, + { + "epoch": 0.99, + "learning_rate": 2.404422929932204e-09, + "logits/chosen": -1.2071707248687744, + "logits/rejected": -0.8376060724258423, + "logps/chosen": -549.0296630859375, + "logps/rejected": -699.3610229492188, + "loss": 0.5418, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.2370543479919434, + "rewards/margins": 1.4369432926177979, + "rewards/rejected": -4.673997402191162, + "step": 5030 + }, + { + "epoch": 0.99, + "learning_rate": 1.711821558721405e-09, + "logits/chosen": -1.4277589321136475, + "logits/rejected": -1.4227768182754517, + "logps/chosen": -532.2887573242188, + "logps/rejected": -689.0106201171875, + "loss": 0.5386, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.7164626121520996, + "rewards/margins": 1.2939804792404175, + "rewards/rejected": -4.010442733764648, + "step": 5040 + }, + { + "epoch": 0.99, + "learning_rate": 1.1365616132008595e-09, + "logits/chosen": -1.5623667240142822, + "logits/rejected": -1.4521827697753906, + "logps/chosen": -545.3251953125, + "logps/rejected": -688.9048461914062, + "loss": 0.5253, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.058255672454834, + "rewards/margins": 1.6055889129638672, + "rewards/rejected": -4.663844585418701, + "step": 5050 + }, + { + "epoch": 0.99, + "learning_rate": 6.786701125999218e-10, + "logits/chosen": -1.2991487979888916, + "logits/rejected": -1.0468859672546387, + "logps/chosen": -624.85302734375, + "logps/rejected": -665.943115234375, + "loss": 0.4871, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1303889751434326, + "rewards/margins": 0.9218120574951172, + "rewards/rejected": -4.052201271057129, + "step": 5060 + }, + { + "epoch": 1.0, + "learning_rate": 3.3816856350177284e-10, + "logits/chosen": -1.112547755241394, + "logits/rejected": -1.2372267246246338, + "logps/chosen": -536.0859985351562, + "logps/rejected": -682.0577392578125, + "loss": 0.526, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1959328651428223, + "rewards/margins": 1.218519926071167, + "rewards/rejected": -4.41445255279541, + "step": 5070 + }, + { + "epoch": 1.0, + "learning_rate": 1.1507295883145253e-10, + "logits/chosen": -1.5622161626815796, + "logits/rejected": -1.194392204284668, + "logps/chosen": -590.2625122070312, + "logps/rejected": -675.1867065429688, + "loss": 0.4111, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.098064422607422, + "rewards/margins": 0.7926515340805054, + "rewards/rejected": -3.8907153606414795, + "step": 5080 + }, + { + "epoch": 1.0, + "learning_rate": 9.393777107291614e-12, + "logits/chosen": -1.6349897384643555, + "logits/rejected": -1.5304569005966187, + "logps/chosen": -610.5177612304688, + "logps/rejected": -668.1381225585938, + "loss": 0.536, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.0565178394317627, + "rewards/margins": 0.9504305124282837, + "rewards/rejected": -4.006947994232178, + "step": 5090 + }, + { + "epoch": 1.0, + "step": 5094, + "total_flos": 0.0, + "train_loss": 0.5211080308048501, + "train_runtime": 54683.4674, + "train_samples_per_second": 1.118, + "train_steps_per_second": 0.093 + } + ], + "logging_steps": 10, + "max_steps": 5094, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}