diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10868 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 7642, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 6.535947712418301e-09, + "logits/chosen": -2.771364688873291, + "logits/rejected": -2.0475902557373047, + "logps/chosen": -350.8045654296875, + "logps/rejected": -232.34600830078125, + "loss": 0.0246, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 6.535947712418302e-08, + "logits/chosen": -2.4147045612335205, + "logits/rejected": -2.214167356491089, + "logps/chosen": -275.91546630859375, + "logps/rejected": -221.3582763671875, + "loss": 0.0435, + "rewards/accuracies": 0.4166666567325592, + "rewards/chosen": -6.692952592857182e-05, + "rewards/margins": -4.810280370293185e-05, + "rewards/rejected": -1.8826718587661162e-05, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 1.3071895424836603e-07, + "logits/chosen": -2.3138043880462646, + "logits/rejected": -1.998386025428772, + "logps/chosen": -183.78628540039062, + "logps/rejected": -185.7834014892578, + "loss": 0.0581, + "rewards/accuracies": 0.375, + "rewards/chosen": 6.9359957706183195e-06, + "rewards/margins": -1.1799385902122594e-05, + "rewards/rejected": 1.873539076768793e-05, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 1.9607843137254904e-07, + "logits/chosen": -2.408761739730835, + "logits/rejected": -2.3481807708740234, + "logps/chosen": -221.2811737060547, + "logps/rejected": -207.22256469726562, + "loss": 0.0492, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.0001152217882918194, + "rewards/margins": 4.462666038307361e-05, + "rewards/rejected": 7.059513154672459e-05, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 2.6143790849673207e-07, + "logits/chosen": -2.2558865547180176, + "logits/rejected": -2.2246384620666504, + "logps/chosen": -189.4458770751953, + "logps/rejected": -171.353515625, + "loss": 0.0585, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0001129482188844122, + "rewards/margins": 0.0001205944427056238, + "rewards/rejected": -7.646233825653326e-06, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 3.267973856209151e-07, + "logits/chosen": -2.456951141357422, + "logits/rejected": -2.2887561321258545, + "logps/chosen": -278.3084411621094, + "logps/rejected": -233.02688598632812, + "loss": 0.0696, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.00021237425971776247, + "rewards/margins": 1.0886736163229216e-05, + "rewards/rejected": 0.00020148752082604915, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 3.921568627450981e-07, + "logits/chosen": -2.452667236328125, + "logits/rejected": -2.3211636543273926, + "logps/chosen": -299.87713623046875, + "logps/rejected": -243.0287322998047, + "loss": 0.0811, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0004027537943329662, + "rewards/margins": 0.00023468179279007018, + "rewards/rejected": 0.00016807201609481126, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 4.5751633986928105e-07, + "logits/chosen": -2.2944846153259277, + "logits/rejected": -2.1996867656707764, + "logps/chosen": -256.50616455078125, + "logps/rejected": -251.6355438232422, + "loss": 0.0398, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0004902628134004772, + "rewards/margins": 2.103743281622883e-05, + "rewards/rejected": 0.0004692253714893013, + "step": 70 + }, + { + "epoch": 0.01, + "learning_rate": 5.228758169934641e-07, + "logits/chosen": -2.4491403102874756, + "logits/rejected": -2.2229251861572266, + "logps/chosen": -251.4347381591797, + "logps/rejected": -224.22293090820312, + "loss": 0.0398, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0005534522933885455, + "rewards/margins": 0.00024414055224042386, + "rewards/rejected": 0.0003093117265962064, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 5.882352941176471e-07, + "logits/chosen": -2.3732728958129883, + "logits/rejected": -2.1753273010253906, + "logps/chosen": -209.80862426757812, + "logps/rejected": -172.71231079101562, + "loss": 0.0691, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0009153633145615458, + "rewards/margins": 0.0003029147337656468, + "rewards/rejected": 0.0006124485516920686, + "step": 90 + }, + { + "epoch": 0.01, + "learning_rate": 6.535947712418302e-07, + "logits/chosen": -2.309413433074951, + "logits/rejected": -2.144484281539917, + "logps/chosen": -208.6510467529297, + "logps/rejected": -211.52490234375, + "loss": 0.0532, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.0008663847111165524, + "rewards/margins": 8.283840725198388e-05, + "rewards/rejected": 0.0007835463620722294, + "step": 100 + }, + { + "epoch": 0.01, + "eval_logits/chosen": -2.3285820484161377, + "eval_logits/rejected": -2.1099746227264404, + "eval_logps/chosen": -227.70619201660156, + "eval_logps/rejected": -195.34722900390625, + "eval_loss": 0.05353359878063202, + "eval_rewards/accuracies": 0.5529999732971191, + "eval_rewards/chosen": 0.0015157524030655622, + "eval_rewards/margins": 0.00045084880548529327, + "eval_rewards/rejected": 0.0010649036848917603, + "eval_runtime": 1442.2999, + "eval_samples_per_second": 1.387, + "eval_steps_per_second": 0.347, + "step": 100 + }, + { + "epoch": 0.01, + "learning_rate": 7.189542483660131e-07, + "logits/chosen": -2.199138641357422, + "logits/rejected": -2.3579840660095215, + "logps/chosen": -235.41043090820312, + "logps/rejected": -250.78732299804688, + "loss": 0.0456, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0016705368179827929, + "rewards/margins": -0.00019449429237283766, + "rewards/rejected": 0.0018650311976671219, + "step": 110 + }, + { + "epoch": 0.02, + "learning_rate": 7.843137254901962e-07, + "logits/chosen": -2.1194636821746826, + "logits/rejected": -2.130894660949707, + "logps/chosen": -208.68984985351562, + "logps/rejected": -208.67202758789062, + "loss": 0.0539, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.0016094299498945475, + "rewards/margins": -0.0002141711302101612, + "rewards/rejected": 0.0018236007308587432, + "step": 120 + }, + { + "epoch": 0.02, + "learning_rate": 8.496732026143792e-07, + "logits/chosen": -2.211329460144043, + "logits/rejected": -2.0572669506073, + "logps/chosen": -233.6269073486328, + "logps/rejected": -180.12249755859375, + "loss": 0.0587, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0022293583024293184, + "rewards/margins": 0.0006621202919632196, + "rewards/rejected": 0.0015672380104660988, + "step": 130 + }, + { + "epoch": 0.02, + "learning_rate": 9.150326797385621e-07, + "logits/chosen": -2.255624771118164, + "logits/rejected": -2.1116433143615723, + "logps/chosen": -176.47323608398438, + "logps/rejected": -158.57904052734375, + "loss": 0.0637, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0026094545610249043, + "rewards/margins": 0.0007819056627340615, + "rewards/rejected": 0.0018275491893291473, + "step": 140 + }, + { + "epoch": 0.02, + "learning_rate": 9.80392156862745e-07, + "logits/chosen": -2.173933506011963, + "logits/rejected": -2.0644004344940186, + "logps/chosen": -193.16531372070312, + "logps/rejected": -217.70556640625, + "loss": 0.0643, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.002999431686475873, + "rewards/margins": 0.0004859448818024248, + "rewards/rejected": 0.0025134864263236523, + "step": 150 + }, + { + "epoch": 0.02, + "learning_rate": 1.0457516339869283e-06, + "logits/chosen": -2.0049774646759033, + "logits/rejected": -2.14156436920166, + "logps/chosen": -196.28433227539062, + "logps/rejected": -215.7510223388672, + "loss": 0.0621, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.004374385345727205, + "rewards/margins": 0.0014288431266322732, + "rewards/rejected": 0.0029455421026796103, + "step": 160 + }, + { + "epoch": 0.02, + "learning_rate": 1.111111111111111e-06, + "logits/chosen": -2.4489009380340576, + "logits/rejected": -2.1065969467163086, + "logps/chosen": -238.6536407470703, + "logps/rejected": -195.6304168701172, + "loss": 0.0526, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.00491450447589159, + "rewards/margins": 0.0018129239324480295, + "rewards/rejected": 0.0031015807762742043, + "step": 170 + }, + { + "epoch": 0.02, + "learning_rate": 1.1764705882352942e-06, + "logits/chosen": -2.4028241634368896, + "logits/rejected": -2.051755428314209, + "logps/chosen": -144.17684936523438, + "logps/rejected": -146.40402221679688, + "loss": 0.0638, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.003161213593557477, + "rewards/margins": 0.0006372106727212667, + "rewards/rejected": 0.0025240029208362103, + "step": 180 + }, + { + "epoch": 0.02, + "learning_rate": 1.2418300653594772e-06, + "logits/chosen": -2.391371011734009, + "logits/rejected": -2.1699657440185547, + "logps/chosen": -217.9038543701172, + "logps/rejected": -168.83421325683594, + "loss": 0.0444, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.004153423942625523, + "rewards/margins": 0.0016645189607515931, + "rewards/rejected": 0.002488905331119895, + "step": 190 + }, + { + "epoch": 0.03, + "learning_rate": 1.3071895424836604e-06, + "logits/chosen": -2.3824551105499268, + "logits/rejected": -2.099884510040283, + "logps/chosen": -203.17013549804688, + "logps/rejected": -164.29214477539062, + "loss": 0.0625, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.004522558301687241, + "rewards/margins": 0.0029251843225210905, + "rewards/rejected": 0.001597374677658081, + "step": 200 + }, + { + "epoch": 0.03, + "eval_logits/chosen": -2.330930233001709, + "eval_logits/rejected": -2.112360954284668, + "eval_logps/chosen": -225.17840576171875, + "eval_logps/rejected": -194.7161407470703, + "eval_loss": 0.05265128239989281, + "eval_rewards/accuracies": 0.6079999804496765, + "eval_rewards/chosen": 0.004043539520353079, + "eval_rewards/margins": 0.0023475452326238155, + "eval_rewards/rejected": 0.0016959939384832978, + "eval_runtime": 1442.7463, + "eval_samples_per_second": 1.386, + "eval_steps_per_second": 0.347, + "step": 200 + }, + { + "epoch": 0.03, + "learning_rate": 1.3725490196078434e-06, + "logits/chosen": -2.2161953449249268, + "logits/rejected": -2.2658512592315674, + "logps/chosen": -217.46646118164062, + "logps/rejected": -239.91824340820312, + "loss": 0.0344, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.003468969836831093, + "rewards/margins": 0.0022426594514399767, + "rewards/rejected": 0.0012263102689757943, + "step": 210 + }, + { + "epoch": 0.03, + "learning_rate": 1.4379084967320261e-06, + "logits/chosen": -2.4367940425872803, + "logits/rejected": -2.205225706100464, + "logps/chosen": -179.8951873779297, + "logps/rejected": -179.15188598632812, + "loss": 0.0275, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.002089735586196184, + "rewards/margins": 0.0011267390800639987, + "rewards/rejected": 0.0009629965061321855, + "step": 220 + }, + { + "epoch": 0.03, + "learning_rate": 1.5032679738562091e-06, + "logits/chosen": -2.2942264080047607, + "logits/rejected": -2.230792999267578, + "logps/chosen": -210.7850799560547, + "logps/rejected": -194.7191619873047, + "loss": 0.0504, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0031732949428260326, + "rewards/margins": 0.003322675358504057, + "rewards/rejected": -0.00014938069216441363, + "step": 230 + }, + { + "epoch": 0.03, + "learning_rate": 1.5686274509803923e-06, + "logits/chosen": -2.321350336074829, + "logits/rejected": -2.1149497032165527, + "logps/chosen": -219.54190063476562, + "logps/rejected": -253.10812377929688, + "loss": 0.0465, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.004185300786048174, + "rewards/margins": 0.005275317933410406, + "rewards/rejected": -0.0010900170309469104, + "step": 240 + }, + { + "epoch": 0.03, + "learning_rate": 1.6339869281045753e-06, + "logits/chosen": -2.167945146560669, + "logits/rejected": -2.141470432281494, + "logps/chosen": -186.19515991210938, + "logps/rejected": -184.30873107910156, + "loss": 0.0607, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.0037641176022589207, + "rewards/margins": 0.002698666648939252, + "rewards/rejected": 0.0010654507204890251, + "step": 250 + }, + { + "epoch": 0.03, + "learning_rate": 1.6993464052287585e-06, + "logits/chosen": -2.341299057006836, + "logits/rejected": -1.9840151071548462, + "logps/chosen": -269.43170166015625, + "logps/rejected": -203.65139770507812, + "loss": 0.0456, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.005779625847935677, + "rewards/margins": 0.0015972151886671782, + "rewards/rejected": 0.004182410426437855, + "step": 260 + }, + { + "epoch": 0.04, + "learning_rate": 1.7647058823529414e-06, + "logits/chosen": -2.3696370124816895, + "logits/rejected": -2.374230146408081, + "logps/chosen": -251.5572967529297, + "logps/rejected": -241.042724609375, + "loss": 0.0276, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.007571273948997259, + "rewards/margins": 0.0037990030832588673, + "rewards/rejected": 0.0037722710985690355, + "step": 270 + }, + { + "epoch": 0.04, + "learning_rate": 1.8300653594771242e-06, + "logits/chosen": -2.272646188735962, + "logits/rejected": -2.035445213317871, + "logps/chosen": -237.1612548828125, + "logps/rejected": -169.53341674804688, + "loss": 0.0596, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.007252591662108898, + "rewards/margins": 0.003395236562937498, + "rewards/rejected": 0.0038573560304939747, + "step": 280 + }, + { + "epoch": 0.04, + "learning_rate": 1.8954248366013072e-06, + "logits/chosen": -2.373527765274048, + "logits/rejected": -2.2914352416992188, + "logps/chosen": -213.3122100830078, + "logps/rejected": -226.66067504882812, + "loss": 0.0646, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.001583040109835565, + "rewards/margins": 0.006263392977416515, + "rewards/rejected": -0.004680351819843054, + "step": 290 + }, + { + "epoch": 0.04, + "learning_rate": 1.96078431372549e-06, + "logits/chosen": -2.220839262008667, + "logits/rejected": -2.360034704208374, + "logps/chosen": -267.57769775390625, + "logps/rejected": -283.5144958496094, + "loss": 0.0485, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.006979311816394329, + "rewards/margins": 0.00745069095864892, + "rewards/rejected": -0.014430004172027111, + "step": 300 + }, + { + "epoch": 0.04, + "eval_logits/chosen": -2.323570966720581, + "eval_logits/rejected": -2.1050167083740234, + "eval_logps/chosen": -237.7423553466797, + "eval_logps/rejected": -214.5470733642578, + "eval_loss": 0.04963809624314308, + "eval_rewards/accuracies": 0.5889999866485596, + "eval_rewards/chosen": -0.008520414121448994, + "eval_rewards/margins": 0.009614524431526661, + "eval_rewards/rejected": -0.018134936690330505, + "eval_runtime": 1442.5595, + "eval_samples_per_second": 1.386, + "eval_steps_per_second": 0.347, + "step": 300 + }, + { + "epoch": 0.04, + "learning_rate": 2.0261437908496734e-06, + "logits/chosen": -2.207132339477539, + "logits/rejected": -1.9097169637680054, + "logps/chosen": -252.839111328125, + "logps/rejected": -199.76547241210938, + "loss": 0.0452, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.014440001919865608, + "rewards/margins": 0.008956280536949635, + "rewards/rejected": -0.023396281525492668, + "step": 310 + }, + { + "epoch": 0.04, + "learning_rate": 2.0915032679738565e-06, + "logits/chosen": -2.26574969291687, + "logits/rejected": -2.1446757316589355, + "logps/chosen": -198.3191680908203, + "logps/rejected": -209.6562957763672, + "loss": 0.0531, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.008327952586114407, + "rewards/margins": 0.015999721363186836, + "rewards/rejected": -0.02432767115533352, + "step": 320 + }, + { + "epoch": 0.04, + "learning_rate": 2.1568627450980393e-06, + "logits/chosen": -2.3831756114959717, + "logits/rejected": -2.3971455097198486, + "logps/chosen": -276.6793518066406, + "logps/rejected": -228.6278076171875, + "loss": 0.0517, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0005464582936838269, + "rewards/margins": 0.01151005458086729, + "rewards/rejected": -0.01096359547227621, + "step": 330 + }, + { + "epoch": 0.04, + "learning_rate": 2.222222222222222e-06, + "logits/chosen": -2.4563076496124268, + "logits/rejected": -1.999681830406189, + "logps/chosen": -280.4342956542969, + "logps/rejected": -198.6059112548828, + "loss": 0.0388, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.005417247768491507, + "rewards/margins": 0.012641100212931633, + "rewards/rejected": -0.018058348447084427, + "step": 340 + }, + { + "epoch": 0.05, + "learning_rate": 2.2875816993464053e-06, + "logits/chosen": -2.3461554050445557, + "logits/rejected": -2.3123745918273926, + "logps/chosen": -206.0986328125, + "logps/rejected": -226.97412109375, + "loss": 0.0511, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.026498574763536453, + "rewards/margins": 0.02056313492357731, + "rewards/rejected": -0.04706170782446861, + "step": 350 + }, + { + "epoch": 0.05, + "learning_rate": 2.3529411764705885e-06, + "logits/chosen": -2.330589532852173, + "logits/rejected": -2.1479365825653076, + "logps/chosen": -273.8431396484375, + "logps/rejected": -248.7611541748047, + "loss": 0.0634, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.027227917686104774, + "rewards/margins": 0.02004820853471756, + "rewards/rejected": -0.047276128083467484, + "step": 360 + }, + { + "epoch": 0.05, + "learning_rate": 2.4183006535947716e-06, + "logits/chosen": -2.193765163421631, + "logits/rejected": -2.45023250579834, + "logps/chosen": -143.6634979248047, + "logps/rejected": -311.2171936035156, + "loss": 0.0394, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.012883084826171398, + "rewards/margins": 0.008494162932038307, + "rewards/rejected": -0.02137724682688713, + "step": 370 + }, + { + "epoch": 0.05, + "learning_rate": 2.4836601307189544e-06, + "logits/chosen": -2.365126609802246, + "logits/rejected": -2.4081246852874756, + "logps/chosen": -183.79946899414062, + "logps/rejected": -189.6509246826172, + "loss": 0.0491, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.015578614547848701, + "rewards/margins": 0.023884663358330727, + "rewards/rejected": -0.03946327418088913, + "step": 380 + }, + { + "epoch": 0.05, + "learning_rate": 2.549019607843137e-06, + "logits/chosen": -2.3143937587738037, + "logits/rejected": -1.9923683404922485, + "logps/chosen": -280.3426818847656, + "logps/rejected": -271.82958984375, + "loss": 0.0526, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0271341260522604, + "rewards/margins": 0.02181713469326496, + "rewards/rejected": -0.04895126074552536, + "step": 390 + }, + { + "epoch": 0.05, + "learning_rate": 2.6143790849673208e-06, + "logits/chosen": -2.2406041622161865, + "logits/rejected": -2.3530118465423584, + "logps/chosen": -267.77569580078125, + "logps/rejected": -281.50048828125, + "loss": 0.0361, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.012241894379258156, + "rewards/margins": 0.0220674779266119, + "rewards/rejected": -0.034309376031160355, + "step": 400 + }, + { + "epoch": 0.05, + "eval_logits/chosen": -2.3719825744628906, + "eval_logits/rejected": -2.149343490600586, + "eval_logps/chosen": -251.4062957763672, + "eval_logps/rejected": -239.94168090820312, + "eval_loss": 0.04466630890965462, + "eval_rewards/accuracies": 0.5989999771118164, + "eval_rewards/chosen": -0.022184353321790695, + "eval_rewards/margins": 0.021345192566514015, + "eval_rewards/rejected": -0.04352954775094986, + "eval_runtime": 1443.3038, + "eval_samples_per_second": 1.386, + "eval_steps_per_second": 0.346, + "step": 400 + }, + { + "epoch": 0.05, + "learning_rate": 2.6797385620915036e-06, + "logits/chosen": -2.4779820442199707, + "logits/rejected": -2.141010284423828, + "logps/chosen": -242.4623260498047, + "logps/rejected": -268.7309265136719, + "loss": 0.018, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.02627101168036461, + "rewards/margins": 0.028306175023317337, + "rewards/rejected": -0.054577190428972244, + "step": 410 + }, + { + "epoch": 0.05, + "learning_rate": 2.7450980392156867e-06, + "logits/chosen": -2.3489201068878174, + "logits/rejected": -2.229403018951416, + "logps/chosen": -345.72723388671875, + "logps/rejected": -308.21630859375, + "loss": 0.0527, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.045316558331251144, + "rewards/margins": 0.00618447782471776, + "rewards/rejected": -0.05150103569030762, + "step": 420 + }, + { + "epoch": 0.06, + "learning_rate": 2.8104575163398695e-06, + "logits/chosen": -2.3438632488250732, + "logits/rejected": -2.246393918991089, + "logps/chosen": -290.4280090332031, + "logps/rejected": -294.6549072265625, + "loss": 0.0546, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03901774808764458, + "rewards/margins": 0.0222849752753973, + "rewards/rejected": -0.06130272150039673, + "step": 430 + }, + { + "epoch": 0.06, + "learning_rate": 2.8758169934640523e-06, + "logits/chosen": -2.5370495319366455, + "logits/rejected": -2.246542453765869, + "logps/chosen": -333.0699462890625, + "logps/rejected": -308.8509216308594, + "loss": 0.0466, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05149116367101669, + "rewards/margins": 0.011986413970589638, + "rewards/rejected": -0.06347757577896118, + "step": 440 + }, + { + "epoch": 0.06, + "learning_rate": 2.9411764705882355e-06, + "logits/chosen": -2.256922721862793, + "logits/rejected": -2.279179096221924, + "logps/chosen": -222.6754608154297, + "logps/rejected": -259.6260681152344, + "loss": 0.041, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03808627650141716, + "rewards/margins": 0.028404083102941513, + "rewards/rejected": -0.06649035960435867, + "step": 450 + }, + { + "epoch": 0.06, + "learning_rate": 3.0065359477124182e-06, + "logits/chosen": -2.428729295730591, + "logits/rejected": -1.8685014247894287, + "logps/chosen": -323.7979431152344, + "logps/rejected": -251.86865234375, + "loss": 0.0505, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.047601353377103806, + "rewards/margins": 0.028006980195641518, + "rewards/rejected": -0.07560833543539047, + "step": 460 + }, + { + "epoch": 0.06, + "learning_rate": 3.071895424836602e-06, + "logits/chosen": -2.2810492515563965, + "logits/rejected": -2.073118209838867, + "logps/chosen": -265.302001953125, + "logps/rejected": -265.73822021484375, + "loss": 0.0375, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05743040516972542, + "rewards/margins": 0.024930477142333984, + "rewards/rejected": -0.0823608785867691, + "step": 470 + }, + { + "epoch": 0.06, + "learning_rate": 3.1372549019607846e-06, + "logits/chosen": -2.216215133666992, + "logits/rejected": -2.1602706909179688, + "logps/chosen": -319.3440856933594, + "logps/rejected": -367.47821044921875, + "loss": 0.0436, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05836561322212219, + "rewards/margins": 0.03132264316082001, + "rewards/rejected": -0.0896882563829422, + "step": 480 + }, + { + "epoch": 0.06, + "learning_rate": 3.2026143790849674e-06, + "logits/chosen": -2.3500797748565674, + "logits/rejected": -2.0155441761016846, + "logps/chosen": -270.86224365234375, + "logps/rejected": -262.1877746582031, + "loss": 0.0339, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.053871434181928635, + "rewards/margins": 0.03459259867668152, + "rewards/rejected": -0.08846403658390045, + "step": 490 + }, + { + "epoch": 0.07, + "learning_rate": 3.2679738562091506e-06, + "logits/chosen": -2.3256402015686035, + "logits/rejected": -2.2846150398254395, + "logps/chosen": -219.56997680664062, + "logps/rejected": -250.40908813476562, + "loss": 0.0375, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04319126158952713, + "rewards/margins": 0.045080628246068954, + "rewards/rejected": -0.08827189356088638, + "step": 500 + }, + { + "epoch": 0.07, + "eval_logits/chosen": -2.1960113048553467, + "eval_logits/rejected": -1.9821337461471558, + "eval_logps/chosen": -282.6957702636719, + "eval_logps/rejected": -281.328857421875, + "eval_loss": 0.04168427363038063, + "eval_rewards/accuracies": 0.5889999866485596, + "eval_rewards/chosen": -0.053473833948373795, + "eval_rewards/margins": 0.03144287317991257, + "eval_rewards/rejected": -0.08491671830415726, + "eval_runtime": 1440.5515, + "eval_samples_per_second": 1.388, + "eval_steps_per_second": 0.347, + "step": 500 + }, + { + "epoch": 0.07, + "learning_rate": 3.3333333333333333e-06, + "logits/chosen": -2.3372669219970703, + "logits/rejected": -2.0579018592834473, + "logps/chosen": -387.23651123046875, + "logps/rejected": -321.8766784667969, + "loss": 0.0369, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.060873858630657196, + "rewards/margins": 0.029842043295502663, + "rewards/rejected": -0.09071590006351471, + "step": 510 + }, + { + "epoch": 0.07, + "learning_rate": 3.398692810457517e-06, + "logits/chosen": -2.3675150871276855, + "logits/rejected": -2.1596837043762207, + "logps/chosen": -315.5011291503906, + "logps/rejected": -302.6841735839844, + "loss": 0.0343, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06973306834697723, + "rewards/margins": 0.030995529145002365, + "rewards/rejected": -0.1007285937666893, + "step": 520 + }, + { + "epoch": 0.07, + "learning_rate": 3.4640522875816997e-06, + "logits/chosen": -2.0667433738708496, + "logits/rejected": -1.9001293182373047, + "logps/chosen": -321.52752685546875, + "logps/rejected": -310.5598449707031, + "loss": 0.0567, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.05219079926609993, + "rewards/margins": 0.025245213881134987, + "rewards/rejected": -0.07743600755929947, + "step": 530 + }, + { + "epoch": 0.07, + "learning_rate": 3.529411764705883e-06, + "logits/chosen": -2.101945400238037, + "logits/rejected": -2.064239740371704, + "logps/chosen": -284.4571838378906, + "logps/rejected": -292.69781494140625, + "loss": 0.0239, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04057732969522476, + "rewards/margins": 0.016831254586577415, + "rewards/rejected": -0.05740858241915703, + "step": 540 + }, + { + "epoch": 0.07, + "learning_rate": 3.5947712418300657e-06, + "logits/chosen": -1.988926887512207, + "logits/rejected": -1.6984357833862305, + "logps/chosen": -287.50714111328125, + "logps/rejected": -260.623291015625, + "loss": 0.0384, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04455497860908508, + "rewards/margins": 0.03517274186015129, + "rewards/rejected": -0.07972772419452667, + "step": 550 + }, + { + "epoch": 0.07, + "learning_rate": 3.6601307189542484e-06, + "logits/chosen": -1.844098687171936, + "logits/rejected": -1.8456268310546875, + "logps/chosen": -269.31488037109375, + "logps/rejected": -283.35833740234375, + "loss": 0.0366, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04365937411785126, + "rewards/margins": 0.03285984694957733, + "rewards/rejected": -0.07651921361684799, + "step": 560 + }, + { + "epoch": 0.07, + "learning_rate": 3.7254901960784316e-06, + "logits/chosen": -1.7108196020126343, + "logits/rejected": -1.5092352628707886, + "logps/chosen": -300.17010498046875, + "logps/rejected": -268.1441650390625, + "loss": 0.0626, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.054984550923109055, + "rewards/margins": 0.05392267554998398, + "rewards/rejected": -0.10890723764896393, + "step": 570 + }, + { + "epoch": 0.08, + "learning_rate": 3.7908496732026144e-06, + "logits/chosen": -1.59710693359375, + "logits/rejected": -1.3931443691253662, + "logps/chosen": -384.5987548828125, + "logps/rejected": -411.75543212890625, + "loss": 0.0391, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1424575299024582, + "rewards/margins": 0.06394679844379425, + "rewards/rejected": -0.20640432834625244, + "step": 580 + }, + { + "epoch": 0.08, + "learning_rate": 3.856209150326798e-06, + "logits/chosen": -1.6260541677474976, + "logits/rejected": -1.5756090879440308, + "logps/chosen": -455.5960388183594, + "logps/rejected": -484.4208068847656, + "loss": 0.0187, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.22536692023277283, + "rewards/margins": 0.037786681205034256, + "rewards/rejected": -0.2631535530090332, + "step": 590 + }, + { + "epoch": 0.08, + "learning_rate": 3.92156862745098e-06, + "logits/chosen": -1.6431983709335327, + "logits/rejected": -1.5343234539031982, + "logps/chosen": -412.9248962402344, + "logps/rejected": -452.26123046875, + "loss": 0.0522, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18326084315776825, + "rewards/margins": 0.04902006313204765, + "rewards/rejected": -0.2322808802127838, + "step": 600 + }, + { + "epoch": 0.08, + "eval_logits/chosen": -1.543282151222229, + "eval_logits/rejected": -1.3696963787078857, + "eval_logps/chosen": -398.6434326171875, + "eval_logps/rejected": -395.07135009765625, + "eval_loss": 0.04322844743728638, + "eval_rewards/accuracies": 0.5920000076293945, + "eval_rewards/chosen": -0.1694214940071106, + "eval_rewards/margins": 0.029237719252705574, + "eval_rewards/rejected": -0.19865919649600983, + "eval_runtime": 1442.1284, + "eval_samples_per_second": 1.387, + "eval_steps_per_second": 0.347, + "step": 600 + }, + { + "epoch": 0.08, + "learning_rate": 3.986928104575164e-06, + "logits/chosen": -1.5562020540237427, + "logits/rejected": -1.4817330837249756, + "logps/chosen": -438.84893798828125, + "logps/rejected": -425.55291748046875, + "loss": 0.0428, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.15865842998027802, + "rewards/margins": 0.030415236949920654, + "rewards/rejected": -0.18907368183135986, + "step": 610 + }, + { + "epoch": 0.08, + "learning_rate": 4.052287581699347e-06, + "logits/chosen": -1.5809907913208008, + "logits/rejected": -1.4904913902282715, + "logps/chosen": -343.4562072753906, + "logps/rejected": -360.49688720703125, + "loss": 0.041, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.13616259396076202, + "rewards/margins": 0.044137731194496155, + "rewards/rejected": -0.18030032515525818, + "step": 620 + }, + { + "epoch": 0.08, + "learning_rate": 4.11764705882353e-06, + "logits/chosen": -1.6854896545410156, + "logits/rejected": -1.5019071102142334, + "logps/chosen": -343.8302001953125, + "logps/rejected": -372.08160400390625, + "loss": 0.0619, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1621018797159195, + "rewards/margins": 0.04577670991420746, + "rewards/rejected": -0.20787855982780457, + "step": 630 + }, + { + "epoch": 0.08, + "learning_rate": 4.183006535947713e-06, + "logits/chosen": -1.8720099925994873, + "logits/rejected": -1.6704416275024414, + "logps/chosen": -352.1206970214844, + "logps/rejected": -401.9758605957031, + "loss": 0.0299, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.08925464749336243, + "rewards/margins": 0.047790177166461945, + "rewards/rejected": -0.13704481720924377, + "step": 640 + }, + { + "epoch": 0.09, + "learning_rate": 4.2483660130718954e-06, + "logits/chosen": -1.858128547668457, + "logits/rejected": -1.6746127605438232, + "logps/chosen": -382.6357727050781, + "logps/rejected": -382.96856689453125, + "loss": 0.0419, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11193374544382095, + "rewards/margins": 0.04092060774564743, + "rewards/rejected": -0.15285435318946838, + "step": 650 + }, + { + "epoch": 0.09, + "learning_rate": 4.313725490196079e-06, + "logits/chosen": -1.919390320777893, + "logits/rejected": -1.8492457866668701, + "logps/chosen": -251.84130859375, + "logps/rejected": -260.2295227050781, + "loss": 0.0455, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07586243003606796, + "rewards/margins": 0.030439767986536026, + "rewards/rejected": -0.1063021868467331, + "step": 660 + }, + { + "epoch": 0.09, + "learning_rate": 4.379084967320262e-06, + "logits/chosen": -1.8406785726547241, + "logits/rejected": -1.662712812423706, + "logps/chosen": -301.2870788574219, + "logps/rejected": -314.25634765625, + "loss": 0.0305, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.08839339017868042, + "rewards/margins": 0.03664010763168335, + "rewards/rejected": -0.12503348290920258, + "step": 670 + }, + { + "epoch": 0.09, + "learning_rate": 4.444444444444444e-06, + "logits/chosen": -1.7664210796356201, + "logits/rejected": -1.671459436416626, + "logps/chosen": -286.22589111328125, + "logps/rejected": -287.08258056640625, + "loss": 0.0587, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08919215947389603, + "rewards/margins": 0.03164225071668625, + "rewards/rejected": -0.12083441019058228, + "step": 680 + }, + { + "epoch": 0.09, + "learning_rate": 4.509803921568628e-06, + "logits/chosen": -1.9861505031585693, + "logits/rejected": -1.7048215866088867, + "logps/chosen": -246.3960418701172, + "logps/rejected": -274.64788818359375, + "loss": 0.0582, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.09032480418682098, + "rewards/margins": 0.04387999698519707, + "rewards/rejected": -0.13420480489730835, + "step": 690 + }, + { + "epoch": 0.09, + "learning_rate": 4.5751633986928105e-06, + "logits/chosen": -1.708458662033081, + "logits/rejected": -1.5508501529693604, + "logps/chosen": -397.4908447265625, + "logps/rejected": -426.00238037109375, + "loss": 0.0453, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13721361756324768, + "rewards/margins": 0.05807988718152046, + "rewards/rejected": -0.19529351592063904, + "step": 700 + }, + { + "epoch": 0.09, + "eval_logits/chosen": -1.913664698600769, + "eval_logits/rejected": -1.720253825187683, + "eval_logps/chosen": -295.2010803222656, + "eval_logps/rejected": -297.8420104980469, + "eval_loss": 0.036652155220508575, + "eval_rewards/accuracies": 0.578000009059906, + "eval_rewards/chosen": -0.06597913801670074, + "eval_rewards/margins": 0.03545072674751282, + "eval_rewards/rejected": -0.10142985731363297, + "eval_runtime": 1439.0401, + "eval_samples_per_second": 1.39, + "eval_steps_per_second": 0.347, + "step": 700 + }, + { + "epoch": 0.09, + "learning_rate": 4.640522875816994e-06, + "logits/chosen": -1.906247854232788, + "logits/rejected": -1.518561601638794, + "logps/chosen": -363.5531311035156, + "logps/rejected": -340.15576171875, + "loss": 0.0493, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06743152439594269, + "rewards/margins": 0.04517130181193352, + "rewards/rejected": -0.1126028299331665, + "step": 710 + }, + { + "epoch": 0.09, + "learning_rate": 4.705882352941177e-06, + "logits/chosen": -1.8197141885757446, + "logits/rejected": -1.843785285949707, + "logps/chosen": -263.13616943359375, + "logps/rejected": -338.6583557128906, + "loss": 0.0308, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.053458016365766525, + "rewards/margins": 0.059612225741147995, + "rewards/rejected": -0.11307024955749512, + "step": 720 + }, + { + "epoch": 0.1, + "learning_rate": 4.77124183006536e-06, + "logits/chosen": -2.073086977005005, + "logits/rejected": -1.8402042388916016, + "logps/chosen": -275.48406982421875, + "logps/rejected": -269.55206298828125, + "loss": 0.0303, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05122733116149902, + "rewards/margins": 0.03283599764108658, + "rewards/rejected": -0.0840633362531662, + "step": 730 + }, + { + "epoch": 0.1, + "learning_rate": 4.836601307189543e-06, + "logits/chosen": -2.1057915687561035, + "logits/rejected": -1.833142876625061, + "logps/chosen": -252.7379913330078, + "logps/rejected": -284.5447692871094, + "loss": 0.0547, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04275885596871376, + "rewards/margins": 0.04886358603835106, + "rewards/rejected": -0.09162244200706482, + "step": 740 + }, + { + "epoch": 0.1, + "learning_rate": 4.901960784313726e-06, + "logits/chosen": -1.849066138267517, + "logits/rejected": -1.7588335275650024, + "logps/chosen": -321.33642578125, + "logps/rejected": -316.1219177246094, + "loss": 0.0409, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.07632167637348175, + "rewards/margins": 0.045088060200214386, + "rewards/rejected": -0.12140975147485733, + "step": 750 + }, + { + "epoch": 0.1, + "learning_rate": 4.967320261437909e-06, + "logits/chosen": -1.7766132354736328, + "logits/rejected": -1.5358455181121826, + "logps/chosen": -321.90472412109375, + "logps/rejected": -320.8185729980469, + "loss": 0.0324, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.116396464407444, + "rewards/margins": 0.049995969980955124, + "rewards/rejected": -0.16639243066310883, + "step": 760 + }, + { + "epoch": 0.1, + "learning_rate": 4.99999347843947e-06, + "logits/chosen": -1.7069709300994873, + "logits/rejected": -1.5624696016311646, + "logps/chosen": -318.8409423828125, + "logps/rejected": -326.75592041015625, + "loss": 0.0303, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.11583375930786133, + "rewards/margins": 0.03832734376192093, + "rewards/rejected": -0.15416111052036285, + "step": 770 + }, + { + "epoch": 0.1, + "learning_rate": 4.999941306159375e-06, + "logits/chosen": -1.9203029870986938, + "logits/rejected": -1.57107675075531, + "logps/chosen": -395.36517333984375, + "logps/rejected": -420.06805419921875, + "loss": 0.0375, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.13647016882896423, + "rewards/margins": 0.027954230085015297, + "rewards/rejected": -0.16442438960075378, + "step": 780 + }, + { + "epoch": 0.1, + "learning_rate": 4.999836962687967e-06, + "logits/chosen": -1.6315686702728271, + "logits/rejected": -1.6286077499389648, + "logps/chosen": -376.17401123046875, + "logps/rejected": -505.39215087890625, + "loss": 0.0463, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.15708838403224945, + "rewards/margins": 0.09491723030805588, + "rewards/rejected": -0.25200560688972473, + "step": 790 + }, + { + "epoch": 0.1, + "learning_rate": 4.999680450202786e-06, + "logits/chosen": -1.5963799953460693, + "logits/rejected": -1.3815653324127197, + "logps/chosen": -450.90875244140625, + "logps/rejected": -442.9236755371094, + "loss": 0.0293, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1555173099040985, + "rewards/margins": 0.05233161896467209, + "rewards/rejected": -0.2078489363193512, + "step": 800 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -1.6149731874465942, + "eval_logits/rejected": -1.4339385032653809, + "eval_logps/chosen": -400.8042297363281, + "eval_logps/rejected": -417.92388916015625, + "eval_loss": 0.036732107400894165, + "eval_rewards/accuracies": 0.5929999947547913, + "eval_rewards/chosen": -0.17158228158950806, + "eval_rewards/margins": 0.04992944374680519, + "eval_rewards/rejected": -0.22151173651218414, + "eval_runtime": 1440.7969, + "eval_samples_per_second": 1.388, + "eval_steps_per_second": 0.347, + "step": 800 + }, + { + "epoch": 0.11, + "learning_rate": 4.999471771970087e-06, + "logits/chosen": -1.705596923828125, + "logits/rejected": -1.6261297464370728, + "logps/chosen": -342.60693359375, + "logps/rejected": -366.328857421875, + "loss": 0.0526, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.18038082122802734, + "rewards/margins": 0.04092780128121376, + "rewards/rejected": -0.2213086187839508, + "step": 810 + }, + { + "epoch": 0.11, + "learning_rate": 4.999210932344767e-06, + "logits/chosen": -1.823772668838501, + "logits/rejected": -1.6082426309585571, + "logps/chosen": -384.11773681640625, + "logps/rejected": -382.7273254394531, + "loss": 0.0207, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.13001203536987305, + "rewards/margins": 0.03671548515558243, + "rewards/rejected": -0.16672751307487488, + "step": 820 + }, + { + "epoch": 0.11, + "learning_rate": 4.998897936770281e-06, + "logits/chosen": -1.5681250095367432, + "logits/rejected": -1.2824045419692993, + "logps/chosen": -402.26470947265625, + "logps/rejected": -407.77325439453125, + "loss": 0.0417, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18893824517726898, + "rewards/margins": 0.027542661875486374, + "rewards/rejected": -0.21648092567920685, + "step": 830 + }, + { + "epoch": 0.11, + "learning_rate": 4.998532791778521e-06, + "logits/chosen": -1.6705471277236938, + "logits/rejected": -1.455125093460083, + "logps/chosen": -416.44256591796875, + "logps/rejected": -387.32745361328125, + "loss": 0.0257, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.18690225481987, + "rewards/margins": 0.033063314855098724, + "rewards/rejected": -0.21996554732322693, + "step": 840 + }, + { + "epoch": 0.11, + "learning_rate": 4.9981155049896885e-06, + "logits/chosen": -1.6054198741912842, + "logits/rejected": -1.5043563842773438, + "logps/chosen": -479.97418212890625, + "logps/rejected": -479.62744140625, + "loss": 0.0373, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22980158030986786, + "rewards/margins": 0.019741864874958992, + "rewards/rejected": -0.2495434284210205, + "step": 850 + }, + { + "epoch": 0.11, + "learning_rate": 4.997646085112126e-06, + "logits/chosen": -1.805079698562622, + "logits/rejected": -1.661866545677185, + "logps/chosen": -368.40509033203125, + "logps/rejected": -385.2732238769531, + "loss": 0.0196, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.14316768944263458, + "rewards/margins": 0.04519253969192505, + "rewards/rejected": -0.18836024403572083, + "step": 860 + }, + { + "epoch": 0.11, + "learning_rate": 4.997124541942141e-06, + "logits/chosen": -1.7849347591400146, + "logits/rejected": -1.5895960330963135, + "logps/chosen": -354.07965087890625, + "logps/rejected": -437.007080078125, + "loss": 0.0246, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11094705015420914, + "rewards/margins": 0.07083548605442047, + "rewards/rejected": -0.181782528758049, + "step": 870 + }, + { + "epoch": 0.12, + "learning_rate": 4.996550886363801e-06, + "logits/chosen": -2.149895191192627, + "logits/rejected": -1.9984849691390991, + "logps/chosen": -333.11785888671875, + "logps/rejected": -338.1785888671875, + "loss": 0.0345, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.07574061304330826, + "rewards/margins": 0.024002335965633392, + "rewards/rejected": -0.09974294900894165, + "step": 880 + }, + { + "epoch": 0.12, + "learning_rate": 4.995925130348706e-06, + "logits/chosen": -1.9792038202285767, + "logits/rejected": -1.534693956375122, + "logps/chosen": -342.51165771484375, + "logps/rejected": -287.411376953125, + "loss": 0.0586, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0806887149810791, + "rewards/margins": 0.02763846516609192, + "rewards/rejected": -0.10832718759775162, + "step": 890 + }, + { + "epoch": 0.12, + "learning_rate": 4.995247286955734e-06, + "logits/chosen": -1.6908349990844727, + "logits/rejected": -1.4713640213012695, + "logps/chosen": -348.95599365234375, + "logps/rejected": -420.2156677246094, + "loss": 0.0241, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11906653642654419, + "rewards/margins": 0.07349709421396255, + "rewards/rejected": -0.19256362318992615, + "step": 900 + }, + { + "epoch": 0.12, + "learning_rate": 4.994517370330779e-06, + "logits/chosen": -1.4035985469818115, + "logits/rejected": -1.4106028079986572, + "logps/chosen": -345.45013427734375, + "logps/rejected": -391.02325439453125, + "loss": 0.0404, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.15998545289039612, + "rewards/margins": 0.01910357177257538, + "rewards/rejected": -0.1790890246629715, + "step": 910 + }, + { + "epoch": 0.12, + "learning_rate": 4.993735395706446e-06, + "logits/chosen": -1.485642671585083, + "logits/rejected": -1.1201026439666748, + "logps/chosen": -413.4599609375, + "logps/rejected": -430.8916931152344, + "loss": 0.036, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.12864863872528076, + "rewards/margins": 0.05424485355615616, + "rewards/rejected": -0.18289348483085632, + "step": 920 + }, + { + "epoch": 0.12, + "learning_rate": 4.992901379401737e-06, + "logits/chosen": -1.4833259582519531, + "logits/rejected": -1.2941524982452393, + "logps/chosen": -289.5825500488281, + "logps/rejected": -385.73004150390625, + "loss": 0.0387, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.11567602306604385, + "rewards/margins": 0.0963902622461319, + "rewards/rejected": -0.21206626296043396, + "step": 930 + }, + { + "epoch": 0.12, + "learning_rate": 4.992015338821711e-06, + "logits/chosen": -1.4885437488555908, + "logits/rejected": -1.1388311386108398, + "logps/chosen": -402.7807312011719, + "logps/rejected": -380.0301208496094, + "loss": 0.0407, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18832194805145264, + "rewards/margins": 0.04915202781558037, + "rewards/rejected": -0.2374739646911621, + "step": 940 + }, + { + "epoch": 0.12, + "learning_rate": 4.991077292457117e-06, + "logits/chosen": -1.579649806022644, + "logits/rejected": -1.3794090747833252, + "logps/chosen": -329.3354797363281, + "logps/rejected": -379.41485595703125, + "loss": 0.0326, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1469370424747467, + "rewards/margins": 0.0816231220960617, + "rewards/rejected": -0.2285601794719696, + "step": 950 + }, + { + "epoch": 0.13, + "learning_rate": 4.990087259884016e-06, + "logits/chosen": -1.5824581384658813, + "logits/rejected": -1.4533617496490479, + "logps/chosen": -362.3585510253906, + "logps/rejected": -411.91851806640625, + "loss": 0.029, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13144941627979279, + "rewards/margins": 0.04948444664478302, + "rewards/rejected": -0.1809338480234146, + "step": 960 + }, + { + "epoch": 0.13, + "learning_rate": 4.989045261763362e-06, + "logits/chosen": -1.6257492303848267, + "logits/rejected": -1.4243541955947876, + "logps/chosen": -384.20977783203125, + "logps/rejected": -451.828369140625, + "loss": 0.0237, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1589529812335968, + "rewards/margins": 0.07473161071538925, + "rewards/rejected": -0.23368458449840546, + "step": 970 + }, + { + "epoch": 0.13, + "learning_rate": 4.98795131984058e-06, + "logits/chosen": -1.3598047494888306, + "logits/rejected": -1.2197717428207397, + "logps/chosen": -463.22418212890625, + "logps/rejected": -492.2627868652344, + "loss": 0.0335, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.21916159987449646, + "rewards/margins": 0.07268805801868439, + "rewards/rejected": -0.29184961318969727, + "step": 980 + }, + { + "epoch": 0.13, + "learning_rate": 4.986805456945107e-06, + "logits/chosen": -1.4510526657104492, + "logits/rejected": -1.3373464345932007, + "logps/chosen": -411.4930725097656, + "logps/rejected": -502.29949951171875, + "loss": 0.0464, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.17068354785442352, + "rewards/margins": 0.10880188643932343, + "rewards/rejected": -0.27948540449142456, + "step": 990 + }, + { + "epoch": 0.13, + "learning_rate": 4.985607696989919e-06, + "logits/chosen": -1.6246265172958374, + "logits/rejected": -1.4347529411315918, + "logps/chosen": -413.3321228027344, + "logps/rejected": -440.0856018066406, + "loss": 0.0315, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13195686042308807, + "rewards/margins": 0.04154813662171364, + "rewards/rejected": -0.17350497841835022, + "step": 1000 + }, + { + "epoch": 0.13, + "learning_rate": 4.984358064971026e-06, + "logits/chosen": -1.6722373962402344, + "logits/rejected": -1.495236873626709, + "logps/chosen": -372.73748779296875, + "logps/rejected": -423.34722900390625, + "loss": 0.0223, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.11866787821054459, + "rewards/margins": 0.07718131691217422, + "rewards/rejected": -0.19584921002388, + "step": 1010 + }, + { + "epoch": 0.13, + "learning_rate": 4.983056586966958e-06, + "logits/chosen": -1.5427746772766113, + "logits/rejected": -1.3719583749771118, + "logps/chosen": -477.5443420410156, + "logps/rejected": -532.8753051757812, + "loss": 0.0258, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21644172072410583, + "rewards/margins": 0.0712512657046318, + "rewards/rejected": -0.28769299387931824, + "step": 1020 + }, + { + "epoch": 0.13, + "learning_rate": 4.981703290138215e-06, + "logits/chosen": -1.6714264154434204, + "logits/rejected": -1.6751312017440796, + "logps/chosen": -413.89111328125, + "logps/rejected": -477.0470275878906, + "loss": 0.0294, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1675490289926529, + "rewards/margins": 0.07263518124818802, + "rewards/rejected": -0.2401842325925827, + "step": 1030 + }, + { + "epoch": 0.14, + "learning_rate": 4.980298202726706e-06, + "logits/chosen": -1.7971988916397095, + "logits/rejected": -1.7383168935775757, + "logps/chosen": -371.4641418457031, + "logps/rejected": -371.2574768066406, + "loss": 0.0383, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.11729536950588226, + "rewards/margins": 0.010803603567183018, + "rewards/rejected": -0.1280989795923233, + "step": 1040 + }, + { + "epoch": 0.14, + "learning_rate": 4.978841354055148e-06, + "logits/chosen": -1.7846040725708008, + "logits/rejected": -1.6594231128692627, + "logps/chosen": -304.92608642578125, + "logps/rejected": -339.9076232910156, + "loss": 0.0473, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.11667434871196747, + "rewards/margins": 0.052279382944107056, + "rewards/rejected": -0.16895373165607452, + "step": 1050 + }, + { + "epoch": 0.14, + "learning_rate": 4.977332774526471e-06, + "logits/chosen": -1.429678201675415, + "logits/rejected": -1.5469611883163452, + "logps/chosen": -360.56683349609375, + "logps/rejected": -457.0309143066406, + "loss": 0.0376, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14282606542110443, + "rewards/margins": 0.07488436251878738, + "rewards/rejected": -0.21771040558815002, + "step": 1060 + }, + { + "epoch": 0.14, + "learning_rate": 4.97577249562317e-06, + "logits/chosen": -1.576290488243103, + "logits/rejected": -1.553577184677124, + "logps/chosen": -473.0867614746094, + "logps/rejected": -547.1744995117188, + "loss": 0.0359, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.23830334842205048, + "rewards/margins": 0.049412429332733154, + "rewards/rejected": -0.28771576285362244, + "step": 1070 + }, + { + "epoch": 0.14, + "learning_rate": 4.974160549906652e-06, + "logits/chosen": -1.705106496810913, + "logits/rejected": -1.5367138385772705, + "logps/chosen": -476.2462463378906, + "logps/rejected": -533.9549560546875, + "loss": 0.0264, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.18861953914165497, + "rewards/margins": 0.09369265288114548, + "rewards/rejected": -0.28231221437454224, + "step": 1080 + }, + { + "epoch": 0.14, + "learning_rate": 4.972496971016559e-06, + "logits/chosen": -1.363030195236206, + "logits/rejected": -1.2015846967697144, + "logps/chosen": -455.76190185546875, + "logps/rejected": -489.392578125, + "loss": 0.0184, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.23590807616710663, + "rewards/margins": 0.06948628276586533, + "rewards/rejected": -0.30539435148239136, + "step": 1090 + }, + { + "epoch": 0.14, + "learning_rate": 4.9707817936700635e-06, + "logits/chosen": -1.6423311233520508, + "logits/rejected": -1.3507311344146729, + "logps/chosen": -496.7262268066406, + "logps/rejected": -511.8069763183594, + "loss": 0.0264, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.26126766204833984, + "rewards/margins": 0.049921147525310516, + "rewards/rejected": -0.31118884682655334, + "step": 1100 + }, + { + "epoch": 0.15, + "learning_rate": 4.969015053661142e-06, + "logits/chosen": -1.4872214794158936, + "logits/rejected": -1.2768076658248901, + "logps/chosen": -513.9466552734375, + "logps/rejected": -542.5833129882812, + "loss": 0.0264, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2550693154335022, + "rewards/margins": 0.0754098892211914, + "rewards/rejected": -0.3304792046546936, + "step": 1110 + }, + { + "epoch": 0.15, + "learning_rate": 4.967196787859835e-06, + "logits/chosen": -1.291974663734436, + "logits/rejected": -1.210644006729126, + "logps/chosen": -585.2589111328125, + "logps/rejected": -605.524169921875, + "loss": 0.0302, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2933773994445801, + "rewards/margins": 0.0462600514292717, + "rewards/rejected": -0.3396374583244324, + "step": 1120 + }, + { + "epoch": 0.15, + "learning_rate": 4.965327034211469e-06, + "logits/chosen": -1.5257681608200073, + "logits/rejected": -1.2594645023345947, + "logps/chosen": -451.5184631347656, + "logps/rejected": -486.90106201171875, + "loss": 0.0249, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.23999419808387756, + "rewards/margins": 0.07712169736623764, + "rewards/rejected": -0.3171158730983734, + "step": 1130 + }, + { + "epoch": 0.15, + "learning_rate": 4.96340583173587e-06, + "logits/chosen": -1.6421066522598267, + "logits/rejected": -1.514484167098999, + "logps/chosen": -499.04534912109375, + "logps/rejected": -516.4204711914062, + "loss": 0.0295, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2360762655735016, + "rewards/margins": 0.044597141444683075, + "rewards/rejected": -0.2806733548641205, + "step": 1140 + }, + { + "epoch": 0.15, + "learning_rate": 4.96143322052655e-06, + "logits/chosen": -1.6334667205810547, + "logits/rejected": -1.4992587566375732, + "logps/chosen": -452.73309326171875, + "logps/rejected": -560.7843627929688, + "loss": 0.0387, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.23054762184619904, + "rewards/margins": 0.08870735764503479, + "rewards/rejected": -0.31925496459007263, + "step": 1150 + }, + { + "epoch": 0.15, + "learning_rate": 4.959409241749864e-06, + "logits/chosen": -1.4871851205825806, + "logits/rejected": -1.441433072090149, + "logps/chosen": -430.04681396484375, + "logps/rejected": -512.469970703125, + "loss": 0.0234, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.20715589821338654, + "rewards/margins": 0.08203965425491333, + "rewards/rejected": -0.28919556736946106, + "step": 1160 + }, + { + "epoch": 0.15, + "learning_rate": 4.957333937644159e-06, + "logits/chosen": -1.6897249221801758, + "logits/rejected": -1.5073974132537842, + "logps/chosen": -444.21759033203125, + "logps/rejected": -495.2613220214844, + "loss": 0.024, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.24953778088092804, + "rewards/margins": 0.06193218380212784, + "rewards/rejected": -0.3114699721336365, + "step": 1170 + }, + { + "epoch": 0.15, + "learning_rate": 4.955207351518885e-06, + "logits/chosen": -1.5480334758758545, + "logits/rejected": -1.4697473049163818, + "logps/chosen": -444.0738220214844, + "logps/rejected": -484.5289001464844, + "loss": 0.0415, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.24453409016132355, + "rewards/margins": 0.05837539955973625, + "rewards/rejected": -0.3029094934463501, + "step": 1180 + }, + { + "epoch": 0.16, + "learning_rate": 4.953029527753699e-06, + "logits/chosen": -1.5584051609039307, + "logits/rejected": -1.3973197937011719, + "logps/chosen": -479.6776428222656, + "logps/rejected": -499.9449157714844, + "loss": 0.0258, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.23742356896400452, + "rewards/margins": 0.03735958784818649, + "rewards/rejected": -0.274783194065094, + "step": 1190 + }, + { + "epoch": 0.16, + "learning_rate": 4.95080051179753e-06, + "logits/chosen": -1.706897497177124, + "logits/rejected": -1.5994467735290527, + "logps/chosen": -428.048828125, + "logps/rejected": -485.77484130859375, + "loss": 0.0433, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.22842660546302795, + "rewards/margins": 0.05509146302938461, + "rewards/rejected": -0.28351807594299316, + "step": 1200 + }, + { + "epoch": 0.16, + "learning_rate": 4.948520350167637e-06, + "logits/chosen": -1.6655988693237305, + "logits/rejected": -1.3038753271102905, + "logps/chosen": -553.6268920898438, + "logps/rejected": -579.7947998046875, + "loss": 0.0286, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25176820158958435, + "rewards/margins": 0.08280982077121735, + "rewards/rejected": -0.3345780074596405, + "step": 1210 + }, + { + "epoch": 0.16, + "learning_rate": 4.946189090448639e-06, + "logits/chosen": -1.7785139083862305, + "logits/rejected": -1.4575735330581665, + "logps/chosen": -575.2974853515625, + "logps/rejected": -596.218994140625, + "loss": 0.0325, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.267650842666626, + "rewards/margins": 0.060673050582408905, + "rewards/rejected": -0.3283239006996155, + "step": 1220 + }, + { + "epoch": 0.16, + "learning_rate": 4.943806781291515e-06, + "logits/chosen": -1.4108306169509888, + "logits/rejected": -1.3590025901794434, + "logps/chosen": -536.73974609375, + "logps/rejected": -646.03564453125, + "loss": 0.0208, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28589382767677307, + "rewards/margins": 0.08323682844638824, + "rewards/rejected": -0.3691306710243225, + "step": 1230 + }, + { + "epoch": 0.16, + "learning_rate": 4.941373472412595e-06, + "logits/chosen": -1.4176275730133057, + "logits/rejected": -1.099277138710022, + "logps/chosen": -666.0406494140625, + "logps/rejected": -683.25732421875, + "loss": 0.0249, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3972935080528259, + "rewards/margins": 0.09708867967128754, + "rewards/rejected": -0.49438220262527466, + "step": 1240 + }, + { + "epoch": 0.16, + "learning_rate": 4.938889214592521e-06, + "logits/chosen": -0.8007619976997375, + "logits/rejected": -0.8756643533706665, + "logps/chosen": -701.1500854492188, + "logps/rejected": -785.47509765625, + "loss": 0.0421, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5342551469802856, + "rewards/margins": 0.05736679583787918, + "rewards/rejected": -0.5916219353675842, + "step": 1250 + }, + { + "epoch": 0.16, + "learning_rate": 4.936354059675186e-06, + "logits/chosen": -0.9256356954574585, + "logits/rejected": -0.7004662752151489, + "logps/chosen": -707.5337524414062, + "logps/rejected": -772.3132934570312, + "loss": 0.0261, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.49646010994911194, + "rewards/margins": 0.06648706644773483, + "rewards/rejected": -0.562947154045105, + "step": 1260 + }, + { + "epoch": 0.17, + "learning_rate": 4.933768060566654e-06, + "logits/chosen": -0.9167349934577942, + "logits/rejected": -0.5076829195022583, + "logps/chosen": -597.2662353515625, + "logps/rejected": -646.2260131835938, + "loss": 0.0286, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.37925904989242554, + "rewards/margins": 0.08603726327419281, + "rewards/rejected": -0.46529620885849, + "step": 1270 + }, + { + "epoch": 0.17, + "learning_rate": 4.931131271234052e-06, + "logits/chosen": -1.0448763370513916, + "logits/rejected": -0.955971896648407, + "logps/chosen": -576.7291259765625, + "logps/rejected": -602.1259155273438, + "loss": 0.0272, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3331785798072815, + "rewards/margins": 0.03749316558241844, + "rewards/rejected": -0.37067174911499023, + "step": 1280 + }, + { + "epoch": 0.17, + "learning_rate": 4.928443746704448e-06, + "logits/chosen": -1.3860580921173096, + "logits/rejected": -0.961715817451477, + "logps/chosen": -480.794189453125, + "logps/rejected": -501.71533203125, + "loss": 0.037, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2662775218486786, + "rewards/margins": 0.06484885513782501, + "rewards/rejected": -0.3311263918876648, + "step": 1290 + }, + { + "epoch": 0.17, + "learning_rate": 4.925705543063703e-06, + "logits/chosen": -1.3405392169952393, + "logits/rejected": -1.417043924331665, + "logps/chosen": -486.39019775390625, + "logps/rejected": -610.2715454101562, + "loss": 0.0271, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2749415338039398, + "rewards/margins": 0.08056138455867767, + "rewards/rejected": -0.3555029332637787, + "step": 1300 + }, + { + "epoch": 0.17, + "learning_rate": 4.922916717455297e-06, + "logits/chosen": -1.174640417098999, + "logits/rejected": -1.264804482460022, + "logps/chosen": -490.49859619140625, + "logps/rejected": -569.15576171875, + "loss": 0.0467, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2704632580280304, + "rewards/margins": 0.07853202521800995, + "rewards/rejected": -0.34899526834487915, + "step": 1310 + }, + { + "epoch": 0.17, + "learning_rate": 4.920077328079136e-06, + "logits/chosen": -1.4887348413467407, + "logits/rejected": -1.2705087661743164, + "logps/chosen": -557.512939453125, + "logps/rejected": -601.0181884765625, + "loss": 0.0241, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2937076985836029, + "rewards/margins": 0.08439463376998901, + "rewards/rejected": -0.37810230255126953, + "step": 1320 + }, + { + "epoch": 0.17, + "learning_rate": 4.9171874341903445e-06, + "logits/chosen": -1.3629595041275024, + "logits/rejected": -1.1199499368667603, + "logps/chosen": -478.30938720703125, + "logps/rejected": -541.1429443359375, + "loss": 0.0545, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.27040910720825195, + "rewards/margins": 0.08212222903966904, + "rewards/rejected": -0.352531373500824, + "step": 1330 + }, + { + "epoch": 0.18, + "learning_rate": 4.914247096098019e-06, + "logits/chosen": -1.4378149509429932, + "logits/rejected": -1.3526177406311035, + "logps/chosen": -428.2542419433594, + "logps/rejected": -478.46258544921875, + "loss": 0.0311, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.21419191360473633, + "rewards/margins": 0.06993551552295685, + "rewards/rejected": -0.284127414226532, + "step": 1340 + }, + { + "epoch": 0.18, + "learning_rate": 4.911256375163977e-06, + "logits/chosen": -1.6823108196258545, + "logits/rejected": -1.413735032081604, + "logps/chosen": -429.1358337402344, + "logps/rejected": -421.41314697265625, + "loss": 0.0326, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.17538395524024963, + "rewards/margins": 0.0550597682595253, + "rewards/rejected": -0.23044368624687195, + "step": 1350 + }, + { + "epoch": 0.18, + "learning_rate": 4.908215333801474e-06, + "logits/chosen": -2.0725080966949463, + "logits/rejected": -1.7535841464996338, + "logps/chosen": -449.07574462890625, + "logps/rejected": -431.6366271972656, + "loss": 0.0289, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.16928282380104065, + "rewards/margins": 0.05664173886179924, + "rewards/rejected": -0.22592458128929138, + "step": 1360 + }, + { + "epoch": 0.18, + "learning_rate": 4.9051240354739004e-06, + "logits/chosen": -1.919226050376892, + "logits/rejected": -1.7065900564193726, + "logps/chosen": -440.53509521484375, + "logps/rejected": -461.86859130859375, + "loss": 0.0229, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.1572817862033844, + "rewards/margins": 0.03301681950688362, + "rewards/rejected": -0.19029861688613892, + "step": 1370 + }, + { + "epoch": 0.18, + "learning_rate": 4.901982544693457e-06, + "logits/chosen": -1.7934865951538086, + "logits/rejected": -1.782326102256775, + "logps/chosen": -299.3521423339844, + "logps/rejected": -408.4831237792969, + "loss": 0.0475, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.15632550418376923, + "rewards/margins": 0.054883696138858795, + "rewards/rejected": -0.21120920777320862, + "step": 1380 + }, + { + "epoch": 0.18, + "learning_rate": 4.898790927019809e-06, + "logits/chosen": -1.8103210926055908, + "logits/rejected": -1.8971868753433228, + "logps/chosen": -365.04541015625, + "logps/rejected": -415.27081298828125, + "loss": 0.0301, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.1553061306476593, + "rewards/margins": 0.0542365200817585, + "rewards/rejected": -0.2095426619052887, + "step": 1390 + }, + { + "epoch": 0.18, + "learning_rate": 4.895549249058718e-06, + "logits/chosen": -1.8186776638031006, + "logits/rejected": -1.4331042766571045, + "logps/chosen": -465.67742919921875, + "logps/rejected": -492.672119140625, + "loss": 0.0242, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.19692638516426086, + "rewards/margins": 0.09286610037088394, + "rewards/rejected": -0.2897924780845642, + "step": 1400 + }, + { + "epoch": 0.18, + "learning_rate": 4.892257578460656e-06, + "logits/chosen": -1.6929572820663452, + "logits/rejected": -1.64870285987854, + "logps/chosen": -414.07781982421875, + "logps/rejected": -508.06951904296875, + "loss": 0.0281, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.25049352645874023, + "rewards/margins": 0.09370686113834381, + "rewards/rejected": -0.34420040249824524, + "step": 1410 + }, + { + "epoch": 0.19, + "learning_rate": 4.888915983919383e-06, + "logits/chosen": -1.7767932415008545, + "logits/rejected": -1.725423812866211, + "logps/chosen": -399.81024169921875, + "logps/rejected": -501.4336853027344, + "loss": 0.0237, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2222074717283249, + "rewards/margins": 0.09464211016893387, + "rewards/rejected": -0.31684961915016174, + "step": 1420 + }, + { + "epoch": 0.19, + "learning_rate": 4.885524535170525e-06, + "logits/chosen": -1.7155097723007202, + "logits/rejected": -1.3688064813613892, + "logps/chosen": -456.03289794921875, + "logps/rejected": -557.8484497070312, + "loss": 0.0305, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2546646296977997, + "rewards/margins": 0.1224946603178978, + "rewards/rejected": -0.3771592676639557, + "step": 1430 + }, + { + "epoch": 0.19, + "learning_rate": 4.882083302990113e-06, + "logits/chosen": -1.7398130893707275, + "logits/rejected": -1.6435096263885498, + "logps/chosen": -479.67724609375, + "logps/rejected": -534.8831176757812, + "loss": 0.0193, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.26064735651016235, + "rewards/margins": 0.06385330110788345, + "rewards/rejected": -0.3245006203651428, + "step": 1440 + }, + { + "epoch": 0.19, + "learning_rate": 4.878592359193104e-06, + "logits/chosen": -1.9220349788665771, + "logits/rejected": -1.4295719861984253, + "logps/chosen": -535.4910278320312, + "logps/rejected": -584.61083984375, + "loss": 0.0351, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3067534863948822, + "rewards/margins": 0.10750001668930054, + "rewards/rejected": -0.4142535328865051, + "step": 1450 + }, + { + "epoch": 0.19, + "learning_rate": 4.875051776631888e-06, + "logits/chosen": -1.6607071161270142, + "logits/rejected": -1.691383719444275, + "logps/chosen": -569.18310546875, + "logps/rejected": -715.18896484375, + "loss": 0.0263, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.317788302898407, + "rewards/margins": 0.0920090302824974, + "rewards/rejected": -0.409797340631485, + "step": 1460 + }, + { + "epoch": 0.19, + "learning_rate": 4.871461629194764e-06, + "logits/chosen": -1.9914582967758179, + "logits/rejected": -1.7897332906723022, + "logps/chosen": -530.7254638671875, + "logps/rejected": -543.0486450195312, + "loss": 0.0248, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.23011811077594757, + "rewards/margins": 0.05637926980853081, + "rewards/rejected": -0.28649741411209106, + "step": 1470 + }, + { + "epoch": 0.19, + "learning_rate": 4.8678219918043984e-06, + "logits/chosen": -1.9362154006958008, + "logits/rejected": -1.8457800149917603, + "logps/chosen": -456.870849609375, + "logps/rejected": -537.7033081054688, + "loss": 0.0182, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.217064768075943, + "rewards/margins": 0.058478035032749176, + "rewards/rejected": -0.2755427956581116, + "step": 1480 + }, + { + "epoch": 0.19, + "learning_rate": 4.864132940416262e-06, + "logits/chosen": -1.9576715230941772, + "logits/rejected": -1.652612328529358, + "logps/chosen": -404.54388427734375, + "logps/rejected": -444.6311950683594, + "loss": 0.024, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.23168055713176727, + "rewards/margins": 0.046421170234680176, + "rewards/rejected": -0.27810171246528625, + "step": 1490 + }, + { + "epoch": 0.2, + "learning_rate": 4.860394552017044e-06, + "logits/chosen": -1.8545684814453125, + "logits/rejected": -1.750740647315979, + "logps/chosen": -519.2760009765625, + "logps/rejected": -541.0926513671875, + "loss": 0.0208, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.24759700894355774, + "rewards/margins": 0.05812455341219902, + "rewards/rejected": -0.30572155117988586, + "step": 1500 + }, + { + "epoch": 0.2, + "learning_rate": 4.856606904623047e-06, + "logits/chosen": -1.9752925634384155, + "logits/rejected": -1.880743384361267, + "logps/chosen": -484.6067810058594, + "logps/rejected": -532.9691162109375, + "loss": 0.0309, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.26403746008872986, + "rewards/margins": 0.08272302895784378, + "rewards/rejected": -0.34676045179367065, + "step": 1510 + }, + { + "epoch": 0.2, + "learning_rate": 4.852770077278557e-06, + "logits/chosen": -1.7194467782974243, + "logits/rejected": -1.3660582304000854, + "logps/chosen": -500.27288818359375, + "logps/rejected": -488.1220703125, + "loss": 0.0299, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2599658668041229, + "rewards/margins": 0.049370136111974716, + "rewards/rejected": -0.30933600664138794, + "step": 1520 + }, + { + "epoch": 0.2, + "learning_rate": 4.848884150054196e-06, + "logits/chosen": -1.3870841264724731, + "logits/rejected": -1.4338102340698242, + "logps/chosen": -483.5819396972656, + "logps/rejected": -598.3069458007812, + "loss": 0.0304, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2766098380088806, + "rewards/margins": 0.07799626886844635, + "rewards/rejected": -0.35460609197616577, + "step": 1530 + }, + { + "epoch": 0.2, + "learning_rate": 4.8449492040452495e-06, + "logits/chosen": -1.6025880575180054, + "logits/rejected": -1.4526028633117676, + "logps/chosen": -582.9497680664062, + "logps/rejected": -625.2781982421875, + "loss": 0.027, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3081502318382263, + "rewards/margins": 0.07421085983514786, + "rewards/rejected": -0.38236111402511597, + "step": 1540 + }, + { + "epoch": 0.2, + "learning_rate": 4.840965321369973e-06, + "logits/chosen": -1.6838009357452393, + "logits/rejected": -1.7545547485351562, + "logps/chosen": -545.9749145507812, + "logps/rejected": -636.2731323242188, + "loss": 0.041, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.28291061520576477, + "rewards/margins": 0.08299050480127335, + "rewards/rejected": -0.3659011125564575, + "step": 1550 + }, + { + "epoch": 0.2, + "learning_rate": 4.8369325851678795e-06, + "logits/chosen": -1.7689971923828125, + "logits/rejected": -1.4405544996261597, + "logps/chosen": -497.4356384277344, + "logps/rejected": -542.1776123046875, + "loss": 0.018, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.25298741459846497, + "rewards/margins": 0.09371381998062134, + "rewards/rejected": -0.3467012047767639, + "step": 1560 + }, + { + "epoch": 0.21, + "learning_rate": 4.832851079598007e-06, + "logits/chosen": -1.7749683856964111, + "logits/rejected": -1.641271948814392, + "logps/chosen": -448.66693115234375, + "logps/rejected": -474.647705078125, + "loss": 0.0347, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.24436303973197937, + "rewards/margins": 0.04413959011435509, + "rewards/rejected": -0.28850263357162476, + "step": 1570 + }, + { + "epoch": 0.21, + "learning_rate": 4.828720889837158e-06, + "logits/chosen": -1.956264853477478, + "logits/rejected": -1.7235233783721924, + "logps/chosen": -536.6216430664062, + "logps/rejected": -541.3743286132812, + "loss": 0.0425, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.25714707374572754, + "rewards/margins": 0.03778282552957535, + "rewards/rejected": -0.2949298620223999, + "step": 1580 + }, + { + "epoch": 0.21, + "learning_rate": 4.824542102078125e-06, + "logits/chosen": -1.8579609394073486, + "logits/rejected": -1.6114164590835571, + "logps/chosen": -391.9680480957031, + "logps/rejected": -420.4110412597656, + "loss": 0.031, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16827771067619324, + "rewards/margins": 0.08321143686771393, + "rewards/rejected": -0.25148916244506836, + "step": 1590 + }, + { + "epoch": 0.21, + "learning_rate": 4.820314803527888e-06, + "logits/chosen": -1.9021852016448975, + "logits/rejected": -1.8077495098114014, + "logps/chosen": -357.4749755859375, + "logps/rejected": -429.97650146484375, + "loss": 0.0314, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1697140634059906, + "rewards/margins": 0.061640460044145584, + "rewards/rejected": -0.2313545197248459, + "step": 1600 + }, + { + "epoch": 0.21, + "learning_rate": 4.816039082405799e-06, + "logits/chosen": -1.6327106952667236, + "logits/rejected": -1.6976954936981201, + "logps/chosen": -423.43048095703125, + "logps/rejected": -547.1001586914062, + "loss": 0.0311, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.24598722159862518, + "rewards/margins": 0.09015451371669769, + "rewards/rejected": -0.33614176511764526, + "step": 1610 + }, + { + "epoch": 0.21, + "learning_rate": 4.81171502794174e-06, + "logits/chosen": -1.66916024684906, + "logits/rejected": -1.5378539562225342, + "logps/chosen": -506.19635009765625, + "logps/rejected": -581.7791748046875, + "loss": 0.0208, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27827128767967224, + "rewards/margins": 0.09809643775224686, + "rewards/rejected": -0.3763677477836609, + "step": 1620 + }, + { + "epoch": 0.21, + "learning_rate": 4.8073427303742584e-06, + "logits/chosen": -1.7626116275787354, + "logits/rejected": -1.553881287574768, + "logps/chosen": -516.6483154296875, + "logps/rejected": -498.420166015625, + "loss": 0.0344, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.27453574538230896, + "rewards/margins": 0.05228264257311821, + "rewards/rejected": -0.32681840658187866, + "step": 1630 + }, + { + "epoch": 0.21, + "learning_rate": 4.802922280948685e-06, + "logits/chosen": -1.7148334980010986, + "logits/rejected": -1.5481488704681396, + "logps/chosen": -483.62457275390625, + "logps/rejected": -519.8236083984375, + "loss": 0.0229, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2525627911090851, + "rewards/margins": 0.06651215255260468, + "rewards/rejected": -0.31907492876052856, + "step": 1640 + }, + { + "epoch": 0.22, + "learning_rate": 4.798453771915231e-06, + "logits/chosen": -1.9586117267608643, + "logits/rejected": -1.7584108114242554, + "logps/chosen": -392.68023681640625, + "logps/rejected": -416.2002868652344, + "loss": 0.0379, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1924244910478592, + "rewards/margins": 0.055614493787288666, + "rewards/rejected": -0.24803900718688965, + "step": 1650 + }, + { + "epoch": 0.22, + "learning_rate": 4.793937296527062e-06, + "logits/chosen": -2.0236639976501465, + "logits/rejected": -1.7089784145355225, + "logps/chosen": -432.085693359375, + "logps/rejected": -505.3924255371094, + "loss": 0.0283, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16688159108161926, + "rewards/margins": 0.10469367355108261, + "rewards/rejected": -0.27157527208328247, + "step": 1660 + }, + { + "epoch": 0.22, + "learning_rate": 4.78937294903835e-06, + "logits/chosen": -2.0606894493103027, + "logits/rejected": -1.7672550678253174, + "logps/chosen": -512.7127075195312, + "logps/rejected": -511.38885498046875, + "loss": 0.0209, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.19819924235343933, + "rewards/margins": 0.06016005203127861, + "rewards/rejected": -0.25835928320884705, + "step": 1670 + }, + { + "epoch": 0.22, + "learning_rate": 4.78476082470231e-06, + "logits/chosen": -1.9348901510238647, + "logits/rejected": -1.9231693744659424, + "logps/chosen": -417.9811096191406, + "logps/rejected": -467.3179626464844, + "loss": 0.0243, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.19126436114311218, + "rewards/margins": 0.053105421364307404, + "rewards/rejected": -0.24436978995800018, + "step": 1680 + }, + { + "epoch": 0.22, + "learning_rate": 4.780101019769212e-06, + "logits/chosen": -1.9238088130950928, + "logits/rejected": -1.7408262491226196, + "logps/chosen": -373.9527587890625, + "logps/rejected": -410.34417724609375, + "loss": 0.0173, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.17050370573997498, + "rewards/margins": 0.051046222448349, + "rewards/rejected": -0.22154991328716278, + "step": 1690 + }, + { + "epoch": 0.22, + "learning_rate": 4.775393631484368e-06, + "logits/chosen": -2.0598666667938232, + "logits/rejected": -1.823992133140564, + "logps/chosen": -535.9193115234375, + "logps/rejected": -628.5404663085938, + "loss": 0.027, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.21464088559150696, + "rewards/margins": 0.08915476500988007, + "rewards/rejected": -0.30379563570022583, + "step": 1700 + }, + { + "epoch": 0.22, + "learning_rate": 4.770638758086105e-06, + "logits/chosen": -2.0537681579589844, + "logits/rejected": -1.927835464477539, + "logps/chosen": -368.2241516113281, + "logps/rejected": -378.3135070800781, + "loss": 0.0323, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.1430542767047882, + "rewards/margins": 0.03868260234594345, + "rewards/rejected": -0.18173687160015106, + "step": 1710 + }, + { + "epoch": 0.23, + "learning_rate": 4.7658364988037184e-06, + "logits/chosen": -2.0646414756774902, + "logits/rejected": -1.9343478679656982, + "logps/chosen": -310.01153564453125, + "logps/rejected": -428.88818359375, + "loss": 0.0283, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.13848961889743805, + "rewards/margins": 0.09961990267038345, + "rewards/rejected": -0.2381094992160797, + "step": 1720 + }, + { + "epoch": 0.23, + "learning_rate": 4.760986953855395e-06, + "logits/chosen": -2.019463062286377, + "logits/rejected": -1.7855058908462524, + "logps/chosen": -395.25616455078125, + "logps/rejected": -376.6690368652344, + "loss": 0.0311, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.13688886165618896, + "rewards/margins": 0.013272324576973915, + "rewards/rejected": -0.15016120672225952, + "step": 1730 + }, + { + "epoch": 0.23, + "learning_rate": 4.756090224446127e-06, + "logits/chosen": -1.9321496486663818, + "logits/rejected": -1.702087163925171, + "logps/chosen": -372.3777770996094, + "logps/rejected": -402.9239807128906, + "loss": 0.036, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.13679590821266174, + "rewards/margins": 0.08134065568447113, + "rewards/rejected": -0.21813654899597168, + "step": 1740 + }, + { + "epoch": 0.23, + "learning_rate": 4.7511464127655945e-06, + "logits/chosen": -1.7360193729400635, + "logits/rejected": -1.7117149829864502, + "logps/chosen": -460.23626708984375, + "logps/rejected": -568.3236694335938, + "loss": 0.0285, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2555629014968872, + "rewards/margins": 0.08793105185031891, + "rewards/rejected": -0.34349390864372253, + "step": 1750 + }, + { + "epoch": 0.23, + "learning_rate": 4.74615562198604e-06, + "logits/chosen": -1.9758737087249756, + "logits/rejected": -1.7570078372955322, + "logps/chosen": -331.95977783203125, + "logps/rejected": -342.317626953125, + "loss": 0.0222, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1414758712053299, + "rewards/margins": 0.056294094771146774, + "rewards/rejected": -0.19776996970176697, + "step": 1760 + }, + { + "epoch": 0.23, + "learning_rate": 4.741117956260107e-06, + "logits/chosen": -2.1880033016204834, + "logits/rejected": -2.02624773979187, + "logps/chosen": -412.73516845703125, + "logps/rejected": -436.26715087890625, + "loss": 0.0234, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.12030048668384552, + "rewards/margins": 0.055233072489500046, + "rewards/rejected": -0.17553356289863586, + "step": 1770 + }, + { + "epoch": 0.23, + "learning_rate": 4.736033520718672e-06, + "logits/chosen": -2.026815891265869, + "logits/rejected": -1.8514766693115234, + "logps/chosen": -287.23846435546875, + "logps/rejected": -331.16925048828125, + "loss": 0.0234, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.11404535919427872, + "rewards/margins": 0.07378792762756348, + "rewards/rejected": -0.1878332495689392, + "step": 1780 + }, + { + "epoch": 0.23, + "learning_rate": 4.730902421468652e-06, + "logits/chosen": -2.1646840572357178, + "logits/rejected": -2.077221155166626, + "logps/chosen": -350.36798095703125, + "logps/rejected": -382.8150939941406, + "loss": 0.0395, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.12857167422771454, + "rewards/margins": 0.0784262865781784, + "rewards/rejected": -0.20699796080589294, + "step": 1790 + }, + { + "epoch": 0.24, + "learning_rate": 4.7257247655907854e-06, + "logits/chosen": -2.1851401329040527, + "logits/rejected": -1.7716038227081299, + "logps/chosen": -369.41815185546875, + "logps/rejected": -374.6217346191406, + "loss": 0.0333, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11951597034931183, + "rewards/margins": 0.05485420301556587, + "rewards/rejected": -0.1743701696395874, + "step": 1800 + }, + { + "epoch": 0.24, + "learning_rate": 4.720500661137397e-06, + "logits/chosen": -1.9286584854125977, + "logits/rejected": -2.1050808429718018, + "logps/chosen": -266.3800354003906, + "logps/rejected": -395.1708984375, + "loss": 0.0261, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.10980401188135147, + "rewards/margins": 0.10229452699422836, + "rewards/rejected": -0.21209852397441864, + "step": 1810 + }, + { + "epoch": 0.24, + "learning_rate": 4.71523021713015e-06, + "logits/chosen": -2.213164806365967, + "logits/rejected": -2.015493869781494, + "logps/chosen": -420.3340759277344, + "logps/rejected": -426.08673095703125, + "loss": 0.0297, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1583167314529419, + "rewards/margins": 0.049007922410964966, + "rewards/rejected": -0.20732466876506805, + "step": 1820 + }, + { + "epoch": 0.24, + "learning_rate": 4.709913543557761e-06, + "logits/chosen": -2.0623435974121094, + "logits/rejected": -1.8618663549423218, + "logps/chosen": -477.73114013671875, + "logps/rejected": -519.5567626953125, + "loss": 0.0231, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.20417337119579315, + "rewards/margins": 0.0879688486456871, + "rewards/rejected": -0.29214224219322205, + "step": 1830 + }, + { + "epoch": 0.24, + "learning_rate": 4.704550751373715e-06, + "logits/chosen": -2.1619040966033936, + "logits/rejected": -2.092615842819214, + "logps/chosen": -377.7481994628906, + "logps/rejected": -447.01483154296875, + "loss": 0.0338, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.18411315977573395, + "rewards/margins": 0.0370350107550621, + "rewards/rejected": -0.22114817798137665, + "step": 1840 + }, + { + "epoch": 0.24, + "learning_rate": 4.699141952493941e-06, + "logits/chosen": -2.0571186542510986, + "logits/rejected": -2.0089094638824463, + "logps/chosen": -400.6955261230469, + "logps/rejected": -421.14239501953125, + "loss": 0.0286, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.15329428017139435, + "rewards/margins": 0.08874006569385529, + "rewards/rejected": -0.24203434586524963, + "step": 1850 + }, + { + "epoch": 0.24, + "learning_rate": 4.6936872597944814e-06, + "logits/chosen": -1.934605598449707, + "logits/rejected": -1.6609560251235962, + "logps/chosen": -484.6827087402344, + "logps/rejected": -568.5869140625, + "loss": 0.0279, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2339555323123932, + "rewards/margins": 0.09535963833332062, + "rewards/rejected": -0.3293151557445526, + "step": 1860 + }, + { + "epoch": 0.24, + "learning_rate": 4.688186787109136e-06, + "logits/chosen": -1.7950502634048462, + "logits/rejected": -1.7118091583251953, + "logps/chosen": -497.66485595703125, + "logps/rejected": -588.157958984375, + "loss": 0.0291, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3197007477283478, + "rewards/margins": 0.10076627880334854, + "rewards/rejected": -0.4204670786857605, + "step": 1870 + }, + { + "epoch": 0.25, + "learning_rate": 4.682640649227085e-06, + "logits/chosen": -1.7749334573745728, + "logits/rejected": -1.3422480821609497, + "logps/chosen": -497.0650939941406, + "logps/rejected": -546.1441040039062, + "loss": 0.0325, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2521812915802002, + "rewards/margins": 0.07832489907741547, + "rewards/rejected": -0.33050617575645447, + "step": 1880 + }, + { + "epoch": 0.25, + "learning_rate": 4.677048961890492e-06, + "logits/chosen": -1.8528788089752197, + "logits/rejected": -1.7184594869613647, + "logps/chosen": -415.6058654785156, + "logps/rejected": -478.2289123535156, + "loss": 0.0405, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.20940780639648438, + "rewards/margins": 0.07339660823345184, + "rewards/rejected": -0.2828044295310974, + "step": 1890 + }, + { + "epoch": 0.25, + "learning_rate": 4.671411841792096e-06, + "logits/chosen": -2.135364294052124, + "logits/rejected": -1.7172048091888428, + "logps/chosen": -539.20947265625, + "logps/rejected": -566.8685302734375, + "loss": 0.0218, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.22130024433135986, + "rewards/margins": 0.10534927994012833, + "rewards/rejected": -0.3266495168209076, + "step": 1900 + }, + { + "epoch": 0.25, + "learning_rate": 4.665729406572764e-06, + "logits/chosen": -1.6798137426376343, + "logits/rejected": -1.5157406330108643, + "logps/chosen": -387.11602783203125, + "logps/rejected": -461.33233642578125, + "loss": 0.0323, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.22341910004615784, + "rewards/margins": 0.08191484212875366, + "rewards/rejected": -0.3053339421749115, + "step": 1910 + }, + { + "epoch": 0.25, + "learning_rate": 4.660001774819048e-06, + "logits/chosen": -1.7033799886703491, + "logits/rejected": -1.7451461553573608, + "logps/chosen": -341.09649658203125, + "logps/rejected": -428.5357971191406, + "loss": 0.0189, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19849085807800293, + "rewards/margins": 0.05713418126106262, + "rewards/rejected": -0.25562506914138794, + "step": 1920 + }, + { + "epoch": 0.25, + "learning_rate": 4.654229066060702e-06, + "logits/chosen": -1.5551412105560303, + "logits/rejected": -1.6741459369659424, + "logps/chosen": -441.34124755859375, + "logps/rejected": -620.9329833984375, + "loss": 0.0287, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2283930480480194, + "rewards/margins": 0.0852317065000534, + "rewards/rejected": -0.313624769449234, + "step": 1930 + }, + { + "epoch": 0.25, + "learning_rate": 4.648411400768193e-06, + "logits/chosen": -1.7304413318634033, + "logits/rejected": -1.6656568050384521, + "logps/chosen": -443.6297912597656, + "logps/rejected": -532.1062622070312, + "loss": 0.0171, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.21100816130638123, + "rewards/margins": 0.061530113220214844, + "rewards/rejected": -0.27253827452659607, + "step": 1940 + }, + { + "epoch": 0.26, + "learning_rate": 4.642548900350182e-06, + "logits/chosen": -1.7242944240570068, + "logits/rejected": -1.59256112575531, + "logps/chosen": -377.0804138183594, + "logps/rejected": -457.90997314453125, + "loss": 0.0386, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.14849740266799927, + "rewards/margins": 0.10453041642904282, + "rewards/rejected": -0.2530278265476227, + "step": 1950 + }, + { + "epoch": 0.26, + "learning_rate": 4.636641687150994e-06, + "logits/chosen": -2.036111354827881, + "logits/rejected": -1.890380620956421, + "logps/chosen": -356.2049255371094, + "logps/rejected": -338.0189208984375, + "loss": 0.0203, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1230909675359726, + "rewards/margins": 0.023669157177209854, + "rewards/rejected": -0.14676013588905334, + "step": 1960 + }, + { + "epoch": 0.26, + "learning_rate": 4.6306898844480615e-06, + "logits/chosen": -1.8678182363510132, + "logits/rejected": -1.7673766613006592, + "logps/chosen": -297.96588134765625, + "logps/rejected": -384.05517578125, + "loss": 0.0215, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13255058228969574, + "rewards/margins": 0.0716322511434555, + "rewards/rejected": -0.20418281853199005, + "step": 1970 + }, + { + "epoch": 0.26, + "learning_rate": 4.624693616449358e-06, + "logits/chosen": -1.9337873458862305, + "logits/rejected": -1.5866873264312744, + "logps/chosen": -407.74786376953125, + "logps/rejected": -436.3265686035156, + "loss": 0.0285, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.15489144623279572, + "rewards/margins": 0.07904152572154999, + "rewards/rejected": -0.23393294215202332, + "step": 1980 + }, + { + "epoch": 0.26, + "learning_rate": 4.6186530082908e-06, + "logits/chosen": -1.6662366390228271, + "logits/rejected": -1.661988615989685, + "logps/chosen": -442.3397521972656, + "logps/rejected": -513.10400390625, + "loss": 0.0194, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2055976837873459, + "rewards/margins": 0.07577277719974518, + "rewards/rejected": -0.28137046098709106, + "step": 1990 + }, + { + "epoch": 0.26, + "learning_rate": 4.612568186033633e-06, + "logits/chosen": -1.5893526077270508, + "logits/rejected": -1.5049242973327637, + "logps/chosen": -458.06011962890625, + "logps/rejected": -476.4659118652344, + "loss": 0.041, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2440434992313385, + "rewards/margins": 0.08281457424163818, + "rewards/rejected": -0.3268580436706543, + "step": 2000 + }, + { + "epoch": 0.26, + "learning_rate": 4.6064392766618125e-06, + "logits/chosen": -1.651850938796997, + "logits/rejected": -1.6366875171661377, + "logps/chosen": -400.959716796875, + "logps/rejected": -483.428466796875, + "loss": 0.0471, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.213557630777359, + "rewards/margins": 0.08097215741872787, + "rewards/rejected": -0.2945297360420227, + "step": 2010 + }, + { + "epoch": 0.26, + "learning_rate": 4.60026640807934e-06, + "logits/chosen": -1.7265008687973022, + "logits/rejected": -1.6939365863800049, + "logps/chosen": -471.2493591308594, + "logps/rejected": -543.5922241210938, + "loss": 0.0404, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.21301202476024628, + "rewards/margins": 0.07630138099193573, + "rewards/rejected": -0.289313405752182, + "step": 2020 + }, + { + "epoch": 0.27, + "learning_rate": 4.594049709107604e-06, + "logits/chosen": -1.7392613887786865, + "logits/rejected": -1.7632348537445068, + "logps/chosen": -470.91925048828125, + "logps/rejected": -563.3527221679688, + "loss": 0.0225, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.25252705812454224, + "rewards/margins": 0.08756853640079498, + "rewards/rejected": -0.340095579624176, + "step": 2030 + }, + { + "epoch": 0.27, + "learning_rate": 4.587789309482687e-06, + "logits/chosen": -1.8418395519256592, + "logits/rejected": -1.7986156940460205, + "logps/chosen": -431.211181640625, + "logps/rejected": -576.7343139648438, + "loss": 0.029, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.23480498790740967, + "rewards/margins": 0.12721502780914307, + "rewards/rejected": -0.3620200455188751, + "step": 2040 + }, + { + "epoch": 0.27, + "learning_rate": 4.581485339852659e-06, + "logits/chosen": -1.8456242084503174, + "logits/rejected": -1.393122673034668, + "logps/chosen": -443.29388427734375, + "logps/rejected": -463.262451171875, + "loss": 0.0325, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.256413996219635, + "rewards/margins": 0.0756414532661438, + "rewards/rejected": -0.3320554494857788, + "step": 2050 + }, + { + "epoch": 0.27, + "learning_rate": 4.5751379317748514e-06, + "logits/chosen": -1.776293158531189, + "logits/rejected": -1.5436232089996338, + "logps/chosen": -528.5411987304688, + "logps/rejected": -530.0343017578125, + "loss": 0.0339, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25058040022850037, + "rewards/margins": 0.08744814246892929, + "rewards/rejected": -0.33802855014801025, + "step": 2060 + }, + { + "epoch": 0.27, + "learning_rate": 4.56874721771311e-06, + "logits/chosen": -2.0413804054260254, + "logits/rejected": -1.6784255504608154, + "logps/chosen": -477.5301818847656, + "logps/rejected": -507.48114013671875, + "loss": 0.0261, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1695762276649475, + "rewards/margins": 0.07611775398254395, + "rewards/rejected": -0.24569399654865265, + "step": 2070 + }, + { + "epoch": 0.27, + "learning_rate": 4.562313331035032e-06, + "logits/chosen": -1.8038721084594727, + "logits/rejected": -1.6684455871582031, + "logps/chosen": -363.0096130371094, + "logps/rejected": -452.68707275390625, + "loss": 0.0247, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.17797474563121796, + "rewards/margins": 0.0909547433257103, + "rewards/rejected": -0.26892951130867004, + "step": 2080 + }, + { + "epoch": 0.27, + "learning_rate": 4.555836406009183e-06, + "logits/chosen": -1.9092241525650024, + "logits/rejected": -1.715597152709961, + "logps/chosen": -422.19610595703125, + "logps/rejected": -476.93792724609375, + "loss": 0.0259, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1975400447845459, + "rewards/margins": 0.09823472797870636, + "rewards/rejected": -0.29577475786209106, + "step": 2090 + }, + { + "epoch": 0.27, + "learning_rate": 4.5493165778022945e-06, + "logits/chosen": -1.5767942667007446, + "logits/rejected": -1.6466798782348633, + "logps/chosen": -441.18658447265625, + "logps/rejected": -488.8585510253906, + "loss": 0.0358, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22285763919353485, + "rewards/margins": 0.05790979415178299, + "rewards/rejected": -0.28076741099357605, + "step": 2100 + }, + { + "epoch": 0.28, + "learning_rate": 4.542753982476443e-06, + "logits/chosen": -1.8367239236831665, + "logits/rejected": -1.640899896621704, + "logps/chosen": -344.5665588378906, + "logps/rejected": -532.787841796875, + "loss": 0.0223, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18316304683685303, + "rewards/margins": 0.17273655533790588, + "rewards/rejected": -0.3558996021747589, + "step": 2110 + }, + { + "epoch": 0.28, + "learning_rate": 4.53614875698621e-06, + "logits/chosen": -2.014596462249756, + "logits/rejected": -1.665479063987732, + "logps/chosen": -449.51776123046875, + "logps/rejected": -545.7579345703125, + "loss": 0.0184, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.22679471969604492, + "rewards/margins": 0.08117818832397461, + "rewards/rejected": -0.30797290802001953, + "step": 2120 + }, + { + "epoch": 0.28, + "learning_rate": 4.529501039175824e-06, + "logits/chosen": -1.8626056909561157, + "logits/rejected": -1.5862220525741577, + "logps/chosen": -426.05096435546875, + "logps/rejected": -422.3216857910156, + "loss": 0.0171, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.186855748295784, + "rewards/margins": 0.055360354483127594, + "rewards/rejected": -0.2422161102294922, + "step": 2130 + }, + { + "epoch": 0.28, + "learning_rate": 4.522810967776287e-06, + "logits/chosen": -1.802381157875061, + "logits/rejected": -1.7314026355743408, + "logps/chosen": -434.59930419921875, + "logps/rejected": -450.11834716796875, + "loss": 0.0378, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21993453800678253, + "rewards/margins": 0.04317254200577736, + "rewards/rejected": -0.2631070613861084, + "step": 2140 + }, + { + "epoch": 0.28, + "learning_rate": 4.516078682402473e-06, + "logits/chosen": -1.7470426559448242, + "logits/rejected": -1.7412372827529907, + "logps/chosen": -477.63922119140625, + "logps/rejected": -509.67266845703125, + "loss": 0.0222, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20522888004779816, + "rewards/margins": 0.05774085968732834, + "rewards/rejected": -0.2629697620868683, + "step": 2150 + }, + { + "epoch": 0.28, + "learning_rate": 4.509304323550221e-06, + "logits/chosen": -1.9978656768798828, + "logits/rejected": -1.8754138946533203, + "logps/chosen": -414.8675842285156, + "logps/rejected": -469.20684814453125, + "loss": 0.0182, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2199893444776535, + "rewards/margins": 0.059581078588962555, + "rewards/rejected": -0.27957040071487427, + "step": 2160 + }, + { + "epoch": 0.28, + "learning_rate": 4.502488032593398e-06, + "logits/chosen": -1.8610658645629883, + "logits/rejected": -1.6103994846343994, + "logps/chosen": -438.61956787109375, + "logps/rejected": -502.69952392578125, + "loss": 0.0189, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2141924798488617, + "rewards/margins": 0.08607035875320435, + "rewards/rejected": -0.30026283860206604, + "step": 2170 + }, + { + "epoch": 0.29, + "learning_rate": 4.495629951780951e-06, + "logits/chosen": -1.8735036849975586, + "logits/rejected": -1.7977253198623657, + "logps/chosen": -410.9207458496094, + "logps/rejected": -541.6500854492188, + "loss": 0.0274, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2333689033985138, + "rewards/margins": 0.10300314426422119, + "rewards/rejected": -0.336372047662735, + "step": 2180 + }, + { + "epoch": 0.29, + "learning_rate": 4.488730224233941e-06, + "logits/chosen": -1.5623286962509155, + "logits/rejected": -1.3899695873260498, + "logps/chosen": -454.39599609375, + "logps/rejected": -538.2545166015625, + "loss": 0.053, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2773645222187042, + "rewards/margins": 0.06378518044948578, + "rewards/rejected": -0.3411497175693512, + "step": 2190 + }, + { + "epoch": 0.29, + "learning_rate": 4.481788993942547e-06, + "logits/chosen": -1.7548974752426147, + "logits/rejected": -1.6386677026748657, + "logps/chosen": -432.77545166015625, + "logps/rejected": -536.7062377929688, + "loss": 0.0155, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2100006639957428, + "rewards/margins": 0.11079144477844238, + "rewards/rejected": -0.3207921087741852, + "step": 2200 + }, + { + "epoch": 0.29, + "learning_rate": 4.474806405763076e-06, + "logits/chosen": -1.6667753458023071, + "logits/rejected": -1.5982141494750977, + "logps/chosen": -451.031005859375, + "logps/rejected": -577.8094482421875, + "loss": 0.0241, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2053256779909134, + "rewards/margins": 0.09365935623645782, + "rewards/rejected": -0.2989850342273712, + "step": 2210 + }, + { + "epoch": 0.29, + "learning_rate": 4.4677826054149235e-06, + "logits/chosen": -1.7733885049819946, + "logits/rejected": -1.6308481693267822, + "logps/chosen": -475.9578552246094, + "logps/rejected": -536.2483520507812, + "loss": 0.0275, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2246747761964798, + "rewards/margins": 0.05896454304456711, + "rewards/rejected": -0.2836392819881439, + "step": 2220 + }, + { + "epoch": 0.29, + "learning_rate": 4.460717739477543e-06, + "logits/chosen": -1.9974079132080078, + "logits/rejected": -1.7053706645965576, + "logps/chosen": -463.2173767089844, + "logps/rejected": -466.25537109375, + "loss": 0.02, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2159685641527176, + "rewards/margins": 0.037511665374040604, + "rewards/rejected": -0.2534802556037903, + "step": 2230 + }, + { + "epoch": 0.29, + "learning_rate": 4.4536119553873866e-06, + "logits/chosen": -1.7966111898422241, + "logits/rejected": -1.7337009906768799, + "logps/chosen": -400.27728271484375, + "logps/rejected": -539.5341186523438, + "loss": 0.0334, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.20263293385505676, + "rewards/margins": 0.09285394102334976, + "rewards/rejected": -0.2954868674278259, + "step": 2240 + }, + { + "epoch": 0.29, + "learning_rate": 4.446465401434824e-06, + "logits/chosen": -2.0680744647979736, + "logits/rejected": -2.1116702556610107, + "logps/chosen": -447.0508728027344, + "logps/rejected": -494.86334228515625, + "loss": 0.0283, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.18393829464912415, + "rewards/margins": 0.06616505235433578, + "rewards/rejected": -0.2501033842563629, + "step": 2250 + }, + { + "epoch": 0.3, + "learning_rate": 4.43927822676105e-06, + "logits/chosen": -2.0033762454986572, + "logits/rejected": -1.7921524047851562, + "logps/chosen": -449.127197265625, + "logps/rejected": -488.2967834472656, + "loss": 0.0289, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21589410305023193, + "rewards/margins": 0.07737429440021515, + "rewards/rejected": -0.2932683825492859, + "step": 2260 + }, + { + "epoch": 0.3, + "learning_rate": 4.432050581354972e-06, + "logits/chosen": -1.8215572834014893, + "logits/rejected": -1.5815279483795166, + "logps/chosen": -389.57464599609375, + "logps/rejected": -434.3033142089844, + "loss": 0.0214, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.20798125863075256, + "rewards/margins": 0.07210813462734222, + "rewards/rejected": -0.2800893783569336, + "step": 2270 + }, + { + "epoch": 0.3, + "learning_rate": 4.424782616050078e-06, + "logits/chosen": -1.7252633571624756, + "logits/rejected": -1.6086317300796509, + "logps/chosen": -446.84552001953125, + "logps/rejected": -523.4686889648438, + "loss": 0.0237, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2335268259048462, + "rewards/margins": 0.09612774103879929, + "rewards/rejected": -0.3296545445919037, + "step": 2280 + }, + { + "epoch": 0.3, + "learning_rate": 4.4174744825212954e-06, + "logits/chosen": -1.9347927570343018, + "logits/rejected": -1.7474849224090576, + "logps/chosen": -506.05963134765625, + "logps/rejected": -575.9285888671875, + "loss": 0.022, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.24362215399742126, + "rewards/margins": 0.0921451598405838, + "rewards/rejected": -0.33576732873916626, + "step": 2290 + }, + { + "epoch": 0.3, + "learning_rate": 4.410126333281815e-06, + "logits/chosen": -1.7309291362762451, + "logits/rejected": -1.5879218578338623, + "logps/chosen": -512.5667114257812, + "logps/rejected": -527.5018310546875, + "loss": 0.033, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25268226861953735, + "rewards/margins": 0.09238190948963165, + "rewards/rejected": -0.3450641334056854, + "step": 2300 + }, + { + "epoch": 0.3, + "learning_rate": 4.402738321679918e-06, + "logits/chosen": -1.670026183128357, + "logits/rejected": -1.761228322982788, + "logps/chosen": -348.04974365234375, + "logps/rejected": -443.8878479003906, + "loss": 0.0283, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.17478026449680328, + "rewards/margins": 0.0867806226015091, + "rewards/rejected": -0.2615608870983124, + "step": 2310 + }, + { + "epoch": 0.3, + "learning_rate": 4.395310601895772e-06, + "logits/chosen": -1.9855763912200928, + "logits/rejected": -1.6330562829971313, + "logps/chosen": -406.68670654296875, + "logps/rejected": -417.7669982910156, + "loss": 0.0173, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.14078545570373535, + "rewards/margins": 0.07669827342033386, + "rewards/rejected": -0.21748371422290802, + "step": 2320 + }, + { + "epoch": 0.3, + "learning_rate": 4.38784332893821e-06, + "logits/chosen": -1.9733412265777588, + "logits/rejected": -1.8503671884536743, + "logps/chosen": -491.7334899902344, + "logps/rejected": -502.209716796875, + "loss": 0.0317, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.22381806373596191, + "rewards/margins": 0.06034940481185913, + "rewards/rejected": -0.28416746854782104, + "step": 2330 + }, + { + "epoch": 0.31, + "learning_rate": 4.380336658641503e-06, + "logits/chosen": -1.9059604406356812, + "logits/rejected": -1.6476118564605713, + "logps/chosen": -435.6214294433594, + "logps/rejected": -533.3345947265625, + "loss": 0.0307, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2410147488117218, + "rewards/margins": 0.11430881172418594, + "rewards/rejected": -0.35532355308532715, + "step": 2340 + }, + { + "epoch": 0.31, + "learning_rate": 4.372790747662101e-06, + "logits/chosen": -1.8285706043243408, + "logits/rejected": -1.7125866413116455, + "logps/chosen": -471.718017578125, + "logps/rejected": -567.15869140625, + "loss": 0.019, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.24902674555778503, + "rewards/margins": 0.07921990007162094, + "rewards/rejected": -0.3282466530799866, + "step": 2350 + }, + { + "epoch": 0.31, + "learning_rate": 4.365205753475367e-06, + "logits/chosen": -2.015669822692871, + "logits/rejected": -1.7391248941421509, + "logps/chosen": -395.1049499511719, + "logps/rejected": -434.15570068359375, + "loss": 0.0269, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15337860584259033, + "rewards/margins": 0.0979638621211052, + "rewards/rejected": -0.2513424754142761, + "step": 2360 + }, + { + "epoch": 0.31, + "learning_rate": 4.35758183437229e-06, + "logits/chosen": -2.215122938156128, + "logits/rejected": -1.8627878427505493, + "logps/chosen": -394.71429443359375, + "logps/rejected": -400.0760192871094, + "loss": 0.0246, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.15606388449668884, + "rewards/margins": 0.048857349902391434, + "rewards/rejected": -0.20492124557495117, + "step": 2370 + }, + { + "epoch": 0.31, + "learning_rate": 4.3499191494561835e-06, + "logits/chosen": -1.851022720336914, + "logits/rejected": -1.783961296081543, + "logps/chosen": -426.6841735839844, + "logps/rejected": -496.6563415527344, + "loss": 0.0255, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1743614822626114, + "rewards/margins": 0.10733374208211899, + "rewards/rejected": -0.2816952168941498, + "step": 2380 + }, + { + "epoch": 0.31, + "learning_rate": 4.3422178586393615e-06, + "logits/chosen": -1.890279769897461, + "logits/rejected": -1.7761573791503906, + "logps/chosen": -464.600830078125, + "logps/rejected": -534.0846557617188, + "loss": 0.0165, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.22872360050678253, + "rewards/margins": 0.10886083543300629, + "rewards/rejected": -0.3375844359397888, + "step": 2390 + }, + { + "epoch": 0.31, + "learning_rate": 4.334478122639804e-06, + "logits/chosen": -1.9677358865737915, + "logits/rejected": -1.6048587560653687, + "logps/chosen": -549.1671142578125, + "logps/rejected": -462.8221130371094, + "loss": 0.0419, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.28963416814804077, + "rewards/margins": 0.019320717081427574, + "rewards/rejected": -0.3089548945426941, + "step": 2400 + }, + { + "epoch": 0.32, + "learning_rate": 4.3267001029778015e-06, + "logits/chosen": -1.8243286609649658, + "logits/rejected": -1.7490968704223633, + "logps/chosen": -473.9647521972656, + "logps/rejected": -526.09033203125, + "loss": 0.037, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2106451541185379, + "rewards/margins": 0.10477238893508911, + "rewards/rejected": -0.3154175579547882, + "step": 2410 + }, + { + "epoch": 0.32, + "learning_rate": 4.318883961972585e-06, + "logits/chosen": -2.0151243209838867, + "logits/rejected": -1.893967866897583, + "logps/chosen": -356.76239013671875, + "logps/rejected": -394.2031555175781, + "loss": 0.0277, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.18662713468074799, + "rewards/margins": 0.046176038682460785, + "rewards/rejected": -0.23280318081378937, + "step": 2420 + }, + { + "epoch": 0.32, + "learning_rate": 4.311029862738942e-06, + "logits/chosen": -1.8836452960968018, + "logits/rejected": -1.717961072921753, + "logps/chosen": -386.47406005859375, + "logps/rejected": -504.1248474121094, + "loss": 0.0175, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.21776695549488068, + "rewards/margins": 0.09177577495574951, + "rewards/rejected": -0.309542715549469, + "step": 2430 + }, + { + "epoch": 0.32, + "learning_rate": 4.303137969183804e-06, + "logits/chosen": -1.941748023033142, + "logits/rejected": -1.7688930034637451, + "logps/chosen": -442.46868896484375, + "logps/rejected": -561.3958740234375, + "loss": 0.0283, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.18605726957321167, + "rewards/margins": 0.11203690618276596, + "rewards/rejected": -0.29809415340423584, + "step": 2440 + }, + { + "epoch": 0.32, + "learning_rate": 4.295208446002832e-06, + "logits/chosen": -2.0674407482147217, + "logits/rejected": -1.8563096523284912, + "logps/chosen": -388.8973693847656, + "logps/rejected": -459.39495849609375, + "loss": 0.033, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.20003589987754822, + "rewards/margins": 0.09308059513568878, + "rewards/rejected": -0.2931164801120758, + "step": 2450 + }, + { + "epoch": 0.32, + "learning_rate": 4.287241458676981e-06, + "logits/chosen": -1.9666210412979126, + "logits/rejected": -1.6701295375823975, + "logps/chosen": -398.9001770019531, + "logps/rejected": -474.994384765625, + "loss": 0.0214, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1540244221687317, + "rewards/margins": 0.11552529036998749, + "rewards/rejected": -0.269549697637558, + "step": 2460 + }, + { + "epoch": 0.32, + "learning_rate": 4.279237173469043e-06, + "logits/chosen": -1.8836891651153564, + "logits/rejected": -1.6282813549041748, + "logps/chosen": -482.57147216796875, + "logps/rejected": -560.1978149414062, + "loss": 0.0158, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2128007709980011, + "rewards/margins": 0.1130170226097107, + "rewards/rejected": -0.3258178234100342, + "step": 2470 + }, + { + "epoch": 0.32, + "learning_rate": 4.271195757420177e-06, + "logits/chosen": -1.9587551355361938, + "logits/rejected": -1.7986056804656982, + "logps/chosen": -456.47161865234375, + "logps/rejected": -548.9180297851562, + "loss": 0.0221, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.23430156707763672, + "rewards/margins": 0.1185712069272995, + "rewards/rejected": -0.3528727889060974, + "step": 2480 + }, + { + "epoch": 0.33, + "learning_rate": 4.263117378346425e-06, + "logits/chosen": -1.819907784461975, + "logits/rejected": -1.6789543628692627, + "logps/chosen": -454.332275390625, + "logps/rejected": -502.52130126953125, + "loss": 0.015, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2230617254972458, + "rewards/margins": 0.07935784757137299, + "rewards/rejected": -0.3024195730686188, + "step": 2490 + }, + { + "epoch": 0.33, + "learning_rate": 4.255002204835208e-06, + "logits/chosen": -1.9170949459075928, + "logits/rejected": -1.673644781112671, + "logps/chosen": -503.7950134277344, + "logps/rejected": -573.0311279296875, + "loss": 0.0195, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2700875997543335, + "rewards/margins": 0.10103525966405869, + "rewards/rejected": -0.3711228370666504, + "step": 2500 + }, + { + "epoch": 0.33, + "learning_rate": 4.246850406241812e-06, + "logits/chosen": -1.8298994302749634, + "logits/rejected": -1.565895676612854, + "logps/chosen": -584.307861328125, + "logps/rejected": -674.6881103515625, + "loss": 0.0284, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.27168023586273193, + "rewards/margins": 0.09993793815374374, + "rewards/rejected": -0.3716181814670563, + "step": 2510 + }, + { + "epoch": 0.33, + "learning_rate": 4.2386621526858465e-06, + "logits/chosen": -1.7684259414672852, + "logits/rejected": -1.7138373851776123, + "logps/chosen": -439.82733154296875, + "logps/rejected": -488.1190490722656, + "loss": 0.0277, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.23590664565563202, + "rewards/margins": 0.05192587897181511, + "rewards/rejected": -0.2878325581550598, + "step": 2520 + }, + { + "epoch": 0.33, + "learning_rate": 4.2304376150477015e-06, + "logits/chosen": -1.867626428604126, + "logits/rejected": -1.6936473846435547, + "logps/chosen": -400.4134826660156, + "logps/rejected": -444.52447509765625, + "loss": 0.033, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19006365537643433, + "rewards/margins": 0.05945818871259689, + "rewards/rejected": -0.24952185153961182, + "step": 2530 + }, + { + "epoch": 0.33, + "learning_rate": 4.222176964964977e-06, + "logits/chosen": -1.8515548706054688, + "logits/rejected": -1.620330810546875, + "logps/chosen": -416.956298828125, + "logps/rejected": -489.68634033203125, + "loss": 0.0272, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1836603283882141, + "rewards/margins": 0.08848662674427032, + "rewards/rejected": -0.27214694023132324, + "step": 2540 + }, + { + "epoch": 0.33, + "learning_rate": 4.213880374828903e-06, + "logits/chosen": -1.8032068014144897, + "logits/rejected": -1.7320334911346436, + "logps/chosen": -474.12335205078125, + "logps/rejected": -495.7640075683594, + "loss": 0.0235, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.21940049529075623, + "rewards/margins": 0.04905577376484871, + "rewards/rejected": -0.26845625042915344, + "step": 2550 + }, + { + "epoch": 0.33, + "learning_rate": 4.2055480177807406e-06, + "logits/chosen": -1.665143609046936, + "logits/rejected": -1.5036962032318115, + "logps/chosen": -434.1664123535156, + "logps/rejected": -500.8665466308594, + "loss": 0.0311, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.22254256904125214, + "rewards/margins": 0.10014110803604126, + "rewards/rejected": -0.3226836621761322, + "step": 2560 + }, + { + "epoch": 0.34, + "learning_rate": 4.1971800677081696e-06, + "logits/chosen": -1.6747596263885498, + "logits/rejected": -1.4174727201461792, + "logps/chosen": -421.354248046875, + "logps/rejected": -491.0271911621094, + "loss": 0.0342, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.23044414818286896, + "rewards/margins": 0.08503113687038422, + "rewards/rejected": -0.3154752850532532, + "step": 2570 + }, + { + "epoch": 0.34, + "learning_rate": 4.188776699241661e-06, + "logits/chosen": -1.4612760543823242, + "logits/rejected": -1.4553321599960327, + "logps/chosen": -458.9996643066406, + "logps/rejected": -573.1510009765625, + "loss": 0.0323, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2871035933494568, + "rewards/margins": 0.07828949391841888, + "rewards/rejected": -0.36539310216903687, + "step": 2580 + }, + { + "epoch": 0.34, + "learning_rate": 4.180338087750827e-06, + "logits/chosen": -1.6774513721466064, + "logits/rejected": -1.3070108890533447, + "logps/chosen": -561.6486206054688, + "logps/rejected": -629.3992919921875, + "loss": 0.0264, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2835440933704376, + "rewards/margins": 0.10050519555807114, + "rewards/rejected": -0.38404932618141174, + "step": 2590 + }, + { + "epoch": 0.34, + "learning_rate": 4.1718644093407704e-06, + "logits/chosen": -1.70647394657135, + "logits/rejected": -1.6027628183364868, + "logps/chosen": -442.23583984375, + "logps/rejected": -534.0610961914062, + "loss": 0.0197, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.23742584884166718, + "rewards/margins": 0.08286769688129425, + "rewards/rejected": -0.3202935457229614, + "step": 2600 + }, + { + "epoch": 0.34, + "learning_rate": 4.163355840848401e-06, + "logits/chosen": -1.7119226455688477, + "logits/rejected": -1.6578342914581299, + "logps/chosen": -454.8663635253906, + "logps/rejected": -536.91943359375, + "loss": 0.0232, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.22818918526172638, + "rewards/margins": 0.0926157757639885, + "rewards/rejected": -0.3208049237728119, + "step": 2610 + }, + { + "epoch": 0.34, + "learning_rate": 4.154812559838748e-06, + "logits/chosen": -1.783673644065857, + "logits/rejected": -1.587224006652832, + "logps/chosen": -426.52020263671875, + "logps/rejected": -453.7071228027344, + "loss": 0.0178, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.17788371443748474, + "rewards/margins": 0.07864318788051605, + "rewards/rejected": -0.2565268874168396, + "step": 2620 + }, + { + "epoch": 0.34, + "learning_rate": 4.146234744601259e-06, + "logits/chosen": -1.6520817279815674, + "logits/rejected": -1.3121957778930664, + "logps/chosen": -498.17529296875, + "logps/rejected": -541.5598754882812, + "loss": 0.0449, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2449866533279419, + "rewards/margins": 0.0944221243262291, + "rewards/rejected": -0.3394087553024292, + "step": 2630 + }, + { + "epoch": 0.35, + "learning_rate": 4.137622574146071e-06, + "logits/chosen": -1.8784363269805908, + "logits/rejected": -1.7260299921035767, + "logps/chosen": -400.75726318359375, + "logps/rejected": -431.58782958984375, + "loss": 0.0383, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.1871858537197113, + "rewards/margins": 0.06336455047130585, + "rewards/rejected": -0.25055041909217834, + "step": 2640 + }, + { + "epoch": 0.35, + "learning_rate": 4.12897622820028e-06, + "logits/chosen": -1.7684128284454346, + "logits/rejected": -1.816655158996582, + "logps/chosen": -439.8097229003906, + "logps/rejected": -501.459228515625, + "loss": 0.0234, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.15390966832637787, + "rewards/margins": 0.06625259667634964, + "rewards/rejected": -0.22016224265098572, + "step": 2650 + }, + { + "epoch": 0.35, + "learning_rate": 4.120295887204191e-06, + "logits/chosen": -1.5292326211929321, + "logits/rejected": -1.4526147842407227, + "logps/chosen": -459.69622802734375, + "logps/rejected": -443.5069885253906, + "loss": 0.0241, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20146799087524414, + "rewards/margins": 0.02798052504658699, + "rewards/rejected": -0.22944851219654083, + "step": 2660 + }, + { + "epoch": 0.35, + "learning_rate": 4.111581732307548e-06, + "logits/chosen": -1.6952488422393799, + "logits/rejected": -1.4364745616912842, + "logps/chosen": -520.1705932617188, + "logps/rejected": -526.3663330078125, + "loss": 0.0208, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.24517397582530975, + "rewards/margins": 0.07558010518550873, + "rewards/rejected": -0.3207540512084961, + "step": 2670 + }, + { + "epoch": 0.35, + "learning_rate": 4.1028339453657595e-06, + "logits/chosen": -1.5877403020858765, + "logits/rejected": -1.4068609476089478, + "logps/chosen": -478.41546630859375, + "logps/rejected": -542.8192138671875, + "loss": 0.0227, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24401327967643738, + "rewards/margins": 0.09986021369695663, + "rewards/rejected": -0.3438734710216522, + "step": 2680 + }, + { + "epoch": 0.35, + "learning_rate": 4.094052708936096e-06, + "logits/chosen": -1.764606237411499, + "logits/rejected": -1.5982327461242676, + "logps/chosen": -442.99664306640625, + "logps/rejected": -494.39892578125, + "loss": 0.0246, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.248723104596138, + "rewards/margins": 0.0774948000907898, + "rewards/rejected": -0.326217919588089, + "step": 2690 + }, + { + "epoch": 0.35, + "learning_rate": 4.0852382062738874e-06, + "logits/chosen": -1.3868590593338013, + "logits/rejected": -1.315887689590454, + "logps/chosen": -410.35186767578125, + "logps/rejected": -470.20556640625, + "loss": 0.0244, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.24316677451133728, + "rewards/margins": 0.06980231404304504, + "rewards/rejected": -0.3129690885543823, + "step": 2700 + }, + { + "epoch": 0.35, + "learning_rate": 4.076390621328693e-06, + "logits/chosen": -1.4941400289535522, + "logits/rejected": -1.2799028158187866, + "logps/chosen": -583.5787353515625, + "logps/rejected": -585.2474365234375, + "loss": 0.0311, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.36681923270225525, + "rewards/margins": 0.04977092519402504, + "rewards/rejected": -0.41659015417099, + "step": 2710 + }, + { + "epoch": 0.36, + "learning_rate": 4.067510138740467e-06, + "logits/chosen": -1.2457807064056396, + "logits/rejected": -1.1327399015426636, + "logps/chosen": -576.9419555664062, + "logps/rejected": -656.8292236328125, + "loss": 0.019, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.36136874556541443, + "rewards/margins": 0.08126135170459747, + "rewards/rejected": -0.4426301419734955, + "step": 2720 + }, + { + "epoch": 0.36, + "learning_rate": 4.058596943835703e-06, + "logits/chosen": -1.3685424327850342, + "logits/rejected": -1.1996796131134033, + "logps/chosen": -542.9622192382812, + "logps/rejected": -635.43896484375, + "loss": 0.035, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.33202752470970154, + "rewards/margins": 0.08833317458629608, + "rewards/rejected": -0.42036065459251404, + "step": 2730 + }, + { + "epoch": 0.36, + "learning_rate": 4.049651222623568e-06, + "logits/chosen": -1.3233853578567505, + "logits/rejected": -1.1292164325714111, + "logps/chosen": -591.0294799804688, + "logps/rejected": -589.8373413085938, + "loss": 0.0291, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3421989381313324, + "rewards/margins": 0.06017078831791878, + "rewards/rejected": -0.4023696780204773, + "step": 2740 + }, + { + "epoch": 0.36, + "learning_rate": 4.040673161792014e-06, + "logits/chosen": -1.3762229681015015, + "logits/rejected": -1.3930509090423584, + "logps/chosen": -540.7244873046875, + "logps/rejected": -700.24658203125, + "loss": 0.0249, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.34776386618614197, + "rewards/margins": 0.1587856411933899, + "rewards/rejected": -0.5065494775772095, + "step": 2750 + }, + { + "epoch": 0.36, + "learning_rate": 4.031662948703896e-06, + "logits/chosen": -1.3948943614959717, + "logits/rejected": -1.2206356525421143, + "logps/chosen": -580.3343505859375, + "logps/rejected": -633.843017578125, + "loss": 0.0245, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3342733681201935, + "rewards/margins": 0.09769748896360397, + "rewards/rejected": -0.43197083473205566, + "step": 2760 + }, + { + "epoch": 0.36, + "learning_rate": 4.022620771393047e-06, + "logits/chosen": -1.2887545824050903, + "logits/rejected": -1.1180346012115479, + "logps/chosen": -455.7023010253906, + "logps/rejected": -522.9749755859375, + "loss": 0.0377, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.274341344833374, + "rewards/margins": 0.08734793961048126, + "rewards/rejected": -0.3616892695426941, + "step": 2770 + }, + { + "epoch": 0.36, + "learning_rate": 4.013546818560362e-06, + "logits/chosen": -1.277212142944336, + "logits/rejected": -1.143895149230957, + "logps/chosen": -429.0113220214844, + "logps/rejected": -407.8331298828125, + "loss": 0.0219, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.21917875111103058, + "rewards/margins": 0.05816282704472542, + "rewards/rejected": -0.2773415446281433, + "step": 2780 + }, + { + "epoch": 0.37, + "learning_rate": 4.00444127956986e-06, + "logits/chosen": -1.4734269380569458, + "logits/rejected": -1.2990522384643555, + "logps/chosen": -466.5022888183594, + "logps/rejected": -478.3050231933594, + "loss": 0.0315, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.2548750042915344, + "rewards/margins": 0.03684788569808006, + "rewards/rejected": -0.2917229235172272, + "step": 2790 + }, + { + "epoch": 0.37, + "learning_rate": 3.9953043444447255e-06, + "logits/chosen": -1.2454700469970703, + "logits/rejected": -1.2405586242675781, + "logps/chosen": -561.666015625, + "logps/rejected": -618.0407104492188, + "loss": 0.0287, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2790631949901581, + "rewards/margins": 0.03939159959554672, + "rewards/rejected": -0.3184548318386078, + "step": 2800 + }, + { + "epoch": 0.37, + "learning_rate": 3.986136203863355e-06, + "logits/chosen": -1.3038129806518555, + "logits/rejected": -1.367506742477417, + "logps/chosen": -515.1031494140625, + "logps/rejected": -544.586181640625, + "loss": 0.0224, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.276677668094635, + "rewards/margins": 0.03499032184481621, + "rewards/rejected": -0.3116679787635803, + "step": 2810 + }, + { + "epoch": 0.37, + "learning_rate": 3.976937049155365e-06, + "logits/chosen": -1.4343550205230713, + "logits/rejected": -1.3405755758285522, + "logps/chosen": -471.4578552246094, + "logps/rejected": -566.8021850585938, + "loss": 0.0267, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.2356349527835846, + "rewards/margins": 0.06042511388659477, + "rewards/rejected": -0.2960600256919861, + "step": 2820 + }, + { + "epoch": 0.37, + "learning_rate": 3.967707072297608e-06, + "logits/chosen": -1.3231853246688843, + "logits/rejected": -1.2007641792297363, + "logps/chosen": -500.8212890625, + "logps/rejected": -607.4046630859375, + "loss": 0.0192, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.29319027066230774, + "rewards/margins": 0.11779715120792389, + "rewards/rejected": -0.41098737716674805, + "step": 2830 + }, + { + "epoch": 0.37, + "learning_rate": 3.958446465910159e-06, + "logits/chosen": -1.3309224843978882, + "logits/rejected": -1.4673199653625488, + "logps/chosen": -482.66705322265625, + "logps/rejected": -582.6072998046875, + "loss": 0.0265, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2789314389228821, + "rewards/margins": 0.07025317847728729, + "rewards/rejected": -0.3491845726966858, + "step": 2840 + }, + { + "epoch": 0.37, + "learning_rate": 3.9491554232523066e-06, + "logits/chosen": -1.1617060899734497, + "logits/rejected": -1.1193522214889526, + "logps/chosen": -520.6838989257812, + "logps/rejected": -621.5158081054688, + "loss": 0.0226, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3350999057292938, + "rewards/margins": 0.11317549645900726, + "rewards/rejected": -0.4482753872871399, + "step": 2850 + }, + { + "epoch": 0.37, + "learning_rate": 3.939834138218505e-06, + "logits/chosen": -1.310098648071289, + "logits/rejected": -1.2673050165176392, + "logps/chosen": -478.2703552246094, + "logps/rejected": -547.5897827148438, + "loss": 0.022, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.27902334928512573, + "rewards/margins": 0.06650176644325256, + "rewards/rejected": -0.3455251157283783, + "step": 2860 + }, + { + "epoch": 0.38, + "learning_rate": 3.930482805334339e-06, + "logits/chosen": -1.5085766315460205, + "logits/rejected": -1.4009966850280762, + "logps/chosen": -403.2984313964844, + "logps/rejected": -477.52056884765625, + "loss": 0.0295, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.24465203285217285, + "rewards/margins": 0.07953041046857834, + "rewards/rejected": -0.3241824507713318, + "step": 2870 + }, + { + "epoch": 0.38, + "learning_rate": 3.921101619752464e-06, + "logits/chosen": -1.4011389017105103, + "logits/rejected": -1.5107542276382446, + "logps/chosen": -450.69342041015625, + "logps/rejected": -470.6482849121094, + "loss": 0.0408, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.23339705169200897, + "rewards/margins": 0.036589961498975754, + "rewards/rejected": -0.26998698711395264, + "step": 2880 + }, + { + "epoch": 0.38, + "learning_rate": 3.911690777248525e-06, + "logits/chosen": -1.512414574623108, + "logits/rejected": -1.5040571689605713, + "logps/chosen": -437.159423828125, + "logps/rejected": -494.37811279296875, + "loss": 0.0274, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.2510322332382202, + "rewards/margins": 0.05385801941156387, + "rewards/rejected": -0.3048902451992035, + "step": 2890 + }, + { + "epoch": 0.38, + "learning_rate": 3.902250474217079e-06, + "logits/chosen": -1.4019505977630615, + "logits/rejected": -1.314363956451416, + "logps/chosen": -383.010009765625, + "logps/rejected": -559.7865600585938, + "loss": 0.0438, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.250358521938324, + "rewards/margins": 0.13500240445137024, + "rewards/rejected": -0.3853609561920166, + "step": 2900 + }, + { + "epoch": 0.38, + "learning_rate": 3.892780907667495e-06, + "logits/chosen": -1.744741678237915, + "logits/rejected": -1.4366378784179688, + "logps/chosen": -484.03057861328125, + "logps/rejected": -522.0039672851562, + "loss": 0.029, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2130521535873413, + "rewards/margins": 0.060403358191251755, + "rewards/rejected": -0.27345550060272217, + "step": 2910 + }, + { + "epoch": 0.38, + "learning_rate": 3.883282275219837e-06, + "logits/chosen": -1.42691969871521, + "logits/rejected": -1.3678151369094849, + "logps/chosen": -447.2662658691406, + "logps/rejected": -526.8746337890625, + "loss": 0.0254, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25842684507369995, + "rewards/margins": 0.07644257694482803, + "rewards/rejected": -0.3348694443702698, + "step": 2920 + }, + { + "epoch": 0.38, + "learning_rate": 3.873754775100751e-06, + "logits/chosen": -1.4811619520187378, + "logits/rejected": -1.4468374252319336, + "logps/chosen": -435.4991149902344, + "logps/rejected": -573.3851318359375, + "loss": 0.018, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.25239571928977966, + "rewards/margins": 0.09442947804927826, + "rewards/rejected": -0.34682518243789673, + "step": 2930 + }, + { + "epoch": 0.38, + "learning_rate": 3.8641986061393145e-06, + "logits/chosen": -1.8592326641082764, + "logits/rejected": -1.5195543766021729, + "logps/chosen": -474.525146484375, + "logps/rejected": -491.6029357910156, + "loss": 0.0233, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2184789478778839, + "rewards/margins": 0.07690483331680298, + "rewards/rejected": -0.2953837513923645, + "step": 2940 + }, + { + "epoch": 0.39, + "learning_rate": 3.854613967762898e-06, + "logits/chosen": -1.5112121105194092, + "logits/rejected": -1.4672622680664062, + "logps/chosen": -457.9198303222656, + "logps/rejected": -558.0857543945312, + "loss": 0.019, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.24388387799263, + "rewards/margins": 0.08239175379276276, + "rewards/rejected": -0.32627564668655396, + "step": 2950 + }, + { + "epoch": 0.39, + "learning_rate": 3.845001059992999e-06, + "logits/chosen": -1.5748988389968872, + "logits/rejected": -1.3156006336212158, + "logps/chosen": -547.559814453125, + "logps/rejected": -659.1094970703125, + "loss": 0.0149, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.27472323179244995, + "rewards/margins": 0.13340029120445251, + "rewards/rejected": -0.4081234931945801, + "step": 2960 + }, + { + "epoch": 0.39, + "learning_rate": 3.835360083441067e-06, + "logits/chosen": -1.7014272212982178, + "logits/rejected": -1.6057565212249756, + "logps/chosen": -526.9740600585938, + "logps/rejected": -604.8302001953125, + "loss": 0.0146, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.300466924905777, + "rewards/margins": 0.0719136968255043, + "rewards/rejected": -0.3723805546760559, + "step": 2970 + }, + { + "epoch": 0.39, + "learning_rate": 3.825691239304318e-06, + "logits/chosen": -1.6480176448822021, + "logits/rejected": -1.4222185611724854, + "logps/chosen": -486.6483459472656, + "logps/rejected": -619.3688354492188, + "loss": 0.0348, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2511606812477112, + "rewards/margins": 0.13785335421562195, + "rewards/rejected": -0.3890140652656555, + "step": 2980 + }, + { + "epoch": 0.39, + "learning_rate": 3.8159947293615385e-06, + "logits/chosen": -1.6466913223266602, + "logits/rejected": -1.3495409488677979, + "logps/chosen": -494.6392517089844, + "logps/rejected": -514.5824584960938, + "loss": 0.0163, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.23486237227916718, + "rewards/margins": 0.05418325588107109, + "rewards/rejected": -0.28904566168785095, + "step": 2990 + }, + { + "epoch": 0.39, + "learning_rate": 3.806270755968866e-06, + "logits/chosen": -1.5945298671722412, + "logits/rejected": -1.6403144598007202, + "logps/chosen": -357.9747619628906, + "logps/rejected": -462.56817626953125, + "loss": 0.0181, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20649829506874084, + "rewards/margins": 0.07846391201019287, + "rewards/rejected": -0.2849622070789337, + "step": 3000 + }, + { + "epoch": 0.39, + "learning_rate": 3.7965195220555784e-06, + "logits/chosen": -1.5004541873931885, + "logits/rejected": -1.4311333894729614, + "logps/chosen": -392.80352783203125, + "logps/rejected": -504.016357421875, + "loss": 0.0303, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.17788691818714142, + "rewards/margins": 0.09681157022714615, + "rewards/rejected": -0.27469852566719055, + "step": 3010 + }, + { + "epoch": 0.4, + "learning_rate": 3.786741231119847e-06, + "logits/chosen": -1.5867496728897095, + "logits/rejected": -1.5139662027359009, + "logps/chosen": -441.8441467285156, + "logps/rejected": -573.4630737304688, + "loss": 0.025, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.24398484826087952, + "rewards/margins": 0.10547931492328644, + "rewards/rejected": -0.34946417808532715, + "step": 3020 + }, + { + "epoch": 0.4, + "learning_rate": 3.7769360872244992e-06, + "logits/chosen": -1.4943944215774536, + "logits/rejected": -1.5490363836288452, + "logps/chosen": -508.37335205078125, + "logps/rejected": -570.640625, + "loss": 0.0205, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2434680461883545, + "rewards/margins": 0.07497527450323105, + "rewards/rejected": -0.3184433579444885, + "step": 3030 + }, + { + "epoch": 0.4, + "learning_rate": 3.767104294992754e-06, + "logits/chosen": -1.6485742330551147, + "logits/rejected": -1.439035415649414, + "logps/chosen": -509.4322204589844, + "logps/rejected": -577.4638671875, + "loss": 0.0238, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3061564564704895, + "rewards/margins": 0.07664833962917328, + "rewards/rejected": -0.382804811000824, + "step": 3040 + }, + { + "epoch": 0.4, + "learning_rate": 3.7572460596039524e-06, + "logits/chosen": -1.4903205633163452, + "logits/rejected": -1.4283571243286133, + "logps/chosen": -551.2159423828125, + "logps/rejected": -646.5234375, + "loss": 0.0444, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.34560102224349976, + "rewards/margins": 0.08969350159168243, + "rewards/rejected": -0.4352944791316986, + "step": 3050 + }, + { + "epoch": 0.4, + "learning_rate": 3.74736158678928e-06, + "logits/chosen": -1.4189043045043945, + "logits/rejected": -1.264514684677124, + "logps/chosen": -380.6353454589844, + "logps/rejected": -457.75537109375, + "loss": 0.036, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.22212731838226318, + "rewards/margins": 0.08710966259241104, + "rewards/rejected": -0.30923694372177124, + "step": 3060 + }, + { + "epoch": 0.4, + "learning_rate": 3.7374510828274673e-06, + "logits/chosen": -1.455428957939148, + "logits/rejected": -1.5315929651260376, + "logps/chosen": -361.54803466796875, + "logps/rejected": -505.62060546875, + "loss": 0.0343, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20948748290538788, + "rewards/margins": 0.09494461119174957, + "rewards/rejected": -0.30443209409713745, + "step": 3070 + }, + { + "epoch": 0.4, + "learning_rate": 3.72751475454049e-06, + "logits/chosen": -1.5700876712799072, + "logits/rejected": -1.3799183368682861, + "logps/chosen": -463.9649963378906, + "logps/rejected": -513.8907470703125, + "loss": 0.0281, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.21600162982940674, + "rewards/margins": 0.08006380498409271, + "rewards/rejected": -0.29606547951698303, + "step": 3080 + }, + { + "epoch": 0.4, + "learning_rate": 3.7175528092892503e-06, + "logits/chosen": -1.6064115762710571, + "logits/rejected": -1.3781477212905884, + "logps/chosen": -387.56402587890625, + "logps/rejected": -408.89117431640625, + "loss": 0.0349, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19931992888450623, + "rewards/margins": 0.06138339638710022, + "rewards/rejected": -0.26070332527160645, + "step": 3090 + }, + { + "epoch": 0.41, + "learning_rate": 3.7075654549692498e-06, + "logits/chosen": -1.4929149150848389, + "logits/rejected": -1.313826560974121, + "logps/chosen": -473.07904052734375, + "logps/rejected": -517.1908569335938, + "loss": 0.0303, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.23562940955162048, + "rewards/margins": 0.05551863834261894, + "rewards/rejected": -0.29114800691604614, + "step": 3100 + }, + { + "epoch": 0.41, + "learning_rate": 3.697552900006249e-06, + "logits/chosen": -1.7409900426864624, + "logits/rejected": -1.6835981607437134, + "logps/chosen": -367.76690673828125, + "logps/rejected": -462.6327209472656, + "loss": 0.0171, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.18092992901802063, + "rewards/margins": 0.06359394639730453, + "rewards/rejected": -0.24452385306358337, + "step": 3110 + }, + { + "epoch": 0.41, + "learning_rate": 3.6875153533519244e-06, + "logits/chosen": -1.8172905445098877, + "logits/rejected": -1.4347440004348755, + "logps/chosen": -450.7483825683594, + "logps/rejected": -505.62530517578125, + "loss": 0.0226, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.20507276058197021, + "rewards/margins": 0.08976466953754425, + "rewards/rejected": -0.29483744502067566, + "step": 3120 + }, + { + "epoch": 0.41, + "learning_rate": 3.6774530244794992e-06, + "logits/chosen": -1.6782619953155518, + "logits/rejected": -1.558455467224121, + "logps/chosen": -513.4165649414062, + "logps/rejected": -545.1614379882812, + "loss": 0.0265, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2603208124637604, + "rewards/margins": 0.0918203815817833, + "rewards/rejected": -0.35214120149612427, + "step": 3130 + }, + { + "epoch": 0.41, + "learning_rate": 3.667366123379378e-06, + "logits/chosen": -1.3462660312652588, + "logits/rejected": -1.3467267751693726, + "logps/chosen": -433.2908630371094, + "logps/rejected": -553.0076293945312, + "loss": 0.0233, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.22309264540672302, + "rewards/margins": 0.10725726187229156, + "rewards/rejected": -0.3303499221801758, + "step": 3140 + }, + { + "epoch": 0.41, + "learning_rate": 3.6572548605547607e-06, + "logits/chosen": -1.6897525787353516, + "logits/rejected": -1.2931480407714844, + "logps/chosen": -476.80987548828125, + "logps/rejected": -513.6519165039062, + "loss": 0.0156, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.23292949795722961, + "rewards/margins": 0.08971043676137924, + "rewards/rejected": -0.32263994216918945, + "step": 3150 + }, + { + "epoch": 0.41, + "learning_rate": 3.6471194470172538e-06, + "logits/chosen": -1.6105201244354248, + "logits/rejected": -1.409613013267517, + "logps/chosen": -582.4307861328125, + "logps/rejected": -648.6765747070312, + "loss": 0.0225, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.27965396642684937, + "rewards/margins": 0.10067589581012726, + "rewards/rejected": -0.38032984733581543, + "step": 3160 + }, + { + "epoch": 0.41, + "learning_rate": 3.636960094282461e-06, + "logits/chosen": -1.6837307214736938, + "logits/rejected": -1.5798778533935547, + "logps/chosen": -438.977783203125, + "logps/rejected": -540.5154418945312, + "loss": 0.0275, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.23820741474628448, + "rewards/margins": 0.09993481636047363, + "rewards/rejected": -0.3381422162055969, + "step": 3170 + }, + { + "epoch": 0.42, + "learning_rate": 3.6267770143655743e-06, + "logits/chosen": -1.7468640804290771, + "logits/rejected": -1.6485341787338257, + "logps/chosen": -426.53192138671875, + "logps/rejected": -463.8035583496094, + "loss": 0.0183, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.20808923244476318, + "rewards/margins": 0.0463956817984581, + "rewards/rejected": -0.2544848918914795, + "step": 3180 + }, + { + "epoch": 0.42, + "learning_rate": 3.6165704197769484e-06, + "logits/chosen": -1.7614538669586182, + "logits/rejected": -1.7924140691757202, + "logps/chosen": -312.37261962890625, + "logps/rejected": -424.40203857421875, + "loss": 0.0221, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13243715465068817, + "rewards/margins": 0.11160604655742645, + "rewards/rejected": -0.24404320120811462, + "step": 3190 + }, + { + "epoch": 0.42, + "learning_rate": 3.606340523517663e-06, + "logits/chosen": -2.0063328742980957, + "logits/rejected": -1.8009140491485596, + "logps/chosen": -376.8255920410156, + "logps/rejected": -452.60699462890625, + "loss": 0.0213, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.144984170794487, + "rewards/margins": 0.08352089673280716, + "rewards/rejected": -0.22850506007671356, + "step": 3200 + }, + { + "epoch": 0.42, + "learning_rate": 3.5960875390750793e-06, + "logits/chosen": -1.8453142642974854, + "logits/rejected": -1.6005035638809204, + "logps/chosen": -446.85675048828125, + "logps/rejected": -556.7806396484375, + "loss": 0.0474, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21962828934192657, + "rewards/margins": 0.08978879451751709, + "rewards/rejected": -0.30941709876060486, + "step": 3210 + }, + { + "epoch": 0.42, + "learning_rate": 3.585811680418386e-06, + "logits/chosen": -1.7096410989761353, + "logits/rejected": -1.5049703121185303, + "logps/chosen": -413.1253356933594, + "logps/rejected": -426.44561767578125, + "loss": 0.0215, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18407495319843292, + "rewards/margins": 0.0769776925444603, + "rewards/rejected": -0.2610526382923126, + "step": 3220 + }, + { + "epoch": 0.42, + "learning_rate": 3.5755131619941347e-06, + "logits/chosen": -1.8558601140975952, + "logits/rejected": -1.671087622642517, + "logps/chosen": -536.2061157226562, + "logps/rejected": -596.7044067382812, + "loss": 0.0345, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.22868017852306366, + "rewards/margins": 0.09786845743656158, + "rewards/rejected": -0.32654863595962524, + "step": 3230 + }, + { + "epoch": 0.42, + "learning_rate": 3.565192198721759e-06, + "logits/chosen": -1.7334047555923462, + "logits/rejected": -1.2770370244979858, + "logps/chosen": -447.2215270996094, + "logps/rejected": -460.78924560546875, + "loss": 0.0212, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.21503373980522156, + "rewards/margins": 0.09746576100587845, + "rewards/rejected": -0.3124995231628418, + "step": 3240 + }, + { + "epoch": 0.43, + "learning_rate": 3.5548490059890965e-06, + "logits/chosen": -1.7922292947769165, + "logits/rejected": -1.681229829788208, + "logps/chosen": -414.5247497558594, + "logps/rejected": -453.2389221191406, + "loss": 0.0244, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.19199016690254211, + "rewards/margins": 0.061304427683353424, + "rewards/rejected": -0.25329458713531494, + "step": 3250 + }, + { + "epoch": 0.43, + "learning_rate": 3.5444837996478903e-06, + "logits/chosen": -1.6504888534545898, + "logits/rejected": -1.5916399955749512, + "logps/chosen": -407.19537353515625, + "logps/rejected": -481.0166931152344, + "loss": 0.0381, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19847889244556427, + "rewards/margins": 0.05682044476270676, + "rewards/rejected": -0.25529932975769043, + "step": 3260 + }, + { + "epoch": 0.43, + "learning_rate": 3.534096796009282e-06, + "logits/chosen": -1.7954851388931274, + "logits/rejected": -1.8284984827041626, + "logps/chosen": -327.8110046386719, + "logps/rejected": -373.6348876953125, + "loss": 0.0306, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.16501004993915558, + "rewards/margins": 0.03539814427495003, + "rewards/rejected": -0.2004081904888153, + "step": 3270 + }, + { + "epoch": 0.43, + "learning_rate": 3.5236882118393046e-06, + "logits/chosen": -1.7820394039154053, + "logits/rejected": -1.6226108074188232, + "logps/chosen": -422.36505126953125, + "logps/rejected": -514.1890869140625, + "loss": 0.0231, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.19812330603599548, + "rewards/margins": 0.10547544062137604, + "rewards/rejected": -0.3035987317562103, + "step": 3280 + }, + { + "epoch": 0.43, + "learning_rate": 3.5132582643543513e-06, + "logits/chosen": -1.7644245624542236, + "logits/rejected": -1.3841527700424194, + "logps/chosen": -505.01629638671875, + "logps/rejected": -548.0068969726562, + "loss": 0.0285, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2590879797935486, + "rewards/margins": 0.0976809710264206, + "rewards/rejected": -0.35676896572113037, + "step": 3290 + }, + { + "epoch": 0.43, + "learning_rate": 3.5028071712166456e-06, + "logits/chosen": -1.6618616580963135, + "logits/rejected": -1.4068950414657593, + "logps/chosen": -468.48419189453125, + "logps/rejected": -525.5357666015625, + "loss": 0.0368, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22081026434898376, + "rewards/margins": 0.09167324751615524, + "rewards/rejected": -0.312483549118042, + "step": 3300 + }, + { + "epoch": 0.43, + "learning_rate": 3.4923351505297008e-06, + "logits/chosen": -1.805572509765625, + "logits/rejected": -1.5034650564193726, + "logps/chosen": -505.39398193359375, + "logps/rejected": -472.99041748046875, + "loss": 0.0461, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2428407371044159, + "rewards/margins": 0.05200044438242912, + "rewards/rejected": -0.2948412299156189, + "step": 3310 + }, + { + "epoch": 0.43, + "learning_rate": 3.481842420833766e-06, + "logits/chosen": -1.912627935409546, + "logits/rejected": -1.681044340133667, + "logps/chosen": -495.08544921875, + "logps/rejected": -584.6039428710938, + "loss": 0.0189, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.24733643233776093, + "rewards/margins": 0.109983429312706, + "rewards/rejected": -0.3573199212551117, + "step": 3320 + }, + { + "epoch": 0.44, + "learning_rate": 3.4713292011012645e-06, + "logits/chosen": -1.7308298349380493, + "logits/rejected": -1.6717321872711182, + "logps/chosen": -364.91351318359375, + "logps/rejected": -413.93768310546875, + "loss": 0.0197, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19749896228313446, + "rewards/margins": 0.07182800769805908, + "rewards/rejected": -0.26932698488235474, + "step": 3330 + }, + { + "epoch": 0.44, + "learning_rate": 3.4607957107322277e-06, + "logits/chosen": -1.5342094898223877, + "logits/rejected": -1.5572571754455566, + "logps/chosen": -384.7530212402344, + "logps/rejected": -516.9837646484375, + "loss": 0.0169, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2340409755706787, + "rewards/margins": 0.10289822518825531, + "rewards/rejected": -0.3369391858577728, + "step": 3340 + }, + { + "epoch": 0.44, + "learning_rate": 3.4502421695497112e-06, + "logits/chosen": -1.814679503440857, + "logits/rejected": -1.7517423629760742, + "logps/chosen": -478.49676513671875, + "logps/rejected": -526.2501220703125, + "loss": 0.0241, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.23059411346912384, + "rewards/margins": 0.07972148805856705, + "rewards/rejected": -0.3103155493736267, + "step": 3350 + }, + { + "epoch": 0.44, + "learning_rate": 3.4396687977952137e-06, + "logits/chosen": -1.850203275680542, + "logits/rejected": -1.494214415550232, + "logps/chosen": -451.4457092285156, + "logps/rejected": -529.9976806640625, + "loss": 0.0188, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2504863142967224, + "rewards/margins": 0.09670194983482361, + "rewards/rejected": -0.347188264131546, + "step": 3360 + }, + { + "epoch": 0.44, + "learning_rate": 3.429075816124075e-06, + "logits/chosen": -1.939391851425171, + "logits/rejected": -1.4765293598175049, + "logps/chosen": -536.4401245117188, + "logps/rejected": -520.0018920898438, + "loss": 0.0188, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22204503417015076, + "rewards/margins": 0.08225461095571518, + "rewards/rejected": -0.30429965257644653, + "step": 3370 + }, + { + "epoch": 0.44, + "learning_rate": 3.418463445600874e-06, + "logits/chosen": -1.9319158792495728, + "logits/rejected": -1.568086862564087, + "logps/chosen": -537.13232421875, + "logps/rejected": -494.2122497558594, + "loss": 0.0276, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2699492871761322, + "rewards/margins": 0.038329653441905975, + "rewards/rejected": -0.3082789480686188, + "step": 3380 + }, + { + "epoch": 0.44, + "learning_rate": 3.4078319076948173e-06, + "logits/chosen": -1.8316503763198853, + "logits/rejected": -1.5290436744689941, + "logps/chosen": -471.3053283691406, + "logps/rejected": -508.49566650390625, + "loss": 0.0191, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.24652762711048126, + "rewards/margins": 0.06418335437774658, + "rewards/rejected": -0.31071096658706665, + "step": 3390 + }, + { + "epoch": 0.44, + "learning_rate": 3.3971814242751123e-06, + "logits/chosen": -1.6582746505737305, + "logits/rejected": -1.5354890823364258, + "logps/chosen": -440.11517333984375, + "logps/rejected": -526.5018310546875, + "loss": 0.0231, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2067064791917801, + "rewards/margins": 0.07915418595075607, + "rewards/rejected": -0.28586068749427795, + "step": 3400 + }, + { + "epoch": 0.45, + "learning_rate": 3.386512217606339e-06, + "logits/chosen": -1.8133357763290405, + "logits/rejected": -1.633396863937378, + "logps/chosen": -483.3251953125, + "logps/rejected": -544.7848510742188, + "loss": 0.0229, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2441612184047699, + "rewards/margins": 0.06827167421579361, + "rewards/rejected": -0.3124328553676605, + "step": 3410 + }, + { + "epoch": 0.45, + "learning_rate": 3.375824510343816e-06, + "logits/chosen": -1.6016031503677368, + "logits/rejected": -1.3881438970565796, + "logps/chosen": -462.1622009277344, + "logps/rejected": -522.3944091796875, + "loss": 0.034, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.25817808508872986, + "rewards/margins": 0.09379847347736359, + "rewards/rejected": -0.35197657346725464, + "step": 3420 + }, + { + "epoch": 0.45, + "learning_rate": 3.3651185255289466e-06, + "logits/chosen": -1.8316404819488525, + "logits/rejected": -1.5879504680633545, + "logps/chosen": -467.69427490234375, + "logps/rejected": -491.70733642578125, + "loss": 0.0288, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21627011895179749, + "rewards/margins": 0.06159573048353195, + "rewards/rejected": -0.27786585688591003, + "step": 3430 + }, + { + "epoch": 0.45, + "learning_rate": 3.354394486584568e-06, + "logits/chosen": -1.3282498121261597, + "logits/rejected": -1.349848985671997, + "logps/chosen": -456.355712890625, + "logps/rejected": -520.9228515625, + "loss": 0.027, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2578386962413788, + "rewards/margins": 0.06478948146104813, + "rewards/rejected": -0.3226282000541687, + "step": 3440 + }, + { + "epoch": 0.45, + "learning_rate": 3.3436526173102913e-06, + "logits/chosen": -1.744478464126587, + "logits/rejected": -1.539102554321289, + "logps/chosen": -516.9254150390625, + "logps/rejected": -546.52197265625, + "loss": 0.0393, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.262891948223114, + "rewards/margins": 0.06755396723747253, + "rewards/rejected": -0.33044594526290894, + "step": 3450 + }, + { + "epoch": 0.45, + "learning_rate": 3.3328931418778254e-06, + "logits/chosen": -1.7492382526397705, + "logits/rejected": -1.425809621810913, + "logps/chosen": -463.23138427734375, + "logps/rejected": -501.52325439453125, + "loss": 0.0278, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.27157261967658997, + "rewards/margins": 0.060871053487062454, + "rewards/rejected": -0.3324436843395233, + "step": 3460 + }, + { + "epoch": 0.45, + "learning_rate": 3.3221162848263028e-06, + "logits/chosen": -1.6769005060195923, + "logits/rejected": -1.568414330482483, + "logps/chosen": -493.3720703125, + "logps/rejected": -502.56390380859375, + "loss": 0.0269, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.27238377928733826, + "rewards/margins": 0.06392760574817657, + "rewards/rejected": -0.33631137013435364, + "step": 3470 + }, + { + "epoch": 0.46, + "learning_rate": 3.3113222710575914e-06, + "logits/chosen": -1.8256422281265259, + "logits/rejected": -1.6403229236602783, + "logps/chosen": -442.6636657714844, + "logps/rejected": -473.293212890625, + "loss": 0.0314, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2143537551164627, + "rewards/margins": 0.02805102802813053, + "rewards/rejected": -0.2424047738313675, + "step": 3480 + }, + { + "epoch": 0.46, + "learning_rate": 3.300511325831603e-06, + "logits/chosen": -1.8307218551635742, + "logits/rejected": -1.7159074544906616, + "logps/chosen": -517.7831420898438, + "logps/rejected": -533.2928466796875, + "loss": 0.0267, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.26110875606536865, + "rewards/margins": 0.06851954013109207, + "rewards/rejected": -0.3296282887458801, + "step": 3490 + }, + { + "epoch": 0.46, + "learning_rate": 3.289683674761592e-06, + "logits/chosen": -1.8645591735839844, + "logits/rejected": -1.7138818502426147, + "logps/chosen": -517.3194580078125, + "logps/rejected": -542.5574951171875, + "loss": 0.0307, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.26969239115715027, + "rewards/margins": 0.06960771977901459, + "rewards/rejected": -0.33930009603500366, + "step": 3500 + }, + { + "epoch": 0.46, + "learning_rate": 3.2788395438094444e-06, + "logits/chosen": -1.6032047271728516, + "logits/rejected": -1.4881914854049683, + "logps/chosen": -454.68963623046875, + "logps/rejected": -497.9525451660156, + "loss": 0.0186, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2439115345478058, + "rewards/margins": 0.034994177520275116, + "rewards/rejected": -0.2789056897163391, + "step": 3510 + }, + { + "epoch": 0.46, + "learning_rate": 3.2679791592809653e-06, + "logits/chosen": -1.7976545095443726, + "logits/rejected": -1.6705448627471924, + "logps/chosen": -444.757080078125, + "logps/rejected": -513.5965576171875, + "loss": 0.0241, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2211049497127533, + "rewards/margins": 0.06046595051884651, + "rewards/rejected": -0.2815709114074707, + "step": 3520 + }, + { + "epoch": 0.46, + "learning_rate": 3.257102747821157e-06, + "logits/chosen": -1.6195249557495117, + "logits/rejected": -1.4758068323135376, + "logps/chosen": -467.68194580078125, + "logps/rejected": -495.48980712890625, + "loss": 0.0281, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2174040526151657, + "rewards/margins": 0.0670911967754364, + "rewards/rejected": -0.2844952642917633, + "step": 3530 + }, + { + "epoch": 0.46, + "learning_rate": 3.246210536409484e-06, + "logits/chosen": -1.7049881219863892, + "logits/rejected": -1.6321277618408203, + "logps/chosen": -382.57464599609375, + "logps/rejected": -440.7767028808594, + "loss": 0.0274, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.22493591904640198, + "rewards/margins": 0.07922857254743576, + "rewards/rejected": -0.30416446924209595, + "step": 3540 + }, + { + "epoch": 0.46, + "learning_rate": 3.235302752355142e-06, + "logits/chosen": -1.3347362279891968, + "logits/rejected": -1.2995057106018066, + "logps/chosen": -357.0577392578125, + "logps/rejected": -402.48480224609375, + "loss": 0.037, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19129280745983124, + "rewards/margins": 0.047142066061496735, + "rewards/rejected": -0.23843488097190857, + "step": 3550 + }, + { + "epoch": 0.47, + "learning_rate": 3.2243796232923097e-06, + "logits/chosen": -1.768776297569275, + "logits/rejected": -1.7733237743377686, + "logps/chosen": -374.0507507324219, + "logps/rejected": -424.15679931640625, + "loss": 0.0148, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.21344757080078125, + "rewards/margins": 0.038538042455911636, + "rewards/rejected": -0.2519856095314026, + "step": 3560 + }, + { + "epoch": 0.47, + "learning_rate": 3.2134413771754037e-06, + "logits/chosen": -1.5099389553070068, + "logits/rejected": -1.3366999626159668, + "logps/chosen": -445.18927001953125, + "logps/rejected": -481.36187744140625, + "loss": 0.0187, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2308485060930252, + "rewards/margins": 0.07678955793380737, + "rewards/rejected": -0.3076380491256714, + "step": 3570 + }, + { + "epoch": 0.47, + "learning_rate": 3.2024882422743118e-06, + "logits/chosen": -1.684107780456543, + "logits/rejected": -1.4487898349761963, + "logps/chosen": -451.0792541503906, + "logps/rejected": -508.5023498535156, + "loss": 0.0307, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2644755244255066, + "rewards/margins": 0.07611201703548431, + "rewards/rejected": -0.3405875563621521, + "step": 3580 + }, + { + "epoch": 0.47, + "learning_rate": 3.1915204471696425e-06, + "logits/chosen": -1.6601011753082275, + "logits/rejected": -1.5120861530303955, + "logps/chosen": -581.9293823242188, + "logps/rejected": -621.4324951171875, + "loss": 0.0334, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.324591726064682, + "rewards/margins": 0.06923629343509674, + "rewards/rejected": -0.39382803440093994, + "step": 3590 + }, + { + "epoch": 0.47, + "learning_rate": 3.180538220747943e-06, + "logits/chosen": -1.5336710214614868, + "logits/rejected": -1.3869249820709229, + "logps/chosen": -501.361572265625, + "logps/rejected": -524.8993530273438, + "loss": 0.03, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.33136993646621704, + "rewards/margins": 0.0689396858215332, + "rewards/rejected": -0.40030962228775024, + "step": 3600 + }, + { + "epoch": 0.47, + "learning_rate": 3.1695417921969287e-06, + "logits/chosen": -1.6701656579971313, + "logits/rejected": -1.5971901416778564, + "logps/chosen": -521.0940551757812, + "logps/rejected": -608.44580078125, + "loss": 0.0225, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3268844187259674, + "rewards/margins": 0.10305018723011017, + "rewards/rejected": -0.4299345910549164, + "step": 3610 + }, + { + "epoch": 0.47, + "learning_rate": 3.158531391000697e-06, + "logits/chosen": -1.6981074810028076, + "logits/rejected": -1.5071054697036743, + "logps/chosen": -566.3934326171875, + "logps/rejected": -606.4476318359375, + "loss": 0.0176, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.28605085611343384, + "rewards/margins": 0.09975703060626984, + "rewards/rejected": -0.38580790162086487, + "step": 3620 + }, + { + "epoch": 0.48, + "learning_rate": 3.147507246934943e-06, + "logits/chosen": -1.927480697631836, + "logits/rejected": -1.7554702758789062, + "logps/chosen": -465.91607666015625, + "logps/rejected": -525.5009765625, + "loss": 0.0293, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2357490509748459, + "rewards/margins": 0.08434184640645981, + "rewards/rejected": -0.3200909197330475, + "step": 3630 + }, + { + "epoch": 0.48, + "learning_rate": 3.136469590062158e-06, + "logits/chosen": -1.7100019454956055, + "logits/rejected": -1.497292160987854, + "logps/chosen": -430.9881896972656, + "logps/rejected": -473.24285888671875, + "loss": 0.0278, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.24274256825447083, + "rewards/margins": 0.08607254922389984, + "rewards/rejected": -0.3288151025772095, + "step": 3640 + }, + { + "epoch": 0.48, + "learning_rate": 3.1254186507268354e-06, + "logits/chosen": -1.873304009437561, + "logits/rejected": -1.638899564743042, + "logps/chosen": -552.6870727539062, + "logps/rejected": -595.4931640625, + "loss": 0.0197, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2840507924556732, + "rewards/margins": 0.0846860259771347, + "rewards/rejected": -0.3687368333339691, + "step": 3650 + }, + { + "epoch": 0.48, + "learning_rate": 3.114354659550656e-06, + "logits/chosen": -1.782984972000122, + "logits/rejected": -1.8160291910171509, + "logps/chosen": -484.53802490234375, + "logps/rejected": -617.17138671875, + "loss": 0.0474, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25991731882095337, + "rewards/margins": 0.0726139098405838, + "rewards/rejected": -0.3325311541557312, + "step": 3660 + }, + { + "epoch": 0.48, + "learning_rate": 3.1032778474276816e-06, + "logits/chosen": -1.9407377243041992, + "logits/rejected": -1.5795496702194214, + "logps/chosen": -439.5630798339844, + "logps/rejected": -509.26953125, + "loss": 0.0264, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.21727442741394043, + "rewards/margins": 0.0863426923751831, + "rewards/rejected": -0.30361711978912354, + "step": 3670 + }, + { + "epoch": 0.48, + "learning_rate": 3.092188445519532e-06, + "logits/chosen": -1.8346776962280273, + "logits/rejected": -1.7408864498138428, + "logps/chosen": -428.6434631347656, + "logps/rejected": -484.744140625, + "loss": 0.0237, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.19090434908866882, + "rewards/margins": 0.08148179948329926, + "rewards/rejected": -0.2723861634731293, + "step": 3680 + }, + { + "epoch": 0.48, + "learning_rate": 3.081086685250565e-06, + "logits/chosen": -2.0212202072143555, + "logits/rejected": -1.9480583667755127, + "logps/chosen": -428.998779296875, + "logps/rejected": -475.50079345703125, + "loss": 0.0408, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.20856909453868866, + "rewards/margins": 0.07064095884561539, + "rewards/rejected": -0.27921006083488464, + "step": 3690 + }, + { + "epoch": 0.48, + "learning_rate": 3.0699727983030434e-06, + "logits/chosen": -1.9696426391601562, + "logits/rejected": -1.8022600412368774, + "logps/chosen": -456.12640380859375, + "logps/rejected": -522.7034912109375, + "loss": 0.0336, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.20959241688251495, + "rewards/margins": 0.08178629726171494, + "rewards/rejected": -0.2913787066936493, + "step": 3700 + }, + { + "epoch": 0.49, + "learning_rate": 3.058847016612301e-06, + "logits/chosen": -1.824035882949829, + "logits/rejected": -1.60663640499115, + "logps/chosen": -604.5587768554688, + "logps/rejected": -632.459716796875, + "loss": 0.023, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2821849584579468, + "rewards/margins": 0.09114620089530945, + "rewards/rejected": -0.3733311593532562, + "step": 3710 + }, + { + "epoch": 0.49, + "learning_rate": 3.0477095723619034e-06, + "logits/chosen": -1.8234065771102905, + "logits/rejected": -1.6101405620574951, + "logps/chosen": -513.341796875, + "logps/rejected": -645.3538818359375, + "loss": 0.0211, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.28968682885169983, + "rewards/margins": 0.12813854217529297, + "rewards/rejected": -0.4178254008293152, + "step": 3720 + }, + { + "epoch": 0.49, + "learning_rate": 3.0365606979788003e-06, + "logits/chosen": -1.5860755443572998, + "logits/rejected": -1.6987440586090088, + "logps/chosen": -454.96197509765625, + "logps/rejected": -528.0737915039062, + "loss": 0.0292, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.26307860016822815, + "rewards/margins": 0.05534951761364937, + "rewards/rejected": -0.318428099155426, + "step": 3730 + }, + { + "epoch": 0.49, + "learning_rate": 3.0254006261284786e-06, + "logits/chosen": -2.1023459434509277, + "logits/rejected": -1.9115941524505615, + "logps/chosen": -444.1775817871094, + "logps/rejected": -460.3389587402344, + "loss": 0.0194, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.18086031079292297, + "rewards/margins": 0.04658311977982521, + "rewards/rejected": -0.2274434119462967, + "step": 3740 + }, + { + "epoch": 0.49, + "learning_rate": 3.0142295897101032e-06, + "logits/chosen": -1.849963903427124, + "logits/rejected": -1.6758321523666382, + "logps/chosen": -464.70941162109375, + "logps/rejected": -479.88824462890625, + "loss": 0.0314, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.22201187908649445, + "rewards/margins": 0.06475654244422913, + "rewards/rejected": -0.28676837682724, + "step": 3750 + }, + { + "epoch": 0.49, + "learning_rate": 3.0030478218516578e-06, + "logits/chosen": -1.8443362712860107, + "logits/rejected": -1.4853614568710327, + "logps/chosen": -517.84716796875, + "logps/rejected": -568.94091796875, + "loss": 0.0265, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2580469250679016, + "rewards/margins": 0.16223561763763428, + "rewards/rejected": -0.4202825129032135, + "step": 3760 + }, + { + "epoch": 0.49, + "learning_rate": 2.9918555559050826e-06, + "logits/chosen": -1.839974045753479, + "logits/rejected": -1.866651177406311, + "logps/chosen": -476.24395751953125, + "logps/rejected": -606.966552734375, + "loss": 0.0326, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2671697735786438, + "rewards/margins": 0.09309405833482742, + "rewards/rejected": -0.360263854265213, + "step": 3770 + }, + { + "epoch": 0.49, + "learning_rate": 2.980653025441399e-06, + "logits/chosen": -2.0247726440429688, + "logits/rejected": -2.0207698345184326, + "logps/chosen": -381.8411865234375, + "logps/rejected": -484.6162109375, + "loss": 0.0266, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20929519832134247, + "rewards/margins": 0.07083339989185333, + "rewards/rejected": -0.2801285684108734, + "step": 3780 + }, + { + "epoch": 0.5, + "learning_rate": 2.969440464245841e-06, + "logits/chosen": -1.819759726524353, + "logits/rejected": -1.9021791219711304, + "logps/chosen": -367.9759216308594, + "logps/rejected": -426.68646240234375, + "loss": 0.0233, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.20591673254966736, + "rewards/margins": 0.04149980470538139, + "rewards/rejected": -0.24741652607917786, + "step": 3790 + }, + { + "epoch": 0.5, + "learning_rate": 2.95821810631297e-06, + "logits/chosen": -1.8347660303115845, + "logits/rejected": -1.882627248764038, + "logps/chosen": -403.18524169921875, + "logps/rejected": -482.39599609375, + "loss": 0.0254, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2047998011112213, + "rewards/margins": 0.05898011848330498, + "rewards/rejected": -0.2637799382209778, + "step": 3800 + }, + { + "epoch": 0.5, + "learning_rate": 2.946986185841801e-06, + "logits/chosen": -1.923988938331604, + "logits/rejected": -1.7287712097167969, + "logps/chosen": -442.3211975097656, + "logps/rejected": -507.741455078125, + "loss": 0.0207, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19284003973007202, + "rewards/margins": 0.08564140647649765, + "rewards/rejected": -0.2784814238548279, + "step": 3810 + }, + { + "epoch": 0.5, + "learning_rate": 2.935744937230903e-06, + "logits/chosen": -1.9581371545791626, + "logits/rejected": -1.7871599197387695, + "logps/chosen": -452.6080627441406, + "logps/rejected": -473.64886474609375, + "loss": 0.0341, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.22355417907238007, + "rewards/margins": 0.06519726663827896, + "rewards/rejected": -0.28875142335891724, + "step": 3820 + }, + { + "epoch": 0.5, + "learning_rate": 2.924494595073517e-06, + "logits/chosen": -1.7134824991226196, + "logits/rejected": -1.6709381341934204, + "logps/chosen": -408.0823974609375, + "logps/rejected": -472.41168212890625, + "loss": 0.0308, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.21899013221263885, + "rewards/margins": 0.07624353468418121, + "rewards/rejected": -0.29523366689682007, + "step": 3830 + }, + { + "epoch": 0.5, + "learning_rate": 2.9132353941526575e-06, + "logits/chosen": -2.0410828590393066, + "logits/rejected": -1.753594994544983, + "logps/chosen": -463.1409606933594, + "logps/rejected": -499.73175048828125, + "loss": 0.0399, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.17987537384033203, + "rewards/margins": 0.1088109239935875, + "rewards/rejected": -0.2886863052845001, + "step": 3840 + }, + { + "epoch": 0.5, + "learning_rate": 2.901967569436209e-06, + "logits/chosen": -1.8085641860961914, + "logits/rejected": -1.7082746028900146, + "logps/chosen": -412.5797424316406, + "logps/rejected": -463.7660217285156, + "loss": 0.0248, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1930643916130066, + "rewards/margins": 0.06069738790392876, + "rewards/rejected": -0.25376179814338684, + "step": 3850 + }, + { + "epoch": 0.51, + "learning_rate": 2.89069135607203e-06, + "logits/chosen": -1.862959623336792, + "logits/rejected": -1.5761888027191162, + "logps/chosen": -482.17529296875, + "logps/rejected": -534.7391967773438, + "loss": 0.029, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.22728033363819122, + "rewards/margins": 0.09412405639886856, + "rewards/rejected": -0.3214043974876404, + "step": 3860 + }, + { + "epoch": 0.51, + "learning_rate": 2.8794069893830386e-06, + "logits/chosen": -1.577389121055603, + "logits/rejected": -1.6044118404388428, + "logps/chosen": -409.6336975097656, + "logps/rejected": -537.7545776367188, + "loss": 0.0231, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.20168235898017883, + "rewards/margins": 0.08728457987308502, + "rewards/rejected": -0.28896695375442505, + "step": 3870 + }, + { + "epoch": 0.51, + "learning_rate": 2.8681147048623038e-06, + "logits/chosen": -1.872521162033081, + "logits/rejected": -1.8206942081451416, + "logps/chosen": -411.75714111328125, + "logps/rejected": -440.4037170410156, + "loss": 0.0254, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.20198896527290344, + "rewards/margins": 0.049792829900979996, + "rewards/rejected": -0.25178176164627075, + "step": 3880 + }, + { + "epoch": 0.51, + "learning_rate": 2.8568147381681333e-06, + "logits/chosen": -1.9868106842041016, + "logits/rejected": -1.529101848602295, + "logps/chosen": -437.37646484375, + "logps/rejected": -485.68011474609375, + "loss": 0.0129, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17643964290618896, + "rewards/margins": 0.10629074275493622, + "rewards/rejected": -0.2827304005622864, + "step": 3890 + }, + { + "epoch": 0.51, + "learning_rate": 2.8455073251191533e-06, + "logits/chosen": -1.9317123889923096, + "logits/rejected": -1.657210111618042, + "logps/chosen": -434.2438049316406, + "logps/rejected": -530.1087646484375, + "loss": 0.0172, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20733916759490967, + "rewards/margins": 0.10539014637470245, + "rewards/rejected": -0.3127292990684509, + "step": 3900 + }, + { + "epoch": 0.51, + "learning_rate": 2.8341927016893887e-06, + "logits/chosen": -1.8347209692001343, + "logits/rejected": -1.6418960094451904, + "logps/chosen": -421.93328857421875, + "logps/rejected": -514.156494140625, + "loss": 0.0267, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19104665517807007, + "rewards/margins": 0.10307135432958603, + "rewards/rejected": -0.2941179871559143, + "step": 3910 + }, + { + "epoch": 0.51, + "learning_rate": 2.822871104003335e-06, + "logits/chosen": -1.7689571380615234, + "logits/rejected": -1.7831192016601562, + "logps/chosen": -439.4349060058594, + "logps/rejected": -500.2786560058594, + "loss": 0.0187, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.23783540725708008, + "rewards/margins": 0.07082591950893402, + "rewards/rejected": -0.3086613714694977, + "step": 3920 + }, + { + "epoch": 0.51, + "learning_rate": 2.8115427683310355e-06, + "logits/chosen": -1.7057605981826782, + "logits/rejected": -1.811344861984253, + "logps/chosen": -486.764404296875, + "logps/rejected": -532.9198608398438, + "loss": 0.0239, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25383487343788147, + "rewards/margins": 0.05473923683166504, + "rewards/rejected": -0.3085741400718689, + "step": 3930 + }, + { + "epoch": 0.52, + "learning_rate": 2.8002079310831477e-06, + "logits/chosen": -1.8792072534561157, + "logits/rejected": -1.5005006790161133, + "logps/chosen": -429.49945068359375, + "logps/rejected": -489.650390625, + "loss": 0.0198, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.21709856390953064, + "rewards/margins": 0.1154472827911377, + "rewards/rejected": -0.33254584670066833, + "step": 3940 + }, + { + "epoch": 0.52, + "learning_rate": 2.7888668288060095e-06, + "logits/chosen": -1.7971118688583374, + "logits/rejected": -1.7154830694198608, + "logps/chosen": -399.40338134765625, + "logps/rejected": -502.49090576171875, + "loss": 0.0304, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.20770153403282166, + "rewards/margins": 0.08903868496417999, + "rewards/rejected": -0.29674020409584045, + "step": 3950 + }, + { + "epoch": 0.52, + "learning_rate": 2.7775196981767044e-06, + "logits/chosen": -1.9662151336669922, + "logits/rejected": -1.6054153442382812, + "logps/chosen": -392.71270751953125, + "logps/rejected": -463.1542053222656, + "loss": 0.0258, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2093251645565033, + "rewards/margins": 0.10495827347040176, + "rewards/rejected": -0.31428343057632446, + "step": 3960 + }, + { + "epoch": 0.52, + "learning_rate": 2.7661667759981213e-06, + "logits/chosen": -1.5571118593215942, + "logits/rejected": -1.6845260858535767, + "logps/chosen": -384.57293701171875, + "logps/rejected": -467.65606689453125, + "loss": 0.0195, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.24344149231910706, + "rewards/margins": 0.06183134391903877, + "rewards/rejected": -0.3052728474140167, + "step": 3970 + }, + { + "epoch": 0.52, + "learning_rate": 2.7548082991940137e-06, + "logits/chosen": -1.879895806312561, + "logits/rejected": -1.6696151494979858, + "logps/chosen": -470.91455078125, + "logps/rejected": -492.08135986328125, + "loss": 0.0218, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.20541563630104065, + "rewards/margins": 0.09316332638263702, + "rewards/rejected": -0.2985789477825165, + "step": 3980 + }, + { + "epoch": 0.52, + "learning_rate": 2.743444504804051e-06, + "logits/chosen": -1.7343305349349976, + "logits/rejected": -1.5158665180206299, + "logps/chosen": -464.01715087890625, + "logps/rejected": -480.1866760253906, + "loss": 0.022, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.25514039397239685, + "rewards/margins": 0.06958375871181488, + "rewards/rejected": -0.3247241973876953, + "step": 3990 + }, + { + "epoch": 0.52, + "learning_rate": 2.7320756299788788e-06, + "logits/chosen": -1.6907501220703125, + "logits/rejected": -1.6015201807022095, + "logps/chosen": -474.698486328125, + "logps/rejected": -565.966064453125, + "loss": 0.0226, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24437180161476135, + "rewards/margins": 0.08970580995082855, + "rewards/rejected": -0.3340775966644287, + "step": 4000 + }, + { + "epoch": 0.52, + "learning_rate": 2.7207019119751644e-06, + "logits/chosen": -1.6006886959075928, + "logits/rejected": -1.4601367712020874, + "logps/chosen": -418.631103515625, + "logps/rejected": -462.9296875, + "loss": 0.0273, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.20333543419837952, + "rewards/margins": 0.09906761348247528, + "rewards/rejected": -0.3024030327796936, + "step": 4010 + }, + { + "epoch": 0.53, + "learning_rate": 2.7093235881506474e-06, + "logits/chosen": -1.7386598587036133, + "logits/rejected": -1.631439208984375, + "logps/chosen": -464.40020751953125, + "logps/rejected": -537.2156372070312, + "loss": 0.0218, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.21815907955169678, + "rewards/margins": 0.09343260526657104, + "rewards/rejected": -0.3115917146205902, + "step": 4020 + }, + { + "epoch": 0.53, + "learning_rate": 2.6979408959591863e-06, + "logits/chosen": -1.7486827373504639, + "logits/rejected": -1.508710265159607, + "logps/chosen": -461.79876708984375, + "logps/rejected": -527.3321533203125, + "loss": 0.0151, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.19339005649089813, + "rewards/margins": 0.125021830201149, + "rewards/rejected": -0.3184118866920471, + "step": 4030 + }, + { + "epoch": 0.53, + "learning_rate": 2.6865540729458034e-06, + "logits/chosen": -1.742926836013794, + "logits/rejected": -1.6564674377441406, + "logps/chosen": -366.6788635253906, + "logps/rejected": -420.30780029296875, + "loss": 0.0338, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.14146068692207336, + "rewards/margins": 0.04987145587801933, + "rewards/rejected": -0.1913321316242218, + "step": 4040 + }, + { + "epoch": 0.53, + "learning_rate": 2.675163356741726e-06, + "logits/chosen": -1.7827094793319702, + "logits/rejected": -1.6591606140136719, + "logps/chosen": -372.85089111328125, + "logps/rejected": -396.1568603515625, + "loss": 0.0254, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.14776013791561127, + "rewards/margins": 0.0655493214726448, + "rewards/rejected": -0.21330948173999786, + "step": 4050 + }, + { + "epoch": 0.53, + "learning_rate": 2.6637689850594285e-06, + "logits/chosen": -1.4729732275009155, + "logits/rejected": -1.2420446872711182, + "logps/chosen": -552.0250244140625, + "logps/rejected": -658.884765625, + "loss": 0.0321, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2546490728855133, + "rewards/margins": 0.10681060701608658, + "rewards/rejected": -0.3614596724510193, + "step": 4060 + }, + { + "epoch": 0.53, + "learning_rate": 2.652371195687671e-06, + "logits/chosen": -1.7226966619491577, + "logits/rejected": -1.6044594049453735, + "logps/chosen": -482.7759704589844, + "logps/rejected": -555.4845581054688, + "loss": 0.0393, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.21687380969524384, + "rewards/margins": 0.08411475270986557, + "rewards/rejected": -0.3009885847568512, + "step": 4070 + }, + { + "epoch": 0.53, + "learning_rate": 2.64097022648654e-06, + "logits/chosen": -1.5568745136260986, + "logits/rejected": -1.2326549291610718, + "logps/chosen": -436.64923095703125, + "logps/rejected": -430.7582092285156, + "loss": 0.0167, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.234510138630867, + "rewards/margins": 0.030786100775003433, + "rewards/rejected": -0.26529622077941895, + "step": 4080 + }, + { + "epoch": 0.54, + "learning_rate": 2.6295663153824774e-06, + "logits/chosen": -1.7773542404174805, + "logits/rejected": -1.462713599205017, + "logps/chosen": -501.14630126953125, + "logps/rejected": -487.55841064453125, + "loss": 0.0271, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.23411674797534943, + "rewards/margins": 0.07479636371135712, + "rewards/rejected": -0.30891311168670654, + "step": 4090 + }, + { + "epoch": 0.54, + "learning_rate": 2.6181597003633218e-06, + "logits/chosen": -1.5608381032943726, + "logits/rejected": -1.4907993078231812, + "logps/chosen": -400.27740478515625, + "logps/rejected": -460.11773681640625, + "loss": 0.0197, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.23818571865558624, + "rewards/margins": 0.07261364161968231, + "rewards/rejected": -0.31079936027526855, + "step": 4100 + }, + { + "epoch": 0.54, + "learning_rate": 2.606750619473342e-06, + "logits/chosen": -1.3995827436447144, + "logits/rejected": -1.3887090682983398, + "logps/chosen": -431.75531005859375, + "logps/rejected": -520.8568115234375, + "loss": 0.0318, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22923071682453156, + "rewards/margins": 0.07589320093393326, + "rewards/rejected": -0.3051239252090454, + "step": 4110 + }, + { + "epoch": 0.54, + "learning_rate": 2.595339310808262e-06, + "logits/chosen": -1.5573954582214355, + "logits/rejected": -1.250608205795288, + "logps/chosen": -493.65411376953125, + "logps/rejected": -508.92034912109375, + "loss": 0.023, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2405034601688385, + "rewards/margins": 0.08133454620838165, + "rewards/rejected": -0.32183799147605896, + "step": 4120 + }, + { + "epoch": 0.54, + "learning_rate": 2.5839260125103004e-06, + "logits/chosen": -1.2536404132843018, + "logits/rejected": -1.2843748331069946, + "logps/chosen": -428.8235778808594, + "logps/rejected": -518.3291625976562, + "loss": 0.0353, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2670718729496002, + "rewards/margins": 0.07417013496160507, + "rewards/rejected": -0.3412419855594635, + "step": 4130 + }, + { + "epoch": 0.54, + "learning_rate": 2.5725109627631984e-06, + "logits/chosen": -1.685089111328125, + "logits/rejected": -1.5624825954437256, + "logps/chosen": -454.55303955078125, + "logps/rejected": -452.01348876953125, + "loss": 0.0253, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.19030128419399261, + "rewards/margins": 0.05792864412069321, + "rewards/rejected": -0.24822993576526642, + "step": 4140 + }, + { + "epoch": 0.54, + "learning_rate": 2.5610943997872443e-06, + "logits/chosen": -1.5616363286972046, + "logits/rejected": -1.4265216588974, + "logps/chosen": -380.36407470703125, + "logps/rejected": -440.4208984375, + "loss": 0.0426, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.19018259644508362, + "rewards/margins": 0.059229712933301926, + "rewards/rejected": -0.24941229820251465, + "step": 4150 + }, + { + "epoch": 0.54, + "learning_rate": 2.5496765618343096e-06, + "logits/chosen": -1.6420665979385376, + "logits/rejected": -1.3375904560089111, + "logps/chosen": -434.129638671875, + "logps/rejected": -474.74896240234375, + "loss": 0.0228, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.20237000286579132, + "rewards/margins": 0.06872677803039551, + "rewards/rejected": -0.27109676599502563, + "step": 4160 + }, + { + "epoch": 0.55, + "learning_rate": 2.538257687182871e-06, + "logits/chosen": -1.6899299621582031, + "logits/rejected": -1.5516376495361328, + "logps/chosen": -391.97076416015625, + "logps/rejected": -479.993896484375, + "loss": 0.0215, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1851247102022171, + "rewards/margins": 0.09358541667461395, + "rewards/rejected": -0.27871015667915344, + "step": 4170 + }, + { + "epoch": 0.55, + "learning_rate": 2.526838014133041e-06, + "logits/chosen": -1.5125585794448853, + "logits/rejected": -1.4651882648468018, + "logps/chosen": -460.04168701171875, + "logps/rejected": -533.2202758789062, + "loss": 0.02, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.21069243550300598, + "rewards/margins": 0.07955484092235565, + "rewards/rejected": -0.29024726152420044, + "step": 4180 + }, + { + "epoch": 0.55, + "learning_rate": 2.515417781001594e-06, + "logits/chosen": -1.6838356256484985, + "logits/rejected": -1.5009909868240356, + "logps/chosen": -407.412353515625, + "logps/rejected": -432.92095947265625, + "loss": 0.016, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.15886293351650238, + "rewards/margins": 0.06116022914648056, + "rewards/rejected": -0.22002318501472473, + "step": 4190 + }, + { + "epoch": 0.55, + "learning_rate": 2.503997226116992e-06, + "logits/chosen": -1.540334701538086, + "logits/rejected": -1.31317937374115, + "logps/chosen": -507.1998596191406, + "logps/rejected": -586.198974609375, + "loss": 0.0379, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2449527233839035, + "rewards/margins": 0.1364816427230835, + "rewards/rejected": -0.3814343810081482, + "step": 4200 + }, + { + "epoch": 0.55, + "learning_rate": 2.4925765878144115e-06, + "logits/chosen": -1.5158265829086304, + "logits/rejected": -1.3035396337509155, + "logps/chosen": -396.3337707519531, + "logps/rejected": -544.6439208984375, + "loss": 0.0227, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.20970037579536438, + "rewards/margins": 0.11930962651968002, + "rewards/rejected": -0.329010009765625, + "step": 4210 + }, + { + "epoch": 0.55, + "learning_rate": 2.4811561044307727e-06, + "logits/chosen": -1.694092035293579, + "logits/rejected": -1.4198188781738281, + "logps/chosen": -432.31689453125, + "logps/rejected": -528.2352905273438, + "loss": 0.0227, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1862529069185257, + "rewards/margins": 0.11565999686717987, + "rewards/rejected": -0.3019128739833832, + "step": 4220 + }, + { + "epoch": 0.55, + "learning_rate": 2.469736014299758e-06, + "logits/chosen": -1.8983396291732788, + "logits/rejected": -1.4605172872543335, + "logps/chosen": -442.47393798828125, + "logps/rejected": -453.75006103515625, + "loss": 0.0215, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.16222409904003143, + "rewards/margins": 0.08797114342451096, + "rewards/rejected": -0.2501952648162842, + "step": 4230 + }, + { + "epoch": 0.55, + "learning_rate": 2.458316555746846e-06, + "logits/chosen": -1.7895050048828125, + "logits/rejected": -1.5404589176177979, + "logps/chosen": -400.5735778808594, + "logps/rejected": -468.3433532714844, + "loss": 0.0254, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16227874159812927, + "rewards/margins": 0.08485525101423264, + "rewards/rejected": -0.24713397026062012, + "step": 4240 + }, + { + "epoch": 0.56, + "learning_rate": 2.446897967084334e-06, + "logits/chosen": -1.7049553394317627, + "logits/rejected": -1.6142890453338623, + "logps/chosen": -446.55078125, + "logps/rejected": -508.743408203125, + "loss": 0.022, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19081057608127594, + "rewards/margins": 0.07758510857820511, + "rewards/rejected": -0.26839572191238403, + "step": 4250 + }, + { + "epoch": 0.56, + "learning_rate": 2.4354804866063684e-06, + "logits/chosen": -1.788198471069336, + "logits/rejected": -1.5201570987701416, + "logps/chosen": -454.056640625, + "logps/rejected": -495.20367431640625, + "loss": 0.0413, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.22993293404579163, + "rewards/margins": 0.051151882857084274, + "rewards/rejected": -0.281084805727005, + "step": 4260 + }, + { + "epoch": 0.56, + "learning_rate": 2.424064352583964e-06, + "logits/chosen": -1.4209312200546265, + "logits/rejected": -1.2128381729125977, + "logps/chosen": -418.61181640625, + "logps/rejected": -477.3053283691406, + "loss": 0.0209, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.19294168055057526, + "rewards/margins": 0.10017367452383041, + "rewards/rejected": -0.29311537742614746, + "step": 4270 + }, + { + "epoch": 0.56, + "learning_rate": 2.4126498032600403e-06, + "logits/chosen": -1.5372164249420166, + "logits/rejected": -1.537858486175537, + "logps/chosen": -389.9130859375, + "logps/rejected": -496.35723876953125, + "loss": 0.0224, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.19452248513698578, + "rewards/margins": 0.09162701666355133, + "rewards/rejected": -0.2861495316028595, + "step": 4280 + }, + { + "epoch": 0.56, + "learning_rate": 2.401237076844445e-06, + "logits/chosen": -1.3520879745483398, + "logits/rejected": -1.1993756294250488, + "logps/chosen": -426.945068359375, + "logps/rejected": -472.2967224121094, + "loss": 0.0281, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.21869483590126038, + "rewards/margins": 0.04853721708059311, + "rewards/rejected": -0.26723209023475647, + "step": 4290 + }, + { + "epoch": 0.56, + "learning_rate": 2.38982641150898e-06, + "logits/chosen": -1.6982262134552002, + "logits/rejected": -1.3734503984451294, + "logps/chosen": -491.4610290527344, + "logps/rejected": -537.25, + "loss": 0.022, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2278437614440918, + "rewards/margins": 0.07478948682546616, + "rewards/rejected": -0.30263322591781616, + "step": 4300 + }, + { + "epoch": 0.56, + "learning_rate": 2.3784180453824414e-06, + "logits/chosen": -1.6156953573226929, + "logits/rejected": -1.5192720890045166, + "logps/chosen": -428.35693359375, + "logps/rejected": -511.0567321777344, + "loss": 0.0222, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.18561260402202606, + "rewards/margins": 0.09769701957702637, + "rewards/rejected": -0.2833096385002136, + "step": 4310 + }, + { + "epoch": 0.57, + "learning_rate": 2.367012216545638e-06, + "logits/chosen": -1.7509912252426147, + "logits/rejected": -1.4216718673706055, + "logps/chosen": -443.70001220703125, + "logps/rejected": -451.0489196777344, + "loss": 0.0303, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1952689290046692, + "rewards/margins": 0.06825876235961914, + "rewards/rejected": -0.26352769136428833, + "step": 4320 + }, + { + "epoch": 0.57, + "learning_rate": 2.3556091630264294e-06, + "logits/chosen": -1.5806615352630615, + "logits/rejected": -1.4713026285171509, + "logps/chosen": -513.8214111328125, + "logps/rejected": -581.2058715820312, + "loss": 0.0282, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.23938843607902527, + "rewards/margins": 0.06749831140041351, + "rewards/rejected": -0.3068867325782776, + "step": 4330 + }, + { + "epoch": 0.57, + "learning_rate": 2.344209122794757e-06, + "logits/chosen": -1.4990651607513428, + "logits/rejected": -1.3438913822174072, + "logps/chosen": -462.8631286621094, + "logps/rejected": -485.54412841796875, + "loss": 0.0249, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.21740731596946716, + "rewards/margins": 0.07030390202999115, + "rewards/rejected": -0.2877112329006195, + "step": 4340 + }, + { + "epoch": 0.57, + "learning_rate": 2.3328123337576787e-06, + "logits/chosen": -1.5248218774795532, + "logits/rejected": -1.4321211576461792, + "logps/chosen": -511.95611572265625, + "logps/rejected": -601.123779296875, + "loss": 0.0274, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.26373380422592163, + "rewards/margins": 0.07318225502967834, + "rewards/rejected": -0.3369160294532776, + "step": 4350 + }, + { + "epoch": 0.57, + "learning_rate": 2.3214190337544017e-06, + "logits/chosen": -1.6629645824432373, + "logits/rejected": -1.4849364757537842, + "logps/chosen": -424.27581787109375, + "logps/rejected": -464.45074462890625, + "loss": 0.0301, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22247524559497833, + "rewards/margins": 0.0703267902135849, + "rewards/rejected": -0.29280200600624084, + "step": 4360 + }, + { + "epoch": 0.57, + "learning_rate": 2.310029460551323e-06, + "logits/chosen": -1.8954870700836182, + "logits/rejected": -1.580891728401184, + "logps/chosen": -437.32275390625, + "logps/rejected": -496.6338806152344, + "loss": 0.0208, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.20409438014030457, + "rewards/margins": 0.08132465183734894, + "rewards/rejected": -0.2854190468788147, + "step": 4370 + }, + { + "epoch": 0.57, + "learning_rate": 2.2986438518370645e-06, + "logits/chosen": -1.6273301839828491, + "logits/rejected": -1.6872777938842773, + "logps/chosen": -431.04425048828125, + "logps/rejected": -482.32293701171875, + "loss": 0.0173, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.22521671652793884, + "rewards/margins": 0.062195055186748505, + "rewards/rejected": -0.28741174936294556, + "step": 4380 + }, + { + "epoch": 0.57, + "learning_rate": 2.2872624452175123e-06, + "logits/chosen": -1.7207715511322021, + "logits/rejected": -1.5806845426559448, + "logps/chosen": -434.0201110839844, + "logps/rejected": -481.0724182128906, + "loss": 0.0279, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.20164403319358826, + "rewards/margins": 0.05903003737330437, + "rewards/rejected": -0.2606740891933441, + "step": 4390 + }, + { + "epoch": 0.58, + "learning_rate": 2.2758854782108584e-06, + "logits/chosen": -1.548923134803772, + "logits/rejected": -1.5041494369506836, + "logps/chosen": -417.203857421875, + "logps/rejected": -507.90582275390625, + "loss": 0.027, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2142319232225418, + "rewards/margins": 0.08471399545669556, + "rewards/rejected": -0.29894593358039856, + "step": 4400 + }, + { + "epoch": 0.58, + "learning_rate": 2.2645131882426458e-06, + "logits/chosen": -1.7802507877349854, + "logits/rejected": -1.4019224643707275, + "logps/chosen": -468.563720703125, + "logps/rejected": -490.57708740234375, + "loss": 0.0216, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.22909097373485565, + "rewards/margins": 0.1022447720170021, + "rewards/rejected": -0.33133575320243835, + "step": 4410 + }, + { + "epoch": 0.58, + "learning_rate": 2.2531458126408154e-06, + "logits/chosen": -1.6049638986587524, + "logits/rejected": -1.3213298320770264, + "logps/chosen": -379.1090393066406, + "logps/rejected": -478.8014221191406, + "loss": 0.0329, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1762361377477646, + "rewards/margins": 0.14133073389530182, + "rewards/rejected": -0.3175669014453888, + "step": 4420 + }, + { + "epoch": 0.58, + "learning_rate": 2.2417835886307452e-06, + "logits/chosen": -1.4309465885162354, + "logits/rejected": -1.2265818119049072, + "logps/chosen": -439.0750427246094, + "logps/rejected": -505.5755310058594, + "loss": 0.0303, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2087368220090866, + "rewards/margins": 0.1353384554386139, + "rewards/rejected": -0.3440752923488617, + "step": 4430 + }, + { + "epoch": 0.58, + "learning_rate": 2.2304267533303075e-06, + "logits/chosen": -1.3964816331863403, + "logits/rejected": -1.242711067199707, + "logps/chosen": -557.1407470703125, + "logps/rejected": -587.2622680664062, + "loss": 0.04, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.2703414857387543, + "rewards/margins": 0.06430923938751221, + "rewards/rejected": -0.33465075492858887, + "step": 4440 + }, + { + "epoch": 0.58, + "learning_rate": 2.219075543744918e-06, + "logits/chosen": -1.5047862529754639, + "logits/rejected": -1.2050893306732178, + "logps/chosen": -474.2640075683594, + "logps/rejected": -498.3106384277344, + "loss": 0.0165, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2143983393907547, + "rewards/margins": 0.07695254683494568, + "rewards/rejected": -0.2913508713245392, + "step": 4450 + }, + { + "epoch": 0.58, + "learning_rate": 2.207730196762589e-06, + "logits/chosen": -1.5303689241409302, + "logits/rejected": -1.2933293581008911, + "logps/chosen": -487.30743408203125, + "logps/rejected": -537.5446166992188, + "loss": 0.0413, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2514573037624359, + "rewards/margins": 0.08375079184770584, + "rewards/rejected": -0.33520805835723877, + "step": 4460 + }, + { + "epoch": 0.58, + "learning_rate": 2.1963909491489846e-06, + "logits/chosen": -1.6777637004852295, + "logits/rejected": -1.7184629440307617, + "logps/chosen": -413.19580078125, + "logps/rejected": -464.3396911621094, + "loss": 0.0392, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2414092719554901, + "rewards/margins": 0.06929554790258408, + "rewards/rejected": -0.3107048571109772, + "step": 4470 + }, + { + "epoch": 0.59, + "learning_rate": 2.185058037542486e-06, + "logits/chosen": -1.5463758707046509, + "logits/rejected": -1.2479265928268433, + "logps/chosen": -518.1327514648438, + "logps/rejected": -562.65380859375, + "loss": 0.0252, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.25115349888801575, + "rewards/margins": 0.11522521823644638, + "rewards/rejected": -0.3663787245750427, + "step": 4480 + }, + { + "epoch": 0.59, + "learning_rate": 2.173731698449244e-06, + "logits/chosen": -1.4406402111053467, + "logits/rejected": -1.059444785118103, + "logps/chosen": -539.0560302734375, + "logps/rejected": -574.791259765625, + "loss": 0.029, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.267414391040802, + "rewards/margins": 0.11316857486963272, + "rewards/rejected": -0.3805829882621765, + "step": 4490 + }, + { + "epoch": 0.59, + "learning_rate": 2.1624121682382495e-06, + "logits/chosen": -1.3759723901748657, + "logits/rejected": -1.2170668840408325, + "logps/chosen": -504.7023010253906, + "logps/rejected": -614.5403442382812, + "loss": 0.0278, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28002482652664185, + "rewards/margins": 0.10902640968561172, + "rewards/rejected": -0.38905128836631775, + "step": 4500 + }, + { + "epoch": 0.59, + "learning_rate": 2.1510996831363993e-06, + "logits/chosen": -1.644968032836914, + "logits/rejected": -1.3946508169174194, + "logps/chosen": -536.4176025390625, + "logps/rejected": -593.7776489257812, + "loss": 0.0264, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.25945937633514404, + "rewards/margins": 0.0572463758289814, + "rewards/rejected": -0.31670576333999634, + "step": 4510 + }, + { + "epoch": 0.59, + "learning_rate": 2.139794479223565e-06, + "logits/chosen": -1.5176206827163696, + "logits/rejected": -1.252367377281189, + "logps/chosen": -457.44683837890625, + "logps/rejected": -522.0448608398438, + "loss": 0.0336, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.24093282222747803, + "rewards/margins": 0.07370787113904953, + "rewards/rejected": -0.31464070081710815, + "step": 4520 + }, + { + "epoch": 0.59, + "learning_rate": 2.128496792427669e-06, + "logits/chosen": -1.8894401788711548, + "logits/rejected": -1.7518160343170166, + "logps/chosen": -424.028076171875, + "logps/rejected": -528.4948120117188, + "loss": 0.0211, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1635584980249405, + "rewards/margins": 0.0953235924243927, + "rewards/rejected": -0.2588821053504944, + "step": 4530 + }, + { + "epoch": 0.59, + "learning_rate": 2.117206858519758e-06, + "logits/chosen": -1.793152093887329, + "logits/rejected": -1.6433145999908447, + "logps/chosen": -481.6566467285156, + "logps/rejected": -526.556640625, + "loss": 0.0238, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2006797045469284, + "rewards/margins": 0.07764464616775513, + "rewards/rejected": -0.27832433581352234, + "step": 4540 + }, + { + "epoch": 0.6, + "learning_rate": 2.1059249131090844e-06, + "logits/chosen": -1.4656394720077515, + "logits/rejected": -1.3645011186599731, + "logps/chosen": -448.1487731933594, + "logps/rejected": -506.11444091796875, + "loss": 0.0173, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.22689059376716614, + "rewards/margins": 0.08436138927936554, + "rewards/rejected": -0.31125199794769287, + "step": 4550 + }, + { + "epoch": 0.6, + "learning_rate": 2.094651191638189e-06, + "logits/chosen": -1.6243807077407837, + "logits/rejected": -1.4395637512207031, + "logps/chosen": -427.7439880371094, + "logps/rejected": -471.2557678222656, + "loss": 0.0282, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1894945651292801, + "rewards/margins": 0.0754246711730957, + "rewards/rejected": -0.2649192214012146, + "step": 4560 + }, + { + "epoch": 0.6, + "learning_rate": 2.0833859293779867e-06, + "logits/chosen": -1.6144850254058838, + "logits/rejected": -1.5711697340011597, + "logps/chosen": -450.52642822265625, + "logps/rejected": -514.2506103515625, + "loss": 0.019, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2052551507949829, + "rewards/margins": 0.08380991965532303, + "rewards/rejected": -0.28906506299972534, + "step": 4570 + }, + { + "epoch": 0.6, + "learning_rate": 2.0721293614228568e-06, + "logits/chosen": -1.5417252779006958, + "logits/rejected": -1.3488690853118896, + "logps/chosen": -432.2200622558594, + "logps/rejected": -466.6581115722656, + "loss": 0.0235, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2289007157087326, + "rewards/margins": 0.05619456246495247, + "rewards/rejected": -0.2850952744483948, + "step": 4580 + }, + { + "epoch": 0.6, + "learning_rate": 2.060881722685742e-06, + "logits/chosen": -1.5724356174468994, + "logits/rejected": -1.2952988147735596, + "logps/chosen": -435.21075439453125, + "logps/rejected": -483.70526123046875, + "loss": 0.0294, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18673992156982422, + "rewards/margins": 0.1105538010597229, + "rewards/rejected": -0.2972937524318695, + "step": 4590 + }, + { + "epoch": 0.6, + "learning_rate": 2.049643247893235e-06, + "logits/chosen": -1.7986335754394531, + "logits/rejected": -1.4598957300186157, + "logps/chosen": -457.9772033691406, + "logps/rejected": -456.84033203125, + "loss": 0.025, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.17091234028339386, + "rewards/margins": 0.06817115843296051, + "rewards/rejected": -0.23908352851867676, + "step": 4600 + }, + { + "epoch": 0.6, + "learning_rate": 2.0384141715806903e-06, + "logits/chosen": -1.5617355108261108, + "logits/rejected": -1.5490385293960571, + "logps/chosen": -379.4814453125, + "logps/rejected": -440.04254150390625, + "loss": 0.0204, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.16015496850013733, + "rewards/margins": 0.045989301055669785, + "rewards/rejected": -0.20614425837993622, + "step": 4610 + }, + { + "epoch": 0.6, + "learning_rate": 2.0271947280873255e-06, + "logits/chosen": -1.6132984161376953, + "logits/rejected": -1.4674255847930908, + "logps/chosen": -348.9981689453125, + "logps/rejected": -502.19091796875, + "loss": 0.0185, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.17968395352363586, + "rewards/margins": 0.12029703706502914, + "rewards/rejected": -0.2999809682369232, + "step": 4620 + }, + { + "epoch": 0.61, + "learning_rate": 2.0159851515513302e-06, + "logits/chosen": -1.3583838939666748, + "logits/rejected": -1.344406247138977, + "logps/chosen": -388.41876220703125, + "logps/rejected": -437.25811767578125, + "loss": 0.0202, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19703704118728638, + "rewards/margins": 0.0540815070271492, + "rewards/rejected": -0.2511185109615326, + "step": 4630 + }, + { + "epoch": 0.61, + "learning_rate": 2.004785675904982e-06, + "logits/chosen": -1.490714192390442, + "logits/rejected": -1.4939876794815063, + "logps/chosen": -345.9848937988281, + "logps/rejected": -453.45684814453125, + "loss": 0.0287, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17589840292930603, + "rewards/margins": 0.09860150516033173, + "rewards/rejected": -0.27449989318847656, + "step": 4640 + }, + { + "epoch": 0.61, + "learning_rate": 1.9935965348697624e-06, + "logits/chosen": -1.526310682296753, + "logits/rejected": -1.5772511959075928, + "logps/chosen": -408.08843994140625, + "logps/rejected": -526.1033935546875, + "loss": 0.0295, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.17842058837413788, + "rewards/margins": 0.09581470489501953, + "rewards/rejected": -0.2742353081703186, + "step": 4650 + }, + { + "epoch": 0.61, + "learning_rate": 1.9824179619514807e-06, + "logits/chosen": -1.3917975425720215, + "logits/rejected": -1.6485751867294312, + "logps/chosen": -324.16485595703125, + "logps/rejected": -428.88397216796875, + "loss": 0.029, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.19104906916618347, + "rewards/margins": 0.07186642289161682, + "rewards/rejected": -0.2629155218601227, + "step": 4660 + }, + { + "epoch": 0.61, + "learning_rate": 1.9712501904354004e-06, + "logits/chosen": -1.708351731300354, + "logits/rejected": -1.3460270166397095, + "logps/chosen": -428.37371826171875, + "logps/rejected": -460.4834899902344, + "loss": 0.0289, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.22374172508716583, + "rewards/margins": 0.06903581321239471, + "rewards/rejected": -0.29277750849723816, + "step": 4670 + }, + { + "epoch": 0.61, + "learning_rate": 1.960093453381369e-06, + "logits/chosen": -1.6553242206573486, + "logits/rejected": -1.6477683782577515, + "logps/chosen": -286.5013122558594, + "logps/rejected": -361.59619140625, + "loss": 0.0228, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.15842024981975555, + "rewards/margins": 0.06222587078809738, + "rewards/rejected": -0.22064614295959473, + "step": 4680 + }, + { + "epoch": 0.61, + "learning_rate": 1.948947983618962e-06, + "logits/chosen": -1.6140121221542358, + "logits/rejected": -1.4784574508666992, + "logps/chosen": -422.51165771484375, + "logps/rejected": -475.551513671875, + "loss": 0.0191, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1707344651222229, + "rewards/margins": 0.07597699016332626, + "rewards/rejected": -0.24671144783496857, + "step": 4690 + }, + { + "epoch": 0.62, + "learning_rate": 1.937814013742611e-06, + "logits/chosen": -1.7896366119384766, + "logits/rejected": -1.4367586374282837, + "logps/chosen": -499.7083435058594, + "logps/rejected": -533.1226806640625, + "loss": 0.0237, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.18204855918884277, + "rewards/margins": 0.09838353097438812, + "rewards/rejected": -0.2804320752620697, + "step": 4700 + }, + { + "epoch": 0.62, + "learning_rate": 1.9266917761067617e-06, + "logits/chosen": -1.4915893077850342, + "logits/rejected": -1.1812177896499634, + "logps/chosen": -387.476806640625, + "logps/rejected": -429.5155334472656, + "loss": 0.0278, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16393427550792694, + "rewards/margins": 0.0778748169541359, + "rewards/rejected": -0.24180908501148224, + "step": 4710 + }, + { + "epoch": 0.62, + "learning_rate": 1.915581502821017e-06, + "logits/chosen": -1.4793461561203003, + "logits/rejected": -1.3449960947036743, + "logps/chosen": -334.1965026855469, + "logps/rejected": -368.3381042480469, + "loss": 0.0536, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.14853033423423767, + "rewards/margins": 0.0649092048406601, + "rewards/rejected": -0.21343955397605896, + "step": 4720 + }, + { + "epoch": 0.62, + "learning_rate": 1.9044834257452997e-06, + "logits/chosen": -1.8198268413543701, + "logits/rejected": -1.5682449340820312, + "logps/chosen": -373.09259033203125, + "logps/rejected": -394.75177001953125, + "loss": 0.0205, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.15750274062156677, + "rewards/margins": 0.06791599839925766, + "rewards/rejected": -0.22541876137256622, + "step": 4730 + }, + { + "epoch": 0.62, + "learning_rate": 1.893397776485006e-06, + "logits/chosen": -1.7474422454833984, + "logits/rejected": -1.504546880722046, + "logps/chosen": -339.1938171386719, + "logps/rejected": -431.856689453125, + "loss": 0.0314, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14679184556007385, + "rewards/margins": 0.10970643907785416, + "rewards/rejected": -0.2564982771873474, + "step": 4740 + }, + { + "epoch": 0.62, + "learning_rate": 1.8823247863861804e-06, + "logits/chosen": -1.6589701175689697, + "logits/rejected": -1.621392011642456, + "logps/chosen": -404.7550964355469, + "logps/rejected": -460.41143798828125, + "loss": 0.0211, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.17128518223762512, + "rewards/margins": 0.07399027049541473, + "rewards/rejected": -0.24527546763420105, + "step": 4750 + }, + { + "epoch": 0.62, + "learning_rate": 1.8712646865306822e-06, + "logits/chosen": -1.5093224048614502, + "logits/rejected": -1.4263262748718262, + "logps/chosen": -462.394287109375, + "logps/rejected": -484.19415283203125, + "loss": 0.0351, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.16870293021202087, + "rewards/margins": 0.08714812994003296, + "rewards/rejected": -0.2558510899543762, + "step": 4760 + }, + { + "epoch": 0.62, + "learning_rate": 1.8602177077313631e-06, + "logits/chosen": -1.4509001970291138, + "logits/rejected": -1.4139875173568726, + "logps/chosen": -415.69921875, + "logps/rejected": -492.8407287597656, + "loss": 0.0176, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1922876238822937, + "rewards/margins": 0.07129409164190292, + "rewards/rejected": -0.2635817229747772, + "step": 4770 + }, + { + "epoch": 0.63, + "learning_rate": 1.8491840805272546e-06, + "logits/chosen": -1.5857927799224854, + "logits/rejected": -1.3974844217300415, + "logps/chosen": -418.29217529296875, + "logps/rejected": -474.0970764160156, + "loss": 0.0268, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1814996898174286, + "rewards/margins": 0.09302203357219696, + "rewards/rejected": -0.27452173829078674, + "step": 4780 + }, + { + "epoch": 0.63, + "learning_rate": 1.8381640351787516e-06, + "logits/chosen": -1.6603059768676758, + "logits/rejected": -1.5743963718414307, + "logps/chosen": -419.96563720703125, + "logps/rejected": -484.13177490234375, + "loss": 0.0292, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18491405248641968, + "rewards/margins": 0.06687320023775101, + "rewards/rejected": -0.2517872452735901, + "step": 4790 + }, + { + "epoch": 0.63, + "learning_rate": 1.8271578016628122e-06, + "logits/chosen": -1.4773046970367432, + "logits/rejected": -1.4587136507034302, + "logps/chosen": -371.9123229980469, + "logps/rejected": -448.15777587890625, + "loss": 0.0205, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1796925961971283, + "rewards/margins": 0.07964633405208588, + "rewards/rejected": -0.259338915348053, + "step": 4800 + }, + { + "epoch": 0.63, + "learning_rate": 1.8161656096681546e-06, + "logits/chosen": -1.5090692043304443, + "logits/rejected": -1.2741395235061646, + "logps/chosen": -350.93194580078125, + "logps/rejected": -480.82171630859375, + "loss": 0.0358, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.17646178603172302, + "rewards/margins": 0.13238736987113953, + "rewards/rejected": -0.30884915590286255, + "step": 4810 + }, + { + "epoch": 0.63, + "learning_rate": 1.8051876885904645e-06, + "logits/chosen": -1.36701500415802, + "logits/rejected": -1.3117094039916992, + "logps/chosen": -421.5575256347656, + "logps/rejected": -501.64691162109375, + "loss": 0.0361, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1858542114496231, + "rewards/margins": 0.08887975662946701, + "rewards/rejected": -0.2747339606285095, + "step": 4820 + }, + { + "epoch": 0.63, + "learning_rate": 1.7942242675276098e-06, + "logits/chosen": -1.4270477294921875, + "logits/rejected": -1.5564621686935425, + "logps/chosen": -362.71478271484375, + "logps/rejected": -495.60162353515625, + "loss": 0.0208, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2082078754901886, + "rewards/margins": 0.08840437233448029, + "rewards/rejected": -0.2966122329235077, + "step": 4830 + }, + { + "epoch": 0.63, + "learning_rate": 1.783275575274856e-06, + "logits/chosen": -1.607736349105835, + "logits/rejected": -1.419236421585083, + "logps/chosen": -492.0704040527344, + "logps/rejected": -484.59344482421875, + "loss": 0.0165, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20354600250720978, + "rewards/margins": 0.08249841630458832, + "rewards/rejected": -0.2860444188117981, + "step": 4840 + }, + { + "epoch": 0.63, + "learning_rate": 1.7723418403200943e-06, + "logits/chosen": -1.6304285526275635, + "logits/rejected": -1.2252190113067627, + "logps/chosen": -516.8814086914062, + "logps/rejected": -528.237060546875, + "loss": 0.0263, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.222385972738266, + "rewards/margins": 0.08664709329605103, + "rewards/rejected": -0.30903303623199463, + "step": 4850 + }, + { + "epoch": 0.64, + "learning_rate": 1.7614232908390748e-06, + "logits/chosen": -1.4483563899993896, + "logits/rejected": -1.3192527294158936, + "logps/chosen": -398.49603271484375, + "logps/rejected": -473.6939392089844, + "loss": 0.0244, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.16964933276176453, + "rewards/margins": 0.07871166616678238, + "rewards/rejected": -0.24836096167564392, + "step": 4860 + }, + { + "epoch": 0.64, + "learning_rate": 1.7505201546906398e-06, + "logits/chosen": -1.4854518175125122, + "logits/rejected": -1.2704298496246338, + "logps/chosen": -463.33636474609375, + "logps/rejected": -425.34796142578125, + "loss": 0.0333, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1838013082742691, + "rewards/margins": 0.04110676050186157, + "rewards/rejected": -0.22490806877613068, + "step": 4870 + }, + { + "epoch": 0.64, + "learning_rate": 1.7396326594119717e-06, + "logits/chosen": -1.7042782306671143, + "logits/rejected": -1.613613486289978, + "logps/chosen": -417.82550048828125, + "logps/rejected": -494.0665588378906, + "loss": 0.0364, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1753154844045639, + "rewards/margins": 0.10930249840021133, + "rewards/rejected": -0.28461796045303345, + "step": 4880 + }, + { + "epoch": 0.64, + "learning_rate": 1.7287610322138449e-06, + "logits/chosen": -1.5676114559173584, + "logits/rejected": -1.3253037929534912, + "logps/chosen": -419.602294921875, + "logps/rejected": -503.919677734375, + "loss": 0.0215, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18550562858581543, + "rewards/margins": 0.12120529264211655, + "rewards/rejected": -0.3067108988761902, + "step": 4890 + }, + { + "epoch": 0.64, + "learning_rate": 1.7179054999758817e-06, + "logits/chosen": -1.8627017736434937, + "logits/rejected": -1.529758334159851, + "logps/chosen": -439.87420654296875, + "logps/rejected": -486.83099365234375, + "loss": 0.0203, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.20357398688793182, + "rewards/margins": 0.07500387728214264, + "rewards/rejected": -0.27857786417007446, + "step": 4900 + }, + { + "epoch": 0.64, + "learning_rate": 1.7070662892418225e-06, + "logits/chosen": -1.5751526355743408, + "logits/rejected": -1.45789635181427, + "logps/chosen": -357.0823669433594, + "logps/rejected": -422.3937072753906, + "loss": 0.042, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.18973052501678467, + "rewards/margins": 0.07508702576160431, + "rewards/rejected": -0.26481756567955017, + "step": 4910 + }, + { + "epoch": 0.64, + "learning_rate": 1.6962436262147913e-06, + "logits/chosen": -1.4677342176437378, + "logits/rejected": -1.4380252361297607, + "logps/chosen": -410.12890625, + "logps/rejected": -524.4754638671875, + "loss": 0.0522, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2252027541399002, + "rewards/margins": 0.09915554523468018, + "rewards/rejected": -0.3243583142757416, + "step": 4920 + }, + { + "epoch": 0.65, + "learning_rate": 1.6854377367525814e-06, + "logits/chosen": -1.6399227380752563, + "logits/rejected": -1.5074961185455322, + "logps/chosen": -495.7207946777344, + "logps/rejected": -519.4228515625, + "loss": 0.029, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.212543323636055, + "rewards/margins": 0.08628206700086594, + "rewards/rejected": -0.29882535338401794, + "step": 4930 + }, + { + "epoch": 0.65, + "learning_rate": 1.6746488463629362e-06, + "logits/chosen": -1.7351045608520508, + "logits/rejected": -1.6972219944000244, + "logps/chosen": -488.1156311035156, + "logps/rejected": -578.7838745117188, + "loss": 0.0268, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.21219182014465332, + "rewards/margins": 0.0840751901268959, + "rewards/rejected": -0.29626700282096863, + "step": 4940 + }, + { + "epoch": 0.65, + "learning_rate": 1.6638771801988483e-06, + "logits/chosen": -1.4572616815567017, + "logits/rejected": -1.4984290599822998, + "logps/chosen": -481.2704162597656, + "logps/rejected": -533.4957885742188, + "loss": 0.0236, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.20552797615528107, + "rewards/margins": 0.06888245046138763, + "rewards/rejected": -0.2744104266166687, + "step": 4950 + }, + { + "epoch": 0.65, + "learning_rate": 1.653122963053857e-06, + "logits/chosen": -1.4090168476104736, + "logits/rejected": -1.22544264793396, + "logps/chosen": -391.96661376953125, + "logps/rejected": -467.94232177734375, + "loss": 0.033, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1839529573917389, + "rewards/margins": 0.07347510755062103, + "rewards/rejected": -0.25742802023887634, + "step": 4960 + }, + { + "epoch": 0.65, + "learning_rate": 1.6423864193573606e-06, + "logits/chosen": -1.399701476097107, + "logits/rejected": -1.2479718923568726, + "logps/chosen": -403.88641357421875, + "logps/rejected": -505.8567810058594, + "loss": 0.0297, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.19615277647972107, + "rewards/margins": 0.10045097768306732, + "rewards/rejected": -0.2966037690639496, + "step": 4970 + }, + { + "epoch": 0.65, + "learning_rate": 1.6316677731699286e-06, + "logits/chosen": -1.6854263544082642, + "logits/rejected": -1.4302657842636108, + "logps/chosen": -468.148193359375, + "logps/rejected": -521.8663330078125, + "loss": 0.0408, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.22639694809913635, + "rewards/margins": 0.09691077470779419, + "rewards/rejected": -0.32330775260925293, + "step": 4980 + }, + { + "epoch": 0.65, + "learning_rate": 1.6209672481786302e-06, + "logits/chosen": -1.4107683897018433, + "logits/rejected": -1.3036644458770752, + "logps/chosen": -397.0565185546875, + "logps/rejected": -478.081298828125, + "loss": 0.0232, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16772839426994324, + "rewards/margins": 0.1289522349834442, + "rewards/rejected": -0.29668062925338745, + "step": 4990 + }, + { + "epoch": 0.65, + "learning_rate": 1.6102850676923616e-06, + "logits/chosen": -1.5386617183685303, + "logits/rejected": -1.5371997356414795, + "logps/chosen": -410.09564208984375, + "logps/rejected": -492.2012634277344, + "loss": 0.018, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.19269494712352753, + "rewards/margins": 0.0963241308927536, + "rewards/rejected": -0.2890191078186035, + "step": 5000 + }, + { + "epoch": 0.66, + "learning_rate": 1.5996214546371888e-06, + "logits/chosen": -1.8242378234863281, + "logits/rejected": -1.4758179187774658, + "logps/chosen": -380.41973876953125, + "logps/rejected": -392.49261474609375, + "loss": 0.02, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.16637901961803436, + "rewards/margins": 0.057704634964466095, + "rewards/rejected": -0.22408363223075867, + "step": 5010 + }, + { + "epoch": 0.66, + "learning_rate": 1.588976631551697e-06, + "logits/chosen": -1.6556527614593506, + "logits/rejected": -1.4670960903167725, + "logps/chosen": -387.4090881347656, + "logps/rejected": -469.97760009765625, + "loss": 0.029, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.14826735854148865, + "rewards/margins": 0.09724099189043045, + "rewards/rejected": -0.2455083429813385, + "step": 5020 + }, + { + "epoch": 0.66, + "learning_rate": 1.5783508205823412e-06, + "logits/chosen": -1.5352144241333008, + "logits/rejected": -1.4616471529006958, + "logps/chosen": -400.0191955566406, + "logps/rejected": -461.9366760253906, + "loss": 0.0229, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18305253982543945, + "rewards/margins": 0.08117742836475372, + "rewards/rejected": -0.264229953289032, + "step": 5030 + }, + { + "epoch": 0.66, + "learning_rate": 1.5677442434788143e-06, + "logits/chosen": -1.7859392166137695, + "logits/rejected": -1.488284945487976, + "logps/chosen": -407.81317138671875, + "logps/rejected": -449.358154296875, + "loss": 0.0242, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.16020144522190094, + "rewards/margins": 0.08070135116577148, + "rewards/rejected": -0.24090275168418884, + "step": 5040 + }, + { + "epoch": 0.66, + "learning_rate": 1.5571571215894181e-06, + "logits/chosen": -1.7393693923950195, + "logits/rejected": -1.6400400400161743, + "logps/chosen": -395.0425720214844, + "logps/rejected": -480.1275329589844, + "loss": 0.0173, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1880541741847992, + "rewards/margins": 0.057797182351350784, + "rewards/rejected": -0.24585136771202087, + "step": 5050 + }, + { + "epoch": 0.66, + "learning_rate": 1.5465896758564452e-06, + "logits/chosen": -1.8525689840316772, + "logits/rejected": -1.6848907470703125, + "logps/chosen": -430.5870666503906, + "logps/rejected": -511.40728759765625, + "loss": 0.0364, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1730785071849823, + "rewards/margins": 0.07160644978284836, + "rewards/rejected": -0.24468496441841125, + "step": 5060 + }, + { + "epoch": 0.66, + "learning_rate": 1.5360421268115653e-06, + "logits/chosen": -1.8298113346099854, + "logits/rejected": -1.4987479448318481, + "logps/chosen": -458.18658447265625, + "logps/rejected": -492.65789794921875, + "loss": 0.0301, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.19494257867336273, + "rewards/margins": 0.09265542030334473, + "rewards/rejected": -0.28759801387786865, + "step": 5070 + }, + { + "epoch": 0.66, + "learning_rate": 1.5255146945712267e-06, + "logits/chosen": -1.5927072763442993, + "logits/rejected": -1.3620411157608032, + "logps/chosen": -451.4266662597656, + "logps/rejected": -485.79571533203125, + "loss": 0.0289, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19076094031333923, + "rewards/margins": 0.09270092099905014, + "rewards/rejected": -0.28346189856529236, + "step": 5080 + }, + { + "epoch": 0.67, + "learning_rate": 1.5150075988320594e-06, + "logits/chosen": -1.6711536645889282, + "logits/rejected": -1.5017945766448975, + "logps/chosen": -375.4448547363281, + "logps/rejected": -468.86480712890625, + "loss": 0.034, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1778108775615692, + "rewards/margins": 0.09814232587814331, + "rewards/rejected": -0.2759532332420349, + "step": 5090 + }, + { + "epoch": 0.67, + "learning_rate": 1.5045210588662929e-06, + "logits/chosen": -1.5476630926132202, + "logits/rejected": -1.4029957056045532, + "logps/chosen": -377.67327880859375, + "logps/rejected": -451.4478454589844, + "loss": 0.0246, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1601109504699707, + "rewards/margins": 0.09873531013727188, + "rewards/rejected": -0.258846253156662, + "step": 5100 + }, + { + "epoch": 0.67, + "learning_rate": 1.4940552935171781e-06, + "logits/chosen": -1.680654764175415, + "logits/rejected": -1.5256980657577515, + "logps/chosen": -431.8291015625, + "logps/rejected": -547.5233154296875, + "loss": 0.0237, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.20115765929222107, + "rewards/margins": 0.09922701865434647, + "rewards/rejected": -0.3003847002983093, + "step": 5110 + }, + { + "epoch": 0.67, + "learning_rate": 1.483610521194419e-06, + "logits/chosen": -1.598397970199585, + "logits/rejected": -1.1381256580352783, + "logps/chosen": -504.94744873046875, + "logps/rejected": -565.5025024414062, + "loss": 0.0293, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.216828852891922, + "rewards/margins": 0.14259278774261475, + "rewards/rejected": -0.35942161083221436, + "step": 5120 + }, + { + "epoch": 0.67, + "learning_rate": 1.4731869598696226e-06, + "logits/chosen": -1.4274873733520508, + "logits/rejected": -1.5127485990524292, + "logps/chosen": -427.0562438964844, + "logps/rejected": -522.8704833984375, + "loss": 0.0259, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.22581632435321808, + "rewards/margins": 0.06540516763925552, + "rewards/rejected": -0.2912214994430542, + "step": 5130 + }, + { + "epoch": 0.67, + "learning_rate": 1.4627848270717387e-06, + "logits/chosen": -1.633857011795044, + "logits/rejected": -1.3590513467788696, + "logps/chosen": -425.37115478515625, + "logps/rejected": -518.7385864257812, + "loss": 0.0256, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.20596785843372345, + "rewards/margins": 0.11591050773859024, + "rewards/rejected": -0.3218783438205719, + "step": 5140 + }, + { + "epoch": 0.67, + "learning_rate": 1.4524043398825277e-06, + "logits/chosen": -1.4400713443756104, + "logits/rejected": -1.1214375495910645, + "logps/chosen": -394.29144287109375, + "logps/rejected": -386.66168212890625, + "loss": 0.0196, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14006850123405457, + "rewards/margins": 0.0656704306602478, + "rewards/rejected": -0.20573893189430237, + "step": 5150 + }, + { + "epoch": 0.68, + "learning_rate": 1.4420457149320299e-06, + "logits/chosen": -1.5667462348937988, + "logits/rejected": -1.5604121685028076, + "logps/chosen": -375.1114196777344, + "logps/rejected": -447.48223876953125, + "loss": 0.019, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1627887487411499, + "rewards/margins": 0.049822621047496796, + "rewards/rejected": -0.2126113921403885, + "step": 5160 + }, + { + "epoch": 0.68, + "learning_rate": 1.431709168394042e-06, + "logits/chosen": -1.7921355962753296, + "logits/rejected": -1.5176485776901245, + "logps/chosen": -432.16754150390625, + "logps/rejected": -388.26953125, + "loss": 0.0245, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.14249520003795624, + "rewards/margins": 0.041745856404304504, + "rewards/rejected": -0.18424105644226074, + "step": 5170 + }, + { + "epoch": 0.68, + "learning_rate": 1.4213949159816059e-06, + "logits/chosen": -1.621297836303711, + "logits/rejected": -1.374531865119934, + "logps/chosen": -340.29486083984375, + "logps/rejected": -422.07275390625, + "loss": 0.0272, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.12620839476585388, + "rewards/margins": 0.1090429425239563, + "rewards/rejected": -0.2352513074874878, + "step": 5180 + }, + { + "epoch": 0.68, + "learning_rate": 1.4111031729425103e-06, + "logits/chosen": -1.6758848428726196, + "logits/rejected": -1.4807888269424438, + "logps/chosen": -440.89263916015625, + "logps/rejected": -510.248046875, + "loss": 0.0202, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17388439178466797, + "rewards/margins": 0.11208424717187881, + "rewards/rejected": -0.2859686017036438, + "step": 5190 + }, + { + "epoch": 0.68, + "learning_rate": 1.4008341540547965e-06, + "logits/chosen": -1.525407075881958, + "logits/rejected": -1.5021107196807861, + "logps/chosen": -427.673828125, + "logps/rejected": -486.7311096191406, + "loss": 0.023, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19444535672664642, + "rewards/margins": 0.05377691239118576, + "rewards/rejected": -0.24822227656841278, + "step": 5200 + }, + { + "epoch": 0.68, + "learning_rate": 1.3905880736222737e-06, + "logits/chosen": -1.5743075609207153, + "logits/rejected": -1.6213937997817993, + "logps/chosen": -369.836181640625, + "logps/rejected": -453.7099609375, + "loss": 0.0361, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.18054161965847015, + "rewards/margins": 0.06503504514694214, + "rewards/rejected": -0.2455766499042511, + "step": 5210 + }, + { + "epoch": 0.68, + "learning_rate": 1.3803651454700531e-06, + "logits/chosen": -1.5111993551254272, + "logits/rejected": -1.3026028871536255, + "logps/chosen": -458.8428649902344, + "logps/rejected": -492.923095703125, + "loss": 0.0259, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.20958459377288818, + "rewards/margins": 0.0841468870639801, + "rewards/rejected": -0.2937314510345459, + "step": 5220 + }, + { + "epoch": 0.68, + "learning_rate": 1.3701655829400773e-06, + "logits/chosen": -1.6716458797454834, + "logits/rejected": -1.4183886051177979, + "logps/chosen": -525.5261840820312, + "logps/rejected": -505.20965576171875, + "loss": 0.022, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.21298542618751526, + "rewards/margins": 0.0640968531370163, + "rewards/rejected": -0.27708229422569275, + "step": 5230 + }, + { + "epoch": 0.69, + "learning_rate": 1.3599895988866756e-06, + "logits/chosen": -1.4628162384033203, + "logits/rejected": -1.3058449029922485, + "logps/chosen": -388.9469909667969, + "logps/rejected": -414.37957763671875, + "loss": 0.0168, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.18021324276924133, + "rewards/margins": 0.039642706513404846, + "rewards/rejected": -0.21985594928264618, + "step": 5240 + }, + { + "epoch": 0.69, + "learning_rate": 1.3498374056721198e-06, + "logits/chosen": -1.696189522743225, + "logits/rejected": -1.6554816961288452, + "logps/chosen": -349.71197509765625, + "logps/rejected": -488.245361328125, + "loss": 0.0165, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.16485311090946198, + "rewards/margins": 0.09726069867610931, + "rewards/rejected": -0.2621137797832489, + "step": 5250 + }, + { + "epoch": 0.69, + "learning_rate": 1.3397092151621883e-06, + "logits/chosen": -1.6548722982406616, + "logits/rejected": -1.3580074310302734, + "logps/chosen": -429.25, + "logps/rejected": -467.40155029296875, + "loss": 0.0365, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2029763013124466, + "rewards/margins": 0.08197561651468277, + "rewards/rejected": -0.28495192527770996, + "step": 5260 + }, + { + "epoch": 0.69, + "learning_rate": 1.3296052387217484e-06, + "logits/chosen": -1.462677240371704, + "logits/rejected": -1.449013352394104, + "logps/chosen": -362.41571044921875, + "logps/rejected": -415.59051513671875, + "loss": 0.0155, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.1855693757534027, + "rewards/margins": 0.08781188726425171, + "rewards/rejected": -0.2733812630176544, + "step": 5270 + }, + { + "epoch": 0.69, + "learning_rate": 1.3195256872103476e-06, + "logits/chosen": -1.488548994064331, + "logits/rejected": -1.5530840158462524, + "logps/chosen": -410.396728515625, + "logps/rejected": -510.2646484375, + "loss": 0.0188, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.16777709126472473, + "rewards/margins": 0.06479448080062866, + "rewards/rejected": -0.2325715571641922, + "step": 5280 + }, + { + "epoch": 0.69, + "learning_rate": 1.3094707709778068e-06, + "logits/chosen": -1.3897769451141357, + "logits/rejected": -1.3290399312973022, + "logps/chosen": -398.8644714355469, + "logps/rejected": -455.2787170410156, + "loss": 0.0193, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.20112545788288116, + "rewards/margins": 0.09017898887395859, + "rewards/rejected": -0.29130443930625916, + "step": 5290 + }, + { + "epoch": 0.69, + "learning_rate": 1.2994406998598364e-06, + "logits/chosen": -1.6049206256866455, + "logits/rejected": -1.458370566368103, + "logps/chosen": -367.20831298828125, + "logps/rejected": -526.95166015625, + "loss": 0.0272, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2139579802751541, + "rewards/margins": 0.16104961931705475, + "rewards/rejected": -0.3750075697898865, + "step": 5300 + }, + { + "epoch": 0.69, + "learning_rate": 1.2894356831736558e-06, + "logits/chosen": -1.5055642127990723, + "logits/rejected": -1.3894039392471313, + "logps/chosen": -393.8443298339844, + "logps/rejected": -445.658935546875, + "loss": 0.0301, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.19631923735141754, + "rewards/margins": 0.08201192319393158, + "rewards/rejected": -0.2783311903476715, + "step": 5310 + }, + { + "epoch": 0.7, + "learning_rate": 1.2794559297136203e-06, + "logits/chosen": -1.3348758220672607, + "logits/rejected": -1.2198129892349243, + "logps/chosen": -498.6011657714844, + "logps/rejected": -608.5252075195312, + "loss": 0.0198, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.25773555040359497, + "rewards/margins": 0.09582929313182831, + "rewards/rejected": -0.3535648286342621, + "step": 5320 + }, + { + "epoch": 0.7, + "learning_rate": 1.2695016477468724e-06, + "logits/chosen": -1.562260389328003, + "logits/rejected": -1.5694024562835693, + "logps/chosen": -445.9269104003906, + "logps/rejected": -454.0824279785156, + "loss": 0.0368, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.20634634792804718, + "rewards/margins": 0.051405757665634155, + "rewards/rejected": -0.25775209069252014, + "step": 5330 + }, + { + "epoch": 0.7, + "learning_rate": 1.2595730450089874e-06, + "logits/chosen": -1.8299586772918701, + "logits/rejected": -1.5711266994476318, + "logps/chosen": -522.7340087890625, + "logps/rejected": -592.4933471679688, + "loss": 0.0245, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.24600999057292938, + "rewards/margins": 0.10581465065479279, + "rewards/rejected": -0.35182467103004456, + "step": 5340 + }, + { + "epoch": 0.7, + "learning_rate": 1.2496703286996433e-06, + "logits/chosen": -1.7908226251602173, + "logits/rejected": -1.3918564319610596, + "logps/chosen": -522.5734252929688, + "logps/rejected": -556.00439453125, + "loss": 0.015, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.217800572514534, + "rewards/margins": 0.1189211830496788, + "rewards/rejected": -0.3367217481136322, + "step": 5350 + }, + { + "epoch": 0.7, + "learning_rate": 1.2397937054782961e-06, + "logits/chosen": -1.5083521604537964, + "logits/rejected": -1.432520866394043, + "logps/chosen": -523.9688720703125, + "logps/rejected": -547.6058959960938, + "loss": 0.0206, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.23229670524597168, + "rewards/margins": 0.07556265592575073, + "rewards/rejected": -0.3078593611717224, + "step": 5360 + }, + { + "epoch": 0.7, + "learning_rate": 1.2299433814598635e-06, + "logits/chosen": -1.6189830303192139, + "logits/rejected": -1.5186855792999268, + "logps/chosen": -429.11181640625, + "logps/rejected": -455.6077575683594, + "loss": 0.0412, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.19660572707653046, + "rewards/margins": 0.058229874819517136, + "rewards/rejected": -0.2548356056213379, + "step": 5370 + }, + { + "epoch": 0.7, + "learning_rate": 1.2201195622104265e-06, + "logits/chosen": -1.6549122333526611, + "logits/rejected": -1.5803096294403076, + "logps/chosen": -481.8655700683594, + "logps/rejected": -576.1634521484375, + "loss": 0.0196, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.23913542926311493, + "rewards/margins": 0.13378915190696716, + "rewards/rejected": -0.3729245662689209, + "step": 5380 + }, + { + "epoch": 0.71, + "learning_rate": 1.2103224527429417e-06, + "logits/chosen": -1.5333130359649658, + "logits/rejected": -1.2944916486740112, + "logps/chosen": -447.95330810546875, + "logps/rejected": -433.03582763671875, + "loss": 0.046, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2052149474620819, + "rewards/margins": 0.07002494484186172, + "rewards/rejected": -0.27523988485336304, + "step": 5390 + }, + { + "epoch": 0.71, + "learning_rate": 1.2005522575129559e-06, + "logits/chosen": -1.7821900844573975, + "logits/rejected": -1.429947018623352, + "logps/chosen": -551.6270751953125, + "logps/rejected": -515.8822021484375, + "loss": 0.0256, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.20926909148693085, + "rewards/margins": 0.08802861720323563, + "rewards/rejected": -0.2972976863384247, + "step": 5400 + }, + { + "epoch": 0.71, + "learning_rate": 1.1908091804143469e-06, + "logits/chosen": -1.4563171863555908, + "logits/rejected": -1.3154704570770264, + "logps/chosen": -382.73455810546875, + "logps/rejected": -472.9546813964844, + "loss": 0.0198, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.20601502060890198, + "rewards/margins": 0.0962318703532219, + "rewards/rejected": -0.30224689841270447, + "step": 5410 + }, + { + "epoch": 0.71, + "learning_rate": 1.1810934247750649e-06, + "logits/chosen": -1.5964233875274658, + "logits/rejected": -1.1891409158706665, + "logps/chosen": -484.681640625, + "logps/rejected": -531.6754150390625, + "loss": 0.0209, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.19162070751190186, + "rewards/margins": 0.12779280543327332, + "rewards/rejected": -0.3194134831428528, + "step": 5420 + }, + { + "epoch": 0.71, + "learning_rate": 1.1714051933528881e-06, + "logits/chosen": -1.791672945022583, + "logits/rejected": -1.4877759218215942, + "logps/chosen": -417.502685546875, + "logps/rejected": -434.6937561035156, + "loss": 0.0175, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.15925352275371552, + "rewards/margins": 0.06685891002416611, + "rewards/rejected": -0.22611241042613983, + "step": 5430 + }, + { + "epoch": 0.71, + "learning_rate": 1.161744688331192e-06, + "logits/chosen": -1.5497604608535767, + "logits/rejected": -1.3743586540222168, + "logps/chosen": -411.5484313964844, + "logps/rejected": -510.011962890625, + "loss": 0.0135, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.19434738159179688, + "rewards/margins": 0.06442542374134064, + "rewards/rejected": -0.2587727904319763, + "step": 5440 + }, + { + "epoch": 0.71, + "learning_rate": 1.152112111314733e-06, + "logits/chosen": -1.5756428241729736, + "logits/rejected": -1.354800820350647, + "logps/chosen": -347.12139892578125, + "logps/rejected": -366.8716125488281, + "loss": 0.0183, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15612633526325226, + "rewards/margins": 0.054203521460294724, + "rewards/rejected": -0.2103298455476761, + "step": 5450 + }, + { + "epoch": 0.71, + "learning_rate": 1.142507663325439e-06, + "logits/chosen": -1.4416675567626953, + "logits/rejected": -1.3867011070251465, + "logps/chosen": -422.52947998046875, + "logps/rejected": -504.47296142578125, + "loss": 0.0354, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1803435981273651, + "rewards/margins": 0.10444513708353043, + "rewards/rejected": -0.28478875756263733, + "step": 5460 + }, + { + "epoch": 0.72, + "learning_rate": 1.132931544798211e-06, + "logits/chosen": -1.5146865844726562, + "logits/rejected": -1.2480287551879883, + "logps/chosen": -467.4181213378906, + "logps/rejected": -529.2227783203125, + "loss": 0.019, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.20075412094593048, + "rewards/margins": 0.106613889336586, + "rewards/rejected": -0.3073680102825165, + "step": 5470 + }, + { + "epoch": 0.72, + "learning_rate": 1.1233839555767482e-06, + "logits/chosen": -1.466223955154419, + "logits/rejected": -1.2259441614151, + "logps/chosen": -426.2166442871094, + "logps/rejected": -446.619140625, + "loss": 0.0325, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.18078255653381348, + "rewards/margins": 0.09439133107662201, + "rewards/rejected": -0.2751738727092743, + "step": 5480 + }, + { + "epoch": 0.72, + "learning_rate": 1.1138650949093668e-06, + "logits/chosen": -1.7423839569091797, + "logits/rejected": -1.4779974222183228, + "logps/chosen": -417.0690002441406, + "logps/rejected": -422.07049560546875, + "loss": 0.0251, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.20100681483745575, + "rewards/margins": 0.053999461233615875, + "rewards/rejected": -0.25500625371932983, + "step": 5490 + }, + { + "epoch": 0.72, + "learning_rate": 1.1043751614448543e-06, + "logits/chosen": -1.5876578092575073, + "logits/rejected": -1.2621053457260132, + "logps/chosen": -394.9156799316406, + "logps/rejected": -432.68328857421875, + "loss": 0.0269, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.19612181186676025, + "rewards/margins": 0.0839340090751648, + "rewards/rejected": -0.28005582094192505, + "step": 5500 + }, + { + "epoch": 0.72, + "learning_rate": 1.0949143532283107e-06, + "logits/chosen": -1.6373828649520874, + "logits/rejected": -1.4211647510528564, + "logps/chosen": -440.989990234375, + "logps/rejected": -500.00396728515625, + "loss": 0.0249, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19455215334892273, + "rewards/margins": 0.08070588856935501, + "rewards/rejected": -0.27525803446769714, + "step": 5510 + }, + { + "epoch": 0.72, + "learning_rate": 1.0854828676970275e-06, + "logits/chosen": -1.4828943014144897, + "logits/rejected": -1.4317944049835205, + "logps/chosen": -427.9281311035156, + "logps/rejected": -534.10400390625, + "loss": 0.0218, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.221365287899971, + "rewards/margins": 0.12023405730724335, + "rewards/rejected": -0.34159931540489197, + "step": 5520 + }, + { + "epoch": 0.72, + "learning_rate": 1.076080901676361e-06, + "logits/chosen": -1.632274866104126, + "logits/rejected": -1.5328576564788818, + "logps/chosen": -380.6434020996094, + "logps/rejected": -445.2096252441406, + "loss": 0.021, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18433251976966858, + "rewards/margins": 0.08061722666025162, + "rewards/rejected": -0.2649497389793396, + "step": 5530 + }, + { + "epoch": 0.72, + "learning_rate": 1.0667086513756234e-06, + "logits/chosen": -1.6440751552581787, + "logits/rejected": -1.3497045040130615, + "logps/chosen": -420.45806884765625, + "logps/rejected": -460.8369140625, + "loss": 0.0245, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18254700303077698, + "rewards/margins": 0.09349583089351654, + "rewards/rejected": -0.2760428190231323, + "step": 5540 + }, + { + "epoch": 0.73, + "learning_rate": 1.0573663123839912e-06, + "logits/chosen": -1.7530996799468994, + "logits/rejected": -1.4101498126983643, + "logps/chosen": -465.41015625, + "logps/rejected": -476.9146423339844, + "loss": 0.0158, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.225990891456604, + "rewards/margins": 0.08158402889966965, + "rewards/rejected": -0.30757492780685425, + "step": 5550 + }, + { + "epoch": 0.73, + "learning_rate": 1.0480540796664251e-06, + "logits/chosen": -1.5448033809661865, + "logits/rejected": -1.368115782737732, + "logps/chosen": -503.0091247558594, + "logps/rejected": -597.8097534179688, + "loss": 0.0157, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.25012126564979553, + "rewards/margins": 0.08918876945972443, + "rewards/rejected": -0.3393099904060364, + "step": 5560 + }, + { + "epoch": 0.73, + "learning_rate": 1.0387721475595978e-06, + "logits/chosen": -1.4915783405303955, + "logits/rejected": -1.3935407400131226, + "logps/chosen": -372.75262451171875, + "logps/rejected": -428.7457580566406, + "loss": 0.0417, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21398456394672394, + "rewards/margins": 0.05937281250953674, + "rewards/rejected": -0.2733573615550995, + "step": 5570 + }, + { + "epoch": 0.73, + "learning_rate": 1.0295207097678378e-06, + "logits/chosen": -1.4784311056137085, + "logits/rejected": -1.4672610759735107, + "logps/chosen": -366.3025207519531, + "logps/rejected": -546.9857177734375, + "loss": 0.0361, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19802804291248322, + "rewards/margins": 0.11691056191921234, + "rewards/rejected": -0.31493860483169556, + "step": 5580 + }, + { + "epoch": 0.73, + "learning_rate": 1.0202999593590924e-06, + "logits/chosen": -1.523376226425171, + "logits/rejected": -1.3750369548797607, + "logps/chosen": -420.4945373535156, + "logps/rejected": -550.7835693359375, + "loss": 0.0296, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2096870243549347, + "rewards/margins": 0.1410565823316574, + "rewards/rejected": -0.3507435917854309, + "step": 5590 + }, + { + "epoch": 0.73, + "learning_rate": 1.011110088760891e-06, + "logits/chosen": -1.4990277290344238, + "logits/rejected": -1.411941409111023, + "logps/chosen": -419.41851806640625, + "logps/rejected": -543.7576904296875, + "loss": 0.0296, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.23063719272613525, + "rewards/margins": 0.12043901532888412, + "rewards/rejected": -0.3510761857032776, + "step": 5600 + }, + { + "epoch": 0.73, + "learning_rate": 1.0019512897563347e-06, + "logits/chosen": -1.6697190999984741, + "logits/rejected": -1.3446718454360962, + "logps/chosen": -505.65313720703125, + "logps/rejected": -560.1145629882812, + "loss": 0.0325, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2524965703487396, + "rewards/margins": 0.09622282534837723, + "rewards/rejected": -0.34871941804885864, + "step": 5610 + }, + { + "epoch": 0.74, + "learning_rate": 9.928237534800935e-07, + "logits/chosen": -1.6215341091156006, + "logits/rejected": -1.5561894178390503, + "logps/chosen": -405.2314453125, + "logps/rejected": -478.72247314453125, + "loss": 0.0217, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.20122945308685303, + "rewards/margins": 0.09568652510643005, + "rewards/rejected": -0.2969159781932831, + "step": 5620 + }, + { + "epoch": 0.74, + "learning_rate": 9.837276704144174e-07, + "logits/chosen": -1.5506407022476196, + "logits/rejected": -1.3838660717010498, + "logps/chosen": -423.6056213378906, + "logps/rejected": -450.9071350097656, + "loss": 0.0273, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2144496887922287, + "rewards/margins": 0.07349663227796555, + "rewards/rejected": -0.28794634342193604, + "step": 5630 + }, + { + "epoch": 0.74, + "learning_rate": 9.746632303851569e-07, + "logits/chosen": -1.7482116222381592, + "logits/rejected": -1.3927713632583618, + "logps/chosen": -425.24053955078125, + "logps/rejected": -488.3824768066406, + "loss": 0.0194, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20241156220436096, + "rewards/margins": 0.10810136795043945, + "rewards/rejected": -0.3105129301548004, + "step": 5640 + }, + { + "epoch": 0.74, + "learning_rate": 9.65630622557809e-07, + "logits/chosen": -1.3741400241851807, + "logits/rejected": -1.094609022140503, + "logps/chosen": -459.25537109375, + "logps/rejected": -542.5413818359375, + "loss": 0.0318, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2083849161863327, + "rewards/margins": 0.09790316969156265, + "rewards/rejected": -0.30628806352615356, + "step": 5650 + }, + { + "epoch": 0.74, + "learning_rate": 9.56630035433561e-07, + "logits/chosen": -1.7187912464141846, + "logits/rejected": -1.4406051635742188, + "logps/chosen": -451.6206970214844, + "logps/rejected": -473.7144470214844, + "loss": 0.03, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1990150511264801, + "rewards/margins": 0.08933638036251068, + "rewards/rejected": -0.2883513867855072, + "step": 5660 + }, + { + "epoch": 0.74, + "learning_rate": 9.476616568453659e-07, + "logits/chosen": -1.617304801940918, + "logits/rejected": -1.2837506532669067, + "logps/chosen": -501.55615234375, + "logps/rejected": -495.8614196777344, + "loss": 0.0241, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23391124606132507, + "rewards/margins": 0.058008432388305664, + "rewards/rejected": -0.29191964864730835, + "step": 5670 + }, + { + "epoch": 0.74, + "learning_rate": 9.387256739540162e-07, + "logits/chosen": -1.5672932863235474, + "logits/rejected": -1.3743022680282593, + "logps/chosen": -403.78558349609375, + "logps/rejected": -425.3617248535156, + "loss": 0.025, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1623322069644928, + "rewards/margins": 0.05934957414865494, + "rewards/rejected": -0.22168178856372833, + "step": 5680 + }, + { + "epoch": 0.74, + "learning_rate": 9.298222732442377e-07, + "logits/chosen": -1.8293750286102295, + "logits/rejected": -1.4852283000946045, + "logps/chosen": -422.79888916015625, + "logps/rejected": -495.72381591796875, + "loss": 0.0247, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1667879819869995, + "rewards/margins": 0.12689343094825745, + "rewards/rejected": -0.29368144273757935, + "step": 5690 + }, + { + "epoch": 0.75, + "learning_rate": 9.20951640520803e-07, + "logits/chosen": -1.6043968200683594, + "logits/rejected": -1.4415172338485718, + "logps/chosen": -422.46337890625, + "logps/rejected": -482.3023376464844, + "loss": 0.0219, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.20696571469306946, + "rewards/margins": 0.08833520114421844, + "rewards/rejected": -0.2953009009361267, + "step": 5700 + }, + { + "epoch": 0.75, + "learning_rate": 9.121139609046484e-07, + "logits/chosen": -1.5958956480026245, + "logits/rejected": -1.451616883277893, + "logps/chosen": -351.499755859375, + "logps/rejected": -456.39324951171875, + "loss": 0.0204, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.1530417501926422, + "rewards/margins": 0.120171919465065, + "rewards/rejected": -0.273213654756546, + "step": 5710 + }, + { + "epoch": 0.75, + "learning_rate": 9.033094188290121e-07, + "logits/chosen": -1.4508789777755737, + "logits/rejected": -1.4076393842697144, + "logps/chosen": -400.70465087890625, + "logps/rejected": -447.3128967285156, + "loss": 0.0188, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.17952290177345276, + "rewards/margins": 0.08391942083835602, + "rewards/rejected": -0.2634423077106476, + "step": 5720 + }, + { + "epoch": 0.75, + "learning_rate": 8.945381980355889e-07, + "logits/chosen": -1.697596788406372, + "logits/rejected": -1.5488923788070679, + "logps/chosen": -357.16522216796875, + "logps/rejected": -423.3173828125, + "loss": 0.0284, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1615738719701767, + "rewards/margins": 0.06780795753002167, + "rewards/rejected": -0.22938182950019836, + "step": 5730 + }, + { + "epoch": 0.75, + "learning_rate": 8.858004815706919e-07, + "logits/chosen": -1.5258272886276245, + "logits/rejected": -1.4233239889144897, + "logps/chosen": -403.95599365234375, + "logps/rejected": -502.761474609375, + "loss": 0.0237, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2067147195339203, + "rewards/margins": 0.10103969275951385, + "rewards/rejected": -0.30775442719459534, + "step": 5740 + }, + { + "epoch": 0.75, + "learning_rate": 8.77096451781432e-07, + "logits/chosen": -1.431740164756775, + "logits/rejected": -1.1918470859527588, + "logps/chosen": -425.783203125, + "logps/rejected": -505.8761291503906, + "loss": 0.0361, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.23915116488933563, + "rewards/margins": 0.1044241189956665, + "rewards/rejected": -0.34357529878616333, + "step": 5750 + }, + { + "epoch": 0.75, + "learning_rate": 8.684262903119165e-07, + "logits/chosen": -1.505095362663269, + "logits/rejected": -1.574194312095642, + "logps/chosen": -411.2688903808594, + "logps/rejected": -521.2644653320312, + "loss": 0.0195, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.18665559589862823, + "rewards/margins": 0.09668277949094772, + "rewards/rejected": -0.28333839774131775, + "step": 5760 + }, + { + "epoch": 0.76, + "learning_rate": 8.597901780994525e-07, + "logits/chosen": -1.814569115638733, + "logits/rejected": -1.554092526435852, + "logps/chosen": -465.39495849609375, + "logps/rejected": -522.8751831054688, + "loss": 0.0175, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2231772392988205, + "rewards/margins": 0.11268983036279678, + "rewards/rejected": -0.33586710691452026, + "step": 5770 + }, + { + "epoch": 0.76, + "learning_rate": 8.511882953707773e-07, + "logits/chosen": -1.669830322265625, + "logits/rejected": -1.4222438335418701, + "logps/chosen": -502.70806884765625, + "logps/rejected": -509.10498046875, + "loss": 0.0165, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.22161705791950226, + "rewards/margins": 0.05758953094482422, + "rewards/rejected": -0.27920660376548767, + "step": 5780 + }, + { + "epoch": 0.76, + "learning_rate": 8.426208216382944e-07, + "logits/chosen": -1.5124791860580444, + "logits/rejected": -1.4276105165481567, + "logps/chosen": -387.97723388671875, + "logps/rejected": -436.249755859375, + "loss": 0.0323, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2093040645122528, + "rewards/margins": 0.06647123396396637, + "rewards/rejected": -0.27577531337738037, + "step": 5790 + }, + { + "epoch": 0.76, + "learning_rate": 8.340879356963245e-07, + "logits/chosen": -1.4797133207321167, + "logits/rejected": -1.360034465789795, + "logps/chosen": -422.455078125, + "logps/rejected": -495.0018005371094, + "loss": 0.0232, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19471265375614166, + "rewards/margins": 0.08114586770534515, + "rewards/rejected": -0.2758585512638092, + "step": 5800 + }, + { + "epoch": 0.76, + "learning_rate": 8.255898156173777e-07, + "logits/chosen": -1.696480393409729, + "logits/rejected": -1.5143259763717651, + "logps/chosen": -396.93707275390625, + "logps/rejected": -459.56396484375, + "loss": 0.0194, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2079300880432129, + "rewards/margins": 0.07973875105381012, + "rewards/rejected": -0.2876688838005066, + "step": 5810 + }, + { + "epoch": 0.76, + "learning_rate": 8.171266387484389e-07, + "logits/chosen": -1.74526047706604, + "logits/rejected": -1.593643069267273, + "logps/chosen": -409.3708190917969, + "logps/rejected": -455.1100158691406, + "loss": 0.019, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17666839063167572, + "rewards/margins": 0.09848718345165253, + "rewards/rejected": -0.27515554428100586, + "step": 5820 + }, + { + "epoch": 0.76, + "learning_rate": 8.086985817072604e-07, + "logits/chosen": -1.6020183563232422, + "logits/rejected": -1.3858546018600464, + "logps/chosen": -407.660400390625, + "logps/rejected": -462.2608337402344, + "loss": 0.0212, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1779371052980423, + "rewards/margins": 0.09084297716617584, + "rewards/rejected": -0.2687801122665405, + "step": 5830 + }, + { + "epoch": 0.76, + "learning_rate": 8.003058203786835e-07, + "logits/chosen": -1.6491711139678955, + "logits/rejected": -1.342855453491211, + "logps/chosen": -497.6109313964844, + "logps/rejected": -524.0183715820312, + "loss": 0.0152, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23766927421092987, + "rewards/margins": 0.07575170695781708, + "rewards/rejected": -0.31342101097106934, + "step": 5840 + }, + { + "epoch": 0.77, + "learning_rate": 7.91948529910963e-07, + "logits/chosen": -1.6227118968963623, + "logits/rejected": -1.4864352941513062, + "logps/chosen": -431.8373107910156, + "logps/rejected": -506.4502868652344, + "loss": 0.0212, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.19953256845474243, + "rewards/margins": 0.10467538982629776, + "rewards/rejected": -0.3042079508304596, + "step": 5850 + }, + { + "epoch": 0.77, + "learning_rate": 7.836268847121126e-07, + "logits/chosen": -1.4191184043884277, + "logits/rejected": -1.314880132675171, + "logps/chosen": -410.31903076171875, + "logps/rejected": -478.3172302246094, + "loss": 0.0254, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18665172159671783, + "rewards/margins": 0.06229304522275925, + "rewards/rejected": -0.24894475936889648, + "step": 5860 + }, + { + "epoch": 0.77, + "learning_rate": 7.753410584462681e-07, + "logits/chosen": -1.6070563793182373, + "logits/rejected": -1.327772855758667, + "logps/chosen": -436.433349609375, + "logps/rejected": -533.3255615234375, + "loss": 0.0309, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.20444254577159882, + "rewards/margins": 0.144081711769104, + "rewards/rejected": -0.3485243022441864, + "step": 5870 + }, + { + "epoch": 0.77, + "learning_rate": 7.670912240300596e-07, + "logits/chosen": -1.6220155954360962, + "logits/rejected": -1.4926598072052002, + "logps/chosen": -392.23870849609375, + "logps/rejected": -450.91741943359375, + "loss": 0.0204, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.17535719275474548, + "rewards/margins": 0.08097521215677261, + "rewards/rejected": -0.2563324272632599, + "step": 5880 + }, + { + "epoch": 0.77, + "learning_rate": 7.588775536290035e-07, + "logits/chosen": -1.7251522541046143, + "logits/rejected": -1.3986337184906006, + "logps/chosen": -521.7364501953125, + "logps/rejected": -563.6209106445312, + "loss": 0.0187, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26620036363601685, + "rewards/margins": 0.08622214198112488, + "rewards/rejected": -0.35242247581481934, + "step": 5890 + }, + { + "epoch": 0.77, + "learning_rate": 7.507002186539147e-07, + "logits/chosen": -1.4981155395507812, + "logits/rejected": -1.524047613143921, + "logps/chosen": -369.7276306152344, + "logps/rejected": -434.8815002441406, + "loss": 0.0349, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.18315860629081726, + "rewards/margins": 0.05764635652303696, + "rewards/rejected": -0.24080495536327362, + "step": 5900 + }, + { + "epoch": 0.77, + "learning_rate": 7.425593897573216e-07, + "logits/chosen": -1.459442377090454, + "logits/rejected": -1.323828101158142, + "logps/chosen": -468.89044189453125, + "logps/rejected": -514.7048950195312, + "loss": 0.0299, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.20316457748413086, + "rewards/margins": 0.08581452071666718, + "rewards/rejected": -0.28897911310195923, + "step": 5910 + }, + { + "epoch": 0.77, + "learning_rate": 7.344552368299088e-07, + "logits/chosen": -1.554771900177002, + "logits/rejected": -1.4254438877105713, + "logps/chosen": -480.82427978515625, + "logps/rejected": -497.3819274902344, + "loss": 0.0306, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.21433106064796448, + "rewards/margins": 0.033910952508449554, + "rewards/rejected": -0.24824202060699463, + "step": 5920 + }, + { + "epoch": 0.78, + "learning_rate": 7.26387928996973e-07, + "logits/chosen": -1.3242512941360474, + "logits/rejected": -1.1937670707702637, + "logps/chosen": -440.6521911621094, + "logps/rejected": -535.5496215820312, + "loss": 0.032, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21807590126991272, + "rewards/margins": 0.13068453967571259, + "rewards/rejected": -0.3487604558467865, + "step": 5930 + }, + { + "epoch": 0.78, + "learning_rate": 7.183576346148899e-07, + "logits/chosen": -1.731610894203186, + "logits/rejected": -1.6200708150863647, + "logps/chosen": -431.1158752441406, + "logps/rejected": -541.3234252929688, + "loss": 0.0172, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.19807052612304688, + "rewards/margins": 0.11446140706539154, + "rewards/rejected": -0.31253188848495483, + "step": 5940 + }, + { + "epoch": 0.78, + "learning_rate": 7.103645212676044e-07, + "logits/chosen": -1.4820683002471924, + "logits/rejected": -1.403683066368103, + "logps/chosen": -437.85186767578125, + "logps/rejected": -493.6400451660156, + "loss": 0.0328, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2281392514705658, + "rewards/margins": 0.09585168957710266, + "rewards/rejected": -0.32399097084999084, + "step": 5950 + }, + { + "epoch": 0.78, + "learning_rate": 7.024087557631318e-07, + "logits/chosen": -1.226434350013733, + "logits/rejected": -1.1703420877456665, + "logps/chosen": -409.635498046875, + "logps/rejected": -446.46649169921875, + "loss": 0.0246, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2522982954978943, + "rewards/margins": 0.03306723013520241, + "rewards/rejected": -0.2853655517101288, + "step": 5960 + }, + { + "epoch": 0.78, + "learning_rate": 6.944905041300739e-07, + "logits/chosen": -1.5336253643035889, + "logits/rejected": -1.2892582416534424, + "logps/chosen": -508.7168884277344, + "logps/rejected": -546.019287109375, + "loss": 0.0253, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.234015554189682, + "rewards/margins": 0.10266806930303574, + "rewards/rejected": -0.3366836607456207, + "step": 5970 + }, + { + "epoch": 0.78, + "learning_rate": 6.866099316141606e-07, + "logits/chosen": -1.583622932434082, + "logits/rejected": -1.4463317394256592, + "logps/chosen": -449.3282775878906, + "logps/rejected": -516.5638427734375, + "loss": 0.0204, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.23067812621593475, + "rewards/margins": 0.06670000404119492, + "rewards/rejected": -0.2973781228065491, + "step": 5980 + }, + { + "epoch": 0.78, + "learning_rate": 6.787672026747946e-07, + "logits/chosen": -1.2984622716903687, + "logits/rejected": -1.5075492858886719, + "logps/chosen": -436.43182373046875, + "logps/rejected": -516.0398559570312, + "loss": 0.0235, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.22933240234851837, + "rewards/margins": 0.05532126501202583, + "rewards/rejected": -0.2846536338329315, + "step": 5990 + }, + { + "epoch": 0.79, + "learning_rate": 6.709624809816223e-07, + "logits/chosen": -1.6339048147201538, + "logits/rejected": -1.5467860698699951, + "logps/chosen": -497.2613830566406, + "logps/rejected": -573.0549926757812, + "loss": 0.0228, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.22815433144569397, + "rewards/margins": 0.0838376060128212, + "rewards/rejected": -0.31199193000793457, + "step": 6000 + }, + { + "epoch": 0.79, + "learning_rate": 6.6319592941112e-07, + "logits/chosen": -1.5801079273223877, + "logits/rejected": -1.3842319250106812, + "logps/chosen": -549.0311889648438, + "logps/rejected": -553.92919921875, + "loss": 0.0259, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.24139074981212616, + "rewards/margins": 0.0593680813908577, + "rewards/rejected": -0.30075883865356445, + "step": 6010 + }, + { + "epoch": 0.79, + "learning_rate": 6.554677100431927e-07, + "logits/chosen": -1.6321079730987549, + "logits/rejected": -1.3281104564666748, + "logps/chosen": -444.17315673828125, + "logps/rejected": -489.5956115722656, + "loss": 0.0236, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.20192117989063263, + "rewards/margins": 0.08705952018499374, + "rewards/rejected": -0.28898072242736816, + "step": 6020 + }, + { + "epoch": 0.79, + "learning_rate": 6.4777798415779e-07, + "logits/chosen": -1.4961564540863037, + "logits/rejected": -1.4459128379821777, + "logps/chosen": -424.9244689941406, + "logps/rejected": -489.59375, + "loss": 0.0188, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2145252227783203, + "rewards/margins": 0.0739036574959755, + "rewards/rejected": -0.2884288728237152, + "step": 6030 + }, + { + "epoch": 0.79, + "learning_rate": 6.401269122315451e-07, + "logits/chosen": -1.317834734916687, + "logits/rejected": -1.0378650426864624, + "logps/chosen": -363.69427490234375, + "logps/rejected": -459.9253845214844, + "loss": 0.0399, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20702803134918213, + "rewards/margins": 0.11154347658157349, + "rewards/rejected": -0.3185714781284332, + "step": 6040 + }, + { + "epoch": 0.79, + "learning_rate": 6.325146539344196e-07, + "logits/chosen": -1.5062226057052612, + "logits/rejected": -1.369683861732483, + "logps/chosen": -503.24725341796875, + "logps/rejected": -545.2264404296875, + "loss": 0.0213, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2399510145187378, + "rewards/margins": 0.07239175587892532, + "rewards/rejected": -0.3123427629470825, + "step": 6050 + }, + { + "epoch": 0.79, + "learning_rate": 6.249413681263782e-07, + "logits/chosen": -1.7546535730361938, + "logits/rejected": -1.7641979455947876, + "logps/chosen": -422.50714111328125, + "logps/rejected": -490.16949462890625, + "loss": 0.0251, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2074483335018158, + "rewards/margins": 0.07991499453783035, + "rewards/rejected": -0.28736335039138794, + "step": 6060 + }, + { + "epoch": 0.79, + "learning_rate": 6.174072128540686e-07, + "logits/chosen": -1.5013649463653564, + "logits/rejected": -1.378302812576294, + "logps/chosen": -438.6622009277344, + "logps/rejected": -487.98590087890625, + "loss": 0.0333, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2310091257095337, + "rewards/margins": 0.07826980948448181, + "rewards/rejected": -0.3092789351940155, + "step": 6070 + }, + { + "epoch": 0.8, + "learning_rate": 6.099123453475245e-07, + "logits/chosen": -1.586003065109253, + "logits/rejected": -1.3171608448028564, + "logps/chosen": -483.73797607421875, + "logps/rejected": -525.4342041015625, + "loss": 0.0275, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.24406981468200684, + "rewards/margins": 0.09159289300441742, + "rewards/rejected": -0.3356626629829407, + "step": 6080 + }, + { + "epoch": 0.8, + "learning_rate": 6.024569220168836e-07, + "logits/chosen": -1.653754472732544, + "logits/rejected": -1.596088171005249, + "logps/chosen": -432.75506591796875, + "logps/rejected": -490.79156494140625, + "loss": 0.0139, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.20881927013397217, + "rewards/margins": 0.08285851776599884, + "rewards/rejected": -0.2916777729988098, + "step": 6090 + }, + { + "epoch": 0.8, + "learning_rate": 5.950410984491268e-07, + "logits/chosen": -1.8049776554107666, + "logits/rejected": -1.6777689456939697, + "logps/chosen": -529.035888671875, + "logps/rejected": -556.14111328125, + "loss": 0.0267, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.21315769851207733, + "rewards/margins": 0.07590596377849579, + "rewards/rejected": -0.2890636622905731, + "step": 6100 + }, + { + "epoch": 0.8, + "learning_rate": 5.876650294048262e-07, + "logits/chosen": -1.376012921333313, + "logits/rejected": -1.3910853862762451, + "logps/chosen": -446.28985595703125, + "logps/rejected": -570.401123046875, + "loss": 0.0286, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.21063914895057678, + "rewards/margins": 0.11243287473917007, + "rewards/rejected": -0.32307201623916626, + "step": 6110 + }, + { + "epoch": 0.8, + "learning_rate": 5.8032886881492e-07, + "logits/chosen": -1.8189210891723633, + "logits/rejected": -1.6391546726226807, + "logps/chosen": -418.68084716796875, + "logps/rejected": -466.744873046875, + "loss": 0.023, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.18586881458759308, + "rewards/margins": 0.06752636283636093, + "rewards/rejected": -0.2533951699733734, + "step": 6120 + }, + { + "epoch": 0.8, + "learning_rate": 5.730327697774988e-07, + "logits/chosen": -1.5780233144760132, + "logits/rejected": -1.5865637063980103, + "logps/chosen": -437.9036560058594, + "logps/rejected": -494.39501953125, + "loss": 0.0268, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.20080581307411194, + "rewards/margins": 0.06650952249765396, + "rewards/rejected": -0.2673153281211853, + "step": 6130 + }, + { + "epoch": 0.8, + "learning_rate": 5.657768845546068e-07, + "logits/chosen": -1.2442691326141357, + "logits/rejected": -1.501525640487671, + "logps/chosen": -462.56494140625, + "logps/rejected": -573.1702880859375, + "loss": 0.0177, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.21378342807292938, + "rewards/margins": 0.05830472707748413, + "rewards/rejected": -0.2720881700515747, + "step": 6140 + }, + { + "epoch": 0.8, + "learning_rate": 5.585613645690713e-07, + "logits/chosen": -1.5363595485687256, + "logits/rejected": -1.3994860649108887, + "logps/chosen": -445.9325256347656, + "logps/rejected": -556.3574829101562, + "loss": 0.0247, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.23880818486213684, + "rewards/margins": 0.1059606522321701, + "rewards/rejected": -0.34476882219314575, + "step": 6150 + }, + { + "epoch": 0.81, + "learning_rate": 5.513863604013355e-07, + "logits/chosen": -1.6287044286727905, + "logits/rejected": -1.1260802745819092, + "logps/chosen": -432.9693908691406, + "logps/rejected": -435.9732971191406, + "loss": 0.0187, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17502713203430176, + "rewards/margins": 0.07979585230350494, + "rewards/rejected": -0.2548229694366455, + "step": 6160 + }, + { + "epoch": 0.81, + "learning_rate": 5.442520217863215e-07, + "logits/chosen": -1.768366813659668, + "logits/rejected": -1.4605776071548462, + "logps/chosen": -414.16473388671875, + "logps/rejected": -461.45965576171875, + "loss": 0.0124, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.19283291697502136, + "rewards/margins": 0.07525144517421722, + "rewards/rejected": -0.2680843472480774, + "step": 6170 + }, + { + "epoch": 0.81, + "learning_rate": 5.371584976103034e-07, + "logits/chosen": -1.7616106271743774, + "logits/rejected": -1.3792388439178467, + "logps/chosen": -463.5787048339844, + "logps/rejected": -509.7967834472656, + "loss": 0.0191, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2153579294681549, + "rewards/margins": 0.08694064617156982, + "rewards/rejected": -0.30229857563972473, + "step": 6180 + }, + { + "epoch": 0.81, + "learning_rate": 5.301059359077987e-07, + "logits/chosen": -1.5925132036209106, + "logits/rejected": -1.495607614517212, + "logps/chosen": -379.4404602050781, + "logps/rejected": -434.323486328125, + "loss": 0.0307, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.18784675002098083, + "rewards/margins": 0.06654896587133408, + "rewards/rejected": -0.2543957233428955, + "step": 6190 + }, + { + "epoch": 0.81, + "learning_rate": 5.230944838584806e-07, + "logits/chosen": -1.6353609561920166, + "logits/rejected": -1.5216481685638428, + "logps/chosen": -482.3740234375, + "logps/rejected": -572.2147216796875, + "loss": 0.0193, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.217765212059021, + "rewards/margins": 0.12146315723657608, + "rewards/rejected": -0.3392283320426941, + "step": 6200 + }, + { + "epoch": 0.81, + "learning_rate": 5.161242877841083e-07, + "logits/chosen": -1.6601533889770508, + "logits/rejected": -1.3861908912658691, + "logps/chosen": -473.46685791015625, + "logps/rejected": -509.176025390625, + "loss": 0.0211, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2064732313156128, + "rewards/margins": 0.07329900562763214, + "rewards/rejected": -0.27977222204208374, + "step": 6210 + }, + { + "epoch": 0.81, + "learning_rate": 5.091954931454682e-07, + "logits/chosen": -1.485788106918335, + "logits/rejected": -1.4958043098449707, + "logps/chosen": -372.6158142089844, + "logps/rejected": -446.5315856933594, + "loss": 0.0279, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1747020035982132, + "rewards/margins": 0.07266520708799362, + "rewards/rejected": -0.24736721813678741, + "step": 6220 + }, + { + "epoch": 0.82, + "learning_rate": 5.023082445393446e-07, + "logits/chosen": -1.6031299829483032, + "logits/rejected": -1.3771857023239136, + "logps/chosen": -426.8971252441406, + "logps/rejected": -490.07373046875, + "loss": 0.033, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.21284392476081848, + "rewards/margins": 0.08204887807369232, + "rewards/rejected": -0.2948927879333496, + "step": 6230 + }, + { + "epoch": 0.82, + "learning_rate": 4.95462685695498e-07, + "logits/chosen": -1.5788943767547607, + "logits/rejected": -1.547645926475525, + "logps/chosen": -380.37451171875, + "logps/rejected": -482.9818420410156, + "loss": 0.033, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17835983633995056, + "rewards/margins": 0.08886045962572098, + "rewards/rejected": -0.26722028851509094, + "step": 6240 + }, + { + "epoch": 0.82, + "learning_rate": 4.88658959473666e-07, + "logits/chosen": -1.4899132251739502, + "logits/rejected": -1.4642289876937866, + "logps/chosen": -448.7754821777344, + "logps/rejected": -518.4555053710938, + "loss": 0.0164, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1989007145166397, + "rewards/margins": 0.07370701432228088, + "rewards/rejected": -0.272607684135437, + "step": 6250 + }, + { + "epoch": 0.82, + "learning_rate": 4.818972078605821e-07, + "logits/chosen": -1.478088617324829, + "logits/rejected": -1.3369762897491455, + "logps/chosen": -493.56195068359375, + "logps/rejected": -521.9366455078125, + "loss": 0.0234, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.24244217574596405, + "rewards/margins": 0.06204763054847717, + "rewards/rejected": -0.30448979139328003, + "step": 6260 + }, + { + "epoch": 0.82, + "learning_rate": 4.7517757196701514e-07, + "logits/chosen": -1.6222178936004639, + "logits/rejected": -1.6087877750396729, + "logps/chosen": -391.06536865234375, + "logps/rejected": -491.53961181640625, + "loss": 0.0192, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.22878026962280273, + "rewards/margins": 0.0832093134522438, + "rewards/rejected": -0.31198957562446594, + "step": 6270 + }, + { + "epoch": 0.82, + "learning_rate": 4.6850019202482193e-07, + "logits/chosen": -1.5310463905334473, + "logits/rejected": -1.6684789657592773, + "logps/chosen": -413.8614807128906, + "logps/rejected": -568.8370971679688, + "loss": 0.0167, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.20737600326538086, + "rewards/margins": 0.11540031433105469, + "rewards/rejected": -0.32277631759643555, + "step": 6280 + }, + { + "epoch": 0.82, + "learning_rate": 4.618652073840188e-07, + "logits/chosen": -1.7567546367645264, + "logits/rejected": -1.4541746377944946, + "logps/chosen": -443.84698486328125, + "logps/rejected": -487.93255615234375, + "loss": 0.0194, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.17903652787208557, + "rewards/margins": 0.06564389914274216, + "rewards/rejected": -0.24468043446540833, + "step": 6290 + }, + { + "epoch": 0.82, + "learning_rate": 4.5527275650987965e-07, + "logits/chosen": -1.5819406509399414, + "logits/rejected": -1.5654951333999634, + "logps/chosen": -420.19915771484375, + "logps/rejected": -540.7142333984375, + "loss": 0.0167, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.19606199860572815, + "rewards/margins": 0.10539436340332031, + "rewards/rejected": -0.30145636200904846, + "step": 6300 + }, + { + "epoch": 0.83, + "learning_rate": 4.487229769800394e-07, + "logits/chosen": -1.520179271697998, + "logits/rejected": -1.4008495807647705, + "logps/chosen": -459.998046875, + "logps/rejected": -561.9788818359375, + "loss": 0.0343, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.22858110070228577, + "rewards/margins": 0.10780701786279678, + "rewards/rejected": -0.33638817071914673, + "step": 6310 + }, + { + "epoch": 0.83, + "learning_rate": 4.422160054816285e-07, + "logits/chosen": -1.5911762714385986, + "logits/rejected": -1.3901503086090088, + "logps/chosen": -423.17327880859375, + "logps/rejected": -499.7627868652344, + "loss": 0.0233, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.18377675116062164, + "rewards/margins": 0.10875866562128067, + "rewards/rejected": -0.2925353944301605, + "step": 6320 + }, + { + "epoch": 0.83, + "learning_rate": 4.35751977808416e-07, + "logits/chosen": -1.634953260421753, + "logits/rejected": -1.3580422401428223, + "logps/chosen": -472.6122131347656, + "logps/rejected": -592.571533203125, + "loss": 0.0157, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.17626197636127472, + "rewards/margins": 0.13492093980312347, + "rewards/rejected": -0.3111829161643982, + "step": 6330 + }, + { + "epoch": 0.83, + "learning_rate": 4.293310288579794e-07, + "logits/chosen": -1.6954553127288818, + "logits/rejected": -1.4459668397903442, + "logps/chosen": -377.73321533203125, + "logps/rejected": -393.33935546875, + "loss": 0.0228, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16442880034446716, + "rewards/margins": 0.04790250584483147, + "rewards/rejected": -0.21233132481575012, + "step": 6340 + }, + { + "epoch": 0.83, + "learning_rate": 4.2295329262888733e-07, + "logits/chosen": -1.386487603187561, + "logits/rejected": -1.4061570167541504, + "logps/chosen": -367.64630126953125, + "logps/rejected": -486.550048828125, + "loss": 0.0237, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.17982135713100433, + "rewards/margins": 0.08798646926879883, + "rewards/rejected": -0.26780781149864197, + "step": 6350 + }, + { + "epoch": 0.83, + "learning_rate": 4.1661890221790316e-07, + "logits/chosen": -1.5857356786727905, + "logits/rejected": -1.472039818763733, + "logps/chosen": -412.272705078125, + "logps/rejected": -433.59014892578125, + "loss": 0.0134, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.17613378167152405, + "rewards/margins": 0.06455505639314651, + "rewards/rejected": -0.24068883061408997, + "step": 6360 + }, + { + "epoch": 0.83, + "learning_rate": 4.103279898172072e-07, + "logits/chosen": -1.6553398370742798, + "logits/rejected": -1.4700231552124023, + "logps/chosen": -406.8631286621094, + "logps/rejected": -437.97943115234375, + "loss": 0.0176, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.17570190131664276, + "rewards/margins": 0.09338720887899399, + "rewards/rejected": -0.26908910274505615, + "step": 6370 + }, + { + "epoch": 0.83, + "learning_rate": 4.040806867116401e-07, + "logits/chosen": -1.8268280029296875, + "logits/rejected": -1.6583318710327148, + "logps/chosen": -479.39544677734375, + "logps/rejected": -525.3782348632812, + "loss": 0.0394, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.20694401860237122, + "rewards/margins": 0.06560108065605164, + "rewards/rejected": -0.27254509925842285, + "step": 6380 + }, + { + "epoch": 0.84, + "learning_rate": 3.978771232759615e-07, + "logits/chosen": -1.6186943054199219, + "logits/rejected": -1.7581040859222412, + "logps/chosen": -389.00115966796875, + "logps/rejected": -422.4768981933594, + "loss": 0.0385, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.196413055062294, + "rewards/margins": 0.051200706511735916, + "rewards/rejected": -0.247613787651062, + "step": 6390 + }, + { + "epoch": 0.84, + "learning_rate": 3.917174289721276e-07, + "logits/chosen": -1.4655689001083374, + "logits/rejected": -1.2271219491958618, + "logps/chosen": -406.14678955078125, + "logps/rejected": -524.12646484375, + "loss": 0.0225, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.18133914470672607, + "rewards/margins": 0.14622202515602112, + "rewards/rejected": -0.3275611996650696, + "step": 6400 + }, + { + "epoch": 0.84, + "learning_rate": 3.856017323465938e-07, + "logits/chosen": -1.6540682315826416, + "logits/rejected": -1.288861632347107, + "logps/chosen": -414.47930908203125, + "logps/rejected": -440.4425354003906, + "loss": 0.0256, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19330580532550812, + "rewards/margins": 0.07383783161640167, + "rewards/rejected": -0.2671436369419098, + "step": 6410 + }, + { + "epoch": 0.84, + "learning_rate": 3.7953016102762695e-07, + "logits/chosen": -1.617789626121521, + "logits/rejected": -1.2891263961791992, + "logps/chosen": -440.29608154296875, + "logps/rejected": -442.4244689941406, + "loss": 0.0335, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17672431468963623, + "rewards/margins": 0.08600305020809174, + "rewards/rejected": -0.2627273499965668, + "step": 6420 + }, + { + "epoch": 0.84, + "learning_rate": 3.7350284172264493e-07, + "logits/chosen": -1.6066081523895264, + "logits/rejected": -1.3126106262207031, + "logps/chosen": -422.9794006347656, + "logps/rejected": -471.2784729003906, + "loss": 0.0225, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1922755241394043, + "rewards/margins": 0.10332103073596954, + "rewards/rejected": -0.29559653997421265, + "step": 6430 + }, + { + "epoch": 0.84, + "learning_rate": 3.67519900215573e-07, + "logits/chosen": -1.6866018772125244, + "logits/rejected": -1.497786283493042, + "logps/chosen": -432.73126220703125, + "logps/rejected": -473.71728515625, + "loss": 0.0249, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19587557017803192, + "rewards/margins": 0.08827323466539383, + "rewards/rejected": -0.2841488718986511, + "step": 6440 + }, + { + "epoch": 0.84, + "learning_rate": 3.615814613642174e-07, + "logits/chosen": -1.5590837001800537, + "logits/rejected": -1.405241847038269, + "logps/chosen": -453.08221435546875, + "logps/rejected": -543.8556518554688, + "loss": 0.0191, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2221020758152008, + "rewards/margins": 0.0864696279168129, + "rewards/rejected": -0.3085716664791107, + "step": 6450 + }, + { + "epoch": 0.85, + "learning_rate": 3.5568764909765795e-07, + "logits/chosen": -1.6449756622314453, + "logits/rejected": -1.4965819120407104, + "logps/chosen": -443.61346435546875, + "logps/rejected": -442.5621032714844, + "loss": 0.0181, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.18187734484672546, + "rewards/margins": 0.0648706704378128, + "rewards/rejected": -0.24674801528453827, + "step": 6460 + }, + { + "epoch": 0.85, + "learning_rate": 3.498385864136672e-07, + "logits/chosen": -1.6285717487335205, + "logits/rejected": -1.4089804887771606, + "logps/chosen": -402.15045166015625, + "logps/rejected": -461.0576171875, + "loss": 0.0187, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.16795262694358826, + "rewards/margins": 0.08682878315448761, + "rewards/rejected": -0.2547813951969147, + "step": 6470 + }, + { + "epoch": 0.85, + "learning_rate": 3.440343953761363e-07, + "logits/chosen": -1.4728256464004517, + "logits/rejected": -1.2075586318969727, + "logps/chosen": -387.43408203125, + "logps/rejected": -426.7442932128906, + "loss": 0.0247, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.17750981450080872, + "rewards/margins": 0.09418913722038269, + "rewards/rejected": -0.2716989517211914, + "step": 6480 + }, + { + "epoch": 0.85, + "learning_rate": 3.382751971125345e-07, + "logits/chosen": -1.6317641735076904, + "logits/rejected": -1.608093023300171, + "logps/chosen": -419.4491271972656, + "logps/rejected": -532.7672729492188, + "loss": 0.0142, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1842859834432602, + "rewards/margins": 0.09859196841716766, + "rewards/rejected": -0.28287798166275024, + "step": 6490 + }, + { + "epoch": 0.85, + "learning_rate": 3.3256111181137753e-07, + "logits/chosen": -1.5633186101913452, + "logits/rejected": -1.536211371421814, + "logps/chosen": -362.38861083984375, + "logps/rejected": -454.27459716796875, + "loss": 0.0185, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.18105900287628174, + "rewards/margins": 0.06541544198989868, + "rewards/rejected": -0.2464744597673416, + "step": 6500 + }, + { + "epoch": 0.85, + "learning_rate": 3.2689225871971905e-07, + "logits/chosen": -1.4011424779891968, + "logits/rejected": -1.3848562240600586, + "logps/chosen": -448.1383361816406, + "logps/rejected": -477.1025390625, + "loss": 0.0248, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2206324338912964, + "rewards/margins": 0.05015076324343681, + "rewards/rejected": -0.2707832157611847, + "step": 6510 + }, + { + "epoch": 0.85, + "learning_rate": 3.2126875614066523e-07, + "logits/chosen": -1.6511691808700562, + "logits/rejected": -1.4052765369415283, + "logps/chosen": -386.79364013671875, + "logps/rejected": -483.07855224609375, + "loss": 0.0164, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.16033944487571716, + "rewards/margins": 0.12655356526374817, + "rewards/rejected": -0.2868930399417877, + "step": 6520 + }, + { + "epoch": 0.85, + "learning_rate": 3.156907214309024e-07, + "logits/chosen": -1.4176801443099976, + "logits/rejected": -1.4613978862762451, + "logps/chosen": -389.1749267578125, + "logps/rejected": -454.427001953125, + "loss": 0.0328, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.20327451825141907, + "rewards/margins": 0.07178395241498947, + "rewards/rejected": -0.2750585079193115, + "step": 6530 + }, + { + "epoch": 0.86, + "learning_rate": 3.1015827099824923e-07, + "logits/chosen": -1.6079105138778687, + "logits/rejected": -1.4893054962158203, + "logps/chosen": -437.54998779296875, + "logps/rejected": -476.2743225097656, + "loss": 0.0159, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.16807910799980164, + "rewards/margins": 0.05767925828695297, + "rewards/rejected": -0.2257583886384964, + "step": 6540 + }, + { + "epoch": 0.86, + "learning_rate": 3.0467152029922926e-07, + "logits/chosen": -1.4525275230407715, + "logits/rejected": -1.3955967426300049, + "logps/chosen": -413.958251953125, + "logps/rejected": -460.4651794433594, + "loss": 0.0283, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.19624033570289612, + "rewards/margins": 0.07971431314945221, + "rewards/rejected": -0.2759546637535095, + "step": 6550 + }, + { + "epoch": 0.86, + "learning_rate": 2.992305838366591e-07, + "logits/chosen": -1.7086737155914307, + "logits/rejected": -1.487250566482544, + "logps/chosen": -346.0322265625, + "logps/rejected": -405.02288818359375, + "loss": 0.0179, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.18641534447669983, + "rewards/margins": 0.06775184720754623, + "rewards/rejected": -0.25416722893714905, + "step": 6560 + }, + { + "epoch": 0.86, + "learning_rate": 2.938355751572583e-07, + "logits/chosen": -1.563066840171814, + "logits/rejected": -1.3573358058929443, + "logps/chosen": -377.8486633300781, + "logps/rejected": -448.5704040527344, + "loss": 0.0256, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19250357151031494, + "rewards/margins": 0.08367094397544861, + "rewards/rejected": -0.27617448568344116, + "step": 6570 + }, + { + "epoch": 0.86, + "learning_rate": 2.8848660684928307e-07, + "logits/chosen": -1.5894094705581665, + "logits/rejected": -1.4691380262374878, + "logps/chosen": -450.65362548828125, + "logps/rejected": -520.8355712890625, + "loss": 0.0331, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.24818184971809387, + "rewards/margins": 0.04456399381160736, + "rewards/rejected": -0.2927458584308624, + "step": 6580 + }, + { + "epoch": 0.86, + "learning_rate": 2.8318379054017383e-07, + "logits/chosen": -1.5910918712615967, + "logits/rejected": -1.3723212480545044, + "logps/chosen": -463.10626220703125, + "logps/rejected": -579.238525390625, + "loss": 0.0279, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2031777799129486, + "rewards/margins": 0.08301732689142227, + "rewards/rejected": -0.2861950993537903, + "step": 6590 + }, + { + "epoch": 0.86, + "learning_rate": 2.779272368942246e-07, + "logits/chosen": -1.600366234779358, + "logits/rejected": -1.260945200920105, + "logps/chosen": -440.1941833496094, + "logps/rejected": -498.590576171875, + "loss": 0.0229, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.21485555171966553, + "rewards/margins": 0.0972893014550209, + "rewards/rejected": -0.3121448755264282, + "step": 6600 + }, + { + "epoch": 0.86, + "learning_rate": 2.7271705561027986e-07, + "logits/chosen": -1.7694162130355835, + "logits/rejected": -1.7384599447250366, + "logps/chosen": -469.52288818359375, + "logps/rejected": -558.3441162109375, + "loss": 0.0209, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19003179669380188, + "rewards/margins": 0.09676986187696457, + "rewards/rejected": -0.28680163621902466, + "step": 6610 + }, + { + "epoch": 0.87, + "learning_rate": 2.6755335541943677e-07, + "logits/chosen": -1.4450665712356567, + "logits/rejected": -1.4400370121002197, + "logps/chosen": -432.9559020996094, + "logps/rejected": -513.572021484375, + "loss": 0.0355, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.22987909615039825, + "rewards/margins": 0.0817764475941658, + "rewards/rejected": -0.31165555119514465, + "step": 6620 + }, + { + "epoch": 0.87, + "learning_rate": 2.62436244082781e-07, + "logits/chosen": -1.533942461013794, + "logits/rejected": -1.4904354810714722, + "logps/chosen": -390.4458923339844, + "logps/rejected": -407.4669189453125, + "loss": 0.021, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.19528919458389282, + "rewards/margins": 0.04443785548210144, + "rewards/rejected": -0.23972702026367188, + "step": 6630 + }, + { + "epoch": 0.87, + "learning_rate": 2.5736582838913836e-07, + "logits/chosen": -1.6789026260375977, + "logits/rejected": -1.5278362035751343, + "logps/chosen": -457.1402282714844, + "logps/rejected": -579.2009887695312, + "loss": 0.0249, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.20158009231090546, + "rewards/margins": 0.14046378433704376, + "rewards/rejected": -0.3420438766479492, + "step": 6640 + }, + { + "epoch": 0.87, + "learning_rate": 2.5234221415284363e-07, + "logits/chosen": -1.5976431369781494, + "logits/rejected": -1.490487813949585, + "logps/chosen": -486.6568908691406, + "logps/rejected": -523.069091796875, + "loss": 0.0235, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.22602316737174988, + "rewards/margins": 0.07399384677410126, + "rewards/rejected": -0.3000170588493347, + "step": 6650 + }, + { + "epoch": 0.87, + "learning_rate": 2.4736550621153375e-07, + "logits/chosen": -1.599687933921814, + "logits/rejected": -1.4537866115570068, + "logps/chosen": -510.0804748535156, + "logps/rejected": -571.4901123046875, + "loss": 0.0309, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.22016914188861847, + "rewards/margins": 0.09033988416194916, + "rewards/rejected": -0.3105090260505676, + "step": 6660 + }, + { + "epoch": 0.87, + "learning_rate": 2.424358084239609e-07, + "logits/chosen": -1.657153844833374, + "logits/rejected": -1.5969769954681396, + "logps/chosen": -568.629150390625, + "logps/rejected": -559.4827880859375, + "loss": 0.0208, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.20487694442272186, + "rewards/margins": 0.09001322090625763, + "rewards/rejected": -0.29489022493362427, + "step": 6670 + }, + { + "epoch": 0.87, + "learning_rate": 2.3755322366782158e-07, + "logits/chosen": -1.545088768005371, + "logits/rejected": -1.4435375928878784, + "logps/chosen": -465.56329345703125, + "logps/rejected": -519.2607421875, + "loss": 0.0189, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.21758349239826202, + "rewards/margins": 0.09756701439619064, + "rewards/rejected": -0.31515049934387207, + "step": 6680 + }, + { + "epoch": 0.88, + "learning_rate": 2.3271785383761431e-07, + "logits/chosen": -1.6613857746124268, + "logits/rejected": -1.3951141834259033, + "logps/chosen": -481.0204162597656, + "logps/rejected": -505.3641662597656, + "loss": 0.0349, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.21575649082660675, + "rewards/margins": 0.057221878319978714, + "rewards/rejected": -0.2729783356189728, + "step": 6690 + }, + { + "epoch": 0.88, + "learning_rate": 2.2792979984250978e-07, + "logits/chosen": -1.5365406274795532, + "logits/rejected": -1.3203694820404053, + "logps/chosen": -432.1142578125, + "logps/rejected": -446.89166259765625, + "loss": 0.0278, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.21476109325885773, + "rewards/margins": 0.06200176477432251, + "rewards/rejected": -0.27676287293434143, + "step": 6700 + }, + { + "epoch": 0.88, + "learning_rate": 2.231891616042453e-07, + "logits/chosen": -1.4919780492782593, + "logits/rejected": -1.5763249397277832, + "logps/chosen": -375.32763671875, + "logps/rejected": -487.92877197265625, + "loss": 0.0309, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.19331511855125427, + "rewards/margins": 0.07069431990385056, + "rewards/rejected": -0.26400941610336304, + "step": 6710 + }, + { + "epoch": 0.88, + "learning_rate": 2.1849603805504328e-07, + "logits/chosen": -1.490634560585022, + "logits/rejected": -1.3624677658081055, + "logps/chosen": -448.1121520996094, + "logps/rejected": -482.7945251464844, + "loss": 0.0164, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.19230246543884277, + "rewards/margins": 0.08405870199203491, + "rewards/rejected": -0.2763611674308777, + "step": 6720 + }, + { + "epoch": 0.88, + "learning_rate": 2.1385052713554066e-07, + "logits/chosen": -1.6195634603500366, + "logits/rejected": -1.5923945903778076, + "logps/chosen": -392.4721984863281, + "logps/rejected": -516.0805053710938, + "loss": 0.0323, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22640562057495117, + "rewards/margins": 0.09060530364513397, + "rewards/rejected": -0.31701093912124634, + "step": 6730 + }, + { + "epoch": 0.88, + "learning_rate": 2.0925272579274873e-07, + "logits/chosen": -1.567509412765503, + "logits/rejected": -1.4040048122406006, + "logps/chosen": -463.7437438964844, + "logps/rejected": -498.69232177734375, + "loss": 0.0221, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.19629953801631927, + "rewards/margins": 0.04697941988706589, + "rewards/rejected": -0.24327898025512695, + "step": 6740 + }, + { + "epoch": 0.88, + "learning_rate": 2.047027299780302e-07, + "logits/chosen": -1.7769540548324585, + "logits/rejected": -1.7264270782470703, + "logps/chosen": -445.7354431152344, + "logps/rejected": -509.4366760253906, + "loss": 0.0163, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1864301860332489, + "rewards/margins": 0.07413545250892639, + "rewards/rejected": -0.2605656385421753, + "step": 6750 + }, + { + "epoch": 0.88, + "learning_rate": 2.0020063464509492e-07, + "logits/chosen": -1.6714115142822266, + "logits/rejected": -1.4131503105163574, + "logps/chosen": -438.7445373535156, + "logps/rejected": -464.24896240234375, + "loss": 0.0182, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1970597803592682, + "rewards/margins": 0.10940859466791153, + "rewards/rejected": -0.3064683973789215, + "step": 6760 + }, + { + "epoch": 0.89, + "learning_rate": 1.957465337480191e-07, + "logits/chosen": -1.6414451599121094, + "logits/rejected": -1.5069279670715332, + "logps/chosen": -452.56182861328125, + "logps/rejected": -525.6574096679688, + "loss": 0.0256, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.21758043766021729, + "rewards/margins": 0.11510293185710907, + "rewards/rejected": -0.33268335461616516, + "step": 6770 + }, + { + "epoch": 0.89, + "learning_rate": 1.9134052023928622e-07, + "logits/chosen": -1.4718319177627563, + "logits/rejected": -1.4121534824371338, + "logps/chosen": -386.81781005859375, + "logps/rejected": -468.6263122558594, + "loss": 0.0364, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.23575875163078308, + "rewards/margins": 0.08290183544158936, + "rewards/rejected": -0.3186606168746948, + "step": 6780 + }, + { + "epoch": 0.89, + "learning_rate": 1.8698268606784392e-07, + "logits/chosen": -1.5553295612335205, + "logits/rejected": -1.3655126094818115, + "logps/chosen": -382.2786865234375, + "logps/rejected": -487.64190673828125, + "loss": 0.0152, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.18006733059883118, + "rewards/margins": 0.09453471750020981, + "rewards/rejected": -0.2746020555496216, + "step": 6790 + }, + { + "epoch": 0.89, + "learning_rate": 1.826731221771866e-07, + "logits/chosen": -1.6402775049209595, + "logits/rejected": -1.5354700088500977, + "logps/chosen": -488.4889221191406, + "logps/rejected": -551.7385864257812, + "loss": 0.0279, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.23139257729053497, + "rewards/margins": 0.09490464627742767, + "rewards/rejected": -0.32629722356796265, + "step": 6800 + }, + { + "epoch": 0.89, + "learning_rate": 1.7841191850345967e-07, + "logits/chosen": -1.6131150722503662, + "logits/rejected": -1.555279016494751, + "logps/chosen": -468.02337646484375, + "logps/rejected": -504.90057373046875, + "loss": 0.0272, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.21480710804462433, + "rewards/margins": 0.065223328769207, + "rewards/rejected": -0.28003042936325073, + "step": 6810 + }, + { + "epoch": 0.89, + "learning_rate": 1.7419916397357905e-07, + "logits/chosen": -1.6005849838256836, + "logits/rejected": -1.4733082056045532, + "logps/chosen": -443.8605041503906, + "logps/rejected": -516.1515502929688, + "loss": 0.0189, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.24963009357452393, + "rewards/margins": 0.06734304130077362, + "rewards/rejected": -0.31697314977645874, + "step": 6820 + }, + { + "epoch": 0.89, + "learning_rate": 1.700349465033782e-07, + "logits/chosen": -1.6979738473892212, + "logits/rejected": -1.4578403234481812, + "logps/chosen": -476.15228271484375, + "logps/rejected": -494.53143310546875, + "loss": 0.0305, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.22806552052497864, + "rewards/margins": 0.0760088786482811, + "rewards/rejected": -0.30407437682151794, + "step": 6830 + }, + { + "epoch": 0.9, + "learning_rate": 1.6591935299577227e-07, + "logits/chosen": -1.4906421899795532, + "logits/rejected": -1.3485311269760132, + "logps/chosen": -521.2755737304688, + "logps/rejected": -579.0121459960938, + "loss": 0.015, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23313705623149872, + "rewards/margins": 0.08736772835254669, + "rewards/rejected": -0.3205048143863678, + "step": 6840 + }, + { + "epoch": 0.9, + "learning_rate": 1.6185246933894338e-07, + "logits/chosen": -1.469611644744873, + "logits/rejected": -1.4063249826431274, + "logps/chosen": -421.91156005859375, + "logps/rejected": -511.09649658203125, + "loss": 0.0277, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.19816365838050842, + "rewards/margins": 0.08645348250865936, + "rewards/rejected": -0.2846171259880066, + "step": 6850 + }, + { + "epoch": 0.9, + "learning_rate": 1.5783438040455097e-07, + "logits/chosen": -1.4442344903945923, + "logits/rejected": -1.5077521800994873, + "logps/chosen": -396.4654846191406, + "logps/rejected": -461.05987548828125, + "loss": 0.0138, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.1838027685880661, + "rewards/margins": 0.07608558237552643, + "rewards/rejected": -0.25988835096359253, + "step": 6860 + }, + { + "epoch": 0.9, + "learning_rate": 1.538651700459576e-07, + "logits/chosen": -1.653641700744629, + "logits/rejected": -1.412660002708435, + "logps/chosen": -494.48052978515625, + "logps/rejected": -557.4576416015625, + "loss": 0.0247, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.23249347507953644, + "rewards/margins": 0.11307159811258316, + "rewards/rejected": -0.3455651104450226, + "step": 6870 + }, + { + "epoch": 0.9, + "learning_rate": 1.4994492109648151e-07, + "logits/chosen": -1.343875527381897, + "logits/rejected": -1.3370107412338257, + "logps/chosen": -377.4173889160156, + "logps/rejected": -463.6302795410156, + "loss": 0.0264, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2058386355638504, + "rewards/margins": 0.07137282937765121, + "rewards/rejected": -0.277211457490921, + "step": 6880 + }, + { + "epoch": 0.9, + "learning_rate": 1.4607371536766695e-07, + "logits/chosen": -1.5182969570159912, + "logits/rejected": -1.6659351587295532, + "logps/chosen": -428.3271484375, + "logps/rejected": -498.54742431640625, + "loss": 0.0131, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.238141730427742, + "rewards/margins": 0.04850930720567703, + "rewards/rejected": -0.28665101528167725, + "step": 6890 + }, + { + "epoch": 0.9, + "learning_rate": 1.4225163364757655e-07, + "logits/chosen": -1.699549913406372, + "logits/rejected": -1.3660337924957275, + "logps/chosen": -476.3060607910156, + "logps/rejected": -480.1463317871094, + "loss": 0.0284, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19354431331157684, + "rewards/margins": 0.07588140666484833, + "rewards/rejected": -0.2694256901741028, + "step": 6900 + }, + { + "epoch": 0.9, + "learning_rate": 1.3847875569910462e-07, + "logits/chosen": -1.5795233249664307, + "logits/rejected": -1.5374621152877808, + "logps/chosen": -381.10736083984375, + "logps/rejected": -439.749267578125, + "loss": 0.0169, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20478907227516174, + "rewards/margins": 0.06565740704536438, + "rewards/rejected": -0.2704464793205261, + "step": 6910 + }, + { + "epoch": 0.91, + "learning_rate": 1.3475516025831552e-07, + "logits/chosen": -1.602575659751892, + "logits/rejected": -1.544237732887268, + "logps/chosen": -419.572265625, + "logps/rejected": -511.387451171875, + "loss": 0.0191, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22955676913261414, + "rewards/margins": 0.07430396229028702, + "rewards/rejected": -0.30386072397232056, + "step": 6920 + }, + { + "epoch": 0.91, + "learning_rate": 1.310809250327974e-07, + "logits/chosen": -1.6102867126464844, + "logits/rejected": -1.5081886053085327, + "logps/chosen": -413.7611389160156, + "logps/rejected": -500.97564697265625, + "loss": 0.014, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2218031883239746, + "rewards/margins": 0.09341904520988464, + "rewards/rejected": -0.31522220373153687, + "step": 6930 + }, + { + "epoch": 0.91, + "learning_rate": 1.2745612670004153e-07, + "logits/chosen": -1.676377296447754, + "logits/rejected": -1.4056346416473389, + "logps/chosen": -471.49420166015625, + "logps/rejected": -564.6023559570312, + "loss": 0.0312, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2545742988586426, + "rewards/margins": 0.10350040346384048, + "rewards/rejected": -0.3580746650695801, + "step": 6940 + }, + { + "epoch": 0.91, + "learning_rate": 1.2388084090584395e-07, + "logits/chosen": -1.647878646850586, + "logits/rejected": -1.442943811416626, + "logps/chosen": -424.6617736816406, + "logps/rejected": -457.922119140625, + "loss": 0.0298, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.21583659946918488, + "rewards/margins": 0.07665933668613434, + "rewards/rejected": -0.2924959361553192, + "step": 6950 + }, + { + "epoch": 0.91, + "learning_rate": 1.2035514226272305e-07, + "logits/chosen": -1.4394546747207642, + "logits/rejected": -1.2975504398345947, + "logps/chosen": -486.21923828125, + "logps/rejected": -533.73486328125, + "loss": 0.0277, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2445685863494873, + "rewards/margins": 0.0704784169793129, + "rewards/rejected": -0.315047025680542, + "step": 6960 + }, + { + "epoch": 0.91, + "learning_rate": 1.1687910434836607e-07, + "logits/chosen": -1.247804880142212, + "logits/rejected": -1.3159127235412598, + "logps/chosen": -384.3349609375, + "logps/rejected": -531.3505859375, + "loss": 0.0365, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.19616396725177765, + "rewards/margins": 0.1056792140007019, + "rewards/rejected": -0.30184316635131836, + "step": 6970 + }, + { + "epoch": 0.91, + "learning_rate": 1.1345279970409128e-07, + "logits/chosen": -1.5758659839630127, + "logits/rejected": -1.5954402685165405, + "logps/chosen": -456.5048828125, + "logps/rejected": -535.1149291992188, + "loss": 0.0277, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2219972163438797, + "rewards/margins": 0.04516248777508736, + "rewards/rejected": -0.26715970039367676, + "step": 6980 + }, + { + "epoch": 0.91, + "learning_rate": 1.1007629983333629e-07, + "logits/chosen": -1.4976396560668945, + "logits/rejected": -1.319213628768921, + "logps/chosen": -416.6646423339844, + "logps/rejected": -548.4486083984375, + "loss": 0.0279, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.20675568282604218, + "rewards/margins": 0.1446656733751297, + "rewards/rejected": -0.35142138600349426, + "step": 6990 + }, + { + "epoch": 0.92, + "learning_rate": 1.067496752001626e-07, + "logits/chosen": -1.5586458444595337, + "logits/rejected": -1.448335886001587, + "logps/chosen": -362.5075378417969, + "logps/rejected": -442.5972595214844, + "loss": 0.0275, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.19385865330696106, + "rewards/margins": 0.07538175582885742, + "rewards/rejected": -0.2692403793334961, + "step": 7000 + }, + { + "epoch": 0.92, + "learning_rate": 1.0347299522778909e-07, + "logits/chosen": -1.6143827438354492, + "logits/rejected": -1.6086311340332031, + "logps/chosen": -423.903076171875, + "logps/rejected": -492.069091796875, + "loss": 0.0321, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.21144139766693115, + "rewards/margins": 0.05724687501788139, + "rewards/rejected": -0.26868826150894165, + "step": 7010 + }, + { + "epoch": 0.92, + "learning_rate": 1.0024632829713971e-07, + "logits/chosen": -1.5233447551727295, + "logits/rejected": -1.5699418783187866, + "logps/chosen": -413.32830810546875, + "logps/rejected": -533.0677490234375, + "loss": 0.0224, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.19785210490226746, + "rewards/margins": 0.0935153067111969, + "rewards/rejected": -0.29136738181114197, + "step": 7020 + }, + { + "epoch": 0.92, + "learning_rate": 9.706974174541889e-08, + "logits/chosen": -1.6802995204925537, + "logits/rejected": -1.4303064346313477, + "logps/chosen": -428.618896484375, + "logps/rejected": -478.03265380859375, + "loss": 0.0204, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.20262300968170166, + "rewards/margins": 0.07504673302173615, + "rewards/rejected": -0.2776697278022766, + "step": 7030 + }, + { + "epoch": 0.92, + "learning_rate": 9.39433018647043e-08, + "logits/chosen": -1.6169646978378296, + "logits/rejected": -1.3585525751113892, + "logps/chosen": -412.26873779296875, + "logps/rejected": -450.7110900878906, + "loss": 0.0214, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.21610252559185028, + "rewards/margins": 0.05788475275039673, + "rewards/rejected": -0.2739872634410858, + "step": 7040 + }, + { + "epoch": 0.92, + "learning_rate": 9.086707390056543e-08, + "logits/chosen": -1.4815367460250854, + "logits/rejected": -1.6197248697280884, + "logps/chosen": -449.9187927246094, + "logps/rejected": -502.67364501953125, + "loss": 0.0248, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2156413346529007, + "rewards/margins": 0.04413525387644768, + "rewards/rejected": -0.25977659225463867, + "step": 7050 + }, + { + "epoch": 0.92, + "learning_rate": 8.784112205070083e-08, + "logits/chosen": -1.6900125741958618, + "logits/rejected": -1.6549876928329468, + "logps/chosen": -351.97808837890625, + "logps/rejected": -422.29638671875, + "loss": 0.0208, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.16725996136665344, + "rewards/margins": 0.0809854120016098, + "rewards/rejected": -0.24824540317058563, + "step": 7060 + }, + { + "epoch": 0.93, + "learning_rate": 8.486550946359779e-08, + "logits/chosen": -1.7063417434692383, + "logits/rejected": -1.6383116245269775, + "logps/chosen": -350.2059326171875, + "logps/rejected": -401.1691589355469, + "loss": 0.0219, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.16188636422157288, + "rewards/margins": 0.04841512441635132, + "rewards/rejected": -0.21030151844024658, + "step": 7070 + }, + { + "epoch": 0.93, + "learning_rate": 8.194029823721556e-08, + "logits/chosen": -1.625353217124939, + "logits/rejected": -1.5545185804367065, + "logps/chosen": -335.5082092285156, + "logps/rejected": -378.410400390625, + "loss": 0.03, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.18089623749256134, + "rewards/margins": 0.05593543127179146, + "rewards/rejected": -0.2368316650390625, + "step": 7080 + }, + { + "epoch": 0.93, + "learning_rate": 7.906554941768896e-08, + "logits/chosen": -1.2641688585281372, + "logits/rejected": -1.3844702243804932, + "logps/chosen": -420.99462890625, + "logps/rejected": -521.883056640625, + "loss": 0.0258, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.20448222756385803, + "rewards/margins": 0.06919074803590775, + "rewards/rejected": -0.27367299795150757, + "step": 7090 + }, + { + "epoch": 0.93, + "learning_rate": 7.624132299805575e-08, + "logits/chosen": -1.6793031692504883, + "logits/rejected": -1.448111891746521, + "logps/chosen": -469.1708984375, + "logps/rejected": -599.2205810546875, + "loss": 0.0217, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2051534652709961, + "rewards/margins": 0.14806172251701355, + "rewards/rejected": -0.35321518778800964, + "step": 7100 + }, + { + "epoch": 0.93, + "learning_rate": 7.346767791700127e-08, + "logits/chosen": -1.469233512878418, + "logits/rejected": -1.4481909275054932, + "logps/chosen": -398.59698486328125, + "logps/rejected": -483.1396484375, + "loss": 0.0314, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.21537312865257263, + "rewards/margins": 0.09009166061878204, + "rewards/rejected": -0.30546480417251587, + "step": 7110 + }, + { + "epoch": 0.93, + "learning_rate": 7.07446720576327e-08, + "logits/chosen": -1.6200309991836548, + "logits/rejected": -1.4754276275634766, + "logps/chosen": -487.5052795410156, + "logps/rejected": -557.2413330078125, + "loss": 0.0261, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.22672314941883087, + "rewards/margins": 0.09737074375152588, + "rewards/rejected": -0.32409390807151794, + "step": 7120 + }, + { + "epoch": 0.93, + "learning_rate": 6.807236224626701e-08, + "logits/chosen": -1.3291611671447754, + "logits/rejected": -1.3264832496643066, + "logps/chosen": -414.73614501953125, + "logps/rejected": -470.7920837402344, + "loss": 0.024, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.22658607363700867, + "rewards/margins": 0.05380987375974655, + "rewards/rejected": -0.2803959548473358, + "step": 7130 + }, + { + "epoch": 0.93, + "learning_rate": 6.545080425124888e-08, + "logits/chosen": -1.567453384399414, + "logits/rejected": -1.3527438640594482, + "logps/chosen": -431.8515625, + "logps/rejected": -490.6085510253906, + "loss": 0.0325, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2253638207912445, + "rewards/margins": 0.09049206227064133, + "rewards/rejected": -0.3158559203147888, + "step": 7140 + }, + { + "epoch": 0.94, + "learning_rate": 6.288005278178382e-08, + "logits/chosen": -1.6186761856079102, + "logits/rejected": -1.4696338176727295, + "logps/chosen": -378.1620788574219, + "logps/rejected": -439.23944091796875, + "loss": 0.0243, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.16763922572135925, + "rewards/margins": 0.08176124095916748, + "rewards/rejected": -0.24940045177936554, + "step": 7150 + }, + { + "epoch": 0.94, + "learning_rate": 6.036016148679825e-08, + "logits/chosen": -1.5543218851089478, + "logits/rejected": -1.387766718864441, + "logps/chosen": -433.41400146484375, + "logps/rejected": -518.6406860351562, + "loss": 0.0288, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.21379122138023376, + "rewards/margins": 0.09945134073495865, + "rewards/rejected": -0.3132425844669342, + "step": 7160 + }, + { + "epoch": 0.94, + "learning_rate": 5.7891182953819235e-08, + "logits/chosen": -1.4900459051132202, + "logits/rejected": -1.3624264001846313, + "logps/chosen": -448.15716552734375, + "logps/rejected": -467.8421936035156, + "loss": 0.0206, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.22588130831718445, + "rewards/margins": 0.050553254783153534, + "rewards/rejected": -0.27643460035324097, + "step": 7170 + }, + { + "epoch": 0.94, + "learning_rate": 5.547316870787689e-08, + "logits/chosen": -1.6980407238006592, + "logits/rejected": -1.614473581314087, + "logps/chosen": -476.68511962890625, + "logps/rejected": -538.3526611328125, + "loss": 0.0281, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.22638268768787384, + "rewards/margins": 0.0753302201628685, + "rewards/rejected": -0.30171290040016174, + "step": 7180 + }, + { + "epoch": 0.94, + "learning_rate": 5.310616921042927e-08, + "logits/chosen": -1.5047115087509155, + "logits/rejected": -1.2674446105957031, + "logps/chosen": -411.44085693359375, + "logps/rejected": -429.0818786621094, + "loss": 0.0242, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.19907724857330322, + "rewards/margins": 0.0715177059173584, + "rewards/rejected": -0.27059492468833923, + "step": 7190 + }, + { + "epoch": 0.94, + "learning_rate": 5.079023385830939e-08, + "logits/chosen": -1.571404218673706, + "logits/rejected": -1.4775793552398682, + "logps/chosen": -428.352783203125, + "logps/rejected": -470.31854248046875, + "loss": 0.0202, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1955043077468872, + "rewards/margins": 0.07978837192058563, + "rewards/rejected": -0.27529269456863403, + "step": 7200 + }, + { + "epoch": 0.94, + "learning_rate": 4.8525410982695476e-08, + "logits/chosen": -1.7943906784057617, + "logits/rejected": -1.4248347282409668, + "logps/chosen": -489.4849548339844, + "logps/rejected": -516.9371948242188, + "loss": 0.0131, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.22439464926719666, + "rewards/margins": 0.07225723564624786, + "rewards/rejected": -0.2966518998146057, + "step": 7210 + }, + { + "epoch": 0.94, + "learning_rate": 4.6311747848099e-08, + "logits/chosen": -1.6961714029312134, + "logits/rejected": -1.386518120765686, + "logps/chosen": -448.45159912109375, + "logps/rejected": -472.0159606933594, + "loss": 0.0208, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19258160889148712, + "rewards/margins": 0.055770616978406906, + "rewards/rejected": -0.24835219979286194, + "step": 7220 + }, + { + "epoch": 0.95, + "learning_rate": 4.4149290651382405e-08, + "logits/chosen": -1.5325816869735718, + "logits/rejected": -1.555955410003662, + "logps/chosen": -445.8809509277344, + "logps/rejected": -574.5386962890625, + "loss": 0.0251, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2440822422504425, + "rewards/margins": 0.09757833182811737, + "rewards/rejected": -0.3416605591773987, + "step": 7230 + }, + { + "epoch": 0.95, + "learning_rate": 4.203808452079211e-08, + "logits/chosen": -1.6468013525009155, + "logits/rejected": -1.599022626876831, + "logps/chosen": -376.5355529785156, + "logps/rejected": -439.79803466796875, + "loss": 0.0259, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.210208460688591, + "rewards/margins": 0.05704888701438904, + "rewards/rejected": -0.26725736260414124, + "step": 7240 + }, + { + "epoch": 0.95, + "learning_rate": 3.9978173515018427e-08, + "logits/chosen": -1.6024280786514282, + "logits/rejected": -1.6201448440551758, + "logps/chosen": -396.4577331542969, + "logps/rejected": -474.0391540527344, + "loss": 0.0224, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.20827671885490417, + "rewards/margins": 0.06074458360671997, + "rewards/rejected": -0.26902130246162415, + "step": 7250 + }, + { + "epoch": 0.95, + "learning_rate": 3.7969600622274614e-08, + "logits/chosen": -1.4462767839431763, + "logits/rejected": -0.992064356803894, + "logps/chosen": -533.9465942382812, + "logps/rejected": -521.5758056640625, + "loss": 0.0328, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24340076744556427, + "rewards/margins": 0.08966024219989777, + "rewards/rejected": -0.33306097984313965, + "step": 7260 + }, + { + "epoch": 0.95, + "learning_rate": 3.601240775940151e-08, + "logits/chosen": -1.681206464767456, + "logits/rejected": -1.5457440614700317, + "logps/chosen": -512.3326416015625, + "logps/rejected": -511.33551025390625, + "loss": 0.0199, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19923128187656403, + "rewards/margins": 0.04757522791624069, + "rewards/rejected": -0.2468065321445465, + "step": 7270 + }, + { + "epoch": 0.95, + "learning_rate": 3.410663577099071e-08, + "logits/chosen": -1.5055510997772217, + "logits/rejected": -1.2968648672103882, + "logps/chosen": -445.6414489746094, + "logps/rejected": -499.5888671875, + "loss": 0.0204, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.20774845778942108, + "rewards/margins": 0.08748999238014221, + "rewards/rejected": -0.2952384352684021, + "step": 7280 + }, + { + "epoch": 0.95, + "learning_rate": 3.2252324428534986e-08, + "logits/chosen": -1.642435073852539, + "logits/rejected": -1.3985035419464111, + "logps/chosen": -430.88116455078125, + "logps/rejected": -523.8838500976562, + "loss": 0.0211, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2069445550441742, + "rewards/margins": 0.13603080809116364, + "rewards/rejected": -0.34297531843185425, + "step": 7290 + }, + { + "epoch": 0.96, + "learning_rate": 3.0449512429594486e-08, + "logits/chosen": -1.618054986000061, + "logits/rejected": -1.4266889095306396, + "logps/chosen": -470.6953125, + "logps/rejected": -619.3988037109375, + "loss": 0.0358, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20840811729431152, + "rewards/margins": 0.14416466653347015, + "rewards/rejected": -0.3525727689266205, + "step": 7300 + }, + { + "epoch": 0.96, + "learning_rate": 2.8698237396992956e-08, + "logits/chosen": -1.5598580837249756, + "logits/rejected": -1.4614779949188232, + "logps/chosen": -452.614013671875, + "logps/rejected": -567.0684204101562, + "loss": 0.0177, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.22577671706676483, + "rewards/margins": 0.13550271093845367, + "rewards/rejected": -0.3612794876098633, + "step": 7310 + }, + { + "epoch": 0.96, + "learning_rate": 2.6998535878030584e-08, + "logits/chosen": -1.6863279342651367, + "logits/rejected": -1.4476655721664429, + "logps/chosen": -444.57281494140625, + "logps/rejected": -477.07830810546875, + "loss": 0.0283, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20643453299999237, + "rewards/margins": 0.11491278558969498, + "rewards/rejected": -0.32134729623794556, + "step": 7320 + }, + { + "epoch": 0.96, + "learning_rate": 2.535044334372072e-08, + "logits/chosen": -1.5739470720291138, + "logits/rejected": -1.5450502634048462, + "logps/chosen": -487.28155517578125, + "logps/rejected": -542.0820922851562, + "loss": 0.0277, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.22768394649028778, + "rewards/margins": 0.08231634646654129, + "rewards/rejected": -0.3100002706050873, + "step": 7330 + }, + { + "epoch": 0.96, + "learning_rate": 2.3753994188051853e-08, + "logits/chosen": -1.3884375095367432, + "logits/rejected": -1.0182054042816162, + "logps/chosen": -503.9326171875, + "logps/rejected": -588.8248291015625, + "loss": 0.0254, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2552822232246399, + "rewards/margins": 0.114377960562706, + "rewards/rejected": -0.3696601986885071, + "step": 7340 + }, + { + "epoch": 0.96, + "learning_rate": 2.220922172726764e-08, + "logits/chosen": -1.735327959060669, + "logits/rejected": -1.519585371017456, + "logps/chosen": -440.54803466796875, + "logps/rejected": -469.45599365234375, + "loss": 0.0218, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.21124538779258728, + "rewards/margins": 0.05926515534520149, + "rewards/rejected": -0.27051058411598206, + "step": 7350 + }, + { + "epoch": 0.96, + "learning_rate": 2.071615819917244e-08, + "logits/chosen": -1.7590490579605103, + "logits/rejected": -1.500055193901062, + "logps/chosen": -447.4695739746094, + "logps/rejected": -540.8870849609375, + "loss": 0.024, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19787776470184326, + "rewards/margins": 0.11433287709951401, + "rewards/rejected": -0.3122106194496155, + "step": 7360 + }, + { + "epoch": 0.96, + "learning_rate": 1.9274834762459393e-08, + "logits/chosen": -1.4206875562667847, + "logits/rejected": -1.2328351736068726, + "logps/chosen": -481.37506103515625, + "logps/rejected": -497.41717529296875, + "loss": 0.0259, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.23238250613212585, + "rewards/margins": 0.06020934507250786, + "rewards/rejected": -0.2925918698310852, + "step": 7370 + }, + { + "epoch": 0.97, + "learning_rate": 1.7885281496058947e-08, + "logits/chosen": -1.5303130149841309, + "logits/rejected": -1.5012943744659424, + "logps/chosen": -420.4921875, + "logps/rejected": -477.54815673828125, + "loss": 0.0151, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.18356028199195862, + "rewards/margins": 0.08185829222202301, + "rewards/rejected": -0.2654185891151428, + "step": 7380 + }, + { + "epoch": 0.97, + "learning_rate": 1.654752739851134e-08, + "logits/chosen": -1.500623106956482, + "logits/rejected": -1.5592161417007446, + "logps/chosen": -385.9029541015625, + "logps/rejected": -537.6184692382812, + "loss": 0.0211, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2086178958415985, + "rewards/margins": 0.11499868333339691, + "rewards/rejected": -0.3236165940761566, + "step": 7390 + }, + { + "epoch": 0.97, + "learning_rate": 1.526160038736235e-08, + "logits/chosen": -1.6749951839447021, + "logits/rejected": -1.4805552959442139, + "logps/chosen": -433.3062438964844, + "logps/rejected": -517.5592041015625, + "loss": 0.0199, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2026170790195465, + "rewards/margins": 0.13305865228176117, + "rewards/rejected": -0.3356757164001465, + "step": 7400 + }, + { + "epoch": 0.97, + "learning_rate": 1.402752729857959e-08, + "logits/chosen": -1.5121300220489502, + "logits/rejected": -1.529975175857544, + "logps/chosen": -430.78106689453125, + "logps/rejected": -469.31854248046875, + "loss": 0.027, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.22110477089881897, + "rewards/margins": 0.037758342921733856, + "rewards/rejected": -0.25886309146881104, + "step": 7410 + }, + { + "epoch": 0.97, + "learning_rate": 1.2845333885992683e-08, + "logits/chosen": -1.4580018520355225, + "logits/rejected": -1.5121757984161377, + "logps/chosen": -416.81170654296875, + "logps/rejected": -495.95404052734375, + "loss": 0.0262, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.21470603346824646, + "rewards/margins": 0.09575139731168747, + "rewards/rejected": -0.31045740842819214, + "step": 7420 + }, + { + "epoch": 0.97, + "learning_rate": 1.171504482075675e-08, + "logits/chosen": -1.6380068063735962, + "logits/rejected": -1.5240509510040283, + "logps/chosen": -446.06378173828125, + "logps/rejected": -507.699462890625, + "loss": 0.0276, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23224465548992157, + "rewards/margins": 0.09121891111135483, + "rewards/rejected": -0.3234635591506958, + "step": 7430 + }, + { + "epoch": 0.97, + "learning_rate": 1.0636683690836147e-08, + "logits/chosen": -1.5630649328231812, + "logits/rejected": -1.3046844005584717, + "logps/chosen": -436.377197265625, + "logps/rejected": -488.420654296875, + "loss": 0.0212, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.21501310169696808, + "rewards/margins": 0.08404555171728134, + "rewards/rejected": -0.2990586459636688, + "step": 7440 + }, + { + "epoch": 0.97, + "learning_rate": 9.610273000513203e-09, + "logits/chosen": -1.679991364479065, + "logits/rejected": -1.5062822103500366, + "logps/chosen": -418.0048828125, + "logps/rejected": -482.8583068847656, + "loss": 0.032, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.22746939957141876, + "rewards/margins": 0.04597032070159912, + "rewards/rejected": -0.2734397351741791, + "step": 7450 + }, + { + "epoch": 0.98, + "learning_rate": 8.635834169918312e-09, + "logits/chosen": -1.6890833377838135, + "logits/rejected": -1.4296929836273193, + "logps/chosen": -463.12799072265625, + "logps/rejected": -477.7140197753906, + "loss": 0.0244, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23178067803382874, + "rewards/margins": 0.04911506921052933, + "rewards/rejected": -0.2808957099914551, + "step": 7460 + }, + { + "epoch": 0.98, + "learning_rate": 7.713387534582506e-09, + "logits/chosen": -1.589603066444397, + "logits/rejected": -1.4947658777236938, + "logps/chosen": -501.105712890625, + "logps/rejected": -561.6420288085938, + "loss": 0.0186, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.22854037582874298, + "rewards/margins": 0.07598869502544403, + "rewards/rejected": -0.304529070854187, + "step": 7470 + }, + { + "epoch": 0.98, + "learning_rate": 6.84295234501392e-09, + "logits/chosen": -1.5068248510360718, + "logits/rejected": -1.166831135749817, + "logps/chosen": -445.83587646484375, + "logps/rejected": -549.9104614257812, + "loss": 0.0195, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.2248276174068451, + "rewards/margins": 0.15849019587039948, + "rewards/rejected": -0.38331782817840576, + "step": 7480 + }, + { + "epoch": 0.98, + "learning_rate": 6.024546766295325e-09, + "logits/chosen": -1.8001673221588135, + "logits/rejected": -1.413901925086975, + "logps/chosen": -467.92889404296875, + "logps/rejected": -552.5183715820312, + "loss": 0.0191, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.22095003724098206, + "rewards/margins": 0.12218403816223145, + "rewards/rejected": -0.3431340754032135, + "step": 7490 + }, + { + "epoch": 0.98, + "learning_rate": 5.2581878777049895e-09, + "logits/chosen": -1.6722362041473389, + "logits/rejected": -1.4077531099319458, + "logps/chosen": -400.8117980957031, + "logps/rejected": -479.1786193847656, + "loss": 0.0211, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2000255584716797, + "rewards/margins": 0.10312291234731674, + "rewards/rejected": -0.30314844846725464, + "step": 7500 + }, + { + "epoch": 0.98, + "learning_rate": 4.543891672361411e-09, + "logits/chosen": -1.7269861698150635, + "logits/rejected": -1.66338312625885, + "logps/chosen": -412.36102294921875, + "logps/rejected": -441.78204345703125, + "loss": 0.0261, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18549641966819763, + "rewards/margins": 0.040950436145067215, + "rewards/rejected": -0.22644683718681335, + "step": 7510 + }, + { + "epoch": 0.98, + "learning_rate": 3.881673056887747e-09, + "logits/chosen": -1.596295714378357, + "logits/rejected": -1.443849802017212, + "logps/chosen": -378.5397644042969, + "logps/rejected": -499.32489013671875, + "loss": 0.0167, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20860883593559265, + "rewards/margins": 0.13590942323207855, + "rewards/rejected": -0.3445182740688324, + "step": 7520 + }, + { + "epoch": 0.99, + "learning_rate": 3.2715458511023425e-09, + "logits/chosen": -1.5938750505447388, + "logits/rejected": -1.442068099975586, + "logps/chosen": -444.24951171875, + "logps/rejected": -574.7880249023438, + "loss": 0.0216, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.21984434127807617, + "rewards/margins": 0.13241305947303772, + "rewards/rejected": -0.3522574007511139, + "step": 7530 + }, + { + "epoch": 0.99, + "learning_rate": 2.7135227877289617e-09, + "logits/chosen": -1.2030110359191895, + "logits/rejected": -1.528421401977539, + "logps/chosen": -356.8739318847656, + "logps/rejected": -478.8106994628906, + "loss": 0.0243, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2077951729297638, + "rewards/margins": 0.06836696714162827, + "rewards/rejected": -0.27616217732429504, + "step": 7540 + }, + { + "epoch": 0.99, + "learning_rate": 2.2076155121328326e-09, + "logits/chosen": -1.6814384460449219, + "logits/rejected": -1.49794602394104, + "logps/chosen": -455.85760498046875, + "logps/rejected": -572.0209350585938, + "loss": 0.0215, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.20171403884887695, + "rewards/margins": 0.11354148387908936, + "rewards/rejected": -0.3152554929256439, + "step": 7550 + }, + { + "epoch": 0.99, + "learning_rate": 1.7538345820755641e-09, + "logits/chosen": -1.3514631986618042, + "logits/rejected": -1.418828010559082, + "logps/chosen": -439.3419494628906, + "logps/rejected": -534.3173828125, + "loss": 0.0318, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.24673500657081604, + "rewards/margins": 0.07837876677513123, + "rewards/rejected": -0.32511377334594727, + "step": 7560 + }, + { + "epoch": 0.99, + "learning_rate": 1.3521894674961567e-09, + "logits/chosen": -1.5621706247329712, + "logits/rejected": -1.437835454940796, + "logps/chosen": -406.8969421386719, + "logps/rejected": -508.74749755859375, + "loss": 0.0209, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.21497616171836853, + "rewards/margins": 0.08241794258356094, + "rewards/rejected": -0.2973940968513489, + "step": 7570 + }, + { + "epoch": 0.99, + "learning_rate": 1.0026885503131023e-09, + "logits/chosen": -1.4738019704818726, + "logits/rejected": -1.5091726779937744, + "logps/chosen": -468.597900390625, + "logps/rejected": -560.5963134765625, + "loss": 0.0261, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2344876229763031, + "rewards/margins": 0.10260176658630371, + "rewards/rejected": -0.3370893895626068, + "step": 7580 + }, + { + "epoch": 0.99, + "learning_rate": 7.053391242492491e-10, + "logits/chosen": -1.478180170059204, + "logits/rejected": -1.4109389781951904, + "logps/chosen": -378.4008483886719, + "logps/rejected": -479.33203125, + "loss": 0.0202, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1765948385000229, + "rewards/margins": 0.11604814231395721, + "rewards/rejected": -0.2926430106163025, + "step": 7590 + }, + { + "epoch": 0.99, + "learning_rate": 4.6014739467997725e-10, + "logits/chosen": -1.3927793502807617, + "logits/rejected": -1.36867356300354, + "logps/chosen": -445.24481201171875, + "logps/rejected": -572.3045654296875, + "loss": 0.0269, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2252415120601654, + "rewards/margins": 0.11578087508678436, + "rewards/rejected": -0.34102240204811096, + "step": 7600 + }, + { + "epoch": 1.0, + "learning_rate": 2.671184785033032e-10, + "logits/chosen": -1.7575019598007202, + "logits/rejected": -1.6783424615859985, + "logps/chosen": -482.564453125, + "logps/rejected": -552.7120361328125, + "loss": 0.0201, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20446915924549103, + "rewards/margins": 0.08912034332752228, + "rewards/rejected": -0.2935895025730133, + "step": 7610 + }, + { + "epoch": 1.0, + "learning_rate": 1.2625640403302054e-10, + "logits/chosen": -1.4191354513168335, + "logits/rejected": -1.4505354166030884, + "logps/chosen": -378.90972900390625, + "logps/rejected": -489.2767639160156, + "loss": 0.0248, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22058920562267303, + "rewards/margins": 0.07962705194950104, + "rewards/rejected": -0.3002162575721741, + "step": 7620 + }, + { + "epoch": 1.0, + "learning_rate": 3.756411091515588e-11, + "logits/chosen": -1.7268844842910767, + "logits/rejected": -1.4856466054916382, + "logps/chosen": -463.380126953125, + "logps/rejected": -495.91876220703125, + "loss": 0.0199, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.19431455433368683, + "rewards/margins": 0.09503963589668274, + "rewards/rejected": -0.2893541753292084, + "step": 7630 + }, + { + "epoch": 1.0, + "learning_rate": 1.0434500657963143e-12, + "logits/chosen": -1.7444950342178345, + "logits/rejected": -1.388063669204712, + "logps/chosen": -367.97479248046875, + "logps/rejected": -386.3829650878906, + "loss": 0.0318, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.17011064291000366, + "rewards/margins": 0.06572172790765762, + "rewards/rejected": -0.2358323633670807, + "step": 7640 + }, + { + "epoch": 1.0, + "step": 7642, + "total_flos": 0.0, + "train_loss": 0.0048277108391146795, + "train_runtime": 13417.0015, + "train_samples_per_second": 4.557, + "train_steps_per_second": 0.57 + } + ], + "logging_steps": 10, + "max_steps": 7642, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}