diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6070 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 3873, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.2886597938144328e-09, + "logits/chosen": -4.2921271324157715, + "logits/rejected": -3.812117338180542, + "logps/chosen": -664.6867065429688, + "logps/rejected": -226.7833709716797, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 1.2886597938144328e-08, + "logits/chosen": -4.003667831420898, + "logits/rejected": -4.013306140899658, + "logps/chosen": -559.2938232421875, + "logps/rejected": -452.70074462890625, + "loss": 0.6948, + "rewards/accuracies": 0.2638888955116272, + "rewards/chosen": -0.007192640565335751, + "rewards/margins": -0.006332792341709137, + "rewards/rejected": -0.000859847932588309, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 2.5773195876288656e-08, + "logits/chosen": -4.353642463684082, + "logits/rejected": -4.292398929595947, + "logps/chosen": -554.0906982421875, + "logps/rejected": -500.97119140625, + "loss": 0.6937, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.001169868279248476, + "rewards/margins": 0.001462915213778615, + "rewards/rejected": -0.0026327825617045164, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 3.865979381443299e-08, + "logits/chosen": -4.102766513824463, + "logits/rejected": -4.200378894805908, + "logps/chosen": -617.0684204101562, + "logps/rejected": -476.2395935058594, + "loss": 0.695, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.001789045287296176, + "rewards/margins": -0.003806379158049822, + "rewards/rejected": 0.005595424212515354, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 5.154639175257731e-08, + "logits/chosen": -4.30725622177124, + "logits/rejected": -4.225460052490234, + "logps/chosen": -497.7335510253906, + "logps/rejected": -415.4452209472656, + "loss": 0.6929, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0006419029086828232, + "rewards/margins": 0.0031944490037858486, + "rewards/rejected": -0.0025525467935949564, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 6.443298969072164e-08, + "logits/chosen": -3.88063383102417, + "logits/rejected": -3.8105220794677734, + "logps/chosen": -627.067626953125, + "logps/rejected": -403.6964111328125, + "loss": 0.6976, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.00458294665440917, + "rewards/margins": -0.016261283308267593, + "rewards/rejected": 0.011678336188197136, + "step": 50 + }, + { + "epoch": 0.02, + "learning_rate": 7.731958762886598e-08, + "logits/chosen": -4.216163635253906, + "logits/rejected": -4.099843978881836, + "logps/chosen": -470.12115478515625, + "logps/rejected": -469.4156799316406, + "loss": 0.6928, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0052711316384375095, + "rewards/margins": -0.0011110258055850863, + "rewards/rejected": 0.00638215895742178, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 9.02061855670103e-08, + "logits/chosen": -4.200804233551025, + "logits/rejected": -4.1986494064331055, + "logps/chosen": -648.3743896484375, + "logps/rejected": -488.0792541503906, + "loss": 0.6905, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0002953239600174129, + "rewards/margins": 0.00705097708851099, + "rewards/rejected": -0.006755652371793985, + "step": 70 + }, + { + "epoch": 0.02, + "learning_rate": 1.0309278350515462e-07, + "logits/chosen": -4.013070106506348, + "logits/rejected": -4.1909003257751465, + "logps/chosen": -538.8270263671875, + "logps/rejected": -391.4429931640625, + "loss": 0.6919, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.007235817611217499, + "rewards/margins": 0.009682310745120049, + "rewards/rejected": -0.002446494298055768, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 1.1597938144329897e-07, + "logits/chosen": -3.9400150775909424, + "logits/rejected": -3.9281005859375, + "logps/chosen": -588.8606567382812, + "logps/rejected": -484.28839111328125, + "loss": 0.692, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.004685616586357355, + "rewards/margins": 0.008750900626182556, + "rewards/rejected": -0.004065284971147776, + "step": 90 + }, + { + "epoch": 0.03, + "learning_rate": 1.2886597938144328e-07, + "logits/chosen": -4.089522361755371, + "logits/rejected": -4.070917129516602, + "logps/chosen": -573.93310546875, + "logps/rejected": -485.439697265625, + "loss": 0.6906, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.010096365585923195, + "rewards/margins": 0.0105238426476717, + "rewards/rejected": -0.00042747752740979195, + "step": 100 + }, + { + "epoch": 0.03, + "eval_logits/chosen": -4.051466941833496, + "eval_logits/rejected": -4.089292526245117, + "eval_logps/chosen": -549.3683471679688, + "eval_logps/rejected": -437.9984130859375, + "eval_loss": 0.6931844353675842, + "eval_rewards/accuracies": 0.4860000014305115, + "eval_rewards/chosen": 0.0008278049062937498, + "eval_rewards/margins": 0.00017659256991464645, + "eval_rewards/rejected": 0.0006512125837616622, + "eval_runtime": 148.2369, + "eval_samples_per_second": 13.492, + "eval_steps_per_second": 1.686, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 1.417525773195876e-07, + "logits/chosen": -4.135636329650879, + "logits/rejected": -4.231348991394043, + "logps/chosen": -458.62255859375, + "logps/rejected": -379.28094482421875, + "loss": 0.691, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.003453848185017705, + "rewards/margins": 0.0036365636624395847, + "rewards/rejected": -0.00018271691806148738, + "step": 110 + }, + { + "epoch": 0.03, + "learning_rate": 1.5463917525773197e-07, + "logits/chosen": -4.357504844665527, + "logits/rejected": -4.165073871612549, + "logps/chosen": -392.82891845703125, + "logps/rejected": -405.0232849121094, + "loss": 0.6946, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0029750962276011705, + "rewards/margins": -0.0063691637478768826, + "rewards/rejected": 0.009344260208308697, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 1.6752577319587627e-07, + "logits/chosen": -4.224671840667725, + "logits/rejected": -4.147946357727051, + "logps/chosen": -530.8834228515625, + "logps/rejected": -379.1323547363281, + "loss": 0.6934, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.001731688855215907, + "rewards/margins": -0.0059483470395207405, + "rewards/rejected": 0.004216659348458052, + "step": 130 + }, + { + "epoch": 0.04, + "learning_rate": 1.804123711340206e-07, + "logits/chosen": -4.243564128875732, + "logits/rejected": -4.247513771057129, + "logps/chosen": -555.7782592773438, + "logps/rejected": -475.36474609375, + "loss": 0.6932, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.0019786651246249676, + "rewards/margins": -0.0012128886301070452, + "rewards/rejected": 0.0031915525905787945, + "step": 140 + }, + { + "epoch": 0.04, + "learning_rate": 1.9329896907216494e-07, + "logits/chosen": -4.274221897125244, + "logits/rejected": -4.187704086303711, + "logps/chosen": -537.5848388671875, + "logps/rejected": -444.8301696777344, + "loss": 0.6938, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.0053375037387013435, + "rewards/margins": 1.6005151337594725e-05, + "rewards/rejected": 0.005321498028934002, + "step": 150 + }, + { + "epoch": 0.04, + "learning_rate": 2.0618556701030925e-07, + "logits/chosen": -4.137946128845215, + "logits/rejected": -4.2239580154418945, + "logps/chosen": -473.9889221191406, + "logps/rejected": -406.7872619628906, + "loss": 0.6876, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.015802519395947456, + "rewards/margins": 0.01578442193567753, + "rewards/rejected": 1.8098298824043013e-05, + "step": 160 + }, + { + "epoch": 0.04, + "learning_rate": 2.190721649484536e-07, + "logits/chosen": -4.182999134063721, + "logits/rejected": -4.228874683380127, + "logps/chosen": -527.0224609375, + "logps/rejected": -448.3179626464844, + "loss": 0.6961, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.019780535250902176, + "rewards/margins": -0.006507801823318005, + "rewards/rejected": 0.026288334280252457, + "step": 170 + }, + { + "epoch": 0.05, + "learning_rate": 2.3195876288659794e-07, + "logits/chosen": -4.098742485046387, + "logits/rejected": -4.176650524139404, + "logps/chosen": -594.6082763671875, + "logps/rejected": -453.4469299316406, + "loss": 0.6877, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.03402475267648697, + "rewards/margins": 0.01976330205798149, + "rewards/rejected": 0.014261451549828053, + "step": 180 + }, + { + "epoch": 0.05, + "learning_rate": 2.4484536082474224e-07, + "logits/chosen": -4.383849143981934, + "logits/rejected": -4.319648742675781, + "logps/chosen": -584.6770629882812, + "logps/rejected": -408.61370849609375, + "loss": 0.6902, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.025487428531050682, + "rewards/margins": 0.011225923895835876, + "rewards/rejected": 0.014261503703892231, + "step": 190 + }, + { + "epoch": 0.05, + "learning_rate": 2.5773195876288655e-07, + "logits/chosen": -4.132022857666016, + "logits/rejected": -4.150428295135498, + "logps/chosen": -518.2391357421875, + "logps/rejected": -388.0254821777344, + "loss": 0.6844, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.02871655486524105, + "rewards/margins": 0.022094249725341797, + "rewards/rejected": 0.0066223046742379665, + "step": 200 + }, + { + "epoch": 0.05, + "eval_logits/chosen": -4.050084114074707, + "eval_logits/rejected": -4.087072849273682, + "eval_logps/chosen": -549.053955078125, + "eval_logps/rejected": -437.8319396972656, + "eval_loss": 0.6855266094207764, + "eval_rewards/accuracies": 0.5640000104904175, + "eval_rewards/chosen": 0.032268982380628586, + "eval_rewards/margins": 0.014963901601731777, + "eval_rewards/rejected": 0.017305083572864532, + "eval_runtime": 146.4759, + "eval_samples_per_second": 13.654, + "eval_steps_per_second": 1.707, + "step": 200 + }, + { + "epoch": 0.05, + "learning_rate": 2.706185567010309e-07, + "logits/chosen": -4.013279438018799, + "logits/rejected": -4.023941516876221, + "logps/chosen": -581.2147827148438, + "logps/rejected": -522.2059936523438, + "loss": 0.6897, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.030506515875458717, + "rewards/margins": -0.003913003019988537, + "rewards/rejected": 0.03441951796412468, + "step": 210 + }, + { + "epoch": 0.06, + "learning_rate": 2.835051546391752e-07, + "logits/chosen": -4.057482719421387, + "logits/rejected": -4.15061092376709, + "logps/chosen": -469.19769287109375, + "logps/rejected": -427.91595458984375, + "loss": 0.6862, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03468897193670273, + "rewards/margins": 0.013076464645564556, + "rewards/rejected": 0.021612513810396194, + "step": 220 + }, + { + "epoch": 0.06, + "learning_rate": 2.963917525773196e-07, + "logits/chosen": -4.064208507537842, + "logits/rejected": -4.0749077796936035, + "logps/chosen": -530.9828491210938, + "logps/rejected": -439.2674865722656, + "loss": 0.684, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.04473203793168068, + "rewards/margins": 0.025556465610861778, + "rewards/rejected": 0.01917557418346405, + "step": 230 + }, + { + "epoch": 0.06, + "learning_rate": 3.0927835051546394e-07, + "logits/chosen": -4.100975036621094, + "logits/rejected": -4.096819877624512, + "logps/chosen": -526.16748046875, + "logps/rejected": -439.20452880859375, + "loss": 0.6816, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.059132933616638184, + "rewards/margins": 0.019664695486426353, + "rewards/rejected": 0.03946823999285698, + "step": 240 + }, + { + "epoch": 0.06, + "learning_rate": 3.2216494845360824e-07, + "logits/chosen": -4.139791488647461, + "logits/rejected": -4.0367560386657715, + "logps/chosen": -521.2025146484375, + "logps/rejected": -388.7520751953125, + "loss": 0.678, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.06324665248394012, + "rewards/margins": 0.04009511321783066, + "rewards/rejected": 0.02315153181552887, + "step": 250 + }, + { + "epoch": 0.07, + "learning_rate": 3.3505154639175255e-07, + "logits/chosen": -4.206658363342285, + "logits/rejected": -4.1859846115112305, + "logps/chosen": -668.1943969726562, + "logps/rejected": -461.34259033203125, + "loss": 0.6769, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.07276210933923721, + "rewards/margins": 0.03734602779150009, + "rewards/rejected": 0.03541607782244682, + "step": 260 + }, + { + "epoch": 0.07, + "learning_rate": 3.4793814432989685e-07, + "logits/chosen": -3.937157392501831, + "logits/rejected": -4.101494312286377, + "logps/chosen": -664.857666015625, + "logps/rejected": -487.4693908691406, + "loss": 0.6737, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.10912873595952988, + "rewards/margins": 0.05263194441795349, + "rewards/rejected": 0.056496791541576385, + "step": 270 + }, + { + "epoch": 0.07, + "learning_rate": 3.608247422680412e-07, + "logits/chosen": -4.2088494300842285, + "logits/rejected": -4.2679290771484375, + "logps/chosen": -711.7024536132812, + "logps/rejected": -427.2392578125, + "loss": 0.6648, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.12602953612804413, + "rewards/margins": 0.07799698412418365, + "rewards/rejected": 0.048032552003860474, + "step": 280 + }, + { + "epoch": 0.07, + "learning_rate": 3.737113402061856e-07, + "logits/chosen": -4.126033782958984, + "logits/rejected": -4.118724346160889, + "logps/chosen": -527.9533081054688, + "logps/rejected": -442.7091369628906, + "loss": 0.6779, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.06928877532482147, + "rewards/margins": 0.021227989345788956, + "rewards/rejected": 0.04806078225374222, + "step": 290 + }, + { + "epoch": 0.08, + "learning_rate": 3.865979381443299e-07, + "logits/chosen": -4.21649169921875, + "logits/rejected": -4.306222438812256, + "logps/chosen": -558.1029663085938, + "logps/rejected": -426.37646484375, + "loss": 0.6685, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.10165311396121979, + "rewards/margins": 0.0537085235118866, + "rewards/rejected": 0.047944579273462296, + "step": 300 + }, + { + "epoch": 0.08, + "eval_logits/chosen": -4.043172359466553, + "eval_logits/rejected": -4.078800201416016, + "eval_logps/chosen": -548.3015747070312, + "eval_logps/rejected": -437.4681701660156, + "eval_loss": 0.6674865484237671, + "eval_rewards/accuracies": 0.6159999966621399, + "eval_rewards/chosen": 0.10750828683376312, + "eval_rewards/margins": 0.05382777377963066, + "eval_rewards/rejected": 0.053680501878261566, + "eval_runtime": 146.1324, + "eval_samples_per_second": 13.686, + "eval_steps_per_second": 1.711, + "step": 300 + }, + { + "epoch": 0.08, + "learning_rate": 3.9948453608247424e-07, + "logits/chosen": -4.439688205718994, + "logits/rejected": -4.406257629394531, + "logps/chosen": -576.0067138671875, + "logps/rejected": -442.0852966308594, + "loss": 0.6703, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.11767254769802094, + "rewards/margins": 0.05607324838638306, + "rewards/rejected": 0.06159929558634758, + "step": 310 + }, + { + "epoch": 0.08, + "learning_rate": 4.123711340206185e-07, + "logits/chosen": -4.040421962738037, + "logits/rejected": -3.995241165161133, + "logps/chosen": -634.211181640625, + "logps/rejected": -444.74945068359375, + "loss": 0.6634, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.14802002906799316, + "rewards/margins": 0.08729343116283417, + "rewards/rejected": 0.0607265941798687, + "step": 320 + }, + { + "epoch": 0.09, + "learning_rate": 4.2525773195876285e-07, + "logits/chosen": -4.013192176818848, + "logits/rejected": -3.9118850231170654, + "logps/chosen": -531.2618408203125, + "logps/rejected": -369.8399963378906, + "loss": 0.6573, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.14214035868644714, + "rewards/margins": 0.0905950665473938, + "rewards/rejected": 0.051545269787311554, + "step": 330 + }, + { + "epoch": 0.09, + "learning_rate": 4.381443298969072e-07, + "logits/chosen": -4.299261569976807, + "logits/rejected": -4.219182968139648, + "logps/chosen": -580.9090576171875, + "logps/rejected": -442.6720275878906, + "loss": 0.6589, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.16068990528583527, + "rewards/margins": 0.07925193011760712, + "rewards/rejected": 0.08143799006938934, + "step": 340 + }, + { + "epoch": 0.09, + "learning_rate": 4.510309278350515e-07, + "logits/chosen": -4.036250114440918, + "logits/rejected": -3.9510204792022705, + "logps/chosen": -485.1849670410156, + "logps/rejected": -423.96746826171875, + "loss": 0.6691, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.16000322997570038, + "rewards/margins": 0.0713193342089653, + "rewards/rejected": 0.08868391811847687, + "step": 350 + }, + { + "epoch": 0.09, + "learning_rate": 4.639175257731959e-07, + "logits/chosen": -3.97419810295105, + "logits/rejected": -3.945896863937378, + "logps/chosen": -588.8265380859375, + "logps/rejected": -500.585205078125, + "loss": 0.664, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.16587187349796295, + "rewards/margins": 0.048247091472148895, + "rewards/rejected": 0.11762477457523346, + "step": 360 + }, + { + "epoch": 0.1, + "learning_rate": 4.7680412371134024e-07, + "logits/chosen": -4.279057502746582, + "logits/rejected": -4.3186540603637695, + "logps/chosen": -577.9805908203125, + "logps/rejected": -508.83880615234375, + "loss": 0.6621, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1850723773241043, + "rewards/margins": 0.04405021667480469, + "rewards/rejected": 0.14102217555046082, + "step": 370 + }, + { + "epoch": 0.1, + "learning_rate": 4.896907216494845e-07, + "logits/chosen": -4.560557842254639, + "logits/rejected": -4.472795009613037, + "logps/chosen": -585.3865966796875, + "logps/rejected": -427.63092041015625, + "loss": 0.6453, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.22210833430290222, + "rewards/margins": 0.14162525534629822, + "rewards/rejected": 0.0804830864071846, + "step": 380 + }, + { + "epoch": 0.1, + "learning_rate": 4.997130559540889e-07, + "logits/chosen": -4.149146556854248, + "logits/rejected": -4.130012035369873, + "logps/chosen": -458.86334228515625, + "logps/rejected": -402.4290466308594, + "loss": 0.6574, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.155195951461792, + "rewards/margins": 0.0719287320971489, + "rewards/rejected": 0.0832671970129013, + "step": 390 + }, + { + "epoch": 0.1, + "learning_rate": 4.982783357245337e-07, + "logits/chosen": -4.101078987121582, + "logits/rejected": -3.9474518299102783, + "logps/chosen": -594.5633544921875, + "logps/rejected": -459.3837890625, + "loss": 0.6579, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.22732532024383545, + "rewards/margins": 0.15504160523414612, + "rewards/rejected": 0.07228370010852814, + "step": 400 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -4.030913829803467, + "eval_logits/rejected": -4.064504623413086, + "eval_logps/chosen": -547.223388671875, + "eval_logps/rejected": -437.06365966796875, + "eval_loss": 0.6425994038581848, + "eval_rewards/accuracies": 0.6430000066757202, + "eval_rewards/chosen": 0.21532239019870758, + "eval_rewards/margins": 0.12119224667549133, + "eval_rewards/rejected": 0.09413015842437744, + "eval_runtime": 146.406, + "eval_samples_per_second": 13.661, + "eval_steps_per_second": 1.708, + "step": 400 + }, + { + "epoch": 0.11, + "learning_rate": 4.968436154949784e-07, + "logits/chosen": -4.3761677742004395, + "logits/rejected": -4.4744062423706055, + "logps/chosen": -486.56976318359375, + "logps/rejected": -388.5422058105469, + "loss": 0.6246, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2251121997833252, + "rewards/margins": 0.1718236207962036, + "rewards/rejected": 0.05328858643770218, + "step": 410 + }, + { + "epoch": 0.11, + "learning_rate": 4.954088952654232e-07, + "logits/chosen": -3.916259288787842, + "logits/rejected": -4.022424221038818, + "logps/chosen": -609.468017578125, + "logps/rejected": -489.47503662109375, + "loss": 0.6397, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.2788829207420349, + "rewards/margins": 0.13126085698604584, + "rewards/rejected": 0.14762204885482788, + "step": 420 + }, + { + "epoch": 0.11, + "learning_rate": 4.93974175035868e-07, + "logits/chosen": -4.073642253875732, + "logits/rejected": -3.992410182952881, + "logps/chosen": -589.1423950195312, + "logps/rejected": -387.6160583496094, + "loss": 0.625, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.27620893716812134, + "rewards/margins": 0.2046860158443451, + "rewards/rejected": 0.07152291387319565, + "step": 430 + }, + { + "epoch": 0.11, + "learning_rate": 4.925394548063128e-07, + "logits/chosen": -4.047796249389648, + "logits/rejected": -4.099135875701904, + "logps/chosen": -556.1654663085938, + "logps/rejected": -459.1832580566406, + "loss": 0.6249, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.26381629705429077, + "rewards/margins": 0.18791969120502472, + "rewards/rejected": 0.07589660584926605, + "step": 440 + }, + { + "epoch": 0.12, + "learning_rate": 4.911047345767575e-07, + "logits/chosen": -3.9531607627868652, + "logits/rejected": -3.9822494983673096, + "logps/chosen": -603.289306640625, + "logps/rejected": -452.654541015625, + "loss": 0.6246, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.24003124237060547, + "rewards/margins": 0.1340305507183075, + "rewards/rejected": 0.10600068420171738, + "step": 450 + }, + { + "epoch": 0.12, + "learning_rate": 4.896700143472023e-07, + "logits/chosen": -4.046868801116943, + "logits/rejected": -3.973362684249878, + "logps/chosen": -543.8755493164062, + "logps/rejected": -415.2347106933594, + "loss": 0.6243, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.27269724011421204, + "rewards/margins": 0.13904382288455963, + "rewards/rejected": 0.1336534321308136, + "step": 460 + }, + { + "epoch": 0.12, + "learning_rate": 4.88235294117647e-07, + "logits/chosen": -3.9586892127990723, + "logits/rejected": -3.949618101119995, + "logps/chosen": -521.1124267578125, + "logps/rejected": -476.64599609375, + "loss": 0.6414, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.26220518350601196, + "rewards/margins": 0.10140831768512726, + "rewards/rejected": 0.1607969105243683, + "step": 470 + }, + { + "epoch": 0.12, + "learning_rate": 4.868005738880918e-07, + "logits/chosen": -4.340029716491699, + "logits/rejected": -4.296602249145508, + "logps/chosen": -498.50628662109375, + "logps/rejected": -436.04193115234375, + "loss": 0.6262, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.23516115546226501, + "rewards/margins": 0.14310702681541443, + "rewards/rejected": 0.09205415844917297, + "step": 480 + }, + { + "epoch": 0.13, + "learning_rate": 4.853658536585365e-07, + "logits/chosen": -3.9785568714141846, + "logits/rejected": -3.9936375617980957, + "logps/chosen": -535.5206298828125, + "logps/rejected": -418.255126953125, + "loss": 0.6359, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.26155534386634827, + "rewards/margins": 0.19521105289459229, + "rewards/rejected": 0.06634429097175598, + "step": 490 + }, + { + "epoch": 0.13, + "learning_rate": 4.839311334289813e-07, + "logits/chosen": -4.171419620513916, + "logits/rejected": -4.2884111404418945, + "logps/chosen": -497.77874755859375, + "logps/rejected": -401.29046630859375, + "loss": 0.6331, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.2628328502178192, + "rewards/margins": 0.19560939073562622, + "rewards/rejected": 0.06722346693277359, + "step": 500 + }, + { + "epoch": 0.13, + "eval_logits/chosen": -4.0221147537231445, + "eval_logits/rejected": -4.052542686462402, + "eval_logps/chosen": -546.3970336914062, + "eval_logps/rejected": -436.89892578125, + "eval_loss": 0.6240983605384827, + "eval_rewards/accuracies": 0.6430000066757202, + "eval_rewards/chosen": 0.2979632318019867, + "eval_rewards/margins": 0.18736404180526733, + "eval_rewards/rejected": 0.11059919744729996, + "eval_runtime": 146.1671, + "eval_samples_per_second": 13.683, + "eval_steps_per_second": 1.71, + "step": 500 + }, + { + "epoch": 0.13, + "learning_rate": 4.824964131994261e-07, + "logits/chosen": -4.073412895202637, + "logits/rejected": -4.001163959503174, + "logps/chosen": -588.8052978515625, + "logps/rejected": -525.47314453125, + "loss": 0.6532, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.2637428641319275, + "rewards/margins": 0.08834028244018555, + "rewards/rejected": 0.17540256679058075, + "step": 510 + }, + { + "epoch": 0.13, + "learning_rate": 4.810616929698708e-07, + "logits/chosen": -4.054637908935547, + "logits/rejected": -4.115445613861084, + "logps/chosen": -586.9202270507812, + "logps/rejected": -401.8949890136719, + "loss": 0.6252, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.2919687032699585, + "rewards/margins": 0.22300024330615997, + "rewards/rejected": 0.06896845996379852, + "step": 520 + }, + { + "epoch": 0.14, + "learning_rate": 4.796269727403156e-07, + "logits/chosen": -4.366249084472656, + "logits/rejected": -4.296690940856934, + "logps/chosen": -501.8008728027344, + "logps/rejected": -414.6390686035156, + "loss": 0.6275, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.2946510314941406, + "rewards/margins": 0.20942220091819763, + "rewards/rejected": 0.0852288231253624, + "step": 530 + }, + { + "epoch": 0.14, + "learning_rate": 4.781922525107604e-07, + "logits/chosen": -4.214944362640381, + "logits/rejected": -4.242516040802002, + "logps/chosen": -582.1668701171875, + "logps/rejected": -438.54376220703125, + "loss": 0.6205, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3577159643173218, + "rewards/margins": 0.19794291257858276, + "rewards/rejected": 0.159773051738739, + "step": 540 + }, + { + "epoch": 0.14, + "learning_rate": 4.7675753228120513e-07, + "logits/chosen": -4.113412380218506, + "logits/rejected": -3.993567705154419, + "logps/chosen": -564.5824584960938, + "logps/rejected": -398.8680419921875, + "loss": 0.6193, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.316133588552475, + "rewards/margins": 0.27710846066474915, + "rewards/rejected": 0.039025187492370605, + "step": 550 + }, + { + "epoch": 0.14, + "learning_rate": 4.7532281205164993e-07, + "logits/chosen": -4.085113048553467, + "logits/rejected": -4.045032024383545, + "logps/chosen": -643.7376708984375, + "logps/rejected": -498.99859619140625, + "loss": 0.6274, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.34889236092567444, + "rewards/margins": 0.22896642982959747, + "rewards/rejected": 0.11992595344781876, + "step": 560 + }, + { + "epoch": 0.15, + "learning_rate": 4.738880918220947e-07, + "logits/chosen": -4.168662071228027, + "logits/rejected": -4.141668319702148, + "logps/chosen": -560.7593994140625, + "logps/rejected": -406.78143310546875, + "loss": 0.6173, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.331714928150177, + "rewards/margins": 0.245022252202034, + "rewards/rejected": 0.0866926982998848, + "step": 570 + }, + { + "epoch": 0.15, + "learning_rate": 4.7245337159253943e-07, + "logits/chosen": -4.329155445098877, + "logits/rejected": -4.298244476318359, + "logps/chosen": -563.4876708984375, + "logps/rejected": -376.99725341796875, + "loss": 0.6147, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.2851625382900238, + "rewards/margins": 0.19439759850502014, + "rewards/rejected": 0.09076493978500366, + "step": 580 + }, + { + "epoch": 0.15, + "learning_rate": 4.710186513629842e-07, + "logits/chosen": -4.025614261627197, + "logits/rejected": -3.995368242263794, + "logps/chosen": -570.0155029296875, + "logps/rejected": -456.23223876953125, + "loss": 0.6397, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.38556593656539917, + "rewards/margins": 0.24411602318286896, + "rewards/rejected": 0.1414499133825302, + "step": 590 + }, + { + "epoch": 0.15, + "learning_rate": 4.69583931133429e-07, + "logits/chosen": -4.11724853515625, + "logits/rejected": -4.225184440612793, + "logps/chosen": -600.27685546875, + "logps/rejected": -416.496826171875, + "loss": 0.6229, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.3451527953147888, + "rewards/margins": 0.26950401067733765, + "rewards/rejected": 0.07564878463745117, + "step": 600 + }, + { + "epoch": 0.15, + "eval_logits/chosen": -4.0116496086120605, + "eval_logits/rejected": -4.040153503417969, + "eval_logps/chosen": -545.94873046875, + "eval_logps/rejected": -436.90228271484375, + "eval_loss": 0.6138368844985962, + "eval_rewards/accuracies": 0.6579999923706055, + "eval_rewards/chosen": 0.3427916169166565, + "eval_rewards/margins": 0.2325276881456375, + "eval_rewards/rejected": 0.11026395857334137, + "eval_runtime": 145.937, + "eval_samples_per_second": 13.705, + "eval_steps_per_second": 1.713, + "step": 600 + }, + { + "epoch": 0.16, + "learning_rate": 4.681492109038737e-07, + "logits/chosen": -4.138489723205566, + "logits/rejected": -4.042520046234131, + "logps/chosen": -544.0598754882812, + "logps/rejected": -387.63031005859375, + "loss": 0.5897, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.3607966899871826, + "rewards/margins": 0.3171598017215729, + "rewards/rejected": 0.043636929243803024, + "step": 610 + }, + { + "epoch": 0.16, + "learning_rate": 4.667144906743185e-07, + "logits/chosen": -4.025771617889404, + "logits/rejected": -3.9127840995788574, + "logps/chosen": -517.0219116210938, + "logps/rejected": -439.63800048828125, + "loss": 0.5769, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3815905749797821, + "rewards/margins": 0.3481997549533844, + "rewards/rejected": 0.033390797674655914, + "step": 620 + }, + { + "epoch": 0.16, + "learning_rate": 4.6527977044476324e-07, + "logits/chosen": -4.107082843780518, + "logits/rejected": -4.197465419769287, + "logps/chosen": -576.8883056640625, + "logps/rejected": -426.826904296875, + "loss": 0.5992, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.4150086045265198, + "rewards/margins": 0.3171616792678833, + "rewards/rejected": 0.09784691035747528, + "step": 630 + }, + { + "epoch": 0.17, + "learning_rate": 4.6384505021520805e-07, + "logits/chosen": -4.232905864715576, + "logits/rejected": -4.251595497131348, + "logps/chosen": -526.0496215820312, + "logps/rejected": -378.66778564453125, + "loss": 0.6053, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.36744990944862366, + "rewards/margins": 0.237229585647583, + "rewards/rejected": 0.13022030889987946, + "step": 640 + }, + { + "epoch": 0.17, + "learning_rate": 4.6241032998565275e-07, + "logits/chosen": -3.990309953689575, + "logits/rejected": -3.9665799140930176, + "logps/chosen": -535.3065795898438, + "logps/rejected": -371.23883056640625, + "loss": 0.5688, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.35981225967407227, + "rewards/margins": 0.3149539828300476, + "rewards/rejected": 0.044858284294605255, + "step": 650 + }, + { + "epoch": 0.17, + "learning_rate": 4.6097560975609755e-07, + "logits/chosen": -4.290364742279053, + "logits/rejected": -4.3908371925354, + "logps/chosen": -602.7872314453125, + "logps/rejected": -467.87841796875, + "loss": 0.6302, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.4449954628944397, + "rewards/margins": 0.33234038949012756, + "rewards/rejected": 0.11265511810779572, + "step": 660 + }, + { + "epoch": 0.17, + "learning_rate": 4.595408895265423e-07, + "logits/chosen": -4.181097507476807, + "logits/rejected": -4.184117317199707, + "logps/chosen": -562.30908203125, + "logps/rejected": -419.0519104003906, + "loss": 0.6057, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.41988906264305115, + "rewards/margins": 0.33613476157188416, + "rewards/rejected": 0.08375430852174759, + "step": 670 + }, + { + "epoch": 0.18, + "learning_rate": 4.581061692969871e-07, + "logits/chosen": -3.9935073852539062, + "logits/rejected": -4.078420162200928, + "logps/chosen": -594.1588134765625, + "logps/rejected": -442.93218994140625, + "loss": 0.5912, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.3990897536277771, + "rewards/margins": 0.3166866898536682, + "rewards/rejected": 0.08240304887294769, + "step": 680 + }, + { + "epoch": 0.18, + "learning_rate": 4.566714490674318e-07, + "logits/chosen": -3.8547301292419434, + "logits/rejected": -3.8780627250671387, + "logps/chosen": -467.4917907714844, + "logps/rejected": -409.6250915527344, + "loss": 0.5982, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3181690573692322, + "rewards/margins": 0.30037710070610046, + "rewards/rejected": 0.01779193803668022, + "step": 690 + }, + { + "epoch": 0.18, + "learning_rate": 4.552367288378766e-07, + "logits/chosen": -3.856755018234253, + "logits/rejected": -3.7566399574279785, + "logps/chosen": -496.44580078125, + "logps/rejected": -416.92791748046875, + "loss": 0.6008, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.3064565062522888, + "rewards/margins": 0.19347265362739563, + "rewards/rejected": 0.1129838228225708, + "step": 700 + }, + { + "epoch": 0.18, + "eval_logits/chosen": -4.004153728485107, + "eval_logits/rejected": -4.03006649017334, + "eval_logps/chosen": -545.5549926757812, + "eval_logps/rejected": -437.035400390625, + "eval_loss": 0.6053361892700195, + "eval_rewards/accuracies": 0.656000018119812, + "eval_rewards/chosen": 0.3821641206741333, + "eval_rewards/margins": 0.28520864248275757, + "eval_rewards/rejected": 0.09695547074079514, + "eval_runtime": 146.9276, + "eval_samples_per_second": 13.612, + "eval_steps_per_second": 1.702, + "step": 700 + }, + { + "epoch": 0.18, + "learning_rate": 4.5380200860832136e-07, + "logits/chosen": -4.1166276931762695, + "logits/rejected": -4.0413994789123535, + "logps/chosen": -559.1090087890625, + "logps/rejected": -445.1997985839844, + "loss": 0.6028, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.37777018547058105, + "rewards/margins": 0.23465311527252197, + "rewards/rejected": 0.1431170552968979, + "step": 710 + }, + { + "epoch": 0.19, + "learning_rate": 4.5236728837876616e-07, + "logits/chosen": -4.048049449920654, + "logits/rejected": -3.983046293258667, + "logps/chosen": -521.6533813476562, + "logps/rejected": -423.5769958496094, + "loss": 0.6113, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.3277244567871094, + "rewards/margins": 0.251709520816803, + "rewards/rejected": 0.07601495087146759, + "step": 720 + }, + { + "epoch": 0.19, + "learning_rate": 4.5093256814921086e-07, + "logits/chosen": -3.96891713142395, + "logits/rejected": -4.157193660736084, + "logps/chosen": -527.0986328125, + "logps/rejected": -350.09735107421875, + "loss": 0.6191, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.3868439793586731, + "rewards/margins": 0.27587562799453735, + "rewards/rejected": 0.11096830666065216, + "step": 730 + }, + { + "epoch": 0.19, + "learning_rate": 4.4949784791965567e-07, + "logits/chosen": -4.01112174987793, + "logits/rejected": -3.9385008811950684, + "logps/chosen": -575.333740234375, + "logps/rejected": -411.80865478515625, + "loss": 0.6002, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.4110191762447357, + "rewards/margins": 0.3183595538139343, + "rewards/rejected": 0.0926596075296402, + "step": 740 + }, + { + "epoch": 0.19, + "learning_rate": 4.480631276901004e-07, + "logits/chosen": -3.8952746391296387, + "logits/rejected": -3.9051570892333984, + "logps/chosen": -587.7459716796875, + "logps/rejected": -426.0521545410156, + "loss": 0.6019, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.4798852503299713, + "rewards/margins": 0.3581300377845764, + "rewards/rejected": 0.12175522744655609, + "step": 750 + }, + { + "epoch": 0.2, + "learning_rate": 4.466284074605452e-07, + "logits/chosen": -4.128601551055908, + "logits/rejected": -4.192216396331787, + "logps/chosen": -555.259033203125, + "logps/rejected": -431.3056640625, + "loss": 0.5987, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4251033365726471, + "rewards/margins": 0.35620301961898804, + "rewards/rejected": 0.06890030205249786, + "step": 760 + }, + { + "epoch": 0.2, + "learning_rate": 4.451936872309899e-07, + "logits/chosen": -4.191853046417236, + "logits/rejected": -4.073651313781738, + "logps/chosen": -564.2633056640625, + "logps/rejected": -462.38232421875, + "loss": 0.5874, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.46583813428878784, + "rewards/margins": 0.4038007855415344, + "rewards/rejected": 0.0620373897254467, + "step": 770 + }, + { + "epoch": 0.2, + "learning_rate": 4.437589670014347e-07, + "logits/chosen": -3.9436306953430176, + "logits/rejected": -4.079471111297607, + "logps/chosen": -569.0813598632812, + "logps/rejected": -438.1226501464844, + "loss": 0.592, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.32991427183151245, + "rewards/margins": 0.34178251028060913, + "rewards/rejected": -0.011868256144225597, + "step": 780 + }, + { + "epoch": 0.2, + "learning_rate": 4.423242467718795e-07, + "logits/chosen": -4.243984699249268, + "logits/rejected": -4.39116907119751, + "logps/chosen": -674.5192260742188, + "logps/rejected": -492.4161682128906, + "loss": 0.5828, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5247339010238647, + "rewards/margins": 0.3629537522792816, + "rewards/rejected": 0.1617802083492279, + "step": 790 + }, + { + "epoch": 0.21, + "learning_rate": 4.4088952654232423e-07, + "logits/chosen": -3.945283889770508, + "logits/rejected": -3.931304454803467, + "logps/chosen": -520.6378173828125, + "logps/rejected": -340.75103759765625, + "loss": 0.5751, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.41377678513526917, + "rewards/margins": 0.4200451374053955, + "rewards/rejected": -0.006268366239964962, + "step": 800 + }, + { + "epoch": 0.21, + "eval_logits/chosen": -4.009909629821777, + "eval_logits/rejected": -4.035899639129639, + "eval_logps/chosen": -545.2993774414062, + "eval_logps/rejected": -437.1260070800781, + "eval_loss": 0.5998407006263733, + "eval_rewards/accuracies": 0.6539999842643738, + "eval_rewards/chosen": 0.4077303409576416, + "eval_rewards/margins": 0.3198363780975342, + "eval_rewards/rejected": 0.0878940224647522, + "eval_runtime": 145.3508, + "eval_samples_per_second": 13.76, + "eval_steps_per_second": 1.72, + "step": 800 + }, + { + "epoch": 0.21, + "learning_rate": 4.39454806312769e-07, + "logits/chosen": -3.9220452308654785, + "logits/rejected": -4.041108131408691, + "logps/chosen": -615.2744750976562, + "logps/rejected": -500.8890686035156, + "loss": 0.5732, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.49199342727661133, + "rewards/margins": 0.3499099612236023, + "rewards/rejected": 0.1420835256576538, + "step": 810 + }, + { + "epoch": 0.21, + "learning_rate": 4.380200860832138e-07, + "logits/chosen": -3.90093994140625, + "logits/rejected": -3.9337615966796875, + "logps/chosen": -616.523681640625, + "logps/rejected": -451.52996826171875, + "loss": 0.5575, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.5319877862930298, + "rewards/margins": 0.5240375399589539, + "rewards/rejected": 0.007950320839881897, + "step": 820 + }, + { + "epoch": 0.21, + "learning_rate": 4.3658536585365853e-07, + "logits/chosen": -4.105984687805176, + "logits/rejected": -4.126413345336914, + "logps/chosen": -491.058349609375, + "logps/rejected": -472.4222106933594, + "loss": 0.6289, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.3649570047855377, + "rewards/margins": 0.19882622361183167, + "rewards/rejected": 0.16613081097602844, + "step": 830 + }, + { + "epoch": 0.22, + "learning_rate": 4.351506456241033e-07, + "logits/chosen": -4.366209983825684, + "logits/rejected": -4.29564905166626, + "logps/chosen": -573.9385375976562, + "logps/rejected": -327.5928649902344, + "loss": 0.5732, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.47058337926864624, + "rewards/margins": 0.47838321328163147, + "rewards/rejected": -0.007799782790243626, + "step": 840 + }, + { + "epoch": 0.22, + "learning_rate": 4.3371592539454804e-07, + "logits/chosen": -3.7974257469177246, + "logits/rejected": -3.734402894973755, + "logps/chosen": -471.0333557128906, + "logps/rejected": -374.65673828125, + "loss": 0.6266, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.34169143438339233, + "rewards/margins": 0.2484813630580902, + "rewards/rejected": 0.09321005642414093, + "step": 850 + }, + { + "epoch": 0.22, + "learning_rate": 4.322812051649928e-07, + "logits/chosen": -4.0287275314331055, + "logits/rejected": -4.05717134475708, + "logps/chosen": -469.2396545410156, + "logps/rejected": -434.5414123535156, + "loss": 0.5803, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.4505770206451416, + "rewards/margins": 0.4199690818786621, + "rewards/rejected": 0.03060789778828621, + "step": 860 + }, + { + "epoch": 0.22, + "learning_rate": 4.308464849354376e-07, + "logits/chosen": -3.7610325813293457, + "logits/rejected": -3.8557701110839844, + "logps/chosen": -529.0855712890625, + "logps/rejected": -426.6482849121094, + "loss": 0.5811, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.43691587448120117, + "rewards/margins": 0.37098073959350586, + "rewards/rejected": 0.0659351572394371, + "step": 870 + }, + { + "epoch": 0.23, + "learning_rate": 4.294117647058823e-07, + "logits/chosen": -4.040841102600098, + "logits/rejected": -4.051581382751465, + "logps/chosen": -590.5636596679688, + "logps/rejected": -456.1898498535156, + "loss": 0.582, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3900124430656433, + "rewards/margins": 0.41093358397483826, + "rewards/rejected": -0.020921092480421066, + "step": 880 + }, + { + "epoch": 0.23, + "learning_rate": 4.279770444763271e-07, + "logits/chosen": -4.3830671310424805, + "logits/rejected": -4.194474220275879, + "logps/chosen": -587.708251953125, + "logps/rejected": -454.063720703125, + "loss": 0.6117, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.4363631308078766, + "rewards/margins": 0.3432873785495758, + "rewards/rejected": 0.09307573735713959, + "step": 890 + }, + { + "epoch": 0.23, + "learning_rate": 4.2654232424677185e-07, + "logits/chosen": -4.166562080383301, + "logits/rejected": -4.1289520263671875, + "logps/chosen": -507.2445373535156, + "logps/rejected": -396.4598083496094, + "loss": 0.6485, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.27186277508735657, + "rewards/margins": 0.16729417443275452, + "rewards/rejected": 0.10456860065460205, + "step": 900 + }, + { + "epoch": 0.23, + "eval_logits/chosen": -3.9936437606811523, + "eval_logits/rejected": -4.016723155975342, + "eval_logps/chosen": -545.1683349609375, + "eval_logps/rejected": -437.3501281738281, + "eval_loss": 0.5922096371650696, + "eval_rewards/accuracies": 0.6600000262260437, + "eval_rewards/chosen": 0.4208315312862396, + "eval_rewards/margins": 0.3553457260131836, + "eval_rewards/rejected": 0.06548583507537842, + "eval_runtime": 146.2261, + "eval_samples_per_second": 13.677, + "eval_steps_per_second": 1.71, + "step": 900 + }, + { + "epoch": 0.23, + "learning_rate": 4.2510760401721665e-07, + "logits/chosen": -4.098907947540283, + "logits/rejected": -4.098723411560059, + "logps/chosen": -650.6366577148438, + "logps/rejected": -495.94879150390625, + "loss": 0.5866, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.5507170557975769, + "rewards/margins": 0.5398961305618286, + "rewards/rejected": 0.010820944793522358, + "step": 910 + }, + { + "epoch": 0.24, + "learning_rate": 4.2367288378766135e-07, + "logits/chosen": -4.1348772048950195, + "logits/rejected": -4.166952610015869, + "logps/chosen": -591.7069702148438, + "logps/rejected": -477.39776611328125, + "loss": 0.5992, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.47026628255844116, + "rewards/margins": 0.3723362982273102, + "rewards/rejected": 0.09792999923229218, + "step": 920 + }, + { + "epoch": 0.24, + "learning_rate": 4.2223816355810615e-07, + "logits/chosen": -4.0753397941589355, + "logits/rejected": -4.123549461364746, + "logps/chosen": -559.75634765625, + "logps/rejected": -458.8775329589844, + "loss": 0.5799, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.39656537771224976, + "rewards/margins": 0.3553692698478699, + "rewards/rejected": 0.04119610786437988, + "step": 930 + }, + { + "epoch": 0.24, + "learning_rate": 4.208034433285509e-07, + "logits/chosen": -4.269906520843506, + "logits/rejected": -4.303974628448486, + "logps/chosen": -593.5145874023438, + "logps/rejected": -494.8085021972656, + "loss": 0.5647, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5839260816574097, + "rewards/margins": 0.44306641817092896, + "rewards/rejected": 0.14085964858531952, + "step": 940 + }, + { + "epoch": 0.25, + "learning_rate": 4.1936872309899565e-07, + "logits/chosen": -3.792731523513794, + "logits/rejected": -3.8398139476776123, + "logps/chosen": -489.6437072753906, + "logps/rejected": -401.5340576171875, + "loss": 0.6287, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.2587449550628662, + "rewards/margins": 0.2045062780380249, + "rewards/rejected": 0.054238706827163696, + "step": 950 + }, + { + "epoch": 0.25, + "learning_rate": 4.179340028694404e-07, + "logits/chosen": -4.206066131591797, + "logits/rejected": -4.116007328033447, + "logps/chosen": -482.42816162109375, + "logps/rejected": -382.22845458984375, + "loss": 0.5962, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.3317318558692932, + "rewards/margins": 0.34027567505836487, + "rewards/rejected": -0.008543826639652252, + "step": 960 + }, + { + "epoch": 0.25, + "learning_rate": 4.164992826398852e-07, + "logits/chosen": -4.003951549530029, + "logits/rejected": -3.9982573986053467, + "logps/chosen": -494.906005859375, + "logps/rejected": -401.5001220703125, + "loss": 0.6299, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.266286164522171, + "rewards/margins": 0.27161869406700134, + "rewards/rejected": -0.005332520697265863, + "step": 970 + }, + { + "epoch": 0.25, + "learning_rate": 4.1506456241032996e-07, + "logits/chosen": -4.05717134475708, + "logits/rejected": -3.8585472106933594, + "logps/chosen": -563.6212768554688, + "logps/rejected": -387.9486999511719, + "loss": 0.5832, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.3436369299888611, + "rewards/margins": 0.378772109746933, + "rewards/rejected": -0.035135164856910706, + "step": 980 + }, + { + "epoch": 0.26, + "learning_rate": 4.136298421807747e-07, + "logits/chosen": -4.064385890960693, + "logits/rejected": -4.122750759124756, + "logps/chosen": -587.16162109375, + "logps/rejected": -431.41796875, + "loss": 0.572, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.4710933566093445, + "rewards/margins": 0.4460281431674957, + "rewards/rejected": 0.02506522461771965, + "step": 990 + }, + { + "epoch": 0.26, + "learning_rate": 4.1219512195121946e-07, + "logits/chosen": -4.021462917327881, + "logits/rejected": -3.989718198776245, + "logps/chosen": -584.3521728515625, + "logps/rejected": -455.634033203125, + "loss": 0.6164, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.48522061109542847, + "rewards/margins": 0.3698544502258301, + "rewards/rejected": 0.11536619812250137, + "step": 1000 + }, + { + "epoch": 0.26, + "eval_logits/chosen": -3.986903429031372, + "eval_logits/rejected": -4.009212017059326, + "eval_logps/chosen": -545.3309326171875, + "eval_logps/rejected": -437.7181701660156, + "eval_loss": 0.5879542827606201, + "eval_rewards/accuracies": 0.6620000004768372, + "eval_rewards/chosen": 0.40457141399383545, + "eval_rewards/margins": 0.37589016556739807, + "eval_rewards/rejected": 0.028681199997663498, + "eval_runtime": 146.6025, + "eval_samples_per_second": 13.642, + "eval_steps_per_second": 1.705, + "step": 1000 + }, + { + "epoch": 0.26, + "learning_rate": 4.1076040172166427e-07, + "logits/chosen": -4.139552116394043, + "logits/rejected": -3.9534621238708496, + "logps/chosen": -571.7590942382812, + "logps/rejected": -444.6793518066406, + "loss": 0.6451, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.3784537613391876, + "rewards/margins": 0.23798270523548126, + "rewards/rejected": 0.14047105610370636, + "step": 1010 + }, + { + "epoch": 0.26, + "learning_rate": 4.09325681492109e-07, + "logits/chosen": -4.049252510070801, + "logits/rejected": -4.108782768249512, + "logps/chosen": -644.1297607421875, + "logps/rejected": -546.4414672851562, + "loss": 0.6455, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.3904695510864258, + "rewards/margins": 0.23047880828380585, + "rewards/rejected": 0.15999077260494232, + "step": 1020 + }, + { + "epoch": 0.27, + "learning_rate": 4.0789096126255377e-07, + "logits/chosen": -4.110049247741699, + "logits/rejected": -4.13530969619751, + "logps/chosen": -601.107666015625, + "logps/rejected": -430.6465759277344, + "loss": 0.5972, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.39451712369918823, + "rewards/margins": 0.3030509948730469, + "rewards/rejected": 0.09146615862846375, + "step": 1030 + }, + { + "epoch": 0.27, + "learning_rate": 4.064562410329985e-07, + "logits/chosen": -4.069981575012207, + "logits/rejected": -4.104067802429199, + "logps/chosen": -562.4483642578125, + "logps/rejected": -496.1336975097656, + "loss": 0.5667, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.46653634309768677, + "rewards/margins": 0.4154808521270752, + "rewards/rejected": 0.051055438816547394, + "step": 1040 + }, + { + "epoch": 0.27, + "learning_rate": 4.050215208034433e-07, + "logits/chosen": -4.096522331237793, + "logits/rejected": -4.07404088973999, + "logps/chosen": -597.645751953125, + "logps/rejected": -389.2298889160156, + "loss": 0.566, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.43959060311317444, + "rewards/margins": 0.4319628179073334, + "rewards/rejected": 0.007627798710018396, + "step": 1050 + }, + { + "epoch": 0.27, + "learning_rate": 4.035868005738881e-07, + "logits/chosen": -4.101964950561523, + "logits/rejected": -3.971134901046753, + "logps/chosen": -654.4503784179688, + "logps/rejected": -446.3114318847656, + "loss": 0.5856, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.5462326407432556, + "rewards/margins": 0.52418452501297, + "rewards/rejected": 0.022048136219382286, + "step": 1060 + }, + { + "epoch": 0.28, + "learning_rate": 4.0215208034433283e-07, + "logits/chosen": -4.149927139282227, + "logits/rejected": -4.159340858459473, + "logps/chosen": -569.7572631835938, + "logps/rejected": -407.02545166015625, + "loss": 0.575, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.25654059648513794, + "rewards/margins": 0.33243244886398315, + "rewards/rejected": -0.0758919045329094, + "step": 1070 + }, + { + "epoch": 0.28, + "learning_rate": 4.007173601147776e-07, + "logits/chosen": -4.003470420837402, + "logits/rejected": -3.9572086334228516, + "logps/chosen": -565.874267578125, + "logps/rejected": -392.24761962890625, + "loss": 0.6221, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.3559120297431946, + "rewards/margins": 0.3540252149105072, + "rewards/rejected": 0.0018868416082113981, + "step": 1080 + }, + { + "epoch": 0.28, + "learning_rate": 3.992826398852224e-07, + "logits/chosen": -4.156329154968262, + "logits/rejected": -4.075765132904053, + "logps/chosen": -503.8531188964844, + "logps/rejected": -443.68731689453125, + "loss": 0.5874, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.3798424005508423, + "rewards/margins": 0.3366612493991852, + "rewards/rejected": 0.04318114370107651, + "step": 1090 + }, + { + "epoch": 0.28, + "learning_rate": 3.978479196556671e-07, + "logits/chosen": -4.099778652191162, + "logits/rejected": -4.040897846221924, + "logps/chosen": -482.4664001464844, + "logps/rejected": -434.3218688964844, + "loss": 0.6225, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.2912788987159729, + "rewards/margins": 0.3869735598564148, + "rewards/rejected": -0.09569470584392548, + "step": 1100 + }, + { + "epoch": 0.28, + "eval_logits/chosen": -3.998389720916748, + "eval_logits/rejected": -4.024014949798584, + "eval_logps/chosen": -545.3189086914062, + "eval_logps/rejected": -437.8950500488281, + "eval_loss": 0.5851995944976807, + "eval_rewards/accuracies": 0.6679999828338623, + "eval_rewards/chosen": 0.40576791763305664, + "eval_rewards/margins": 0.3947778642177582, + "eval_rewards/rejected": 0.01099009346216917, + "eval_runtime": 145.9401, + "eval_samples_per_second": 13.704, + "eval_steps_per_second": 1.713, + "step": 1100 + }, + { + "epoch": 0.29, + "learning_rate": 3.964131994261119e-07, + "logits/chosen": -4.306766986846924, + "logits/rejected": -4.230467796325684, + "logps/chosen": -549.1437377929688, + "logps/rejected": -444.1548767089844, + "loss": 0.5957, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.4746015965938568, + "rewards/margins": 0.5222647786140442, + "rewards/rejected": -0.047663114964962006, + "step": 1110 + }, + { + "epoch": 0.29, + "learning_rate": 3.9497847919655664e-07, + "logits/chosen": -4.1666669845581055, + "logits/rejected": -4.193212509155273, + "logps/chosen": -506.9803161621094, + "logps/rejected": -432.77783203125, + "loss": 0.6335, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3090699315071106, + "rewards/margins": 0.2045580893754959, + "rewards/rejected": 0.10451184213161469, + "step": 1120 + }, + { + "epoch": 0.29, + "learning_rate": 3.9354375896700144e-07, + "logits/chosen": -3.8870487213134766, + "logits/rejected": -3.9582340717315674, + "logps/chosen": -616.6710815429688, + "logps/rejected": -508.39813232421875, + "loss": 0.5705, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.48951154947280884, + "rewards/margins": 0.4596976637840271, + "rewards/rejected": 0.02981388568878174, + "step": 1130 + }, + { + "epoch": 0.29, + "learning_rate": 3.9210903873744614e-07, + "logits/chosen": -3.9056262969970703, + "logits/rejected": -3.7358765602111816, + "logps/chosen": -550.6695556640625, + "logps/rejected": -412.7212829589844, + "loss": 0.5673, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3575005829334259, + "rewards/margins": 0.3841975927352905, + "rewards/rejected": -0.02669701538980007, + "step": 1140 + }, + { + "epoch": 0.3, + "learning_rate": 3.9067431850789094e-07, + "logits/chosen": -3.873683452606201, + "logits/rejected": -3.945786714553833, + "logps/chosen": -598.2088623046875, + "logps/rejected": -395.6291198730469, + "loss": 0.6215, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.40675464272499084, + "rewards/margins": 0.4558509886264801, + "rewards/rejected": -0.04909630864858627, + "step": 1150 + }, + { + "epoch": 0.3, + "learning_rate": 3.892395982783357e-07, + "logits/chosen": -3.874563217163086, + "logits/rejected": -3.988626480102539, + "logps/chosen": -580.8389282226562, + "logps/rejected": -464.979248046875, + "loss": 0.5563, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.4360577464103699, + "rewards/margins": 0.436431884765625, + "rewards/rejected": -0.0003741338732652366, + "step": 1160 + }, + { + "epoch": 0.3, + "learning_rate": 3.878048780487805e-07, + "logits/chosen": -3.8558075428009033, + "logits/rejected": -3.862384080886841, + "logps/chosen": -603.0067138671875, + "logps/rejected": -453.36126708984375, + "loss": 0.5751, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.5401335954666138, + "rewards/margins": 0.4825173020362854, + "rewards/rejected": 0.05761627480387688, + "step": 1170 + }, + { + "epoch": 0.3, + "learning_rate": 3.863701578192252e-07, + "logits/chosen": -4.10439395904541, + "logits/rejected": -4.0709943771362305, + "logps/chosen": -562.9437255859375, + "logps/rejected": -468.41046142578125, + "loss": 0.5989, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5524980425834656, + "rewards/margins": 0.43577200174331665, + "rewards/rejected": 0.1167261153459549, + "step": 1180 + }, + { + "epoch": 0.31, + "learning_rate": 3.8493543758967e-07, + "logits/chosen": -3.9623591899871826, + "logits/rejected": -3.9735617637634277, + "logps/chosen": -496.47479248046875, + "logps/rejected": -354.4454040527344, + "loss": 0.639, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2600507140159607, + "rewards/margins": 0.2917958199977875, + "rewards/rejected": -0.031745124608278275, + "step": 1190 + }, + { + "epoch": 0.31, + "learning_rate": 3.8350071736011475e-07, + "logits/chosen": -3.906859874725342, + "logits/rejected": -3.9087185859680176, + "logps/chosen": -427.0773010253906, + "logps/rejected": -349.86077880859375, + "loss": 0.6289, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.2505320608615875, + "rewards/margins": 0.1847679316997528, + "rewards/rejected": 0.06576415151357651, + "step": 1200 + }, + { + "epoch": 0.31, + "eval_logits/chosen": -3.9994406700134277, + "eval_logits/rejected": -4.025309085845947, + "eval_logps/chosen": -545.2498168945312, + "eval_logps/rejected": -437.9264831542969, + "eval_loss": 0.5823842287063599, + "eval_rewards/accuracies": 0.6669999957084656, + "eval_rewards/chosen": 0.41267773509025574, + "eval_rewards/margins": 0.4048316776752472, + "eval_rewards/rejected": 0.007846098393201828, + "eval_runtime": 147.2172, + "eval_samples_per_second": 13.585, + "eval_steps_per_second": 1.698, + "step": 1200 + }, + { + "epoch": 0.31, + "learning_rate": 3.8206599713055956e-07, + "logits/chosen": -4.042483329772949, + "logits/rejected": -3.88130259513855, + "logps/chosen": -579.4273681640625, + "logps/rejected": -485.6758728027344, + "loss": 0.6176, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.34652236104011536, + "rewards/margins": 0.2972865104675293, + "rewards/rejected": 0.049235861748456955, + "step": 1210 + }, + { + "epoch": 0.32, + "learning_rate": 3.8063127690100426e-07, + "logits/chosen": -4.063638210296631, + "logits/rejected": -4.126063346862793, + "logps/chosen": -599.88916015625, + "logps/rejected": -484.5962829589844, + "loss": 0.5938, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.32978740334510803, + "rewards/margins": 0.2084900587797165, + "rewards/rejected": 0.12129731476306915, + "step": 1220 + }, + { + "epoch": 0.32, + "learning_rate": 3.7919655667144906e-07, + "logits/chosen": -3.99627947807312, + "logits/rejected": -3.940380573272705, + "logps/chosen": -546.5948486328125, + "logps/rejected": -412.3846740722656, + "loss": 0.5958, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.44469523429870605, + "rewards/margins": 0.48819655179977417, + "rewards/rejected": -0.0435013584792614, + "step": 1230 + }, + { + "epoch": 0.32, + "learning_rate": 3.777618364418938e-07, + "logits/chosen": -3.9554569721221924, + "logits/rejected": -3.9587948322296143, + "logps/chosen": -467.00677490234375, + "logps/rejected": -398.10955810546875, + "loss": 0.5939, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.2916651964187622, + "rewards/margins": 0.2515104413032532, + "rewards/rejected": 0.040154773741960526, + "step": 1240 + }, + { + "epoch": 0.32, + "learning_rate": 3.763271162123386e-07, + "logits/chosen": -3.7756049633026123, + "logits/rejected": -3.755903720855713, + "logps/chosen": -591.2271728515625, + "logps/rejected": -444.3221130371094, + "loss": 0.5495, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.4893193244934082, + "rewards/margins": 0.5091069936752319, + "rewards/rejected": -0.019787678495049477, + "step": 1250 + }, + { + "epoch": 0.33, + "learning_rate": 3.748923959827833e-07, + "logits/chosen": -4.159283638000488, + "logits/rejected": -4.039699077606201, + "logps/chosen": -449.0978088378906, + "logps/rejected": -326.54791259765625, + "loss": 0.624, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.2927771806716919, + "rewards/margins": 0.25214409828186035, + "rewards/rejected": 0.040633104741573334, + "step": 1260 + }, + { + "epoch": 0.33, + "learning_rate": 3.734576757532281e-07, + "logits/chosen": -4.083529472351074, + "logits/rejected": -4.100892543792725, + "logps/chosen": -642.364501953125, + "logps/rejected": -476.1664123535156, + "loss": 0.5597, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5070067644119263, + "rewards/margins": 0.6494277715682983, + "rewards/rejected": -0.1424209624528885, + "step": 1270 + }, + { + "epoch": 0.33, + "learning_rate": 3.7202295552367287e-07, + "logits/chosen": -4.093569755554199, + "logits/rejected": -4.279056549072266, + "logps/chosen": -589.33642578125, + "logps/rejected": -452.7460021972656, + "loss": 0.5692, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.5026015639305115, + "rewards/margins": 0.4538155198097229, + "rewards/rejected": 0.048786066472530365, + "step": 1280 + }, + { + "epoch": 0.33, + "learning_rate": 3.705882352941176e-07, + "logits/chosen": -4.209478855133057, + "logits/rejected": -4.320340633392334, + "logps/chosen": -601.5377197265625, + "logps/rejected": -405.8938293457031, + "loss": 0.5319, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.5548459887504578, + "rewards/margins": 0.6283925771713257, + "rewards/rejected": -0.07354650646448135, + "step": 1290 + }, + { + "epoch": 0.34, + "learning_rate": 3.6915351506456237e-07, + "logits/chosen": -4.303310871124268, + "logits/rejected": -4.392244338989258, + "logps/chosen": -526.3382568359375, + "logps/rejected": -336.311279296875, + "loss": 0.5818, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2645590901374817, + "rewards/margins": 0.33163318037986755, + "rewards/rejected": -0.06707411259412766, + "step": 1300 + }, + { + "epoch": 0.34, + "eval_logits/chosen": -3.9953150749206543, + "eval_logits/rejected": -4.021241188049316, + "eval_logps/chosen": -545.1544189453125, + "eval_logps/rejected": -437.90802001953125, + "eval_loss": 0.5818018913269043, + "eval_rewards/accuracies": 0.6679999828338623, + "eval_rewards/chosen": 0.42221859097480774, + "eval_rewards/margins": 0.41252991557121277, + "eval_rewards/rejected": 0.009688721038401127, + "eval_runtime": 146.2307, + "eval_samples_per_second": 13.677, + "eval_steps_per_second": 1.71, + "step": 1300 + }, + { + "epoch": 0.34, + "learning_rate": 3.677187948350072e-07, + "logits/chosen": -4.2785139083862305, + "logits/rejected": -4.281913757324219, + "logps/chosen": -631.8258056640625, + "logps/rejected": -432.3312072753906, + "loss": 0.5513, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.4580743908882141, + "rewards/margins": 0.5590990781784058, + "rewards/rejected": -0.10102470219135284, + "step": 1310 + }, + { + "epoch": 0.34, + "learning_rate": 3.6628407460545193e-07, + "logits/chosen": -4.18049955368042, + "logits/rejected": -4.1783599853515625, + "logps/chosen": -482.9546813964844, + "logps/rejected": -441.1940002441406, + "loss": 0.6002, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.4040532112121582, + "rewards/margins": 0.2903508245944977, + "rewards/rejected": 0.11370239406824112, + "step": 1320 + }, + { + "epoch": 0.34, + "learning_rate": 3.648493543758967e-07, + "logits/chosen": -4.047448635101318, + "logits/rejected": -4.031399726867676, + "logps/chosen": -513.3343505859375, + "logps/rejected": -439.59857177734375, + "loss": 0.5949, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.5155481696128845, + "rewards/margins": 0.5262617468833923, + "rewards/rejected": -0.01071359496563673, + "step": 1330 + }, + { + "epoch": 0.35, + "learning_rate": 3.6341463414634143e-07, + "logits/chosen": -4.256237983703613, + "logits/rejected": -4.140265941619873, + "logps/chosen": -586.2674560546875, + "logps/rejected": -513.4707641601562, + "loss": 0.6051, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4771571159362793, + "rewards/margins": 0.3665952682495117, + "rewards/rejected": 0.11056187003850937, + "step": 1340 + }, + { + "epoch": 0.35, + "learning_rate": 3.6197991391678623e-07, + "logits/chosen": -4.259045600891113, + "logits/rejected": -4.169145584106445, + "logps/chosen": -492.68572998046875, + "logps/rejected": -336.1100769042969, + "loss": 0.5918, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.3294292390346527, + "rewards/margins": 0.42583298683166504, + "rewards/rejected": -0.09640369564294815, + "step": 1350 + }, + { + "epoch": 0.35, + "learning_rate": 3.60545193687231e-07, + "logits/chosen": -4.042055130004883, + "logits/rejected": -4.034060478210449, + "logps/chosen": -437.0550231933594, + "logps/rejected": -344.05828857421875, + "loss": 0.5862, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.3367193341255188, + "rewards/margins": 0.3423077464103699, + "rewards/rejected": -0.0055884262546896935, + "step": 1360 + }, + { + "epoch": 0.35, + "learning_rate": 3.5911047345767574e-07, + "logits/chosen": -3.9995014667510986, + "logits/rejected": -4.026850700378418, + "logps/chosen": -576.7128295898438, + "logps/rejected": -466.55010986328125, + "loss": 0.6112, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.4717404246330261, + "rewards/margins": 0.5018633604049683, + "rewards/rejected": -0.030122917145490646, + "step": 1370 + }, + { + "epoch": 0.36, + "learning_rate": 3.576757532281205e-07, + "logits/chosen": -3.9444518089294434, + "logits/rejected": -3.91229510307312, + "logps/chosen": -586.0652465820312, + "logps/rejected": -464.03271484375, + "loss": 0.5891, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.3900233507156372, + "rewards/margins": 0.42400288581848145, + "rewards/rejected": -0.03397948667407036, + "step": 1380 + }, + { + "epoch": 0.36, + "learning_rate": 3.562410329985653e-07, + "logits/chosen": -3.970731735229492, + "logits/rejected": -4.1245927810668945, + "logps/chosen": -567.556884765625, + "logps/rejected": -479.55841064453125, + "loss": 0.6725, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.331743061542511, + "rewards/margins": 0.251522034406662, + "rewards/rejected": 0.08022100478410721, + "step": 1390 + }, + { + "epoch": 0.36, + "learning_rate": 3.5480631276901004e-07, + "logits/chosen": -4.056004524230957, + "logits/rejected": -4.051678657531738, + "logps/chosen": -536.676513671875, + "logps/rejected": -385.55767822265625, + "loss": 0.567, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.28222841024398804, + "rewards/margins": 0.38999465107917786, + "rewards/rejected": -0.10776624828577042, + "step": 1400 + }, + { + "epoch": 0.36, + "eval_logits/chosen": -4.006156921386719, + "eval_logits/rejected": -4.033264636993408, + "eval_logps/chosen": -545.279052734375, + "eval_logps/rejected": -438.14556884765625, + "eval_loss": 0.5797023773193359, + "eval_rewards/accuracies": 0.6729999780654907, + "eval_rewards/chosen": 0.409759521484375, + "eval_rewards/margins": 0.42382344603538513, + "eval_rewards/rejected": -0.014063959941267967, + "eval_runtime": 147.9054, + "eval_samples_per_second": 13.522, + "eval_steps_per_second": 1.69, + "step": 1400 + }, + { + "epoch": 0.36, + "learning_rate": 3.533715925394548e-07, + "logits/chosen": -3.957362413406372, + "logits/rejected": -3.8286430835723877, + "logps/chosen": -518.3594970703125, + "logps/rejected": -345.4431457519531, + "loss": 0.5558, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.4510959982872009, + "rewards/margins": 0.5645402669906616, + "rewards/rejected": -0.11344428360462189, + "step": 1410 + }, + { + "epoch": 0.37, + "learning_rate": 3.5193687230989955e-07, + "logits/chosen": -4.1850199699401855, + "logits/rejected": -4.076201915740967, + "logps/chosen": -611.8565673828125, + "logps/rejected": -568.882080078125, + "loss": 0.6438, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.4028521478176117, + "rewards/margins": 0.12559688091278076, + "rewards/rejected": 0.2772553265094757, + "step": 1420 + }, + { + "epoch": 0.37, + "learning_rate": 3.5050215208034435e-07, + "logits/chosen": -4.432595252990723, + "logits/rejected": -4.352065086364746, + "logps/chosen": -611.6333618164062, + "logps/rejected": -451.0577697753906, + "loss": 0.5624, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.40066614747047424, + "rewards/margins": 0.4248287081718445, + "rewards/rejected": -0.024162566289305687, + "step": 1430 + }, + { + "epoch": 0.37, + "learning_rate": 3.4906743185078905e-07, + "logits/chosen": -4.167354106903076, + "logits/rejected": -4.119304656982422, + "logps/chosen": -597.4181518554688, + "logps/rejected": -468.30126953125, + "loss": 0.5355, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.5246739983558655, + "rewards/margins": 0.6675662994384766, + "rewards/rejected": -0.1428922414779663, + "step": 1440 + }, + { + "epoch": 0.37, + "learning_rate": 3.4763271162123385e-07, + "logits/chosen": -3.9422059059143066, + "logits/rejected": -4.009974002838135, + "logps/chosen": -456.62103271484375, + "logps/rejected": -456.80462646484375, + "loss": 0.6016, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.46443605422973633, + "rewards/margins": 0.3656379282474518, + "rewards/rejected": 0.09879810363054276, + "step": 1450 + }, + { + "epoch": 0.38, + "learning_rate": 3.461979913916786e-07, + "logits/chosen": -4.015919208526611, + "logits/rejected": -4.085513114929199, + "logps/chosen": -490.6377868652344, + "logps/rejected": -374.94866943359375, + "loss": 0.5377, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.44695836305618286, + "rewards/margins": 0.5198525190353394, + "rewards/rejected": -0.07289411872625351, + "step": 1460 + }, + { + "epoch": 0.38, + "learning_rate": 3.447632711621234e-07, + "logits/chosen": -4.100437164306641, + "logits/rejected": -4.218926906585693, + "logps/chosen": -555.1058349609375, + "logps/rejected": -427.87139892578125, + "loss": 0.6011, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3257507383823395, + "rewards/margins": 0.39021044969558716, + "rewards/rejected": -0.06445976346731186, + "step": 1470 + }, + { + "epoch": 0.38, + "learning_rate": 3.433285509325681e-07, + "logits/chosen": -3.8781065940856934, + "logits/rejected": -3.8362109661102295, + "logps/chosen": -423.455078125, + "logps/rejected": -366.01617431640625, + "loss": 0.609, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.23626752197742462, + "rewards/margins": 0.28217631578445435, + "rewards/rejected": -0.04590878635644913, + "step": 1480 + }, + { + "epoch": 0.38, + "learning_rate": 3.418938307030129e-07, + "logits/chosen": -4.141830921173096, + "logits/rejected": -4.139374256134033, + "logps/chosen": -490.91473388671875, + "logps/rejected": -431.1683654785156, + "loss": 0.5698, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3411320149898529, + "rewards/margins": 0.3740822374820709, + "rewards/rejected": -0.032950229942798615, + "step": 1490 + }, + { + "epoch": 0.39, + "learning_rate": 3.4045911047345766e-07, + "logits/chosen": -4.246241569519043, + "logits/rejected": -4.1093244552612305, + "logps/chosen": -595.197509765625, + "logps/rejected": -455.29656982421875, + "loss": 0.5659, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.3736713230609894, + "rewards/margins": 0.5091021060943604, + "rewards/rejected": -0.13543078303337097, + "step": 1500 + }, + { + "epoch": 0.39, + "eval_logits/chosen": -3.996328353881836, + "eval_logits/rejected": -4.024491786956787, + "eval_logps/chosen": -545.1725463867188, + "eval_logps/rejected": -438.1591491699219, + "eval_loss": 0.5790306925773621, + "eval_rewards/accuracies": 0.6779999732971191, + "eval_rewards/chosen": 0.4204104244709015, + "eval_rewards/margins": 0.43583211302757263, + "eval_rewards/rejected": -0.015421712771058083, + "eval_runtime": 145.84, + "eval_samples_per_second": 13.714, + "eval_steps_per_second": 1.714, + "step": 1500 + }, + { + "epoch": 0.39, + "learning_rate": 3.3902439024390247e-07, + "logits/chosen": -3.910076141357422, + "logits/rejected": -4.025428295135498, + "logps/chosen": -489.580322265625, + "logps/rejected": -334.91131591796875, + "loss": 0.5546, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.26078709959983826, + "rewards/margins": 0.4329034686088562, + "rewards/rejected": -0.17211636900901794, + "step": 1510 + }, + { + "epoch": 0.39, + "learning_rate": 3.3758967001434716e-07, + "logits/chosen": -4.3428544998168945, + "logits/rejected": -4.32183837890625, + "logps/chosen": -733.9967041015625, + "logps/rejected": -545.9852905273438, + "loss": 0.5303, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6733923554420471, + "rewards/margins": 0.6076704263687134, + "rewards/rejected": 0.06572196632623672, + "step": 1520 + }, + { + "epoch": 0.4, + "learning_rate": 3.3615494978479197e-07, + "logits/chosen": -4.124747276306152, + "logits/rejected": -4.166211128234863, + "logps/chosen": -608.829345703125, + "logps/rejected": -383.62518310546875, + "loss": 0.5576, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.39822494983673096, + "rewards/margins": 0.5036035776138306, + "rewards/rejected": -0.1053786501288414, + "step": 1530 + }, + { + "epoch": 0.4, + "learning_rate": 3.347202295552367e-07, + "logits/chosen": -4.033061981201172, + "logits/rejected": -4.068852424621582, + "logps/chosen": -511.05682373046875, + "logps/rejected": -465.43475341796875, + "loss": 0.6175, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.35540980100631714, + "rewards/margins": 0.31782034039497375, + "rewards/rejected": 0.03758946806192398, + "step": 1540 + }, + { + "epoch": 0.4, + "learning_rate": 3.332855093256815e-07, + "logits/chosen": -4.0989203453063965, + "logits/rejected": -4.1430792808532715, + "logps/chosen": -612.7939453125, + "logps/rejected": -483.2908630371094, + "loss": 0.5781, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.567456841468811, + "rewards/margins": 0.6063565611839294, + "rewards/rejected": -0.03889976069331169, + "step": 1550 + }, + { + "epoch": 0.4, + "learning_rate": 3.318507890961262e-07, + "logits/chosen": -4.184214115142822, + "logits/rejected": -4.299299716949463, + "logps/chosen": -540.6097412109375, + "logps/rejected": -396.2356872558594, + "loss": 0.5515, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.401826947927475, + "rewards/margins": 0.48910683393478394, + "rewards/rejected": -0.08727996051311493, + "step": 1560 + }, + { + "epoch": 0.41, + "learning_rate": 3.3041606886657103e-07, + "logits/chosen": -3.9090118408203125, + "logits/rejected": -3.9316658973693848, + "logps/chosen": -574.2691650390625, + "logps/rejected": -485.7064514160156, + "loss": 0.674, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.4122130274772644, + "rewards/margins": 0.2545969486236572, + "rewards/rejected": 0.1576160490512848, + "step": 1570 + }, + { + "epoch": 0.41, + "learning_rate": 3.289813486370158e-07, + "logits/chosen": -4.144872665405273, + "logits/rejected": -4.0689921379089355, + "logps/chosen": -513.11181640625, + "logps/rejected": -471.02606201171875, + "loss": 0.6057, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.34228435158729553, + "rewards/margins": 0.4427550733089447, + "rewards/rejected": -0.10047070682048798, + "step": 1580 + }, + { + "epoch": 0.41, + "learning_rate": 3.275466284074606e-07, + "logits/chosen": -3.9695823192596436, + "logits/rejected": -4.070342063903809, + "logps/chosen": -653.9967651367188, + "logps/rejected": -471.898193359375, + "loss": 0.5594, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.57194983959198, + "rewards/margins": 0.6099370121955872, + "rewards/rejected": -0.03798716515302658, + "step": 1590 + }, + { + "epoch": 0.41, + "learning_rate": 3.261119081779053e-07, + "logits/chosen": -4.089110851287842, + "logits/rejected": -4.0619401931762695, + "logps/chosen": -515.8906860351562, + "logps/rejected": -470.29541015625, + "loss": 0.5993, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.42164483666419983, + "rewards/margins": 0.3623473346233368, + "rewards/rejected": 0.05929745361208916, + "step": 1600 + }, + { + "epoch": 0.41, + "eval_logits/chosen": -3.990658760070801, + "eval_logits/rejected": -4.0185322761535645, + "eval_logps/chosen": -545.216064453125, + "eval_logps/rejected": -438.2904052734375, + "eval_loss": 0.5782522559165955, + "eval_rewards/accuracies": 0.671999990940094, + "eval_rewards/chosen": 0.41605862975120544, + "eval_rewards/margins": 0.4446040093898773, + "eval_rewards/rejected": -0.02854539081454277, + "eval_runtime": 147.5337, + "eval_samples_per_second": 13.556, + "eval_steps_per_second": 1.695, + "step": 1600 + }, + { + "epoch": 0.42, + "learning_rate": 3.246771879483501e-07, + "logits/chosen": -3.944901704788208, + "logits/rejected": -3.9903030395507812, + "logps/chosen": -475.91363525390625, + "logps/rejected": -396.0389099121094, + "loss": 0.5996, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.2757423520088196, + "rewards/margins": 0.3251574635505676, + "rewards/rejected": -0.049415141344070435, + "step": 1610 + }, + { + "epoch": 0.42, + "learning_rate": 3.2324246771879484e-07, + "logits/chosen": -4.066908359527588, + "logits/rejected": -3.8957467079162598, + "logps/chosen": -538.4827880859375, + "logps/rejected": -386.1225280761719, + "loss": 0.5916, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.40634027123451233, + "rewards/margins": 0.44673413038253784, + "rewards/rejected": -0.04039386659860611, + "step": 1620 + }, + { + "epoch": 0.42, + "learning_rate": 3.2180774748923953e-07, + "logits/chosen": -4.119419097900391, + "logits/rejected": -3.880350112915039, + "logps/chosen": -571.5280151367188, + "logps/rejected": -467.98321533203125, + "loss": 0.5709, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.3607470393180847, + "rewards/margins": 0.40588730573654175, + "rewards/rejected": -0.04514027386903763, + "step": 1630 + }, + { + "epoch": 0.42, + "learning_rate": 3.2037302725968434e-07, + "logits/chosen": -3.995079517364502, + "logits/rejected": -3.9660801887512207, + "logps/chosen": -528.3262939453125, + "logps/rejected": -391.5002746582031, + "loss": 0.6413, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3550952970981598, + "rewards/margins": 0.4048345685005188, + "rewards/rejected": -0.049739234149456024, + "step": 1640 + }, + { + "epoch": 0.43, + "learning_rate": 3.189383070301291e-07, + "logits/chosen": -4.0606889724731445, + "logits/rejected": -4.020025253295898, + "logps/chosen": -606.38330078125, + "logps/rejected": -492.71759033203125, + "loss": 0.5607, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.392439067363739, + "rewards/margins": 0.29129353165626526, + "rewards/rejected": 0.10114555060863495, + "step": 1650 + }, + { + "epoch": 0.43, + "learning_rate": 3.175035868005739e-07, + "logits/chosen": -4.366388320922852, + "logits/rejected": -4.3169779777526855, + "logps/chosen": -572.692626953125, + "logps/rejected": -431.1947326660156, + "loss": 0.6249, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.4136829972267151, + "rewards/margins": 0.31379351019859314, + "rewards/rejected": 0.09988941252231598, + "step": 1660 + }, + { + "epoch": 0.43, + "learning_rate": 3.160688665710186e-07, + "logits/chosen": -4.261553764343262, + "logits/rejected": -4.20203971862793, + "logps/chosen": -548.4271240234375, + "logps/rejected": -461.83563232421875, + "loss": 0.5295, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.4961729943752289, + "rewards/margins": 0.5303093194961548, + "rewards/rejected": -0.034136295318603516, + "step": 1670 + }, + { + "epoch": 0.43, + "learning_rate": 3.146341463414634e-07, + "logits/chosen": -4.172554016113281, + "logits/rejected": -4.170234680175781, + "logps/chosen": -538.4212036132812, + "logps/rejected": -511.212890625, + "loss": 0.5634, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.45014914870262146, + "rewards/margins": 0.44471946358680725, + "rewards/rejected": 0.005429693963378668, + "step": 1680 + }, + { + "epoch": 0.44, + "learning_rate": 3.1319942611190815e-07, + "logits/chosen": -3.915037155151367, + "logits/rejected": -3.8585174083709717, + "logps/chosen": -497.04229736328125, + "logps/rejected": -471.8094787597656, + "loss": 0.5919, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.2910478711128235, + "rewards/margins": 0.37008222937583923, + "rewards/rejected": -0.07903440296649933, + "step": 1690 + }, + { + "epoch": 0.44, + "learning_rate": 3.1176470588235295e-07, + "logits/chosen": -3.864201307296753, + "logits/rejected": -3.8585095405578613, + "logps/chosen": -542.01953125, + "logps/rejected": -397.53424072265625, + "loss": 0.5999, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17917154729366302, + "rewards/margins": 0.28133153915405273, + "rewards/rejected": -0.10215996205806732, + "step": 1700 + }, + { + "epoch": 0.44, + "eval_logits/chosen": -3.99351167678833, + "eval_logits/rejected": -4.020653247833252, + "eval_logps/chosen": -545.3095092773438, + "eval_logps/rejected": -438.4728698730469, + "eval_loss": 0.5767195820808411, + "eval_rewards/accuracies": 0.6840000152587891, + "eval_rewards/chosen": 0.40671002864837646, + "eval_rewards/margins": 0.4535037875175476, + "eval_rewards/rejected": -0.04679381474852562, + "eval_runtime": 147.2862, + "eval_samples_per_second": 13.579, + "eval_steps_per_second": 1.697, + "step": 1700 + }, + { + "epoch": 0.44, + "learning_rate": 3.1032998565279765e-07, + "logits/chosen": -4.243551254272461, + "logits/rejected": -4.064631938934326, + "logps/chosen": -478.11187744140625, + "logps/rejected": -458.78692626953125, + "loss": 0.5753, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.40352755784988403, + "rewards/margins": 0.4834938645362854, + "rewards/rejected": -0.07996630668640137, + "step": 1710 + }, + { + "epoch": 0.44, + "learning_rate": 3.0889526542324245e-07, + "logits/chosen": -4.1683268547058105, + "logits/rejected": -4.173158645629883, + "logps/chosen": -652.5173950195312, + "logps/rejected": -432.58428955078125, + "loss": 0.5737, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.4788149297237396, + "rewards/margins": 0.5799158811569214, + "rewards/rejected": -0.10110093653202057, + "step": 1720 + }, + { + "epoch": 0.45, + "learning_rate": 3.074605451936872e-07, + "logits/chosen": -4.137356758117676, + "logits/rejected": -4.176325798034668, + "logps/chosen": -576.1214599609375, + "logps/rejected": -380.2808837890625, + "loss": 0.5699, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.4571780562400818, + "rewards/margins": 0.49380144476890564, + "rewards/rejected": -0.03662336990237236, + "step": 1730 + }, + { + "epoch": 0.45, + "learning_rate": 3.06025824964132e-07, + "logits/chosen": -4.188223838806152, + "logits/rejected": -4.05302095413208, + "logps/chosen": -480.8373107910156, + "logps/rejected": -422.5328063964844, + "loss": 0.5799, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.2358378916978836, + "rewards/margins": 0.28218406438827515, + "rewards/rejected": -0.04634615033864975, + "step": 1740 + }, + { + "epoch": 0.45, + "learning_rate": 3.045911047345767e-07, + "logits/chosen": -4.110243797302246, + "logits/rejected": -4.0695366859436035, + "logps/chosen": -615.70263671875, + "logps/rejected": -426.46075439453125, + "loss": 0.5073, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.5083123445510864, + "rewards/margins": 0.7217694520950317, + "rewards/rejected": -0.2134571522474289, + "step": 1750 + }, + { + "epoch": 0.45, + "learning_rate": 3.031563845050215e-07, + "logits/chosen": -4.203267574310303, + "logits/rejected": -4.161170482635498, + "logps/chosen": -590.3410034179688, + "logps/rejected": -447.26715087890625, + "loss": 0.5185, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.5431427955627441, + "rewards/margins": 0.68207848072052, + "rewards/rejected": -0.13893572986125946, + "step": 1760 + }, + { + "epoch": 0.46, + "learning_rate": 3.0172166427546626e-07, + "logits/chosen": -4.285967826843262, + "logits/rejected": -4.167950630187988, + "logps/chosen": -533.8848876953125, + "logps/rejected": -413.30975341796875, + "loss": 0.5936, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.39957305788993835, + "rewards/margins": 0.3795527517795563, + "rewards/rejected": 0.020020361989736557, + "step": 1770 + }, + { + "epoch": 0.46, + "learning_rate": 3.00286944045911e-07, + "logits/chosen": -4.027644634246826, + "logits/rejected": -3.9792587757110596, + "logps/chosen": -626.9630737304688, + "logps/rejected": -397.4438781738281, + "loss": 0.6014, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.45451897382736206, + "rewards/margins": 0.5369467735290527, + "rewards/rejected": -0.08242778480052948, + "step": 1780 + }, + { + "epoch": 0.46, + "learning_rate": 2.9885222381635577e-07, + "logits/chosen": -4.1345133781433105, + "logits/rejected": -4.244950771331787, + "logps/chosen": -562.5131225585938, + "logps/rejected": -422.6846618652344, + "loss": 0.5804, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.40697044134140015, + "rewards/margins": 0.48475074768066406, + "rewards/rejected": -0.07778030633926392, + "step": 1790 + }, + { + "epoch": 0.46, + "learning_rate": 2.9741750358680057e-07, + "logits/chosen": -4.032704830169678, + "logits/rejected": -3.9772307872772217, + "logps/chosen": -568.47802734375, + "logps/rejected": -502.3460998535156, + "loss": 0.6004, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.46621161699295044, + "rewards/margins": 0.4852283000946045, + "rewards/rejected": -0.019016731530427933, + "step": 1800 + }, + { + "epoch": 0.46, + "eval_logits/chosen": -3.9943645000457764, + "eval_logits/rejected": -4.0218825340271, + "eval_logps/chosen": -545.1437377929688, + "eval_logps/rejected": -438.3991394042969, + "eval_loss": 0.5730865597724915, + "eval_rewards/accuracies": 0.6830000281333923, + "eval_rewards/chosen": 0.4232881963253021, + "eval_rewards/margins": 0.46270594000816345, + "eval_rewards/rejected": -0.03941771015524864, + "eval_runtime": 148.86, + "eval_samples_per_second": 13.435, + "eval_steps_per_second": 1.679, + "step": 1800 + }, + { + "epoch": 0.47, + "learning_rate": 2.959827833572453e-07, + "logits/chosen": -4.1379075050354, + "logits/rejected": -4.1423420906066895, + "logps/chosen": -620.6439819335938, + "logps/rejected": -438.18084716796875, + "loss": 0.5651, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5286334753036499, + "rewards/margins": 0.5047623515129089, + "rewards/rejected": 0.02387116476893425, + "step": 1810 + }, + { + "epoch": 0.47, + "learning_rate": 2.9454806312769007e-07, + "logits/chosen": -4.126761436462402, + "logits/rejected": -4.265500545501709, + "logps/chosen": -494.80206298828125, + "logps/rejected": -427.181640625, + "loss": 0.6087, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.4608178734779358, + "rewards/margins": 0.36958831548690796, + "rewards/rejected": 0.09122952073812485, + "step": 1820 + }, + { + "epoch": 0.47, + "learning_rate": 2.931133428981348e-07, + "logits/chosen": -4.138312339782715, + "logits/rejected": -4.2697319984436035, + "logps/chosen": -492.8348693847656, + "logps/rejected": -405.1728515625, + "loss": 0.6199, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.21532472968101501, + "rewards/margins": 0.19911542534828186, + "rewards/rejected": 0.016209278255701065, + "step": 1830 + }, + { + "epoch": 0.48, + "learning_rate": 2.9167862266857963e-07, + "logits/chosen": -4.26310396194458, + "logits/rejected": -4.242154121398926, + "logps/chosen": -562.9186401367188, + "logps/rejected": -377.67303466796875, + "loss": 0.5497, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.5041324496269226, + "rewards/margins": 0.5989420413970947, + "rewards/rejected": -0.09480961412191391, + "step": 1840 + }, + { + "epoch": 0.48, + "learning_rate": 2.902439024390244e-07, + "logits/chosen": -4.286158561706543, + "logits/rejected": -4.289405345916748, + "logps/chosen": -607.9891357421875, + "logps/rejected": -496.7867126464844, + "loss": 0.5634, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5594509840011597, + "rewards/margins": 0.6072807908058167, + "rewards/rejected": -0.04782974720001221, + "step": 1850 + }, + { + "epoch": 0.48, + "learning_rate": 2.8880918220946913e-07, + "logits/chosen": -4.170632839202881, + "logits/rejected": -4.215968132019043, + "logps/chosen": -445.17559814453125, + "logps/rejected": -355.8191223144531, + "loss": 0.6043, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.31880825757980347, + "rewards/margins": 0.2609938383102417, + "rewards/rejected": 0.05781441926956177, + "step": 1860 + }, + { + "epoch": 0.48, + "learning_rate": 2.873744619799139e-07, + "logits/chosen": -4.265324592590332, + "logits/rejected": -4.255076885223389, + "logps/chosen": -590.837158203125, + "logps/rejected": -441.911376953125, + "loss": 0.6081, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.33519288897514343, + "rewards/margins": 0.3991120457649231, + "rewards/rejected": -0.06391920149326324, + "step": 1870 + }, + { + "epoch": 0.49, + "learning_rate": 2.859397417503587e-07, + "logits/chosen": -4.415879249572754, + "logits/rejected": -4.314742565155029, + "logps/chosen": -501.2169494628906, + "logps/rejected": -451.86553955078125, + "loss": 0.6035, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.33651891350746155, + "rewards/margins": 0.30277958512306213, + "rewards/rejected": 0.03373932093381882, + "step": 1880 + }, + { + "epoch": 0.49, + "learning_rate": 2.8450502152080344e-07, + "logits/chosen": -3.878053665161133, + "logits/rejected": -4.040474891662598, + "logps/chosen": -647.9942626953125, + "logps/rejected": -437.6617736816406, + "loss": 0.5379, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.5844290852546692, + "rewards/margins": 0.661879301071167, + "rewards/rejected": -0.07745026051998138, + "step": 1890 + }, + { + "epoch": 0.49, + "learning_rate": 2.830703012912482e-07, + "logits/chosen": -4.195162296295166, + "logits/rejected": -4.2288408279418945, + "logps/chosen": -590.7817993164062, + "logps/rejected": -428.28387451171875, + "loss": 0.5349, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.510196328163147, + "rewards/margins": 0.6029139757156372, + "rewards/rejected": -0.09271766245365143, + "step": 1900 + }, + { + "epoch": 0.49, + "eval_logits/chosen": -4.0012006759643555, + "eval_logits/rejected": -4.029512405395508, + "eval_logps/chosen": -545.0914306640625, + "eval_logps/rejected": -438.4334716796875, + "eval_loss": 0.5719799995422363, + "eval_rewards/accuracies": 0.6830000281333923, + "eval_rewards/chosen": 0.4285166561603546, + "eval_rewards/margins": 0.4713680148124695, + "eval_rewards/rejected": -0.04285132512450218, + "eval_runtime": 148.6253, + "eval_samples_per_second": 13.457, + "eval_steps_per_second": 1.682, + "step": 1900 + }, + { + "epoch": 0.49, + "learning_rate": 2.8163558106169294e-07, + "logits/chosen": -4.198761940002441, + "logits/rejected": -4.1362786293029785, + "logps/chosen": -616.3384399414062, + "logps/rejected": -427.9803771972656, + "loss": 0.5013, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.5358282327651978, + "rewards/margins": 0.5532472729682922, + "rewards/rejected": -0.017419060692191124, + "step": 1910 + }, + { + "epoch": 0.5, + "learning_rate": 2.8020086083213774e-07, + "logits/chosen": -4.3343377113342285, + "logits/rejected": -4.233187198638916, + "logps/chosen": -663.6807861328125, + "logps/rejected": -496.68121337890625, + "loss": 0.5346, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.5831555128097534, + "rewards/margins": 0.7528368830680847, + "rewards/rejected": -0.16968131065368652, + "step": 1920 + }, + { + "epoch": 0.5, + "learning_rate": 2.7876614060258244e-07, + "logits/chosen": -4.208827018737793, + "logits/rejected": -4.191887378692627, + "logps/chosen": -546.9085693359375, + "logps/rejected": -454.7889099121094, + "loss": 0.6139, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4015926420688629, + "rewards/margins": 0.4776438772678375, + "rewards/rejected": -0.0760512501001358, + "step": 1930 + }, + { + "epoch": 0.5, + "learning_rate": 2.7733142037302725e-07, + "logits/chosen": -3.8217597007751465, + "logits/rejected": -3.925053119659424, + "logps/chosen": -661.263916015625, + "logps/rejected": -534.646728515625, + "loss": 0.5292, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.4541807770729065, + "rewards/margins": 0.5835798382759094, + "rewards/rejected": -0.12939909100532532, + "step": 1940 + }, + { + "epoch": 0.5, + "learning_rate": 2.75896700143472e-07, + "logits/chosen": -4.182621955871582, + "logits/rejected": -3.9824492931365967, + "logps/chosen": -570.587890625, + "logps/rejected": -394.5463562011719, + "loss": 0.5491, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.5437583923339844, + "rewards/margins": 0.5803283452987671, + "rewards/rejected": -0.03656994178891182, + "step": 1950 + }, + { + "epoch": 0.51, + "learning_rate": 2.744619799139168e-07, + "logits/chosen": -4.0817694664001465, + "logits/rejected": -4.021645545959473, + "logps/chosen": -562.7216796875, + "logps/rejected": -408.1803894042969, + "loss": 0.6227, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.38751038908958435, + "rewards/margins": 0.412889301776886, + "rewards/rejected": -0.025378871709108353, + "step": 1960 + }, + { + "epoch": 0.51, + "learning_rate": 2.730272596843615e-07, + "logits/chosen": -4.210979461669922, + "logits/rejected": -4.233429908752441, + "logps/chosen": -531.6992797851562, + "logps/rejected": -415.34130859375, + "loss": 0.5544, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.4554630219936371, + "rewards/margins": 0.6501585841178894, + "rewards/rejected": -0.1946956068277359, + "step": 1970 + }, + { + "epoch": 0.51, + "learning_rate": 2.715925394548063e-07, + "logits/chosen": -3.991922378540039, + "logits/rejected": -3.861186981201172, + "logps/chosen": -516.5054931640625, + "logps/rejected": -480.88043212890625, + "loss": 0.6105, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.35484379529953003, + "rewards/margins": 0.21209315955638885, + "rewards/rejected": 0.14275071024894714, + "step": 1980 + }, + { + "epoch": 0.51, + "learning_rate": 2.7015781922525106e-07, + "logits/chosen": -3.956188678741455, + "logits/rejected": -3.9556357860565186, + "logps/chosen": -464.2613830566406, + "logps/rejected": -448.0741271972656, + "loss": 0.6277, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.34133443236351013, + "rewards/margins": 0.3067266345024109, + "rewards/rejected": 0.03460781276226044, + "step": 1990 + }, + { + "epoch": 0.52, + "learning_rate": 2.6872309899569586e-07, + "logits/chosen": -4.143117427825928, + "logits/rejected": -4.186631679534912, + "logps/chosen": -578.9446411132812, + "logps/rejected": -439.56658935546875, + "loss": 0.5377, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.5403174161911011, + "rewards/margins": 0.633220911026001, + "rewards/rejected": -0.0929035171866417, + "step": 2000 + }, + { + "epoch": 0.52, + "eval_logits/chosen": -4.000906944274902, + "eval_logits/rejected": -4.028975009918213, + "eval_logps/chosen": -545.1220092773438, + "eval_logps/rejected": -438.54486083984375, + "eval_loss": 0.5702030062675476, + "eval_rewards/accuracies": 0.6850000023841858, + "eval_rewards/chosen": 0.4254603683948517, + "eval_rewards/margins": 0.4794518053531647, + "eval_rewards/rejected": -0.0539914108812809, + "eval_runtime": 147.8823, + "eval_samples_per_second": 13.524, + "eval_steps_per_second": 1.691, + "step": 2000 + }, + { + "epoch": 0.52, + "learning_rate": 2.6728837876614056e-07, + "logits/chosen": -4.1301422119140625, + "logits/rejected": -4.1415839195251465, + "logps/chosen": -582.57666015625, + "logps/rejected": -450.591552734375, + "loss": 0.5582, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.5017611384391785, + "rewards/margins": 0.6365527510643005, + "rewards/rejected": -0.13479158282279968, + "step": 2010 + }, + { + "epoch": 0.52, + "learning_rate": 2.6585365853658536e-07, + "logits/chosen": -4.098201274871826, + "logits/rejected": -4.06491756439209, + "logps/chosen": -536.2640380859375, + "logps/rejected": -417.99481201171875, + "loss": 0.5795, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.4373628497123718, + "rewards/margins": 0.393180251121521, + "rewards/rejected": 0.044182561337947845, + "step": 2020 + }, + { + "epoch": 0.52, + "learning_rate": 2.644189383070301e-07, + "logits/chosen": -4.560007572174072, + "logits/rejected": -4.380262851715088, + "logps/chosen": -560.2337646484375, + "logps/rejected": -399.133056640625, + "loss": 0.5672, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.5478136539459229, + "rewards/margins": 0.6600695252418518, + "rewards/rejected": -0.11225590854883194, + "step": 2030 + }, + { + "epoch": 0.53, + "learning_rate": 2.629842180774749e-07, + "logits/chosen": -4.084300518035889, + "logits/rejected": -4.194989204406738, + "logps/chosen": -615.1845703125, + "logps/rejected": -399.0810546875, + "loss": 0.5302, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5594775676727295, + "rewards/margins": 0.8519641160964966, + "rewards/rejected": -0.2924865782260895, + "step": 2040 + }, + { + "epoch": 0.53, + "learning_rate": 2.615494978479196e-07, + "logits/chosen": -3.948491334915161, + "logits/rejected": -3.790837049484253, + "logps/chosen": -506.5896911621094, + "logps/rejected": -390.9329528808594, + "loss": 0.5634, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.3724905848503113, + "rewards/margins": 0.4565068781375885, + "rewards/rejected": -0.08401624858379364, + "step": 2050 + }, + { + "epoch": 0.53, + "learning_rate": 2.601147776183644e-07, + "logits/chosen": -4.237751007080078, + "logits/rejected": -4.1675705909729, + "logps/chosen": -569.1129150390625, + "logps/rejected": -415.8509826660156, + "loss": 0.5397, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.3635835647583008, + "rewards/margins": 0.4130166471004486, + "rewards/rejected": -0.04943311959505081, + "step": 2060 + }, + { + "epoch": 0.53, + "learning_rate": 2.5868005738880917e-07, + "logits/chosen": -4.343452453613281, + "logits/rejected": -4.299803733825684, + "logps/chosen": -468.18768310546875, + "logps/rejected": -420.81256103515625, + "loss": 0.5592, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4336473047733307, + "rewards/margins": 0.4215630888938904, + "rewards/rejected": 0.012084214016795158, + "step": 2070 + }, + { + "epoch": 0.54, + "learning_rate": 2.57245337159254e-07, + "logits/chosen": -4.12381649017334, + "logits/rejected": -4.066357135772705, + "logps/chosen": -512.6734619140625, + "logps/rejected": -421.00872802734375, + "loss": 0.5649, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26631009578704834, + "rewards/margins": 0.49776148796081543, + "rewards/rejected": -0.23145142197608948, + "step": 2080 + }, + { + "epoch": 0.54, + "learning_rate": 2.558106169296987e-07, + "logits/chosen": -4.055663108825684, + "logits/rejected": -4.202220439910889, + "logps/chosen": -531.7757568359375, + "logps/rejected": -427.21051025390625, + "loss": 0.5916, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.35120105743408203, + "rewards/margins": 0.329306036233902, + "rewards/rejected": 0.021895062178373337, + "step": 2090 + }, + { + "epoch": 0.54, + "learning_rate": 2.543758967001435e-07, + "logits/chosen": -3.896604537963867, + "logits/rejected": -3.8615658283233643, + "logps/chosen": -546.3208618164062, + "logps/rejected": -435.53143310546875, + "loss": 0.4988, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.46676820516586304, + "rewards/margins": 0.7485499382019043, + "rewards/rejected": -0.28178170323371887, + "step": 2100 + }, + { + "epoch": 0.54, + "eval_logits/chosen": -4.003889560699463, + "eval_logits/rejected": -4.031704425811768, + "eval_logps/chosen": -545.0299072265625, + "eval_logps/rejected": -438.5533142089844, + "eval_loss": 0.5712563395500183, + "eval_rewards/accuracies": 0.6840000152587891, + "eval_rewards/chosen": 0.43467363715171814, + "eval_rewards/margins": 0.4895067512989044, + "eval_rewards/rejected": -0.05483310669660568, + "eval_runtime": 148.9309, + "eval_samples_per_second": 13.429, + "eval_steps_per_second": 1.679, + "step": 2100 + }, + { + "epoch": 0.54, + "learning_rate": 2.5294117647058823e-07, + "logits/chosen": -4.0738677978515625, + "logits/rejected": -4.02095890045166, + "logps/chosen": -547.7879638671875, + "logps/rejected": -461.1922302246094, + "loss": 0.5612, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.4428838789463043, + "rewards/margins": 0.4759696424007416, + "rewards/rejected": -0.03308583423495293, + "step": 2110 + }, + { + "epoch": 0.55, + "learning_rate": 2.51506456241033e-07, + "logits/chosen": -4.027615547180176, + "logits/rejected": -4.137267112731934, + "logps/chosen": -519.4240112304688, + "logps/rejected": -418.49884033203125, + "loss": 0.5573, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.33291319012641907, + "rewards/margins": 0.5208204388618469, + "rewards/rejected": -0.18790724873542786, + "step": 2120 + }, + { + "epoch": 0.55, + "learning_rate": 2.5007173601147773e-07, + "logits/chosen": -3.9522101879119873, + "logits/rejected": -4.056872367858887, + "logps/chosen": -581.5064697265625, + "logps/rejected": -583.0844116210938, + "loss": 0.5702, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.40307894349098206, + "rewards/margins": 0.49982690811157227, + "rewards/rejected": -0.09674793481826782, + "step": 2130 + }, + { + "epoch": 0.55, + "learning_rate": 2.486370157819225e-07, + "logits/chosen": -4.075150966644287, + "logits/rejected": -3.9781277179718018, + "logps/chosen": -570.3604736328125, + "logps/rejected": -457.1639099121094, + "loss": 0.5703, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.4488401412963867, + "rewards/margins": 0.4194963574409485, + "rewards/rejected": 0.029343824833631516, + "step": 2140 + }, + { + "epoch": 0.56, + "learning_rate": 2.472022955523673e-07, + "logits/chosen": -3.9273715019226074, + "logits/rejected": -4.03403377532959, + "logps/chosen": -576.674072265625, + "logps/rejected": -480.82330322265625, + "loss": 0.5837, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.4813673496246338, + "rewards/margins": 0.5193904638290405, + "rewards/rejected": -0.038023076951503754, + "step": 2150 + }, + { + "epoch": 0.56, + "learning_rate": 2.4576757532281204e-07, + "logits/chosen": -4.104135513305664, + "logits/rejected": -4.141896724700928, + "logps/chosen": -570.6798095703125, + "logps/rejected": -480.5628967285156, + "loss": 0.6344, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4069296419620514, + "rewards/margins": 0.38408637046813965, + "rewards/rejected": 0.02284328266978264, + "step": 2160 + }, + { + "epoch": 0.56, + "learning_rate": 2.443328550932568e-07, + "logits/chosen": -3.959376811981201, + "logits/rejected": -3.8872084617614746, + "logps/chosen": -552.5087280273438, + "logps/rejected": -436.44189453125, + "loss": 0.568, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.29828330874443054, + "rewards/margins": 0.3948608338832855, + "rewards/rejected": -0.09657756984233856, + "step": 2170 + }, + { + "epoch": 0.56, + "learning_rate": 2.4289813486370154e-07, + "logits/chosen": -4.081685543060303, + "logits/rejected": -4.045130729675293, + "logps/chosen": -545.3685302734375, + "logps/rejected": -373.71600341796875, + "loss": 0.5179, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.462582528591156, + "rewards/margins": 0.6623843908309937, + "rewards/rejected": -0.19980189204216003, + "step": 2180 + }, + { + "epoch": 0.57, + "learning_rate": 2.4146341463414635e-07, + "logits/chosen": -4.286005973815918, + "logits/rejected": -4.341670036315918, + "logps/chosen": -546.548828125, + "logps/rejected": -416.674072265625, + "loss": 0.5689, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.4305512011051178, + "rewards/margins": 0.6499841213226318, + "rewards/rejected": -0.21943287551403046, + "step": 2190 + }, + { + "epoch": 0.57, + "learning_rate": 2.400286944045911e-07, + "logits/chosen": -3.9777417182922363, + "logits/rejected": -3.9910645484924316, + "logps/chosen": -470.6476135253906, + "logps/rejected": -453.6451721191406, + "loss": 0.6093, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.38502687215805054, + "rewards/margins": 0.30599719285964966, + "rewards/rejected": 0.07902970165014267, + "step": 2200 + }, + { + "epoch": 0.57, + "eval_logits/chosen": -4.001364231109619, + "eval_logits/rejected": -4.028832912445068, + "eval_logps/chosen": -544.912841796875, + "eval_logps/rejected": -438.4606628417969, + "eval_loss": 0.5706081986427307, + "eval_rewards/accuracies": 0.6809999942779541, + "eval_rewards/chosen": 0.44638243317604065, + "eval_rewards/margins": 0.49195748567581177, + "eval_rewards/rejected": -0.045575033873319626, + "eval_runtime": 146.1996, + "eval_samples_per_second": 13.68, + "eval_steps_per_second": 1.71, + "step": 2200 + }, + { + "epoch": 0.57, + "learning_rate": 2.3859397417503585e-07, + "logits/chosen": -4.030927658081055, + "logits/rejected": -3.9580256938934326, + "logps/chosen": -514.30712890625, + "logps/rejected": -354.2815246582031, + "loss": 0.5861, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.4988827705383301, + "rewards/margins": 0.5217560529708862, + "rewards/rejected": -0.0228732917457819, + "step": 2210 + }, + { + "epoch": 0.57, + "learning_rate": 2.3715925394548063e-07, + "logits/chosen": -4.007624626159668, + "logits/rejected": -4.2475457191467285, + "logps/chosen": -661.9398193359375, + "logps/rejected": -411.38848876953125, + "loss": 0.538, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.663671612739563, + "rewards/margins": 0.7176098227500916, + "rewards/rejected": -0.05393817275762558, + "step": 2220 + }, + { + "epoch": 0.58, + "learning_rate": 2.3572453371592538e-07, + "logits/chosen": -4.090206146240234, + "logits/rejected": -4.1433539390563965, + "logps/chosen": -504.1729431152344, + "logps/rejected": -387.82623291015625, + "loss": 0.5351, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.45734721422195435, + "rewards/margins": 0.4615212082862854, + "rewards/rejected": -0.0041740150190889835, + "step": 2230 + }, + { + "epoch": 0.58, + "learning_rate": 2.3428981348637013e-07, + "logits/chosen": -4.289696216583252, + "logits/rejected": -4.263758659362793, + "logps/chosen": -579.8231201171875, + "logps/rejected": -404.7850646972656, + "loss": 0.5307, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.6702554225921631, + "rewards/margins": 0.7904798984527588, + "rewards/rejected": -0.12022446095943451, + "step": 2240 + }, + { + "epoch": 0.58, + "learning_rate": 2.328550932568149e-07, + "logits/chosen": -3.835402727127075, + "logits/rejected": -3.8511269092559814, + "logps/chosen": -510.6192932128906, + "logps/rejected": -446.6246643066406, + "loss": 0.6048, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.38664764165878296, + "rewards/margins": 0.3122571110725403, + "rewards/rejected": 0.07439050823450089, + "step": 2250 + }, + { + "epoch": 0.58, + "learning_rate": 2.3142037302725966e-07, + "logits/chosen": -3.915778398513794, + "logits/rejected": -3.8879055976867676, + "logps/chosen": -513.9568481445312, + "logps/rejected": -410.12310791015625, + "loss": 0.5356, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4804447293281555, + "rewards/margins": 0.5258998274803162, + "rewards/rejected": -0.04545507952570915, + "step": 2260 + }, + { + "epoch": 0.59, + "learning_rate": 2.2998565279770444e-07, + "logits/chosen": -4.199291229248047, + "logits/rejected": -4.164752006530762, + "logps/chosen": -644.1868896484375, + "logps/rejected": -394.4111022949219, + "loss": 0.603, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.38507136702537537, + "rewards/margins": 0.504477858543396, + "rewards/rejected": -0.11940644681453705, + "step": 2270 + }, + { + "epoch": 0.59, + "learning_rate": 2.285509325681492e-07, + "logits/chosen": -4.0966033935546875, + "logits/rejected": -4.1276421546936035, + "logps/chosen": -525.5519409179688, + "logps/rejected": -452.3783264160156, + "loss": 0.5688, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.3376069664955139, + "rewards/margins": 0.48656487464904785, + "rewards/rejected": -0.14895787835121155, + "step": 2280 + }, + { + "epoch": 0.59, + "learning_rate": 2.2711621233859396e-07, + "logits/chosen": -3.9321861267089844, + "logits/rejected": -3.8997440338134766, + "logps/chosen": -524.1985473632812, + "logps/rejected": -368.1291198730469, + "loss": 0.5533, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.33177611231803894, + "rewards/margins": 0.49390387535095215, + "rewards/rejected": -0.1621277630329132, + "step": 2290 + }, + { + "epoch": 0.59, + "learning_rate": 2.2568149210903872e-07, + "logits/chosen": -4.461883544921875, + "logits/rejected": -4.530648708343506, + "logps/chosen": -619.8561401367188, + "logps/rejected": -468.6748046875, + "loss": 0.5356, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.5222944021224976, + "rewards/margins": 0.6014097929000854, + "rewards/rejected": -0.07911545038223267, + "step": 2300 + }, + { + "epoch": 0.59, + "eval_logits/chosen": -3.998574733734131, + "eval_logits/rejected": -4.025696277618408, + "eval_logps/chosen": -544.8922119140625, + "eval_logps/rejected": -438.4912109375, + "eval_loss": 0.5689104199409485, + "eval_rewards/accuracies": 0.6880000233650208, + "eval_rewards/chosen": 0.448445200920105, + "eval_rewards/margins": 0.49707192182540894, + "eval_rewards/rejected": -0.04862673580646515, + "eval_runtime": 148.9177, + "eval_samples_per_second": 13.43, + "eval_steps_per_second": 1.679, + "step": 2300 + }, + { + "epoch": 0.6, + "learning_rate": 2.242467718794835e-07, + "logits/chosen": -3.9245476722717285, + "logits/rejected": -4.00443696975708, + "logps/chosen": -561.0607299804688, + "logps/rejected": -444.2076110839844, + "loss": 0.542, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.41927021741867065, + "rewards/margins": 0.5216065049171448, + "rewards/rejected": -0.10233630239963531, + "step": 2310 + }, + { + "epoch": 0.6, + "learning_rate": 2.2281205164992824e-07, + "logits/chosen": -4.177279949188232, + "logits/rejected": -4.077963352203369, + "logps/chosen": -504.2713928222656, + "logps/rejected": -444.5032653808594, + "loss": 0.6156, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.43868088722229004, + "rewards/margins": 0.4138507843017578, + "rewards/rejected": 0.024830086156725883, + "step": 2320 + }, + { + "epoch": 0.6, + "learning_rate": 2.2137733142037302e-07, + "logits/chosen": -3.990638017654419, + "logits/rejected": -4.014552593231201, + "logps/chosen": -549.037353515625, + "logps/rejected": -442.3534240722656, + "loss": 0.5213, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.5215023756027222, + "rewards/margins": 0.638957679271698, + "rewards/rejected": -0.11745530366897583, + "step": 2330 + }, + { + "epoch": 0.6, + "learning_rate": 2.1994261119081777e-07, + "logits/chosen": -4.201764106750488, + "logits/rejected": -4.19627046585083, + "logps/chosen": -564.8765258789062, + "logps/rejected": -433.4271545410156, + "loss": 0.5939, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.4676602780818939, + "rewards/margins": 0.5004759430885315, + "rewards/rejected": -0.03281565010547638, + "step": 2340 + }, + { + "epoch": 0.61, + "learning_rate": 2.1850789096126255e-07, + "logits/chosen": -3.911675214767456, + "logits/rejected": -4.010054588317871, + "logps/chosen": -611.3627319335938, + "logps/rejected": -452.5043029785156, + "loss": 0.663, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.40320587158203125, + "rewards/margins": 0.3427828252315521, + "rewards/rejected": 0.060423027724027634, + "step": 2350 + }, + { + "epoch": 0.61, + "learning_rate": 2.170731707317073e-07, + "logits/chosen": -4.167824745178223, + "logits/rejected": -4.243043422698975, + "logps/chosen": -556.890625, + "logps/rejected": -397.8431091308594, + "loss": 0.551, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.4790073037147522, + "rewards/margins": 0.5861107110977173, + "rewards/rejected": -0.10710340738296509, + "step": 2360 + }, + { + "epoch": 0.61, + "learning_rate": 2.1563845050215208e-07, + "logits/chosen": -4.110980033874512, + "logits/rejected": -4.188474655151367, + "logps/chosen": -569.0153198242188, + "logps/rejected": -404.994384765625, + "loss": 0.5716, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.4905467927455902, + "rewards/margins": 0.5199242830276489, + "rewards/rejected": -0.029377540573477745, + "step": 2370 + }, + { + "epoch": 0.61, + "learning_rate": 2.1420373027259683e-07, + "logits/chosen": -4.269859313964844, + "logits/rejected": -4.332370758056641, + "logps/chosen": -543.8313598632812, + "logps/rejected": -437.123046875, + "loss": 0.5571, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.500403106212616, + "rewards/margins": 0.7072377800941467, + "rewards/rejected": -0.20683467388153076, + "step": 2380 + }, + { + "epoch": 0.62, + "learning_rate": 2.127690100430416e-07, + "logits/chosen": -3.926335096359253, + "logits/rejected": -3.9738330841064453, + "logps/chosen": -533.0458984375, + "logps/rejected": -424.850341796875, + "loss": 0.6198, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.35639142990112305, + "rewards/margins": 0.4018617570400238, + "rewards/rejected": -0.04547032713890076, + "step": 2390 + }, + { + "epoch": 0.62, + "learning_rate": 2.1133428981348636e-07, + "logits/chosen": -3.779186248779297, + "logits/rejected": -3.8913798332214355, + "logps/chosen": -617.508056640625, + "logps/rejected": -492.209228515625, + "loss": 0.5753, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.5383332371711731, + "rewards/margins": 0.467332661151886, + "rewards/rejected": 0.07100055366754532, + "step": 2400 + }, + { + "epoch": 0.62, + "eval_logits/chosen": -3.9845926761627197, + "eval_logits/rejected": -4.009966850280762, + "eval_logps/chosen": -544.7802124023438, + "eval_logps/rejected": -438.44573974609375, + "eval_loss": 0.5681360960006714, + "eval_rewards/accuracies": 0.6850000023841858, + "eval_rewards/chosen": 0.4596436619758606, + "eval_rewards/margins": 0.5037252306938171, + "eval_rewards/rejected": -0.04408155009150505, + "eval_runtime": 148.5709, + "eval_samples_per_second": 13.462, + "eval_steps_per_second": 1.683, + "step": 2400 + }, + { + "epoch": 0.62, + "learning_rate": 2.098995695839311e-07, + "logits/chosen": -3.978921890258789, + "logits/rejected": -3.8923873901367188, + "logps/chosen": -556.697998046875, + "logps/rejected": -416.08184814453125, + "loss": 0.534, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.4480956494808197, + "rewards/margins": 0.6271561980247498, + "rewards/rejected": -0.17906051874160767, + "step": 2410 + }, + { + "epoch": 0.62, + "learning_rate": 2.084648493543759e-07, + "logits/chosen": -4.281157493591309, + "logits/rejected": -4.271050453186035, + "logps/chosen": -673.8267211914062, + "logps/rejected": -463.4944763183594, + "loss": 0.5461, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.5151551365852356, + "rewards/margins": 0.6002731919288635, + "rewards/rejected": -0.08511805534362793, + "step": 2420 + }, + { + "epoch": 0.63, + "learning_rate": 2.0703012912482064e-07, + "logits/chosen": -3.901240110397339, + "logits/rejected": -3.862910509109497, + "logps/chosen": -591.2846069335938, + "logps/rejected": -389.8904113769531, + "loss": 0.6189, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.2968345284461975, + "rewards/margins": 0.3309480547904968, + "rewards/rejected": -0.03411349281668663, + "step": 2430 + }, + { + "epoch": 0.63, + "learning_rate": 2.0559540889526542e-07, + "logits/chosen": -4.235989570617676, + "logits/rejected": -4.060244560241699, + "logps/chosen": -597.9428100585938, + "logps/rejected": -404.6048889160156, + "loss": 0.5864, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.46942558884620667, + "rewards/margins": 0.5440183877944946, + "rewards/rejected": -0.07459276914596558, + "step": 2440 + }, + { + "epoch": 0.63, + "learning_rate": 2.0416068866571017e-07, + "logits/chosen": -3.8396244049072266, + "logits/rejected": -3.752044677734375, + "logps/chosen": -601.25341796875, + "logps/rejected": -435.07647705078125, + "loss": 0.5917, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.49751096963882446, + "rewards/margins": 0.46320000290870667, + "rewards/rejected": 0.03431097790598869, + "step": 2450 + }, + { + "epoch": 0.64, + "learning_rate": 2.0272596843615495e-07, + "logits/chosen": -4.197469711303711, + "logits/rejected": -4.122381687164307, + "logps/chosen": -553.6739501953125, + "logps/rejected": -420.4598083496094, + "loss": 0.5932, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.453556627035141, + "rewards/margins": 0.4589596390724182, + "rewards/rejected": -0.005403043236583471, + "step": 2460 + }, + { + "epoch": 0.64, + "learning_rate": 2.012912482065997e-07, + "logits/chosen": -4.135566711425781, + "logits/rejected": -4.087862968444824, + "logps/chosen": -528.1041259765625, + "logps/rejected": -432.2552795410156, + "loss": 0.5474, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.5243477821350098, + "rewards/margins": 0.636970043182373, + "rewards/rejected": -0.11262223869562149, + "step": 2470 + }, + { + "epoch": 0.64, + "learning_rate": 1.9985652797704448e-07, + "logits/chosen": -3.9298617839813232, + "logits/rejected": -3.9982573986053467, + "logps/chosen": -466.2574157714844, + "logps/rejected": -393.4660339355469, + "loss": 0.5623, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.469001442193985, + "rewards/margins": 0.48559433221817017, + "rewards/rejected": -0.016592923551797867, + "step": 2480 + }, + { + "epoch": 0.64, + "learning_rate": 1.9842180774748923e-07, + "logits/chosen": -3.9439053535461426, + "logits/rejected": -3.8986332416534424, + "logps/chosen": -559.919921875, + "logps/rejected": -430.98162841796875, + "loss": 0.5399, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.4386838376522064, + "rewards/margins": 0.4418070912361145, + "rewards/rejected": -0.003123197006061673, + "step": 2490 + }, + { + "epoch": 0.65, + "learning_rate": 1.96987087517934e-07, + "logits/chosen": -4.274647235870361, + "logits/rejected": -4.253532409667969, + "logps/chosen": -593.2935791015625, + "logps/rejected": -445.554931640625, + "loss": 0.5709, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4176342487335205, + "rewards/margins": 0.4632183909416199, + "rewards/rejected": -0.04558416083455086, + "step": 2500 + }, + { + "epoch": 0.65, + "eval_logits/chosen": -3.984861135482788, + "eval_logits/rejected": -4.0099897384643555, + "eval_logps/chosen": -544.6834716796875, + "eval_logps/rejected": -438.3924255371094, + "eval_loss": 0.5672796368598938, + "eval_rewards/accuracies": 0.6909999847412109, + "eval_rewards/chosen": 0.4693204462528229, + "eval_rewards/margins": 0.5080692172050476, + "eval_rewards/rejected": -0.03874876722693443, + "eval_runtime": 146.2016, + "eval_samples_per_second": 13.68, + "eval_steps_per_second": 1.71, + "step": 2500 + }, + { + "epoch": 0.65, + "learning_rate": 1.9555236728837876e-07, + "logits/chosen": -4.1506547927856445, + "logits/rejected": -4.121700286865234, + "logps/chosen": -559.0662231445312, + "logps/rejected": -428.6475524902344, + "loss": 0.5692, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5197519659996033, + "rewards/margins": 0.5856004953384399, + "rewards/rejected": -0.06584848463535309, + "step": 2510 + }, + { + "epoch": 0.65, + "learning_rate": 1.9411764705882353e-07, + "logits/chosen": -4.2824015617370605, + "logits/rejected": -4.196056365966797, + "logps/chosen": -598.8436279296875, + "logps/rejected": -525.2330322265625, + "loss": 0.5574, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.4597892165184021, + "rewards/margins": 0.5178920030593872, + "rewards/rejected": -0.05810274928808212, + "step": 2520 + }, + { + "epoch": 0.65, + "learning_rate": 1.9268292682926829e-07, + "logits/chosen": -4.085073947906494, + "logits/rejected": -4.154143810272217, + "logps/chosen": -669.5693359375, + "logps/rejected": -449.7344665527344, + "loss": 0.5496, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.6506977081298828, + "rewards/margins": 0.7707425355911255, + "rewards/rejected": -0.12004482746124268, + "step": 2530 + }, + { + "epoch": 0.66, + "learning_rate": 1.9124820659971306e-07, + "logits/chosen": -3.931575059890747, + "logits/rejected": -3.969634532928467, + "logps/chosen": -670.7431640625, + "logps/rejected": -442.46405029296875, + "loss": 0.5956, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.47314882278442383, + "rewards/margins": 0.5320797562599182, + "rewards/rejected": -0.058930885046720505, + "step": 2540 + }, + { + "epoch": 0.66, + "learning_rate": 1.8981348637015781e-07, + "logits/chosen": -3.7605667114257812, + "logits/rejected": -3.7463626861572266, + "logps/chosen": -507.4091796875, + "logps/rejected": -419.73919677734375, + "loss": 0.6149, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.4424256384372711, + "rewards/margins": 0.36436089873313904, + "rewards/rejected": 0.07806471735239029, + "step": 2550 + }, + { + "epoch": 0.66, + "learning_rate": 1.883787661406026e-07, + "logits/chosen": -3.9714431762695312, + "logits/rejected": -4.080648899078369, + "logps/chosen": -589.9671630859375, + "logps/rejected": -402.8089904785156, + "loss": 0.5552, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.5086973309516907, + "rewards/margins": 0.7329601645469666, + "rewards/rejected": -0.2242628037929535, + "step": 2560 + }, + { + "epoch": 0.66, + "learning_rate": 1.8694404591104734e-07, + "logits/chosen": -4.056425094604492, + "logits/rejected": -4.178628444671631, + "logps/chosen": -524.011962890625, + "logps/rejected": -415.20233154296875, + "loss": 0.6136, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.41316717863082886, + "rewards/margins": 0.47009795904159546, + "rewards/rejected": -0.056930772960186005, + "step": 2570 + }, + { + "epoch": 0.67, + "learning_rate": 1.855093256814921e-07, + "logits/chosen": -4.216281890869141, + "logits/rejected": -4.076776027679443, + "logps/chosen": -544.1904296875, + "logps/rejected": -466.5315856933594, + "loss": 0.575, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.41413965821266174, + "rewards/margins": 0.48670220375061035, + "rewards/rejected": -0.07256259769201279, + "step": 2580 + }, + { + "epoch": 0.67, + "learning_rate": 1.8407460545193687e-07, + "logits/chosen": -4.047214984893799, + "logits/rejected": -4.0597639083862305, + "logps/chosen": -594.7711181640625, + "logps/rejected": -453.0791015625, + "loss": 0.612, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6000150442123413, + "rewards/margins": 0.62732994556427, + "rewards/rejected": -0.027314912527799606, + "step": 2590 + }, + { + "epoch": 0.67, + "learning_rate": 1.8263988522238162e-07, + "logits/chosen": -3.8675410747528076, + "logits/rejected": -3.881988525390625, + "logps/chosen": -478.98443603515625, + "logps/rejected": -408.0418395996094, + "loss": 0.5565, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.34187614917755127, + "rewards/margins": 0.5348206162452698, + "rewards/rejected": -0.19294443726539612, + "step": 2600 + }, + { + "epoch": 0.67, + "eval_logits/chosen": -3.984271764755249, + "eval_logits/rejected": -4.009637832641602, + "eval_logps/chosen": -544.6849975585938, + "eval_logps/rejected": -438.4054260253906, + "eval_loss": 0.5665393471717834, + "eval_rewards/accuracies": 0.6819999814033508, + "eval_rewards/chosen": 0.469163715839386, + "eval_rewards/margins": 0.5092154145240784, + "eval_rewards/rejected": -0.04005170986056328, + "eval_runtime": 145.7676, + "eval_samples_per_second": 13.72, + "eval_steps_per_second": 1.715, + "step": 2600 + }, + { + "epoch": 0.67, + "learning_rate": 1.812051649928264e-07, + "logits/chosen": -3.83473539352417, + "logits/rejected": -3.9073386192321777, + "logps/chosen": -604.2052612304688, + "logps/rejected": -456.9849548339844, + "loss": 0.5876, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.5973328351974487, + "rewards/margins": 0.4590074121952057, + "rewards/rejected": 0.13832543790340424, + "step": 2610 + }, + { + "epoch": 0.68, + "learning_rate": 1.7977044476327115e-07, + "logits/chosen": -3.848345994949341, + "logits/rejected": -3.786773681640625, + "logps/chosen": -502.00653076171875, + "logps/rejected": -401.30877685546875, + "loss": 0.6026, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.4152609407901764, + "rewards/margins": 0.3713647425174713, + "rewards/rejected": 0.04389624670147896, + "step": 2620 + }, + { + "epoch": 0.68, + "learning_rate": 1.7833572453371593e-07, + "logits/chosen": -4.126666069030762, + "logits/rejected": -4.045652389526367, + "logps/chosen": -495.5149841308594, + "logps/rejected": -428.750244140625, + "loss": 0.5802, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4449082911014557, + "rewards/margins": 0.48476019501686096, + "rewards/rejected": -0.03985190391540527, + "step": 2630 + }, + { + "epoch": 0.68, + "learning_rate": 1.7690100430416068e-07, + "logits/chosen": -4.006113529205322, + "logits/rejected": -4.05719518661499, + "logps/chosen": -564.3095092773438, + "logps/rejected": -465.112060546875, + "loss": 0.5478, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.4402576982975006, + "rewards/margins": 0.5477269887924194, + "rewards/rejected": -0.10746929794549942, + "step": 2640 + }, + { + "epoch": 0.68, + "learning_rate": 1.7546628407460546e-07, + "logits/chosen": -4.016690254211426, + "logits/rejected": -4.137378692626953, + "logps/chosen": -540.1869506835938, + "logps/rejected": -384.2567443847656, + "loss": 0.5053, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.46003857254981995, + "rewards/margins": 0.587317168712616, + "rewards/rejected": -0.1272786259651184, + "step": 2650 + }, + { + "epoch": 0.69, + "learning_rate": 1.740315638450502e-07, + "logits/chosen": -4.14896297454834, + "logits/rejected": -4.022231101989746, + "logps/chosen": -551.8126831054688, + "logps/rejected": -429.8963317871094, + "loss": 0.553, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.5274641513824463, + "rewards/margins": 0.4932268559932709, + "rewards/rejected": 0.034237295389175415, + "step": 2660 + }, + { + "epoch": 0.69, + "learning_rate": 1.72596843615495e-07, + "logits/chosen": -3.9441299438476562, + "logits/rejected": -3.7229416370391846, + "logps/chosen": -541.1564331054688, + "logps/rejected": -522.1112060546875, + "loss": 0.6686, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.36212533712387085, + "rewards/margins": 0.2563532590866089, + "rewards/rejected": 0.10577203333377838, + "step": 2670 + }, + { + "epoch": 0.69, + "learning_rate": 1.7116212338593974e-07, + "logits/chosen": -4.236396312713623, + "logits/rejected": -4.220719337463379, + "logps/chosen": -498.52874755859375, + "logps/rejected": -399.10699462890625, + "loss": 0.6321, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.4771362245082855, + "rewards/margins": 0.3947621285915375, + "rewards/rejected": 0.08237410336732864, + "step": 2680 + }, + { + "epoch": 0.69, + "learning_rate": 1.6972740315638452e-07, + "logits/chosen": -4.176735877990723, + "logits/rejected": -3.9685966968536377, + "logps/chosen": -628.45947265625, + "logps/rejected": -414.69329833984375, + "loss": 0.4936, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.5496100783348083, + "rewards/margins": 0.7379701733589172, + "rewards/rejected": -0.1883600354194641, + "step": 2690 + }, + { + "epoch": 0.7, + "learning_rate": 1.6829268292682927e-07, + "logits/chosen": -4.031551361083984, + "logits/rejected": -4.172730445861816, + "logps/chosen": -528.1746215820312, + "logps/rejected": -456.8675231933594, + "loss": 0.585, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5029736757278442, + "rewards/margins": 0.41340795159339905, + "rewards/rejected": 0.08956580609083176, + "step": 2700 + }, + { + "epoch": 0.7, + "eval_logits/chosen": -3.981973171234131, + "eval_logits/rejected": -4.00735330581665, + "eval_logps/chosen": -544.59619140625, + "eval_logps/rejected": -438.3558044433594, + "eval_loss": 0.5649946331977844, + "eval_rewards/accuracies": 0.6940000057220459, + "eval_rewards/chosen": 0.47803932428359985, + "eval_rewards/margins": 0.5131266713142395, + "eval_rewards/rejected": -0.03508726879954338, + "eval_runtime": 146.3353, + "eval_samples_per_second": 13.667, + "eval_steps_per_second": 1.708, + "step": 2700 + }, + { + "epoch": 0.7, + "learning_rate": 1.6685796269727405e-07, + "logits/chosen": -4.223569393157959, + "logits/rejected": -4.211024284362793, + "logps/chosen": -586.1395874023438, + "logps/rejected": -488.5260314941406, + "loss": 0.5806, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.5072857141494751, + "rewards/margins": 0.45038923621177673, + "rewards/rejected": 0.05689648538827896, + "step": 2710 + }, + { + "epoch": 0.7, + "learning_rate": 1.654232424677188e-07, + "logits/chosen": -4.059657573699951, + "logits/rejected": -4.050175666809082, + "logps/chosen": -636.249755859375, + "logps/rejected": -445.1454162597656, + "loss": 0.5708, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6326101422309875, + "rewards/margins": 0.6772100925445557, + "rewards/rejected": -0.04459994286298752, + "step": 2720 + }, + { + "epoch": 0.7, + "learning_rate": 1.6398852223816355e-07, + "logits/chosen": -3.965324878692627, + "logits/rejected": -3.852470874786377, + "logps/chosen": -587.070556640625, + "logps/rejected": -468.05755615234375, + "loss": 0.5195, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.6876250505447388, + "rewards/margins": 0.6847888827323914, + "rewards/rejected": 0.0028361976146698, + "step": 2730 + }, + { + "epoch": 0.71, + "learning_rate": 1.6255380200860833e-07, + "logits/chosen": -3.9872021675109863, + "logits/rejected": -4.126004695892334, + "logps/chosen": -575.1105346679688, + "logps/rejected": -469.9864807128906, + "loss": 0.6969, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.39717382192611694, + "rewards/margins": 0.3177093267440796, + "rewards/rejected": 0.07946449518203735, + "step": 2740 + }, + { + "epoch": 0.71, + "learning_rate": 1.6111908177905308e-07, + "logits/chosen": -4.012079238891602, + "logits/rejected": -3.936004161834717, + "logps/chosen": -597.6954345703125, + "logps/rejected": -411.5677795410156, + "loss": 0.6023, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.4575771391391754, + "rewards/margins": 0.4063941538333893, + "rewards/rejected": 0.05118294805288315, + "step": 2750 + }, + { + "epoch": 0.71, + "learning_rate": 1.5968436154949786e-07, + "logits/chosen": -4.271051406860352, + "logits/rejected": -4.011579990386963, + "logps/chosen": -563.8511962890625, + "logps/rejected": -387.9336853027344, + "loss": 0.5445, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.46211299300193787, + "rewards/margins": 0.5439087748527527, + "rewards/rejected": -0.0817958191037178, + "step": 2760 + }, + { + "epoch": 0.72, + "learning_rate": 1.582496413199426e-07, + "logits/chosen": -4.078734397888184, + "logits/rejected": -4.14528751373291, + "logps/chosen": -748.7713623046875, + "logps/rejected": -491.18206787109375, + "loss": 0.5499, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.6352272033691406, + "rewards/margins": 0.7783417701721191, + "rewards/rejected": -0.14311453700065613, + "step": 2770 + }, + { + "epoch": 0.72, + "learning_rate": 1.5681492109038739e-07, + "logits/chosen": -4.074445724487305, + "logits/rejected": -3.9905147552490234, + "logps/chosen": -471.805908203125, + "logps/rejected": -419.62109375, + "loss": 0.5468, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.5179725885391235, + "rewards/margins": 0.5387195348739624, + "rewards/rejected": -0.020746838301420212, + "step": 2780 + }, + { + "epoch": 0.72, + "learning_rate": 1.553802008608321e-07, + "logits/chosen": -4.071807384490967, + "logits/rejected": -4.146918296813965, + "logps/chosen": -561.8287353515625, + "logps/rejected": -447.42034912109375, + "loss": 0.5752, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.5403738021850586, + "rewards/margins": 0.5378284454345703, + "rewards/rejected": 0.0025453567504882812, + "step": 2790 + }, + { + "epoch": 0.72, + "learning_rate": 1.539454806312769e-07, + "logits/chosen": -3.9499289989471436, + "logits/rejected": -3.7599105834960938, + "logps/chosen": -573.884765625, + "logps/rejected": -480.6022033691406, + "loss": 0.5883, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.5067580938339233, + "rewards/margins": 0.46255749464035034, + "rewards/rejected": 0.044200599193573, + "step": 2800 + }, + { + "epoch": 0.72, + "eval_logits/chosen": -3.966898202896118, + "eval_logits/rejected": -3.9893743991851807, + "eval_logps/chosen": -544.46240234375, + "eval_logps/rejected": -438.1562194824219, + "eval_loss": 0.5670157074928284, + "eval_rewards/accuracies": 0.6880000233650208, + "eval_rewards/chosen": 0.4914305508136749, + "eval_rewards/margins": 0.5065579414367676, + "eval_rewards/rejected": -0.015127355232834816, + "eval_runtime": 146.024, + "eval_samples_per_second": 13.696, + "eval_steps_per_second": 1.712, + "step": 2800 + }, + { + "epoch": 0.73, + "learning_rate": 1.5251076040172164e-07, + "logits/chosen": -4.044391632080078, + "logits/rejected": -4.076410293579102, + "logps/chosen": -598.0468139648438, + "logps/rejected": -476.67181396484375, + "loss": 0.5806, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5654221773147583, + "rewards/margins": 0.4769902229309082, + "rewards/rejected": 0.0884319394826889, + "step": 2810 + }, + { + "epoch": 0.73, + "learning_rate": 1.5107604017216642e-07, + "logits/chosen": -3.93943452835083, + "logits/rejected": -4.029221534729004, + "logps/chosen": -531.6763916015625, + "logps/rejected": -357.01910400390625, + "loss": 0.5708, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.5427955389022827, + "rewards/margins": 0.6645749807357788, + "rewards/rejected": -0.12177946418523788, + "step": 2820 + }, + { + "epoch": 0.73, + "learning_rate": 1.4964131994261117e-07, + "logits/chosen": -4.1592698097229, + "logits/rejected": -4.196699142456055, + "logps/chosen": -555.0905151367188, + "logps/rejected": -400.6699523925781, + "loss": 0.5251, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.5090584754943848, + "rewards/margins": 0.5514262318611145, + "rewards/rejected": -0.04236777871847153, + "step": 2830 + }, + { + "epoch": 0.73, + "learning_rate": 1.4820659971305595e-07, + "logits/chosen": -3.996324062347412, + "logits/rejected": -3.8942997455596924, + "logps/chosen": -559.6539916992188, + "logps/rejected": -462.087890625, + "loss": 0.5617, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.48809123039245605, + "rewards/margins": 0.41656923294067383, + "rewards/rejected": 0.07152204215526581, + "step": 2840 + }, + { + "epoch": 0.74, + "learning_rate": 1.467718794835007e-07, + "logits/chosen": -3.6908695697784424, + "logits/rejected": -3.810857057571411, + "logps/chosen": -488.4864196777344, + "logps/rejected": -417.60400390625, + "loss": 0.5171, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4484861493110657, + "rewards/margins": 0.4763699173927307, + "rewards/rejected": -0.02788383699953556, + "step": 2850 + }, + { + "epoch": 0.74, + "learning_rate": 1.4533715925394547e-07, + "logits/chosen": -3.739753007888794, + "logits/rejected": -3.9605700969696045, + "logps/chosen": -507.19659423828125, + "logps/rejected": -371.3736572265625, + "loss": 0.5889, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.5019505023956299, + "rewards/margins": 0.4500049650669098, + "rewards/rejected": 0.05194549635052681, + "step": 2860 + }, + { + "epoch": 0.74, + "learning_rate": 1.4390243902439023e-07, + "logits/chosen": -4.046762466430664, + "logits/rejected": -4.0279541015625, + "logps/chosen": -556.5122680664062, + "logps/rejected": -335.6501770019531, + "loss": 0.5436, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6151873469352722, + "rewards/margins": 0.7387471795082092, + "rewards/rejected": -0.12355981022119522, + "step": 2870 + }, + { + "epoch": 0.74, + "learning_rate": 1.4246771879483498e-07, + "logits/chosen": -3.7312331199645996, + "logits/rejected": -3.662278413772583, + "logps/chosen": -459.88525390625, + "logps/rejected": -383.14984130859375, + "loss": 0.582, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.3872886300086975, + "rewards/margins": 0.42437830567359924, + "rewards/rejected": -0.03708968311548233, + "step": 2880 + }, + { + "epoch": 0.75, + "learning_rate": 1.4103299856527975e-07, + "logits/chosen": -4.147296905517578, + "logits/rejected": -4.07787561416626, + "logps/chosen": -554.8704833984375, + "logps/rejected": -398.159912109375, + "loss": 0.6256, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5256353616714478, + "rewards/margins": 0.47513723373413086, + "rewards/rejected": 0.05049814656376839, + "step": 2890 + }, + { + "epoch": 0.75, + "learning_rate": 1.395982783357245e-07, + "logits/chosen": -3.914976119995117, + "logits/rejected": -3.905255079269409, + "logps/chosen": -563.65576171875, + "logps/rejected": -461.05389404296875, + "loss": 0.624, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.48630857467651367, + "rewards/margins": 0.4222942888736725, + "rewards/rejected": 0.0640142410993576, + "step": 2900 + }, + { + "epoch": 0.75, + "eval_logits/chosen": -3.970454692840576, + "eval_logits/rejected": -3.993534803390503, + "eval_logps/chosen": -544.4996948242188, + "eval_logps/rejected": -438.1958312988281, + "eval_loss": 0.5662667155265808, + "eval_rewards/accuracies": 0.6840000152587891, + "eval_rewards/chosen": 0.4876936674118042, + "eval_rewards/margins": 0.5067842602729797, + "eval_rewards/rejected": -0.019090561196208, + "eval_runtime": 146.1468, + "eval_samples_per_second": 13.685, + "eval_steps_per_second": 1.711, + "step": 2900 + }, + { + "epoch": 0.75, + "learning_rate": 1.3816355810616928e-07, + "logits/chosen": -3.885633945465088, + "logits/rejected": -3.992154598236084, + "logps/chosen": -606.8893432617188, + "logps/rejected": -488.0694885253906, + "loss": 0.6481, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.506466269493103, + "rewards/margins": 0.38042640686035156, + "rewards/rejected": 0.12603983283042908, + "step": 2910 + }, + { + "epoch": 0.75, + "learning_rate": 1.3672883787661404e-07, + "logits/chosen": -3.9882044792175293, + "logits/rejected": -4.012315273284912, + "logps/chosen": -598.3148193359375, + "logps/rejected": -423.91839599609375, + "loss": 0.5414, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.4659012258052826, + "rewards/margins": 0.621524453163147, + "rewards/rejected": -0.15562327206134796, + "step": 2920 + }, + { + "epoch": 0.76, + "learning_rate": 1.352941176470588e-07, + "logits/chosen": -4.175354957580566, + "logits/rejected": -4.1613287925720215, + "logps/chosen": -553.1173706054688, + "logps/rejected": -457.8443298339844, + "loss": 0.5357, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.49676817655563354, + "rewards/margins": 0.5213097333908081, + "rewards/rejected": -0.024541499093174934, + "step": 2930 + }, + { + "epoch": 0.76, + "learning_rate": 1.3385939741750356e-07, + "logits/chosen": -4.05168342590332, + "logits/rejected": -4.1532673835754395, + "logps/chosen": -515.8355712890625, + "logps/rejected": -411.39788818359375, + "loss": 0.5648, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.4736366271972656, + "rewards/margins": 0.5187323689460754, + "rewards/rejected": -0.045095693320035934, + "step": 2940 + }, + { + "epoch": 0.76, + "learning_rate": 1.3242467718794834e-07, + "logits/chosen": -4.0590500831604, + "logits/rejected": -4.028027534484863, + "logps/chosen": -478.349853515625, + "logps/rejected": -426.6962890625, + "loss": 0.5647, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4894079566001892, + "rewards/margins": 0.4928358197212219, + "rewards/rejected": -0.0034278512466698885, + "step": 2950 + }, + { + "epoch": 0.76, + "learning_rate": 1.309899569583931e-07, + "logits/chosen": -3.9538333415985107, + "logits/rejected": -4.088204383850098, + "logps/chosen": -615.9334106445312, + "logps/rejected": -433.25189208984375, + "loss": 0.5954, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.5048609972000122, + "rewards/margins": 0.6196298599243164, + "rewards/rejected": -0.11476895958185196, + "step": 2960 + }, + { + "epoch": 0.77, + "learning_rate": 1.2955523672883787e-07, + "logits/chosen": -3.891871690750122, + "logits/rejected": -3.8232593536376953, + "logps/chosen": -516.1154174804688, + "logps/rejected": -465.57977294921875, + "loss": 0.55, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.6237145662307739, + "rewards/margins": 0.559754490852356, + "rewards/rejected": 0.06396011263132095, + "step": 2970 + }, + { + "epoch": 0.77, + "learning_rate": 1.2812051649928262e-07, + "logits/chosen": -4.138208866119385, + "logits/rejected": -4.1710052490234375, + "logps/chosen": -514.2755737304688, + "logps/rejected": -385.1489562988281, + "loss": 0.6169, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.442038357257843, + "rewards/margins": 0.44703513383865356, + "rewards/rejected": -0.004996694624423981, + "step": 2980 + }, + { + "epoch": 0.77, + "learning_rate": 1.266857962697274e-07, + "logits/chosen": -4.293547630310059, + "logits/rejected": -4.3495774269104, + "logps/chosen": -570.8736572265625, + "logps/rejected": -520.841796875, + "loss": 0.6195, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.49560147523880005, + "rewards/margins": 0.48620933294296265, + "rewards/rejected": 0.009392100386321545, + "step": 2990 + }, + { + "epoch": 0.77, + "learning_rate": 1.2525107604017215e-07, + "logits/chosen": -4.126075267791748, + "logits/rejected": -3.8955910205841064, + "logps/chosen": -650.3043212890625, + "logps/rejected": -456.94110107421875, + "loss": 0.5347, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.6632064580917358, + "rewards/margins": 0.6959556341171265, + "rewards/rejected": -0.032749250531196594, + "step": 3000 + }, + { + "epoch": 0.77, + "eval_logits/chosen": -3.9776611328125, + "eval_logits/rejected": -4.001935958862305, + "eval_logps/chosen": -544.619873046875, + "eval_logps/rejected": -438.3401184082031, + "eval_loss": 0.564439594745636, + "eval_rewards/accuracies": 0.6850000023841858, + "eval_rewards/chosen": 0.4756743311882019, + "eval_rewards/margins": 0.5091925859451294, + "eval_rewards/rejected": -0.03351828455924988, + "eval_runtime": 145.9011, + "eval_samples_per_second": 13.708, + "eval_steps_per_second": 1.713, + "step": 3000 + }, + { + "epoch": 0.78, + "learning_rate": 1.2381635581061693e-07, + "logits/chosen": -4.2549543380737305, + "logits/rejected": -4.473557472229004, + "logps/chosen": -614.374267578125, + "logps/rejected": -472.72259521484375, + "loss": 0.5698, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.49129754304885864, + "rewards/margins": 0.42049235105514526, + "rewards/rejected": 0.07080519199371338, + "step": 3010 + }, + { + "epoch": 0.78, + "learning_rate": 1.2238163558106168e-07, + "logits/chosen": -4.080137729644775, + "logits/rejected": -4.035037040710449, + "logps/chosen": -531.7971801757812, + "logps/rejected": -427.37860107421875, + "loss": 0.6113, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.28571632504463196, + "rewards/margins": 0.4164826273918152, + "rewards/rejected": -0.13076625764369965, + "step": 3020 + }, + { + "epoch": 0.78, + "learning_rate": 1.2094691535150646e-07, + "logits/chosen": -3.9666385650634766, + "logits/rejected": -4.024598121643066, + "logps/chosen": -485.36181640625, + "logps/rejected": -329.271728515625, + "loss": 0.5889, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.49098771810531616, + "rewards/margins": 0.6027761697769165, + "rewards/rejected": -0.11178841441869736, + "step": 3030 + }, + { + "epoch": 0.78, + "learning_rate": 1.195121951219512e-07, + "logits/chosen": -4.0698957443237305, + "logits/rejected": -4.125982761383057, + "logps/chosen": -544.8397827148438, + "logps/rejected": -468.3692932128906, + "loss": 0.5681, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.5031709671020508, + "rewards/margins": 0.5633045434951782, + "rewards/rejected": -0.06013358756899834, + "step": 3040 + }, + { + "epoch": 0.79, + "learning_rate": 1.1807747489239597e-07, + "logits/chosen": -4.321501731872559, + "logits/rejected": -4.136828899383545, + "logps/chosen": -502.498779296875, + "logps/rejected": -387.68841552734375, + "loss": 0.6017, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.33200693130493164, + "rewards/margins": 0.5042457580566406, + "rewards/rejected": -0.17223885655403137, + "step": 3050 + }, + { + "epoch": 0.79, + "learning_rate": 1.1664275466284074e-07, + "logits/chosen": -4.194310188293457, + "logits/rejected": -4.310281753540039, + "logps/chosen": -580.3814086914062, + "logps/rejected": -446.2391662597656, + "loss": 0.5559, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.44598498940467834, + "rewards/margins": 0.5190831422805786, + "rewards/rejected": -0.07309817522764206, + "step": 3060 + }, + { + "epoch": 0.79, + "learning_rate": 1.152080344332855e-07, + "logits/chosen": -4.0883283615112305, + "logits/rejected": -4.150923728942871, + "logps/chosen": -554.2647094726562, + "logps/rejected": -424.0943298339844, + "loss": 0.5772, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.4851420521736145, + "rewards/margins": 0.5894008874893188, + "rewards/rejected": -0.10425883531570435, + "step": 3070 + }, + { + "epoch": 0.8, + "learning_rate": 1.1377331420373027e-07, + "logits/chosen": -4.141337871551514, + "logits/rejected": -4.110450267791748, + "logps/chosen": -580.8941040039062, + "logps/rejected": -452.0682067871094, + "loss": 0.5978, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5248143076896667, + "rewards/margins": 0.5298766493797302, + "rewards/rejected": -0.005062357988208532, + "step": 3080 + }, + { + "epoch": 0.8, + "learning_rate": 1.1233859397417503e-07, + "logits/chosen": -4.203800201416016, + "logits/rejected": -4.3287224769592285, + "logps/chosen": -611.932373046875, + "logps/rejected": -462.0245056152344, + "loss": 0.5423, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.6605905890464783, + "rewards/margins": 0.6125748753547668, + "rewards/rejected": 0.04801566153764725, + "step": 3090 + }, + { + "epoch": 0.8, + "learning_rate": 1.109038737446198e-07, + "logits/chosen": -3.9451992511749268, + "logits/rejected": -3.978661060333252, + "logps/chosen": -548.4806518554688, + "logps/rejected": -418.55255126953125, + "loss": 0.5837, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.4787958264350891, + "rewards/margins": 0.4884239733219147, + "rewards/rejected": -0.009628054685890675, + "step": 3100 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -3.9741623401641846, + "eval_logits/rejected": -3.997610092163086, + "eval_logps/chosen": -544.5935668945312, + "eval_logps/rejected": -438.3072509765625, + "eval_loss": 0.563690721988678, + "eval_rewards/accuracies": 0.6830000281333923, + "eval_rewards/chosen": 0.47830715775489807, + "eval_rewards/margins": 0.5085403323173523, + "eval_rewards/rejected": -0.03023313544690609, + "eval_runtime": 145.8972, + "eval_samples_per_second": 13.708, + "eval_steps_per_second": 1.714, + "step": 3100 + }, + { + "epoch": 0.8, + "learning_rate": 1.0946915351506456e-07, + "logits/chosen": -3.9184958934783936, + "logits/rejected": -3.7819457054138184, + "logps/chosen": -571.571533203125, + "logps/rejected": -445.2757263183594, + "loss": 0.5715, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.40955060720443726, + "rewards/margins": 0.4587160050868988, + "rewards/rejected": -0.049165401607751846, + "step": 3110 + }, + { + "epoch": 0.81, + "learning_rate": 1.0803443328550932e-07, + "logits/chosen": -3.905442476272583, + "logits/rejected": -3.8821640014648438, + "logps/chosen": -460.50372314453125, + "logps/rejected": -420.4170837402344, + "loss": 0.5573, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.32348185777664185, + "rewards/margins": 0.37588781118392944, + "rewards/rejected": -0.05240591615438461, + "step": 3120 + }, + { + "epoch": 0.81, + "learning_rate": 1.0659971305595408e-07, + "logits/chosen": -3.8710105419158936, + "logits/rejected": -3.9780330657958984, + "logps/chosen": -492.71075439453125, + "logps/rejected": -370.08160400390625, + "loss": 0.5272, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.4087301790714264, + "rewards/margins": 0.49260735511779785, + "rewards/rejected": -0.08387719094753265, + "step": 3130 + }, + { + "epoch": 0.81, + "learning_rate": 1.0516499282639884e-07, + "logits/chosen": -4.243491172790527, + "logits/rejected": -4.153388977050781, + "logps/chosen": -496.98260498046875, + "logps/rejected": -367.54632568359375, + "loss": 0.5874, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.44594526290893555, + "rewards/margins": 0.5719509720802307, + "rewards/rejected": -0.12600573897361755, + "step": 3140 + }, + { + "epoch": 0.81, + "learning_rate": 1.037302725968436e-07, + "logits/chosen": -3.9939746856689453, + "logits/rejected": -4.004325866699219, + "logps/chosen": -478.77056884765625, + "logps/rejected": -464.50665283203125, + "loss": 0.6232, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.35015568137168884, + "rewards/margins": 0.37633177638053894, + "rewards/rejected": -0.02617608569562435, + "step": 3150 + }, + { + "epoch": 0.82, + "learning_rate": 1.0229555236728837e-07, + "logits/chosen": -4.253720283508301, + "logits/rejected": -4.267764568328857, + "logps/chosen": -557.8858642578125, + "logps/rejected": -412.69818115234375, + "loss": 0.5698, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.4118489623069763, + "rewards/margins": 0.5684719085693359, + "rewards/rejected": -0.15662303566932678, + "step": 3160 + }, + { + "epoch": 0.82, + "learning_rate": 1.0086083213773313e-07, + "logits/chosen": -3.7371535301208496, + "logits/rejected": -3.7913818359375, + "logps/chosen": -562.0777587890625, + "logps/rejected": -491.91143798828125, + "loss": 0.5108, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.5288134813308716, + "rewards/margins": 0.5345078706741333, + "rewards/rejected": -0.005694452673196793, + "step": 3170 + }, + { + "epoch": 0.82, + "learning_rate": 9.94261119081779e-08, + "logits/chosen": -3.9377448558807373, + "logits/rejected": -3.955479383468628, + "logps/chosen": -502.92718505859375, + "logps/rejected": -408.81976318359375, + "loss": 0.5617, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.5690404772758484, + "rewards/margins": 0.502609372138977, + "rewards/rejected": 0.06643114238977432, + "step": 3180 + }, + { + "epoch": 0.82, + "learning_rate": 9.799139167862266e-08, + "logits/chosen": -4.40088415145874, + "logits/rejected": -4.40977144241333, + "logps/chosen": -582.387451171875, + "logps/rejected": -508.1477966308594, + "loss": 0.523, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.5947442650794983, + "rewards/margins": 0.6568835377693176, + "rewards/rejected": -0.06213930994272232, + "step": 3190 + }, + { + "epoch": 0.83, + "learning_rate": 9.655667144906743e-08, + "logits/chosen": -4.194244861602783, + "logits/rejected": -4.144165992736816, + "logps/chosen": -582.037109375, + "logps/rejected": -409.24224853515625, + "loss": 0.5293, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.5745357275009155, + "rewards/margins": 0.728204071521759, + "rewards/rejected": -0.15366844832897186, + "step": 3200 + }, + { + "epoch": 0.83, + "eval_logits/chosen": -3.9778196811676025, + "eval_logits/rejected": -4.002331733703613, + "eval_logps/chosen": -544.66162109375, + "eval_logps/rejected": -438.367919921875, + "eval_loss": 0.5634328126907349, + "eval_rewards/accuracies": 0.6890000104904175, + "eval_rewards/chosen": 0.47150418162345886, + "eval_rewards/margins": 0.5078018307685852, + "eval_rewards/rejected": -0.03629762679338455, + "eval_runtime": 146.8004, + "eval_samples_per_second": 13.624, + "eval_steps_per_second": 1.703, + "step": 3200 + }, + { + "epoch": 0.83, + "learning_rate": 9.512195121951219e-08, + "logits/chosen": -4.067798614501953, + "logits/rejected": -4.208149433135986, + "logps/chosen": -511.45013427734375, + "logps/rejected": -376.13067626953125, + "loss": 0.5854, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.4113733768463135, + "rewards/margins": 0.48462361097335815, + "rewards/rejected": -0.07325027137994766, + "step": 3210 + }, + { + "epoch": 0.83, + "learning_rate": 9.368723098995696e-08, + "logits/chosen": -4.2905426025390625, + "logits/rejected": -4.173062801361084, + "logps/chosen": -573.29296875, + "logps/rejected": -427.76739501953125, + "loss": 0.5641, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.48305606842041016, + "rewards/margins": 0.5324376225471497, + "rewards/rejected": -0.049381546676158905, + "step": 3220 + }, + { + "epoch": 0.83, + "learning_rate": 9.225251076040172e-08, + "logits/chosen": -3.8998687267303467, + "logits/rejected": -3.798374891281128, + "logps/chosen": -553.4527587890625, + "logps/rejected": -378.1935119628906, + "loss": 0.5525, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4461655020713806, + "rewards/margins": 0.6628150343894958, + "rewards/rejected": -0.21664953231811523, + "step": 3230 + }, + { + "epoch": 0.84, + "learning_rate": 9.081779053084649e-08, + "logits/chosen": -4.098966121673584, + "logits/rejected": -4.07062292098999, + "logps/chosen": -563.8943481445312, + "logps/rejected": -444.6372985839844, + "loss": 0.5853, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.5742425918579102, + "rewards/margins": 0.5752390027046204, + "rewards/rejected": -0.0009963444899767637, + "step": 3240 + }, + { + "epoch": 0.84, + "learning_rate": 8.938307030129125e-08, + "logits/chosen": -4.253169059753418, + "logits/rejected": -4.21138858795166, + "logps/chosen": -535.7103271484375, + "logps/rejected": -373.97003173828125, + "loss": 0.5755, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3600585460662842, + "rewards/margins": 0.45460644364356995, + "rewards/rejected": -0.09454789757728577, + "step": 3250 + }, + { + "epoch": 0.84, + "learning_rate": 8.794835007173601e-08, + "logits/chosen": -3.9807746410369873, + "logits/rejected": -3.941415309906006, + "logps/chosen": -572.0859985351562, + "logps/rejected": -468.2994079589844, + "loss": 0.58, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.5067304968833923, + "rewards/margins": 0.50401771068573, + "rewards/rejected": 0.0027127789799124002, + "step": 3260 + }, + { + "epoch": 0.84, + "learning_rate": 8.651362984218078e-08, + "logits/chosen": -4.293785095214844, + "logits/rejected": -4.3718976974487305, + "logps/chosen": -584.475830078125, + "logps/rejected": -409.7781677246094, + "loss": 0.5662, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.3756251931190491, + "rewards/margins": 0.5408438444137573, + "rewards/rejected": -0.16521869599819183, + "step": 3270 + }, + { + "epoch": 0.85, + "learning_rate": 8.507890961262554e-08, + "logits/chosen": -3.9542198181152344, + "logits/rejected": -3.9630751609802246, + "logps/chosen": -513.8878173828125, + "logps/rejected": -526.8805541992188, + "loss": 0.5605, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.5460586547851562, + "rewards/margins": 0.4736880362033844, + "rewards/rejected": 0.07237061113119125, + "step": 3280 + }, + { + "epoch": 0.85, + "learning_rate": 8.364418938307031e-08, + "logits/chosen": -3.925539493560791, + "logits/rejected": -3.744020462036133, + "logps/chosen": -536.7335205078125, + "logps/rejected": -374.2388610839844, + "loss": 0.5676, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.4619140028953552, + "rewards/margins": 0.5450933575630188, + "rewards/rejected": -0.08317933976650238, + "step": 3290 + }, + { + "epoch": 0.85, + "learning_rate": 8.220946915351506e-08, + "logits/chosen": -3.950735569000244, + "logits/rejected": -3.9631354808807373, + "logps/chosen": -522.4519653320312, + "logps/rejected": -515.7288208007812, + "loss": 0.5128, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.4143013060092926, + "rewards/margins": 0.5980950593948364, + "rewards/rejected": -0.1837938129901886, + "step": 3300 + }, + { + "epoch": 0.85, + "eval_logits/chosen": -3.9803950786590576, + "eval_logits/rejected": -4.005295753479004, + "eval_logps/chosen": -544.6318969726562, + "eval_logps/rejected": -438.39166259765625, + "eval_loss": 0.5619609355926514, + "eval_rewards/accuracies": 0.6880000233650208, + "eval_rewards/chosen": 0.474471777677536, + "eval_rewards/margins": 0.5131421089172363, + "eval_rewards/rejected": -0.038670338690280914, + "eval_runtime": 147.8686, + "eval_samples_per_second": 13.526, + "eval_steps_per_second": 1.691, + "step": 3300 + }, + { + "epoch": 0.85, + "learning_rate": 8.077474892395982e-08, + "logits/chosen": -4.0901780128479, + "logits/rejected": -4.084465980529785, + "logps/chosen": -538.7194213867188, + "logps/rejected": -432.45892333984375, + "loss": 0.5549, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.45796340703964233, + "rewards/margins": 0.44825053215026855, + "rewards/rejected": 0.009712914004921913, + "step": 3310 + }, + { + "epoch": 0.86, + "learning_rate": 7.934002869440459e-08, + "logits/chosen": -3.90226411819458, + "logits/rejected": -4.059938907623291, + "logps/chosen": -635.9435424804688, + "logps/rejected": -362.847412109375, + "loss": 0.5811, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.3464578092098236, + "rewards/margins": 0.35814762115478516, + "rewards/rejected": -0.011689816601574421, + "step": 3320 + }, + { + "epoch": 0.86, + "learning_rate": 7.790530846484935e-08, + "logits/chosen": -4.0388689041137695, + "logits/rejected": -3.9921679496765137, + "logps/chosen": -555.1847534179688, + "logps/rejected": -400.4381408691406, + "loss": 0.5537, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.46570801734924316, + "rewards/margins": 0.5739080309867859, + "rewards/rejected": -0.1082000583410263, + "step": 3330 + }, + { + "epoch": 0.86, + "learning_rate": 7.647058823529412e-08, + "logits/chosen": -4.057839870452881, + "logits/rejected": -3.962494373321533, + "logps/chosen": -633.3861083984375, + "logps/rejected": -521.1183471679688, + "loss": 0.569, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.44552069902420044, + "rewards/margins": 0.42656344175338745, + "rewards/rejected": 0.01895725727081299, + "step": 3340 + }, + { + "epoch": 0.86, + "learning_rate": 7.503586800573888e-08, + "logits/chosen": -4.196125507354736, + "logits/rejected": -4.071255683898926, + "logps/chosen": -488.28460693359375, + "logps/rejected": -371.49169921875, + "loss": 0.5918, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.39670076966285706, + "rewards/margins": 0.3868991434574127, + "rewards/rejected": 0.009801648557186127, + "step": 3350 + }, + { + "epoch": 0.87, + "learning_rate": 7.360114777618365e-08, + "logits/chosen": -3.972092390060425, + "logits/rejected": -4.106286525726318, + "logps/chosen": -572.9547119140625, + "logps/rejected": -420.35400390625, + "loss": 0.5315, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.49660277366638184, + "rewards/margins": 0.5567241311073303, + "rewards/rejected": -0.06012127920985222, + "step": 3360 + }, + { + "epoch": 0.87, + "learning_rate": 7.21664275466284e-08, + "logits/chosen": -3.9770302772521973, + "logits/rejected": -4.03969669342041, + "logps/chosen": -547.3425903320312, + "logps/rejected": -495.91851806640625, + "loss": 0.604, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.48121723532676697, + "rewards/margins": 0.4144704341888428, + "rewards/rejected": 0.06674680858850479, + "step": 3370 + }, + { + "epoch": 0.87, + "learning_rate": 7.073170731707316e-08, + "logits/chosen": -4.059111595153809, + "logits/rejected": -4.129426002502441, + "logps/chosen": -564.0946655273438, + "logps/rejected": -506.58709716796875, + "loss": 0.549, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.40859222412109375, + "rewards/margins": 0.3801480233669281, + "rewards/rejected": 0.028444204479455948, + "step": 3380 + }, + { + "epoch": 0.88, + "learning_rate": 6.929698708751793e-08, + "logits/chosen": -4.225644111633301, + "logits/rejected": -4.292551040649414, + "logps/chosen": -588.9600830078125, + "logps/rejected": -472.1261291503906, + "loss": 0.5453, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.4812944531440735, + "rewards/margins": 0.5918601155281067, + "rewards/rejected": -0.11056558787822723, + "step": 3390 + }, + { + "epoch": 0.88, + "learning_rate": 6.786226685796269e-08, + "logits/chosen": -4.155394077301025, + "logits/rejected": -4.14565896987915, + "logps/chosen": -525.122314453125, + "logps/rejected": -392.67376708984375, + "loss": 0.6204, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.31629544496536255, + "rewards/margins": 0.37368619441986084, + "rewards/rejected": -0.0573907308280468, + "step": 3400 + }, + { + "epoch": 0.88, + "eval_logits/chosen": -3.9814672470092773, + "eval_logits/rejected": -4.006735324859619, + "eval_logps/chosen": -544.69775390625, + "eval_logps/rejected": -438.4468688964844, + "eval_loss": 0.5624998211860657, + "eval_rewards/accuracies": 0.6859999895095825, + "eval_rewards/chosen": 0.46788930892944336, + "eval_rewards/margins": 0.5120863914489746, + "eval_rewards/rejected": -0.044197000563144684, + "eval_runtime": 147.1881, + "eval_samples_per_second": 13.588, + "eval_steps_per_second": 1.699, + "step": 3400 + }, + { + "epoch": 0.88, + "learning_rate": 6.642754662840746e-08, + "logits/chosen": -4.196806907653809, + "logits/rejected": -4.363795280456543, + "logps/chosen": -574.8983154296875, + "logps/rejected": -502.860107421875, + "loss": 0.5918, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.49528592824935913, + "rewards/margins": 0.5257723927497864, + "rewards/rejected": -0.03048643469810486, + "step": 3410 + }, + { + "epoch": 0.88, + "learning_rate": 6.499282639885222e-08, + "logits/chosen": -3.957125186920166, + "logits/rejected": -3.888190507888794, + "logps/chosen": -547.2122192382812, + "logps/rejected": -404.66595458984375, + "loss": 0.5457, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.4918132722377777, + "rewards/margins": 0.5843526124954224, + "rewards/rejected": -0.09253935515880585, + "step": 3420 + }, + { + "epoch": 0.89, + "learning_rate": 6.355810616929698e-08, + "logits/chosen": -3.979330539703369, + "logits/rejected": -4.06231164932251, + "logps/chosen": -524.4691772460938, + "logps/rejected": -422.9524841308594, + "loss": 0.5182, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.4810391962528229, + "rewards/margins": 0.5485955476760864, + "rewards/rejected": -0.06755636632442474, + "step": 3430 + }, + { + "epoch": 0.89, + "learning_rate": 6.212338593974175e-08, + "logits/chosen": -4.29758882522583, + "logits/rejected": -4.182176113128662, + "logps/chosen": -595.9281616210938, + "logps/rejected": -425.3091735839844, + "loss": 0.5189, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.5549203157424927, + "rewards/margins": 0.5970374941825867, + "rewards/rejected": -0.04211718589067459, + "step": 3440 + }, + { + "epoch": 0.89, + "learning_rate": 6.068866571018651e-08, + "logits/chosen": -4.2188720703125, + "logits/rejected": -4.125060081481934, + "logps/chosen": -521.8778076171875, + "logps/rejected": -426.075439453125, + "loss": 0.6504, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.3750000596046448, + "rewards/margins": 0.34474819898605347, + "rewards/rejected": 0.03025185689330101, + "step": 3450 + }, + { + "epoch": 0.89, + "learning_rate": 5.925394548063128e-08, + "logits/chosen": -4.250518321990967, + "logits/rejected": -4.323083400726318, + "logps/chosen": -596.1790161132812, + "logps/rejected": -468.62078857421875, + "loss": 0.628, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.49697431921958923, + "rewards/margins": 0.5001575350761414, + "rewards/rejected": -0.00318324426189065, + "step": 3460 + }, + { + "epoch": 0.9, + "learning_rate": 5.7819225251076036e-08, + "logits/chosen": -4.112654685974121, + "logits/rejected": -3.997405529022217, + "logps/chosen": -498.8377990722656, + "logps/rejected": -473.70550537109375, + "loss": 0.5889, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.3584319055080414, + "rewards/margins": 0.3437018394470215, + "rewards/rejected": 0.014730053022503853, + "step": 3470 + }, + { + "epoch": 0.9, + "learning_rate": 5.63845050215208e-08, + "logits/chosen": -4.2403950691223145, + "logits/rejected": -4.099762916564941, + "logps/chosen": -572.9876098632812, + "logps/rejected": -437.2333984375, + "loss": 0.565, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.43449336290359497, + "rewards/margins": 0.5010371804237366, + "rewards/rejected": -0.06654379516839981, + "step": 3480 + }, + { + "epoch": 0.9, + "learning_rate": 5.4949784791965565e-08, + "logits/chosen": -4.034601211547852, + "logits/rejected": -4.012211322784424, + "logps/chosen": -535.9708862304688, + "logps/rejected": -390.2044372558594, + "loss": 0.5451, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5869877338409424, + "rewards/margins": 0.674468994140625, + "rewards/rejected": -0.087481290102005, + "step": 3490 + }, + { + "epoch": 0.9, + "learning_rate": 5.351506456241032e-08, + "logits/chosen": -4.010631561279297, + "logits/rejected": -3.996525526046753, + "logps/chosen": -550.5261840820312, + "logps/rejected": -470.1412658691406, + "loss": 0.5469, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.5485318899154663, + "rewards/margins": 0.49418431520462036, + "rewards/rejected": 0.054347604513168335, + "step": 3500 + }, + { + "epoch": 0.9, + "eval_logits/chosen": -3.98427677154541, + "eval_logits/rejected": -4.009834289550781, + "eval_logps/chosen": -544.7650756835938, + "eval_logps/rejected": -438.4956359863281, + "eval_loss": 0.5618208050727844, + "eval_rewards/accuracies": 0.6859999895095825, + "eval_rewards/chosen": 0.4611594080924988, + "eval_rewards/margins": 0.5102306604385376, + "eval_rewards/rejected": -0.049071334302425385, + "eval_runtime": 146.3352, + "eval_samples_per_second": 13.667, + "eval_steps_per_second": 1.708, + "step": 3500 + }, + { + "epoch": 0.91, + "learning_rate": 5.208034433285509e-08, + "logits/chosen": -3.8921310901641846, + "logits/rejected": -3.848719358444214, + "logps/chosen": -647.0032958984375, + "logps/rejected": -568.3533935546875, + "loss": 0.5397, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.5782560110092163, + "rewards/margins": 0.5668593645095825, + "rewards/rejected": 0.011396640911698341, + "step": 3510 + }, + { + "epoch": 0.91, + "learning_rate": 5.064562410329985e-08, + "logits/chosen": -4.195284843444824, + "logits/rejected": -4.076591491699219, + "logps/chosen": -530.7032470703125, + "logps/rejected": -445.4291076660156, + "loss": 0.5744, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.45487624406814575, + "rewards/margins": 0.4947517514228821, + "rewards/rejected": -0.03987548500299454, + "step": 3520 + }, + { + "epoch": 0.91, + "learning_rate": 4.9210903873744616e-08, + "logits/chosen": -4.124705791473389, + "logits/rejected": -4.10734748840332, + "logps/chosen": -541.8250122070312, + "logps/rejected": -430.775146484375, + "loss": 0.6309, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.4992215633392334, + "rewards/margins": 0.41505131125450134, + "rewards/rejected": 0.08417025953531265, + "step": 3530 + }, + { + "epoch": 0.91, + "learning_rate": 4.777618364418938e-08, + "logits/chosen": -3.803776502609253, + "logits/rejected": -3.8794872760772705, + "logps/chosen": -551.4606323242188, + "logps/rejected": -393.61907958984375, + "loss": 0.5847, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.3885660171508789, + "rewards/margins": 0.4686763882637024, + "rewards/rejected": -0.08011035621166229, + "step": 3540 + }, + { + "epoch": 0.92, + "learning_rate": 4.6341463414634145e-08, + "logits/chosen": -4.222638130187988, + "logits/rejected": -4.128180027008057, + "logps/chosen": -616.28515625, + "logps/rejected": -464.65692138671875, + "loss": 0.6212, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5344708561897278, + "rewards/margins": 0.4752708971500397, + "rewards/rejected": 0.059200018644332886, + "step": 3550 + }, + { + "epoch": 0.92, + "learning_rate": 4.490674318507891e-08, + "logits/chosen": -4.027795314788818, + "logits/rejected": -3.9686641693115234, + "logps/chosen": -548.213134765625, + "logps/rejected": -391.7416076660156, + "loss": 0.6047, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4162333607673645, + "rewards/margins": 0.338506817817688, + "rewards/rejected": 0.07772652804851532, + "step": 3560 + }, + { + "epoch": 0.92, + "learning_rate": 4.3472022955523674e-08, + "logits/chosen": -4.117037773132324, + "logits/rejected": -4.153426647186279, + "logps/chosen": -632.22607421875, + "logps/rejected": -593.6041870117188, + "loss": 0.627, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.40994158387184143, + "rewards/margins": 0.3304620385169983, + "rewards/rejected": 0.07947959750890732, + "step": 3570 + }, + { + "epoch": 0.92, + "learning_rate": 4.203730272596843e-08, + "logits/chosen": -3.860865831375122, + "logits/rejected": -3.7739486694335938, + "logps/chosen": -516.7479248046875, + "logps/rejected": -392.3046569824219, + "loss": 0.6383, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.31629177927970886, + "rewards/margins": 0.39345088601112366, + "rewards/rejected": -0.07715904712677002, + "step": 3580 + }, + { + "epoch": 0.93, + "learning_rate": 4.0602582496413197e-08, + "logits/chosen": -3.8026537895202637, + "logits/rejected": -3.838305711746216, + "logps/chosen": -599.1148681640625, + "logps/rejected": -478.2513122558594, + "loss": 0.5632, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.4942142963409424, + "rewards/margins": 0.4958348274230957, + "rewards/rejected": -0.001620540046133101, + "step": 3590 + }, + { + "epoch": 0.93, + "learning_rate": 3.916786226685796e-08, + "logits/chosen": -3.803657054901123, + "logits/rejected": -3.809593677520752, + "logps/chosen": -450.1053161621094, + "logps/rejected": -387.50457763671875, + "loss": 0.5807, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.4047510623931885, + "rewards/margins": 0.3800794184207916, + "rewards/rejected": 0.0246716421097517, + "step": 3600 + }, + { + "epoch": 0.93, + "eval_logits/chosen": -3.9818077087402344, + "eval_logits/rejected": -4.006768703460693, + "eval_logps/chosen": -544.7014770507812, + "eval_logps/rejected": -438.45843505859375, + "eval_loss": 0.561528742313385, + "eval_rewards/accuracies": 0.6890000104904175, + "eval_rewards/chosen": 0.46752142906188965, + "eval_rewards/margins": 0.5128761529922485, + "eval_rewards/rejected": -0.04535466805100441, + "eval_runtime": 146.6531, + "eval_samples_per_second": 13.638, + "eval_steps_per_second": 1.705, + "step": 3600 + }, + { + "epoch": 0.93, + "learning_rate": 3.7733142037302726e-08, + "logits/chosen": -4.237338066101074, + "logits/rejected": -4.075575828552246, + "logps/chosen": -507.56475830078125, + "logps/rejected": -460.54864501953125, + "loss": 0.5626, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.3922446668148041, + "rewards/margins": 0.3932397663593292, + "rewards/rejected": -0.000995102571323514, + "step": 3610 + }, + { + "epoch": 0.93, + "learning_rate": 3.629842180774749e-08, + "logits/chosen": -4.196072578430176, + "logits/rejected": -4.2456374168396, + "logps/chosen": -561.850830078125, + "logps/rejected": -485.9574279785156, + "loss": 0.5872, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.45551833510398865, + "rewards/margins": 0.5499417185783386, + "rewards/rejected": -0.09442339837551117, + "step": 3620 + }, + { + "epoch": 0.94, + "learning_rate": 3.4863701578192255e-08, + "logits/chosen": -4.031411170959473, + "logits/rejected": -4.024487495422363, + "logps/chosen": -486.8564453125, + "logps/rejected": -412.80712890625, + "loss": 0.5609, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.395702064037323, + "rewards/margins": 0.37622708082199097, + "rewards/rejected": 0.01947496458888054, + "step": 3630 + }, + { + "epoch": 0.94, + "learning_rate": 3.342898134863702e-08, + "logits/chosen": -4.104000091552734, + "logits/rejected": -4.140475273132324, + "logps/chosen": -629.5847778320312, + "logps/rejected": -438.322021484375, + "loss": 0.5127, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.6019877195358276, + "rewards/margins": 0.7443768382072449, + "rewards/rejected": -0.1423892229795456, + "step": 3640 + }, + { + "epoch": 0.94, + "learning_rate": 3.1994261119081784e-08, + "logits/chosen": -4.0185065269470215, + "logits/rejected": -3.976111888885498, + "logps/chosen": -567.458740234375, + "logps/rejected": -433.2574157714844, + "loss": 0.5677, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.40292826294898987, + "rewards/margins": 0.45192545652389526, + "rewards/rejected": -0.048997145146131516, + "step": 3650 + }, + { + "epoch": 0.95, + "learning_rate": 3.055954088952654e-08, + "logits/chosen": -3.958566665649414, + "logits/rejected": -4.042834758758545, + "logps/chosen": -507.0655212402344, + "logps/rejected": -423.1856384277344, + "loss": 0.5869, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4078023433685303, + "rewards/margins": 0.4608895778656006, + "rewards/rejected": -0.05308721214532852, + "step": 3660 + }, + { + "epoch": 0.95, + "learning_rate": 2.9124820659971306e-08, + "logits/chosen": -4.479077339172363, + "logits/rejected": -4.497659683227539, + "logps/chosen": -593.07666015625, + "logps/rejected": -471.2254943847656, + "loss": 0.5922, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.5073246359825134, + "rewards/margins": 0.5663779973983765, + "rewards/rejected": -0.059053339064121246, + "step": 3670 + }, + { + "epoch": 0.95, + "learning_rate": 2.7690100430416067e-08, + "logits/chosen": -4.271915912628174, + "logits/rejected": -4.254021644592285, + "logps/chosen": -458.9129943847656, + "logps/rejected": -393.89862060546875, + "loss": 0.5112, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.3906114995479584, + "rewards/margins": 0.6087072491645813, + "rewards/rejected": -0.2180958241224289, + "step": 3680 + }, + { + "epoch": 0.95, + "learning_rate": 2.625538020086083e-08, + "logits/chosen": -4.178504943847656, + "logits/rejected": -4.220212459564209, + "logps/chosen": -584.3404541015625, + "logps/rejected": -444.0389709472656, + "loss": 0.5676, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.4277767241001129, + "rewards/margins": 0.5100394487380981, + "rewards/rejected": -0.08226276189088821, + "step": 3690 + }, + { + "epoch": 0.96, + "learning_rate": 2.4820659971305596e-08, + "logits/chosen": -4.02055549621582, + "logits/rejected": -4.129875183105469, + "logps/chosen": -557.54345703125, + "logps/rejected": -435.833984375, + "loss": 0.5265, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.48887911438941956, + "rewards/margins": 0.5414212942123413, + "rewards/rejected": -0.05254218727350235, + "step": 3700 + }, + { + "epoch": 0.96, + "eval_logits/chosen": -3.9832568168640137, + "eval_logits/rejected": -4.008208274841309, + "eval_logps/chosen": -544.701904296875, + "eval_logps/rejected": -438.4403381347656, + "eval_loss": 0.5619760751724243, + "eval_rewards/accuracies": 0.6880000233650208, + "eval_rewards/chosen": 0.46747326850891113, + "eval_rewards/margins": 0.5110137462615967, + "eval_rewards/rejected": -0.04354046657681465, + "eval_runtime": 147.499, + "eval_samples_per_second": 13.559, + "eval_steps_per_second": 1.695, + "step": 3700 + }, + { + "epoch": 0.96, + "learning_rate": 2.3385939741750357e-08, + "logits/chosen": -4.32746696472168, + "logits/rejected": -4.259668827056885, + "logps/chosen": -622.6856079101562, + "logps/rejected": -556.1048583984375, + "loss": 0.5794, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.5587274432182312, + "rewards/margins": 0.6079638600349426, + "rewards/rejected": -0.049236398190259933, + "step": 3710 + }, + { + "epoch": 0.96, + "learning_rate": 2.195121951219512e-08, + "logits/chosen": -3.798098087310791, + "logits/rejected": -4.027928829193115, + "logps/chosen": -534.12646484375, + "logps/rejected": -370.48907470703125, + "loss": 0.5292, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.5056679844856262, + "rewards/margins": 0.6213759183883667, + "rewards/rejected": -0.11570799350738525, + "step": 3720 + }, + { + "epoch": 0.96, + "learning_rate": 2.0516499282639883e-08, + "logits/chosen": -4.053610801696777, + "logits/rejected": -4.082070350646973, + "logps/chosen": -639.72998046875, + "logps/rejected": -456.34844970703125, + "loss": 0.5726, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.5697764158248901, + "rewards/margins": 0.5703621506690979, + "rewards/rejected": -0.0005857095238752663, + "step": 3730 + }, + { + "epoch": 0.97, + "learning_rate": 1.9081779053084647e-08, + "logits/chosen": -3.877751111984253, + "logits/rejected": -3.9127018451690674, + "logps/chosen": -583.1852416992188, + "logps/rejected": -401.8482971191406, + "loss": 0.5216, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.5018226504325867, + "rewards/margins": 0.5718634724617004, + "rewards/rejected": -0.07004072517156601, + "step": 3740 + }, + { + "epoch": 0.97, + "learning_rate": 1.7647058823529412e-08, + "logits/chosen": -4.168404579162598, + "logits/rejected": -4.074638366699219, + "logps/chosen": -510.4005432128906, + "logps/rejected": -415.7406311035156, + "loss": 0.5916, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.4106532037258148, + "rewards/margins": 0.4389967918395996, + "rewards/rejected": -0.028343593701720238, + "step": 3750 + }, + { + "epoch": 0.97, + "learning_rate": 1.6212338593974173e-08, + "logits/chosen": -4.087225914001465, + "logits/rejected": -3.9666972160339355, + "logps/chosen": -465.10986328125, + "logps/rejected": -324.4447021484375, + "loss": 0.5872, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.3475942015647888, + "rewards/margins": 0.424374520778656, + "rewards/rejected": -0.07678033411502838, + "step": 3760 + }, + { + "epoch": 0.97, + "learning_rate": 1.4777618364418938e-08, + "logits/chosen": -4.230082035064697, + "logits/rejected": -4.243712902069092, + "logps/chosen": -635.1439208984375, + "logps/rejected": -543.9008178710938, + "loss": 0.5959, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6082009077072144, + "rewards/margins": 0.42760133743286133, + "rewards/rejected": 0.1805996149778366, + "step": 3770 + }, + { + "epoch": 0.98, + "learning_rate": 1.3342898134863702e-08, + "logits/chosen": -4.096220970153809, + "logits/rejected": -4.115006446838379, + "logps/chosen": -577.7149658203125, + "logps/rejected": -440.178955078125, + "loss": 0.6036, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.4464997351169586, + "rewards/margins": 0.4255821108818054, + "rewards/rejected": 0.020917650312185287, + "step": 3780 + }, + { + "epoch": 0.98, + "learning_rate": 1.1908177905308463e-08, + "logits/chosen": -3.8991074562072754, + "logits/rejected": -3.802464246749878, + "logps/chosen": -519.2384643554688, + "logps/rejected": -435.0231018066406, + "loss": 0.5509, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.44993266463279724, + "rewards/margins": 0.5668569207191467, + "rewards/rejected": -0.1169242262840271, + "step": 3790 + }, + { + "epoch": 0.98, + "learning_rate": 1.0473457675753228e-08, + "logits/chosen": -4.110097885131836, + "logits/rejected": -4.070154190063477, + "logps/chosen": -539.1266479492188, + "logps/rejected": -404.272216796875, + "loss": 0.5484, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.5660614371299744, + "rewards/margins": 0.6612092852592468, + "rewards/rejected": -0.0951477512717247, + "step": 3800 + }, + { + "epoch": 0.98, + "eval_logits/chosen": -3.9850714206695557, + "eval_logits/rejected": -4.010331630706787, + "eval_logps/chosen": -544.69189453125, + "eval_logps/rejected": -438.4535827636719, + "eval_loss": 0.5614883899688721, + "eval_rewards/accuracies": 0.6930000185966492, + "eval_rewards/chosen": 0.46847668290138245, + "eval_rewards/margins": 0.5133422613143921, + "eval_rewards/rejected": -0.044865623116493225, + "eval_runtime": 146.7356, + "eval_samples_per_second": 13.63, + "eval_steps_per_second": 1.704, + "step": 3800 + }, + { + "epoch": 0.98, + "learning_rate": 9.03873744619799e-09, + "logits/chosen": -4.123238563537598, + "logits/rejected": -4.03770112991333, + "logps/chosen": -511.1904296875, + "logps/rejected": -446.9549865722656, + "loss": 0.5468, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.46543973684310913, + "rewards/margins": 0.4664246439933777, + "rewards/rejected": -0.0009849362540990114, + "step": 3810 + }, + { + "epoch": 0.99, + "learning_rate": 7.604017216642753e-09, + "logits/chosen": -4.272444725036621, + "logits/rejected": -4.183244705200195, + "logps/chosen": -505.7000427246094, + "logps/rejected": -433.4576721191406, + "loss": 0.5317, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5172845721244812, + "rewards/margins": 0.5496169328689575, + "rewards/rejected": -0.03233236074447632, + "step": 3820 + }, + { + "epoch": 0.99, + "learning_rate": 6.169296987087518e-09, + "logits/chosen": -4.149974346160889, + "logits/rejected": -4.188473701477051, + "logps/chosen": -587.9978637695312, + "logps/rejected": -437.9640197753906, + "loss": 0.557, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5558081269264221, + "rewards/margins": 0.5474778413772583, + "rewards/rejected": 0.008330265991389751, + "step": 3830 + }, + { + "epoch": 0.99, + "learning_rate": 4.734576757532282e-09, + "logits/chosen": -4.119086265563965, + "logits/rejected": -4.145096778869629, + "logps/chosen": -511.7611389160156, + "logps/rejected": -394.4190368652344, + "loss": 0.5667, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.42686066031455994, + "rewards/margins": 0.49367666244506836, + "rewards/rejected": -0.06681600958108902, + "step": 3840 + }, + { + "epoch": 0.99, + "learning_rate": 3.299856527977044e-09, + "logits/chosen": -3.9760982990264893, + "logits/rejected": -4.040436267852783, + "logps/chosen": -612.0133056640625, + "logps/rejected": -519.7421875, + "loss": 0.5988, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.49833711981773376, + "rewards/margins": 0.5593429803848267, + "rewards/rejected": -0.06100592762231827, + "step": 3850 + }, + { + "epoch": 1.0, + "learning_rate": 1.8651362984218077e-09, + "logits/chosen": -3.951416015625, + "logits/rejected": -3.949162006378174, + "logps/chosen": -557.078857421875, + "logps/rejected": -389.9462890625, + "loss": 0.5123, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.5364678502082825, + "rewards/margins": 0.6398257613182068, + "rewards/rejected": -0.10335797071456909, + "step": 3860 + }, + { + "epoch": 1.0, + "learning_rate": 4.30416068866571e-10, + "logits/chosen": -4.126285076141357, + "logits/rejected": -4.162901878356934, + "logps/chosen": -532.77880859375, + "logps/rejected": -499.14410400390625, + "loss": 0.6348, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4620552659034729, + "rewards/margins": 0.32118576765060425, + "rewards/rejected": 0.14086945354938507, + "step": 3870 + }, + { + "epoch": 1.0, + "step": 3873, + "total_flos": 0.0, + "train_loss": 0.5913154047772216, + "train_runtime": 14580.3501, + "train_samples_per_second": 4.25, + "train_steps_per_second": 0.266 + } + ], + "logging_steps": 10, + "max_steps": 3873, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}