{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 3873, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.2886597938144328e-09, "logits/chosen": -4.2921271324157715, "logits/rejected": -3.812117338180542, "logps/chosen": -664.6867065429688, "logps/rejected": -226.7833709716797, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 1.2886597938144328e-08, "logits/chosen": -4.003667831420898, "logits/rejected": -4.013306140899658, "logps/chosen": -559.2938232421875, "logps/rejected": -452.70074462890625, "loss": 0.6948, "rewards/accuracies": 0.2638888955116272, "rewards/chosen": -0.007192640565335751, "rewards/margins": -0.006332792341709137, "rewards/rejected": -0.000859847932588309, "step": 10 }, { "epoch": 0.01, "learning_rate": 2.5773195876288656e-08, "logits/chosen": -4.353642463684082, "logits/rejected": -4.292398929595947, "logps/chosen": -554.0906982421875, "logps/rejected": -500.97119140625, "loss": 0.6937, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.001169868279248476, "rewards/margins": 0.001462915213778615, "rewards/rejected": -0.0026327825617045164, "step": 20 }, { "epoch": 0.01, "learning_rate": 3.865979381443299e-08, "logits/chosen": -4.102766513824463, "logits/rejected": -4.200378894805908, "logps/chosen": -617.0684204101562, "logps/rejected": -476.2395935058594, "loss": 0.695, "rewards/accuracies": 0.5, "rewards/chosen": 0.001789045287296176, "rewards/margins": -0.003806379158049822, "rewards/rejected": 0.005595424212515354, "step": 30 }, { "epoch": 0.01, "learning_rate": 5.154639175257731e-08, "logits/chosen": -4.30725622177124, "logits/rejected": -4.225460052490234, "logps/chosen": -497.7335510253906, "logps/rejected": -415.4452209472656, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": 0.0006419029086828232, "rewards/margins": 0.0031944490037858486, "rewards/rejected": -0.0025525467935949564, "step": 40 }, { "epoch": 0.01, "learning_rate": 6.443298969072164e-08, "logits/chosen": -3.88063383102417, "logits/rejected": -3.8105220794677734, "logps/chosen": -627.067626953125, "logps/rejected": -403.6964111328125, "loss": 0.6976, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.00458294665440917, "rewards/margins": -0.016261283308267593, "rewards/rejected": 0.011678336188197136, "step": 50 }, { "epoch": 0.02, "learning_rate": 7.731958762886598e-08, "logits/chosen": -4.216163635253906, "logits/rejected": -4.099843978881836, "logps/chosen": -470.12115478515625, "logps/rejected": -469.4156799316406, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": 0.0052711316384375095, "rewards/margins": -0.0011110258055850863, "rewards/rejected": 0.00638215895742178, "step": 60 }, { "epoch": 0.02, "learning_rate": 9.02061855670103e-08, "logits/chosen": -4.200804233551025, "logits/rejected": -4.1986494064331055, "logps/chosen": -648.3743896484375, "logps/rejected": -488.0792541503906, "loss": 0.6905, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0002953239600174129, "rewards/margins": 0.00705097708851099, "rewards/rejected": -0.006755652371793985, "step": 70 }, { "epoch": 0.02, "learning_rate": 1.0309278350515462e-07, "logits/chosen": -4.013070106506348, "logits/rejected": -4.1909003257751465, "logps/chosen": -538.8270263671875, "logps/rejected": -391.4429931640625, "loss": 0.6919, "rewards/accuracies": 0.4375, "rewards/chosen": 0.007235817611217499, "rewards/margins": 0.009682310745120049, "rewards/rejected": -0.002446494298055768, "step": 80 }, { "epoch": 0.02, "learning_rate": 1.1597938144329897e-07, "logits/chosen": -3.9400150775909424, "logits/rejected": -3.9281005859375, "logps/chosen": -588.8606567382812, "logps/rejected": -484.28839111328125, "loss": 0.692, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.004685616586357355, "rewards/margins": 0.008750900626182556, "rewards/rejected": -0.004065284971147776, "step": 90 }, { "epoch": 0.03, "learning_rate": 1.2886597938144328e-07, "logits/chosen": -4.089522361755371, "logits/rejected": -4.070917129516602, "logps/chosen": -573.93310546875, "logps/rejected": -485.439697265625, "loss": 0.6906, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.010096365585923195, "rewards/margins": 0.0105238426476717, "rewards/rejected": -0.00042747752740979195, "step": 100 }, { "epoch": 0.03, "eval_logits/chosen": -4.051466941833496, "eval_logits/rejected": -4.089292526245117, "eval_logps/chosen": -549.3683471679688, "eval_logps/rejected": -437.9984130859375, "eval_loss": 0.6931844353675842, "eval_rewards/accuracies": 0.4860000014305115, "eval_rewards/chosen": 0.0008278049062937498, "eval_rewards/margins": 0.00017659256991464645, "eval_rewards/rejected": 0.0006512125837616622, "eval_runtime": 148.2369, "eval_samples_per_second": 13.492, "eval_steps_per_second": 1.686, "step": 100 }, { "epoch": 0.03, "learning_rate": 1.417525773195876e-07, "logits/chosen": -4.135636329650879, "logits/rejected": -4.231348991394043, "logps/chosen": -458.62255859375, "logps/rejected": -379.28094482421875, "loss": 0.691, "rewards/accuracies": 0.5, "rewards/chosen": 0.003453848185017705, "rewards/margins": 0.0036365636624395847, "rewards/rejected": -0.00018271691806148738, "step": 110 }, { "epoch": 0.03, "learning_rate": 1.5463917525773197e-07, "logits/chosen": -4.357504844665527, "logits/rejected": -4.165073871612549, "logps/chosen": -392.82891845703125, "logps/rejected": -405.0232849121094, "loss": 0.6946, "rewards/accuracies": 0.5, "rewards/chosen": 0.0029750962276011705, "rewards/margins": -0.0063691637478768826, "rewards/rejected": 0.009344260208308697, "step": 120 }, { "epoch": 0.03, "learning_rate": 1.6752577319587627e-07, "logits/chosen": -4.224671840667725, "logits/rejected": -4.147946357727051, "logps/chosen": -530.8834228515625, "logps/rejected": -379.1323547363281, "loss": 0.6934, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.001731688855215907, "rewards/margins": -0.0059483470395207405, "rewards/rejected": 0.004216659348458052, "step": 130 }, { "epoch": 0.04, "learning_rate": 1.804123711340206e-07, "logits/chosen": -4.243564128875732, "logits/rejected": -4.247513771057129, "logps/chosen": -555.7782592773438, "logps/rejected": -475.36474609375, "loss": 0.6932, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.0019786651246249676, "rewards/margins": -0.0012128886301070452, "rewards/rejected": 0.0031915525905787945, "step": 140 }, { "epoch": 0.04, "learning_rate": 1.9329896907216494e-07, "logits/chosen": -4.274221897125244, "logits/rejected": -4.187704086303711, "logps/chosen": -537.5848388671875, "logps/rejected": -444.8301696777344, "loss": 0.6938, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0053375037387013435, "rewards/margins": 1.6005151337594725e-05, "rewards/rejected": 0.005321498028934002, "step": 150 }, { "epoch": 0.04, "learning_rate": 2.0618556701030925e-07, "logits/chosen": -4.137946128845215, "logits/rejected": -4.2239580154418945, "logps/chosen": -473.9889221191406, "logps/rejected": -406.7872619628906, "loss": 0.6876, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.015802519395947456, "rewards/margins": 0.01578442193567753, "rewards/rejected": 1.8098298824043013e-05, "step": 160 }, { "epoch": 0.04, "learning_rate": 2.190721649484536e-07, "logits/chosen": -4.182999134063721, "logits/rejected": -4.228874683380127, "logps/chosen": -527.0224609375, "logps/rejected": -448.3179626464844, "loss": 0.6961, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.019780535250902176, "rewards/margins": -0.006507801823318005, "rewards/rejected": 0.026288334280252457, "step": 170 }, { "epoch": 0.05, "learning_rate": 2.3195876288659794e-07, "logits/chosen": -4.098742485046387, "logits/rejected": -4.176650524139404, "logps/chosen": -594.6082763671875, "logps/rejected": -453.4469299316406, "loss": 0.6877, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.03402475267648697, "rewards/margins": 0.01976330205798149, "rewards/rejected": 0.014261451549828053, "step": 180 }, { "epoch": 0.05, "learning_rate": 2.4484536082474224e-07, "logits/chosen": -4.383849143981934, "logits/rejected": -4.319648742675781, "logps/chosen": -584.6770629882812, "logps/rejected": -408.61370849609375, "loss": 0.6902, "rewards/accuracies": 0.5625, "rewards/chosen": 0.025487428531050682, "rewards/margins": 0.011225923895835876, "rewards/rejected": 0.014261503703892231, "step": 190 }, { "epoch": 0.05, "learning_rate": 2.5773195876288655e-07, "logits/chosen": -4.132022857666016, "logits/rejected": -4.150428295135498, "logps/chosen": -518.2391357421875, "logps/rejected": -388.0254821777344, "loss": 0.6844, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.02871655486524105, "rewards/margins": 0.022094249725341797, "rewards/rejected": 0.0066223046742379665, "step": 200 }, { "epoch": 0.05, "eval_logits/chosen": -4.050084114074707, "eval_logits/rejected": -4.087072849273682, "eval_logps/chosen": -549.053955078125, "eval_logps/rejected": -437.8319396972656, "eval_loss": 0.6855266094207764, "eval_rewards/accuracies": 0.5640000104904175, "eval_rewards/chosen": 0.032268982380628586, "eval_rewards/margins": 0.014963901601731777, "eval_rewards/rejected": 0.017305083572864532, "eval_runtime": 146.4759, "eval_samples_per_second": 13.654, "eval_steps_per_second": 1.707, "step": 200 }, { "epoch": 0.05, "learning_rate": 2.706185567010309e-07, "logits/chosen": -4.013279438018799, "logits/rejected": -4.023941516876221, "logps/chosen": -581.2147827148438, "logps/rejected": -522.2059936523438, "loss": 0.6897, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.030506515875458717, "rewards/margins": -0.003913003019988537, "rewards/rejected": 0.03441951796412468, "step": 210 }, { "epoch": 0.06, "learning_rate": 2.835051546391752e-07, "logits/chosen": -4.057482719421387, "logits/rejected": -4.15061092376709, "logps/chosen": -469.19769287109375, "logps/rejected": -427.91595458984375, "loss": 0.6862, "rewards/accuracies": 0.5, "rewards/chosen": 0.03468897193670273, "rewards/margins": 0.013076464645564556, "rewards/rejected": 0.021612513810396194, "step": 220 }, { "epoch": 0.06, "learning_rate": 2.963917525773196e-07, "logits/chosen": -4.064208507537842, "logits/rejected": -4.0749077796936035, "logps/chosen": -530.9828491210938, "logps/rejected": -439.2674865722656, "loss": 0.684, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.04473203793168068, "rewards/margins": 0.025556465610861778, "rewards/rejected": 0.01917557418346405, "step": 230 }, { "epoch": 0.06, "learning_rate": 3.0927835051546394e-07, "logits/chosen": -4.100975036621094, "logits/rejected": -4.096819877624512, "logps/chosen": -526.16748046875, "logps/rejected": -439.20452880859375, "loss": 0.6816, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.059132933616638184, "rewards/margins": 0.019664695486426353, "rewards/rejected": 0.03946823999285698, "step": 240 }, { "epoch": 0.06, "learning_rate": 3.2216494845360824e-07, "logits/chosen": -4.139791488647461, "logits/rejected": -4.0367560386657715, "logps/chosen": -521.2025146484375, "logps/rejected": -388.7520751953125, "loss": 0.678, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.06324665248394012, "rewards/margins": 0.04009511321783066, "rewards/rejected": 0.02315153181552887, "step": 250 }, { "epoch": 0.07, "learning_rate": 3.3505154639175255e-07, "logits/chosen": -4.206658363342285, "logits/rejected": -4.1859846115112305, "logps/chosen": -668.1943969726562, "logps/rejected": -461.34259033203125, "loss": 0.6769, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.07276210933923721, "rewards/margins": 0.03734602779150009, "rewards/rejected": 0.03541607782244682, "step": 260 }, { "epoch": 0.07, "learning_rate": 3.4793814432989685e-07, "logits/chosen": -3.937157392501831, "logits/rejected": -4.101494312286377, "logps/chosen": -664.857666015625, "logps/rejected": -487.4693908691406, "loss": 0.6737, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.10912873595952988, "rewards/margins": 0.05263194441795349, "rewards/rejected": 0.056496791541576385, "step": 270 }, { "epoch": 0.07, "learning_rate": 3.608247422680412e-07, "logits/chosen": -4.2088494300842285, "logits/rejected": -4.2679290771484375, "logps/chosen": -711.7024536132812, "logps/rejected": -427.2392578125, "loss": 0.6648, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.12602953612804413, "rewards/margins": 0.07799698412418365, "rewards/rejected": 0.048032552003860474, "step": 280 }, { "epoch": 0.07, "learning_rate": 3.737113402061856e-07, "logits/chosen": -4.126033782958984, "logits/rejected": -4.118724346160889, "logps/chosen": -527.9533081054688, "logps/rejected": -442.7091369628906, "loss": 0.6779, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.06928877532482147, "rewards/margins": 0.021227989345788956, "rewards/rejected": 0.04806078225374222, "step": 290 }, { "epoch": 0.08, "learning_rate": 3.865979381443299e-07, "logits/chosen": -4.21649169921875, "logits/rejected": -4.306222438812256, "logps/chosen": -558.1029663085938, "logps/rejected": -426.37646484375, "loss": 0.6685, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.10165311396121979, "rewards/margins": 0.0537085235118866, "rewards/rejected": 0.047944579273462296, "step": 300 }, { "epoch": 0.08, "eval_logits/chosen": -4.043172359466553, "eval_logits/rejected": -4.078800201416016, "eval_logps/chosen": -548.3015747070312, "eval_logps/rejected": -437.4681701660156, "eval_loss": 0.6674865484237671, "eval_rewards/accuracies": 0.6159999966621399, "eval_rewards/chosen": 0.10750828683376312, "eval_rewards/margins": 0.05382777377963066, "eval_rewards/rejected": 0.053680501878261566, "eval_runtime": 146.1324, "eval_samples_per_second": 13.686, "eval_steps_per_second": 1.711, "step": 300 }, { "epoch": 0.08, "learning_rate": 3.9948453608247424e-07, "logits/chosen": -4.439688205718994, "logits/rejected": -4.406257629394531, "logps/chosen": -576.0067138671875, "logps/rejected": -442.0852966308594, "loss": 0.6703, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.11767254769802094, "rewards/margins": 0.05607324838638306, "rewards/rejected": 0.06159929558634758, "step": 310 }, { "epoch": 0.08, "learning_rate": 4.123711340206185e-07, "logits/chosen": -4.040421962738037, "logits/rejected": -3.995241165161133, "logps/chosen": -634.211181640625, "logps/rejected": -444.74945068359375, "loss": 0.6634, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.14802002906799316, "rewards/margins": 0.08729343116283417, "rewards/rejected": 0.0607265941798687, "step": 320 }, { "epoch": 0.09, "learning_rate": 4.2525773195876285e-07, "logits/chosen": -4.013192176818848, "logits/rejected": -3.9118850231170654, "logps/chosen": -531.2618408203125, "logps/rejected": -369.8399963378906, "loss": 0.6573, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.14214035868644714, "rewards/margins": 0.0905950665473938, "rewards/rejected": 0.051545269787311554, "step": 330 }, { "epoch": 0.09, "learning_rate": 4.381443298969072e-07, "logits/chosen": -4.299261569976807, "logits/rejected": -4.219182968139648, "logps/chosen": -580.9090576171875, "logps/rejected": -442.6720275878906, "loss": 0.6589, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.16068990528583527, "rewards/margins": 0.07925193011760712, "rewards/rejected": 0.08143799006938934, "step": 340 }, { "epoch": 0.09, "learning_rate": 4.510309278350515e-07, "logits/chosen": -4.036250114440918, "logits/rejected": -3.9510204792022705, "logps/chosen": -485.1849670410156, "logps/rejected": -423.96746826171875, "loss": 0.6691, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.16000322997570038, "rewards/margins": 0.0713193342089653, "rewards/rejected": 0.08868391811847687, "step": 350 }, { "epoch": 0.09, "learning_rate": 4.639175257731959e-07, "logits/chosen": -3.97419810295105, "logits/rejected": -3.945896863937378, "logps/chosen": -588.8265380859375, "logps/rejected": -500.585205078125, "loss": 0.664, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.16587187349796295, "rewards/margins": 0.048247091472148895, "rewards/rejected": 0.11762477457523346, "step": 360 }, { "epoch": 0.1, "learning_rate": 4.7680412371134024e-07, "logits/chosen": -4.279057502746582, "logits/rejected": -4.3186540603637695, "logps/chosen": -577.9805908203125, "logps/rejected": -508.83880615234375, "loss": 0.6621, "rewards/accuracies": 0.5625, "rewards/chosen": 0.1850723773241043, "rewards/margins": 0.04405021667480469, "rewards/rejected": 0.14102217555046082, "step": 370 }, { "epoch": 0.1, "learning_rate": 4.896907216494845e-07, "logits/chosen": -4.560557842254639, "logits/rejected": -4.472795009613037, "logps/chosen": -585.3865966796875, "logps/rejected": -427.63092041015625, "loss": 0.6453, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.22210833430290222, "rewards/margins": 0.14162525534629822, "rewards/rejected": 0.0804830864071846, "step": 380 }, { "epoch": 0.1, "learning_rate": 4.997130559540889e-07, "logits/chosen": -4.149146556854248, "logits/rejected": -4.130012035369873, "logps/chosen": -458.86334228515625, "logps/rejected": -402.4290466308594, "loss": 0.6574, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.155195951461792, "rewards/margins": 0.0719287320971489, "rewards/rejected": 0.0832671970129013, "step": 390 }, { "epoch": 0.1, "learning_rate": 4.982783357245337e-07, "logits/chosen": -4.101078987121582, "logits/rejected": -3.9474518299102783, "logps/chosen": -594.5633544921875, "logps/rejected": -459.3837890625, "loss": 0.6579, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.22732532024383545, "rewards/margins": 0.15504160523414612, "rewards/rejected": 0.07228370010852814, "step": 400 }, { "epoch": 0.1, "eval_logits/chosen": -4.030913829803467, "eval_logits/rejected": -4.064504623413086, "eval_logps/chosen": -547.223388671875, "eval_logps/rejected": -437.06365966796875, "eval_loss": 0.6425994038581848, "eval_rewards/accuracies": 0.6430000066757202, "eval_rewards/chosen": 0.21532239019870758, "eval_rewards/margins": 0.12119224667549133, "eval_rewards/rejected": 0.09413015842437744, "eval_runtime": 146.406, "eval_samples_per_second": 13.661, "eval_steps_per_second": 1.708, "step": 400 }, { "epoch": 0.11, "learning_rate": 4.968436154949784e-07, "logits/chosen": -4.3761677742004395, "logits/rejected": -4.4744062423706055, "logps/chosen": -486.56976318359375, "logps/rejected": -388.5422058105469, "loss": 0.6246, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2251121997833252, "rewards/margins": 0.1718236207962036, "rewards/rejected": 0.05328858643770218, "step": 410 }, { "epoch": 0.11, "learning_rate": 4.954088952654232e-07, "logits/chosen": -3.916259288787842, "logits/rejected": -4.022424221038818, "logps/chosen": -609.468017578125, "logps/rejected": -489.47503662109375, "loss": 0.6397, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.2788829207420349, "rewards/margins": 0.13126085698604584, "rewards/rejected": 0.14762204885482788, "step": 420 }, { "epoch": 0.11, "learning_rate": 4.93974175035868e-07, "logits/chosen": -4.073642253875732, "logits/rejected": -3.992410182952881, "logps/chosen": -589.1423950195312, "logps/rejected": -387.6160583496094, "loss": 0.625, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.27620893716812134, "rewards/margins": 0.2046860158443451, "rewards/rejected": 0.07152291387319565, "step": 430 }, { "epoch": 0.11, "learning_rate": 4.925394548063128e-07, "logits/chosen": -4.047796249389648, "logits/rejected": -4.099135875701904, "logps/chosen": -556.1654663085938, "logps/rejected": -459.1832580566406, "loss": 0.6249, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.26381629705429077, "rewards/margins": 0.18791969120502472, "rewards/rejected": 0.07589660584926605, "step": 440 }, { "epoch": 0.12, "learning_rate": 4.911047345767575e-07, "logits/chosen": -3.9531607627868652, "logits/rejected": -3.9822494983673096, "logps/chosen": -603.289306640625, "logps/rejected": -452.654541015625, "loss": 0.6246, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.24003124237060547, "rewards/margins": 0.1340305507183075, "rewards/rejected": 0.10600068420171738, "step": 450 }, { "epoch": 0.12, "learning_rate": 4.896700143472023e-07, "logits/chosen": -4.046868801116943, "logits/rejected": -3.973362684249878, "logps/chosen": -543.8755493164062, "logps/rejected": -415.2347106933594, "loss": 0.6243, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.27269724011421204, "rewards/margins": 0.13904382288455963, "rewards/rejected": 0.1336534321308136, "step": 460 }, { "epoch": 0.12, "learning_rate": 4.88235294117647e-07, "logits/chosen": -3.9586892127990723, "logits/rejected": -3.949618101119995, "logps/chosen": -521.1124267578125, "logps/rejected": -476.64599609375, "loss": 0.6414, "rewards/accuracies": 0.625, "rewards/chosen": 0.26220518350601196, "rewards/margins": 0.10140831768512726, "rewards/rejected": 0.1607969105243683, "step": 470 }, { "epoch": 0.12, "learning_rate": 4.868005738880918e-07, "logits/chosen": -4.340029716491699, "logits/rejected": -4.296602249145508, "logps/chosen": -498.50628662109375, "logps/rejected": -436.04193115234375, "loss": 0.6262, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.23516115546226501, "rewards/margins": 0.14310702681541443, "rewards/rejected": 0.09205415844917297, "step": 480 }, { "epoch": 0.13, "learning_rate": 4.853658536585365e-07, "logits/chosen": -3.9785568714141846, "logits/rejected": -3.9936375617980957, "logps/chosen": -535.5206298828125, "logps/rejected": -418.255126953125, "loss": 0.6359, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.26155534386634827, "rewards/margins": 0.19521105289459229, "rewards/rejected": 0.06634429097175598, "step": 490 }, { "epoch": 0.13, "learning_rate": 4.839311334289813e-07, "logits/chosen": -4.171419620513916, "logits/rejected": -4.2884111404418945, "logps/chosen": -497.77874755859375, "logps/rejected": -401.29046630859375, "loss": 0.6331, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.2628328502178192, "rewards/margins": 0.19560939073562622, "rewards/rejected": 0.06722346693277359, "step": 500 }, { "epoch": 0.13, "eval_logits/chosen": -4.0221147537231445, "eval_logits/rejected": -4.052542686462402, "eval_logps/chosen": -546.3970336914062, "eval_logps/rejected": -436.89892578125, "eval_loss": 0.6240983605384827, "eval_rewards/accuracies": 0.6430000066757202, "eval_rewards/chosen": 0.2979632318019867, "eval_rewards/margins": 0.18736404180526733, "eval_rewards/rejected": 0.11059919744729996, "eval_runtime": 146.1671, "eval_samples_per_second": 13.683, "eval_steps_per_second": 1.71, "step": 500 }, { "epoch": 0.13, "learning_rate": 4.824964131994261e-07, "logits/chosen": -4.073412895202637, "logits/rejected": -4.001163959503174, "logps/chosen": -588.8052978515625, "logps/rejected": -525.47314453125, "loss": 0.6532, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.2637428641319275, "rewards/margins": 0.08834028244018555, "rewards/rejected": 0.17540256679058075, "step": 510 }, { "epoch": 0.13, "learning_rate": 4.810616929698708e-07, "logits/chosen": -4.054637908935547, "logits/rejected": -4.115445613861084, "logps/chosen": -586.9202270507812, "logps/rejected": -401.8949890136719, "loss": 0.6252, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.2919687032699585, "rewards/margins": 0.22300024330615997, "rewards/rejected": 0.06896845996379852, "step": 520 }, { "epoch": 0.14, "learning_rate": 4.796269727403156e-07, "logits/chosen": -4.366249084472656, "logits/rejected": -4.296690940856934, "logps/chosen": -501.8008728027344, "logps/rejected": -414.6390686035156, "loss": 0.6275, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.2946510314941406, "rewards/margins": 0.20942220091819763, "rewards/rejected": 0.0852288231253624, "step": 530 }, { "epoch": 0.14, "learning_rate": 4.781922525107604e-07, "logits/chosen": -4.214944362640381, "logits/rejected": -4.242516040802002, "logps/chosen": -582.1668701171875, "logps/rejected": -438.54376220703125, "loss": 0.6205, "rewards/accuracies": 0.625, "rewards/chosen": 0.3577159643173218, "rewards/margins": 0.19794291257858276, "rewards/rejected": 0.159773051738739, "step": 540 }, { "epoch": 0.14, "learning_rate": 4.7675753228120513e-07, "logits/chosen": -4.113412380218506, "logits/rejected": -3.993567705154419, "logps/chosen": -564.5824584960938, "logps/rejected": -398.8680419921875, "loss": 0.6193, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.316133588552475, "rewards/margins": 0.27710846066474915, "rewards/rejected": 0.039025187492370605, "step": 550 }, { "epoch": 0.14, "learning_rate": 4.7532281205164993e-07, "logits/chosen": -4.085113048553467, "logits/rejected": -4.045032024383545, "logps/chosen": -643.7376708984375, "logps/rejected": -498.99859619140625, "loss": 0.6274, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.34889236092567444, "rewards/margins": 0.22896642982959747, "rewards/rejected": 0.11992595344781876, "step": 560 }, { "epoch": 0.15, "learning_rate": 4.738880918220947e-07, "logits/chosen": -4.168662071228027, "logits/rejected": -4.141668319702148, "logps/chosen": -560.7593994140625, "logps/rejected": -406.78143310546875, "loss": 0.6173, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.331714928150177, "rewards/margins": 0.245022252202034, "rewards/rejected": 0.0866926982998848, "step": 570 }, { "epoch": 0.15, "learning_rate": 4.7245337159253943e-07, "logits/chosen": -4.329155445098877, "logits/rejected": -4.298244476318359, "logps/chosen": -563.4876708984375, "logps/rejected": -376.99725341796875, "loss": 0.6147, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.2851625382900238, "rewards/margins": 0.19439759850502014, "rewards/rejected": 0.09076493978500366, "step": 580 }, { "epoch": 0.15, "learning_rate": 4.710186513629842e-07, "logits/chosen": -4.025614261627197, "logits/rejected": -3.995368242263794, "logps/chosen": -570.0155029296875, "logps/rejected": -456.23223876953125, "loss": 0.6397, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.38556593656539917, "rewards/margins": 0.24411602318286896, "rewards/rejected": 0.1414499133825302, "step": 590 }, { "epoch": 0.15, "learning_rate": 4.69583931133429e-07, "logits/chosen": -4.11724853515625, "logits/rejected": -4.225184440612793, "logps/chosen": -600.27685546875, "logps/rejected": -416.496826171875, "loss": 0.6229, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.3451527953147888, "rewards/margins": 0.26950401067733765, "rewards/rejected": 0.07564878463745117, "step": 600 }, { "epoch": 0.15, "eval_logits/chosen": -4.0116496086120605, "eval_logits/rejected": -4.040153503417969, "eval_logps/chosen": -545.94873046875, "eval_logps/rejected": -436.90228271484375, "eval_loss": 0.6138368844985962, "eval_rewards/accuracies": 0.6579999923706055, "eval_rewards/chosen": 0.3427916169166565, "eval_rewards/margins": 0.2325276881456375, "eval_rewards/rejected": 0.11026395857334137, "eval_runtime": 145.937, "eval_samples_per_second": 13.705, "eval_steps_per_second": 1.713, "step": 600 }, { "epoch": 0.16, "learning_rate": 4.681492109038737e-07, "logits/chosen": -4.138489723205566, "logits/rejected": -4.042520046234131, "logps/chosen": -544.0598754882812, "logps/rejected": -387.63031005859375, "loss": 0.5897, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.3607966899871826, "rewards/margins": 0.3171598017215729, "rewards/rejected": 0.043636929243803024, "step": 610 }, { "epoch": 0.16, "learning_rate": 4.667144906743185e-07, "logits/chosen": -4.025771617889404, "logits/rejected": -3.9127840995788574, "logps/chosen": -517.0219116210938, "logps/rejected": -439.63800048828125, "loss": 0.5769, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3815905749797821, "rewards/margins": 0.3481997549533844, "rewards/rejected": 0.033390797674655914, "step": 620 }, { "epoch": 0.16, "learning_rate": 4.6527977044476324e-07, "logits/chosen": -4.107082843780518, "logits/rejected": -4.197465419769287, "logps/chosen": -576.8883056640625, "logps/rejected": -426.826904296875, "loss": 0.5992, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.4150086045265198, "rewards/margins": 0.3171616792678833, "rewards/rejected": 0.09784691035747528, "step": 630 }, { "epoch": 0.17, "learning_rate": 4.6384505021520805e-07, "logits/chosen": -4.232905864715576, "logits/rejected": -4.251595497131348, "logps/chosen": -526.0496215820312, "logps/rejected": -378.66778564453125, "loss": 0.6053, "rewards/accuracies": 0.6875, "rewards/chosen": 0.36744990944862366, "rewards/margins": 0.237229585647583, "rewards/rejected": 0.13022030889987946, "step": 640 }, { "epoch": 0.17, "learning_rate": 4.6241032998565275e-07, "logits/chosen": -3.990309953689575, "logits/rejected": -3.9665799140930176, "logps/chosen": -535.3065795898438, "logps/rejected": -371.23883056640625, "loss": 0.5688, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.35981225967407227, "rewards/margins": 0.3149539828300476, "rewards/rejected": 0.044858284294605255, "step": 650 }, { "epoch": 0.17, "learning_rate": 4.6097560975609755e-07, "logits/chosen": -4.290364742279053, "logits/rejected": -4.3908371925354, "logps/chosen": -602.7872314453125, "logps/rejected": -467.87841796875, "loss": 0.6302, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.4449954628944397, "rewards/margins": 0.33234038949012756, "rewards/rejected": 0.11265511810779572, "step": 660 }, { "epoch": 0.17, "learning_rate": 4.595408895265423e-07, "logits/chosen": -4.181097507476807, "logits/rejected": -4.184117317199707, "logps/chosen": -562.30908203125, "logps/rejected": -419.0519104003906, "loss": 0.6057, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.41988906264305115, "rewards/margins": 0.33613476157188416, "rewards/rejected": 0.08375430852174759, "step": 670 }, { "epoch": 0.18, "learning_rate": 4.581061692969871e-07, "logits/chosen": -3.9935073852539062, "logits/rejected": -4.078420162200928, "logps/chosen": -594.1588134765625, "logps/rejected": -442.93218994140625, "loss": 0.5912, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.3990897536277771, "rewards/margins": 0.3166866898536682, "rewards/rejected": 0.08240304887294769, "step": 680 }, { "epoch": 0.18, "learning_rate": 4.566714490674318e-07, "logits/chosen": -3.8547301292419434, "logits/rejected": -3.8780627250671387, "logps/chosen": -467.4917907714844, "logps/rejected": -409.6250915527344, "loss": 0.5982, "rewards/accuracies": 0.75, "rewards/chosen": 0.3181690573692322, "rewards/margins": 0.30037710070610046, "rewards/rejected": 0.01779193803668022, "step": 690 }, { "epoch": 0.18, "learning_rate": 4.552367288378766e-07, "logits/chosen": -3.856755018234253, "logits/rejected": -3.7566399574279785, "logps/chosen": -496.44580078125, "logps/rejected": -416.92791748046875, "loss": 0.6008, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.3064565062522888, "rewards/margins": 0.19347265362739563, "rewards/rejected": 0.1129838228225708, "step": 700 }, { "epoch": 0.18, "eval_logits/chosen": -4.004153728485107, "eval_logits/rejected": -4.03006649017334, "eval_logps/chosen": -545.5549926757812, "eval_logps/rejected": -437.035400390625, "eval_loss": 0.6053361892700195, "eval_rewards/accuracies": 0.656000018119812, "eval_rewards/chosen": 0.3821641206741333, "eval_rewards/margins": 0.28520864248275757, "eval_rewards/rejected": 0.09695547074079514, "eval_runtime": 146.9276, "eval_samples_per_second": 13.612, "eval_steps_per_second": 1.702, "step": 700 }, { "epoch": 0.18, "learning_rate": 4.5380200860832136e-07, "logits/chosen": -4.1166276931762695, "logits/rejected": -4.0413994789123535, "logps/chosen": -559.1090087890625, "logps/rejected": -445.1997985839844, "loss": 0.6028, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.37777018547058105, "rewards/margins": 0.23465311527252197, "rewards/rejected": 0.1431170552968979, "step": 710 }, { "epoch": 0.19, "learning_rate": 4.5236728837876616e-07, "logits/chosen": -4.048049449920654, "logits/rejected": -3.983046293258667, "logps/chosen": -521.6533813476562, "logps/rejected": -423.5769958496094, "loss": 0.6113, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.3277244567871094, "rewards/margins": 0.251709520816803, "rewards/rejected": 0.07601495087146759, "step": 720 }, { "epoch": 0.19, "learning_rate": 4.5093256814921086e-07, "logits/chosen": -3.96891713142395, "logits/rejected": -4.157193660736084, "logps/chosen": -527.0986328125, "logps/rejected": -350.09735107421875, "loss": 0.6191, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.3868439793586731, "rewards/margins": 0.27587562799453735, "rewards/rejected": 0.11096830666065216, "step": 730 }, { "epoch": 0.19, "learning_rate": 4.4949784791965567e-07, "logits/chosen": -4.01112174987793, "logits/rejected": -3.9385008811950684, "logps/chosen": -575.333740234375, "logps/rejected": -411.80865478515625, "loss": 0.6002, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.4110191762447357, "rewards/margins": 0.3183595538139343, "rewards/rejected": 0.0926596075296402, "step": 740 }, { "epoch": 0.19, "learning_rate": 4.480631276901004e-07, "logits/chosen": -3.8952746391296387, "logits/rejected": -3.9051570892333984, "logps/chosen": -587.7459716796875, "logps/rejected": -426.0521545410156, "loss": 0.6019, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.4798852503299713, "rewards/margins": 0.3581300377845764, "rewards/rejected": 0.12175522744655609, "step": 750 }, { "epoch": 0.2, "learning_rate": 4.466284074605452e-07, "logits/chosen": -4.128601551055908, "logits/rejected": -4.192216396331787, "logps/chosen": -555.259033203125, "logps/rejected": -431.3056640625, "loss": 0.5987, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4251033365726471, "rewards/margins": 0.35620301961898804, "rewards/rejected": 0.06890030205249786, "step": 760 }, { "epoch": 0.2, "learning_rate": 4.451936872309899e-07, "logits/chosen": -4.191853046417236, "logits/rejected": -4.073651313781738, "logps/chosen": -564.2633056640625, "logps/rejected": -462.38232421875, "loss": 0.5874, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.46583813428878784, "rewards/margins": 0.4038007855415344, "rewards/rejected": 0.0620373897254467, "step": 770 }, { "epoch": 0.2, "learning_rate": 4.437589670014347e-07, "logits/chosen": -3.9436306953430176, "logits/rejected": -4.079471111297607, "logps/chosen": -569.0813598632812, "logps/rejected": -438.1226501464844, "loss": 0.592, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.32991427183151245, "rewards/margins": 0.34178251028060913, "rewards/rejected": -0.011868256144225597, "step": 780 }, { "epoch": 0.2, "learning_rate": 4.423242467718795e-07, "logits/chosen": -4.243984699249268, "logits/rejected": -4.39116907119751, "logps/chosen": -674.5192260742188, "logps/rejected": -492.4161682128906, "loss": 0.5828, "rewards/accuracies": 0.625, "rewards/chosen": 0.5247339010238647, "rewards/margins": 0.3629537522792816, "rewards/rejected": 0.1617802083492279, "step": 790 }, { "epoch": 0.21, "learning_rate": 4.4088952654232423e-07, "logits/chosen": -3.945283889770508, "logits/rejected": -3.931304454803467, "logps/chosen": -520.6378173828125, "logps/rejected": -340.75103759765625, "loss": 0.5751, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.41377678513526917, "rewards/margins": 0.4200451374053955, "rewards/rejected": -0.006268366239964962, "step": 800 }, { "epoch": 0.21, "eval_logits/chosen": -4.009909629821777, "eval_logits/rejected": -4.035899639129639, "eval_logps/chosen": -545.2993774414062, "eval_logps/rejected": -437.1260070800781, "eval_loss": 0.5998407006263733, "eval_rewards/accuracies": 0.6539999842643738, "eval_rewards/chosen": 0.4077303409576416, "eval_rewards/margins": 0.3198363780975342, "eval_rewards/rejected": 0.0878940224647522, "eval_runtime": 145.3508, "eval_samples_per_second": 13.76, "eval_steps_per_second": 1.72, "step": 800 }, { "epoch": 0.21, "learning_rate": 4.39454806312769e-07, "logits/chosen": -3.9220452308654785, "logits/rejected": -4.041108131408691, "logps/chosen": -615.2744750976562, "logps/rejected": -500.8890686035156, "loss": 0.5732, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.49199342727661133, "rewards/margins": 0.3499099612236023, "rewards/rejected": 0.1420835256576538, "step": 810 }, { "epoch": 0.21, "learning_rate": 4.380200860832138e-07, "logits/chosen": -3.90093994140625, "logits/rejected": -3.9337615966796875, "logps/chosen": -616.523681640625, "logps/rejected": -451.52996826171875, "loss": 0.5575, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.5319877862930298, "rewards/margins": 0.5240375399589539, "rewards/rejected": 0.007950320839881897, "step": 820 }, { "epoch": 0.21, "learning_rate": 4.3658536585365853e-07, "logits/chosen": -4.105984687805176, "logits/rejected": -4.126413345336914, "logps/chosen": -491.058349609375, "logps/rejected": -472.4222106933594, "loss": 0.6289, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.3649570047855377, "rewards/margins": 0.19882622361183167, "rewards/rejected": 0.16613081097602844, "step": 830 }, { "epoch": 0.22, "learning_rate": 4.351506456241033e-07, "logits/chosen": -4.366209983825684, "logits/rejected": -4.29564905166626, "logps/chosen": -573.9385375976562, "logps/rejected": -327.5928649902344, "loss": 0.5732, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.47058337926864624, "rewards/margins": 0.47838321328163147, "rewards/rejected": -0.007799782790243626, "step": 840 }, { "epoch": 0.22, "learning_rate": 4.3371592539454804e-07, "logits/chosen": -3.7974257469177246, "logits/rejected": -3.734402894973755, "logps/chosen": -471.0333557128906, "logps/rejected": -374.65673828125, "loss": 0.6266, "rewards/accuracies": 0.6875, "rewards/chosen": 0.34169143438339233, "rewards/margins": 0.2484813630580902, "rewards/rejected": 0.09321005642414093, "step": 850 }, { "epoch": 0.22, "learning_rate": 4.322812051649928e-07, "logits/chosen": -4.0287275314331055, "logits/rejected": -4.05717134475708, "logps/chosen": -469.2396545410156, "logps/rejected": -434.5414123535156, "loss": 0.5803, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.4505770206451416, "rewards/margins": 0.4199690818786621, "rewards/rejected": 0.03060789778828621, "step": 860 }, { "epoch": 0.22, "learning_rate": 4.308464849354376e-07, "logits/chosen": -3.7610325813293457, "logits/rejected": -3.8557701110839844, "logps/chosen": -529.0855712890625, "logps/rejected": -426.6482849121094, "loss": 0.5811, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.43691587448120117, "rewards/margins": 0.37098073959350586, "rewards/rejected": 0.0659351572394371, "step": 870 }, { "epoch": 0.23, "learning_rate": 4.294117647058823e-07, "logits/chosen": -4.040841102600098, "logits/rejected": -4.051581382751465, "logps/chosen": -590.5636596679688, "logps/rejected": -456.1898498535156, "loss": 0.582, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3900124430656433, "rewards/margins": 0.41093358397483826, "rewards/rejected": -0.020921092480421066, "step": 880 }, { "epoch": 0.23, "learning_rate": 4.279770444763271e-07, "logits/chosen": -4.3830671310424805, "logits/rejected": -4.194474220275879, "logps/chosen": -587.708251953125, "logps/rejected": -454.063720703125, "loss": 0.6117, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.4363631308078766, "rewards/margins": 0.3432873785495758, "rewards/rejected": 0.09307573735713959, "step": 890 }, { "epoch": 0.23, "learning_rate": 4.2654232424677185e-07, "logits/chosen": -4.166562080383301, "logits/rejected": -4.1289520263671875, "logps/chosen": -507.2445373535156, "logps/rejected": -396.4598083496094, "loss": 0.6485, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.27186277508735657, "rewards/margins": 0.16729417443275452, "rewards/rejected": 0.10456860065460205, "step": 900 }, { "epoch": 0.23, "eval_logits/chosen": -3.9936437606811523, "eval_logits/rejected": -4.016723155975342, "eval_logps/chosen": -545.1683349609375, "eval_logps/rejected": -437.3501281738281, "eval_loss": 0.5922096371650696, "eval_rewards/accuracies": 0.6600000262260437, "eval_rewards/chosen": 0.4208315312862396, "eval_rewards/margins": 0.3553457260131836, "eval_rewards/rejected": 0.06548583507537842, "eval_runtime": 146.2261, "eval_samples_per_second": 13.677, "eval_steps_per_second": 1.71, "step": 900 }, { "epoch": 0.23, "learning_rate": 4.2510760401721665e-07, "logits/chosen": -4.098907947540283, "logits/rejected": -4.098723411560059, "logps/chosen": -650.6366577148438, "logps/rejected": -495.94879150390625, "loss": 0.5866, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.5507170557975769, "rewards/margins": 0.5398961305618286, "rewards/rejected": 0.010820944793522358, "step": 910 }, { "epoch": 0.24, "learning_rate": 4.2367288378766135e-07, "logits/chosen": -4.1348772048950195, "logits/rejected": -4.166952610015869, "logps/chosen": -591.7069702148438, "logps/rejected": -477.39776611328125, "loss": 0.5992, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.47026628255844116, "rewards/margins": 0.3723362982273102, "rewards/rejected": 0.09792999923229218, "step": 920 }, { "epoch": 0.24, "learning_rate": 4.2223816355810615e-07, "logits/chosen": -4.0753397941589355, "logits/rejected": -4.123549461364746, "logps/chosen": -559.75634765625, "logps/rejected": -458.8775329589844, "loss": 0.5799, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.39656537771224976, "rewards/margins": 0.3553692698478699, "rewards/rejected": 0.04119610786437988, "step": 930 }, { "epoch": 0.24, "learning_rate": 4.208034433285509e-07, "logits/chosen": -4.269906520843506, "logits/rejected": -4.303974628448486, "logps/chosen": -593.5145874023438, "logps/rejected": -494.8085021972656, "loss": 0.5647, "rewards/accuracies": 0.75, "rewards/chosen": 0.5839260816574097, "rewards/margins": 0.44306641817092896, "rewards/rejected": 0.14085964858531952, "step": 940 }, { "epoch": 0.25, "learning_rate": 4.1936872309899565e-07, "logits/chosen": -3.792731523513794, "logits/rejected": -3.8398139476776123, "logps/chosen": -489.6437072753906, "logps/rejected": -401.5340576171875, "loss": 0.6287, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.2587449550628662, "rewards/margins": 0.2045062780380249, "rewards/rejected": 0.054238706827163696, "step": 950 }, { "epoch": 0.25, "learning_rate": 4.179340028694404e-07, "logits/chosen": -4.206066131591797, "logits/rejected": -4.116007328033447, "logps/chosen": -482.42816162109375, "logps/rejected": -382.22845458984375, "loss": 0.5962, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.3317318558692932, "rewards/margins": 0.34027567505836487, "rewards/rejected": -0.008543826639652252, "step": 960 }, { "epoch": 0.25, "learning_rate": 4.164992826398852e-07, "logits/chosen": -4.003951549530029, "logits/rejected": -3.9982573986053467, "logps/chosen": -494.906005859375, "logps/rejected": -401.5001220703125, "loss": 0.6299, "rewards/accuracies": 0.5625, "rewards/chosen": 0.266286164522171, "rewards/margins": 0.27161869406700134, "rewards/rejected": -0.005332520697265863, "step": 970 }, { "epoch": 0.25, "learning_rate": 4.1506456241032996e-07, "logits/chosen": -4.05717134475708, "logits/rejected": -3.8585472106933594, "logps/chosen": -563.6212768554688, "logps/rejected": -387.9486999511719, "loss": 0.5832, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.3436369299888611, "rewards/margins": 0.378772109746933, "rewards/rejected": -0.035135164856910706, "step": 980 }, { "epoch": 0.26, "learning_rate": 4.136298421807747e-07, "logits/chosen": -4.064385890960693, "logits/rejected": -4.122750759124756, "logps/chosen": -587.16162109375, "logps/rejected": -431.41796875, "loss": 0.572, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.4710933566093445, "rewards/margins": 0.4460281431674957, "rewards/rejected": 0.02506522461771965, "step": 990 }, { "epoch": 0.26, "learning_rate": 4.1219512195121946e-07, "logits/chosen": -4.021462917327881, "logits/rejected": -3.989718198776245, "logps/chosen": -584.3521728515625, "logps/rejected": -455.634033203125, "loss": 0.6164, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.48522061109542847, "rewards/margins": 0.3698544502258301, "rewards/rejected": 0.11536619812250137, "step": 1000 }, { "epoch": 0.26, "eval_logits/chosen": -3.986903429031372, "eval_logits/rejected": -4.009212017059326, "eval_logps/chosen": -545.3309326171875, "eval_logps/rejected": -437.7181701660156, "eval_loss": 0.5879542827606201, "eval_rewards/accuracies": 0.6620000004768372, "eval_rewards/chosen": 0.40457141399383545, "eval_rewards/margins": 0.37589016556739807, "eval_rewards/rejected": 0.028681199997663498, "eval_runtime": 146.6025, "eval_samples_per_second": 13.642, "eval_steps_per_second": 1.705, "step": 1000 }, { "epoch": 0.26, "learning_rate": 4.1076040172166427e-07, "logits/chosen": -4.139552116394043, "logits/rejected": -3.9534621238708496, "logps/chosen": -571.7590942382812, "logps/rejected": -444.6793518066406, "loss": 0.6451, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.3784537613391876, "rewards/margins": 0.23798270523548126, "rewards/rejected": 0.14047105610370636, "step": 1010 }, { "epoch": 0.26, "learning_rate": 4.09325681492109e-07, "logits/chosen": -4.049252510070801, "logits/rejected": -4.108782768249512, "logps/chosen": -644.1297607421875, "logps/rejected": -546.4414672851562, "loss": 0.6455, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.3904695510864258, "rewards/margins": 0.23047880828380585, "rewards/rejected": 0.15999077260494232, "step": 1020 }, { "epoch": 0.27, "learning_rate": 4.0789096126255377e-07, "logits/chosen": -4.110049247741699, "logits/rejected": -4.13530969619751, "logps/chosen": -601.107666015625, "logps/rejected": -430.6465759277344, "loss": 0.5972, "rewards/accuracies": 0.6875, "rewards/chosen": 0.39451712369918823, "rewards/margins": 0.3030509948730469, "rewards/rejected": 0.09146615862846375, "step": 1030 }, { "epoch": 0.27, "learning_rate": 4.064562410329985e-07, "logits/chosen": -4.069981575012207, "logits/rejected": -4.104067802429199, "logps/chosen": -562.4483642578125, "logps/rejected": -496.1336975097656, "loss": 0.5667, "rewards/accuracies": 0.6875, "rewards/chosen": 0.46653634309768677, "rewards/margins": 0.4154808521270752, "rewards/rejected": 0.051055438816547394, "step": 1040 }, { "epoch": 0.27, "learning_rate": 4.050215208034433e-07, "logits/chosen": -4.096522331237793, "logits/rejected": -4.07404088973999, "logps/chosen": -597.645751953125, "logps/rejected": -389.2298889160156, "loss": 0.566, "rewards/accuracies": 0.75, "rewards/chosen": 0.43959060311317444, "rewards/margins": 0.4319628179073334, "rewards/rejected": 0.007627798710018396, "step": 1050 }, { "epoch": 0.27, "learning_rate": 4.035868005738881e-07, "logits/chosen": -4.101964950561523, "logits/rejected": -3.971134901046753, "logps/chosen": -654.4503784179688, "logps/rejected": -446.3114318847656, "loss": 0.5856, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.5462326407432556, "rewards/margins": 0.52418452501297, "rewards/rejected": 0.022048136219382286, "step": 1060 }, { "epoch": 0.28, "learning_rate": 4.0215208034433283e-07, "logits/chosen": -4.149927139282227, "logits/rejected": -4.159340858459473, "logps/chosen": -569.7572631835938, "logps/rejected": -407.02545166015625, "loss": 0.575, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.25654059648513794, "rewards/margins": 0.33243244886398315, "rewards/rejected": -0.0758919045329094, "step": 1070 }, { "epoch": 0.28, "learning_rate": 4.007173601147776e-07, "logits/chosen": -4.003470420837402, "logits/rejected": -3.9572086334228516, "logps/chosen": -565.874267578125, "logps/rejected": -392.24761962890625, "loss": 0.6221, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.3559120297431946, "rewards/margins": 0.3540252149105072, "rewards/rejected": 0.0018868416082113981, "step": 1080 }, { "epoch": 0.28, "learning_rate": 3.992826398852224e-07, "logits/chosen": -4.156329154968262, "logits/rejected": -4.075765132904053, "logps/chosen": -503.8531188964844, "logps/rejected": -443.68731689453125, "loss": 0.5874, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.3798424005508423, "rewards/margins": 0.3366612493991852, "rewards/rejected": 0.04318114370107651, "step": 1090 }, { "epoch": 0.28, "learning_rate": 3.978479196556671e-07, "logits/chosen": -4.099778652191162, "logits/rejected": -4.040897846221924, "logps/chosen": -482.4664001464844, "logps/rejected": -434.3218688964844, "loss": 0.6225, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.2912788987159729, "rewards/margins": 0.3869735598564148, "rewards/rejected": -0.09569470584392548, "step": 1100 }, { "epoch": 0.28, "eval_logits/chosen": -3.998389720916748, "eval_logits/rejected": -4.024014949798584, "eval_logps/chosen": -545.3189086914062, "eval_logps/rejected": -437.8950500488281, "eval_loss": 0.5851995944976807, "eval_rewards/accuracies": 0.6679999828338623, "eval_rewards/chosen": 0.40576791763305664, "eval_rewards/margins": 0.3947778642177582, "eval_rewards/rejected": 0.01099009346216917, "eval_runtime": 145.9401, "eval_samples_per_second": 13.704, "eval_steps_per_second": 1.713, "step": 1100 }, { "epoch": 0.29, "learning_rate": 3.964131994261119e-07, "logits/chosen": -4.306766986846924, "logits/rejected": -4.230467796325684, "logps/chosen": -549.1437377929688, "logps/rejected": -444.1548767089844, "loss": 0.5957, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.4746015965938568, "rewards/margins": 0.5222647786140442, "rewards/rejected": -0.047663114964962006, "step": 1110 }, { "epoch": 0.29, "learning_rate": 3.9497847919655664e-07, "logits/chosen": -4.1666669845581055, "logits/rejected": -4.193212509155273, "logps/chosen": -506.9803161621094, "logps/rejected": -432.77783203125, "loss": 0.6335, "rewards/accuracies": 0.625, "rewards/chosen": 0.3090699315071106, "rewards/margins": 0.2045580893754959, "rewards/rejected": 0.10451184213161469, "step": 1120 }, { "epoch": 0.29, "learning_rate": 3.9354375896700144e-07, "logits/chosen": -3.8870487213134766, "logits/rejected": -3.9582340717315674, "logps/chosen": -616.6710815429688, "logps/rejected": -508.39813232421875, "loss": 0.5705, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.48951154947280884, "rewards/margins": 0.4596976637840271, "rewards/rejected": 0.02981388568878174, "step": 1130 }, { "epoch": 0.29, "learning_rate": 3.9210903873744614e-07, "logits/chosen": -3.9056262969970703, "logits/rejected": -3.7358765602111816, "logps/chosen": -550.6695556640625, "logps/rejected": -412.7212829589844, "loss": 0.5673, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3575005829334259, "rewards/margins": 0.3841975927352905, "rewards/rejected": -0.02669701538980007, "step": 1140 }, { "epoch": 0.3, "learning_rate": 3.9067431850789094e-07, "logits/chosen": -3.873683452606201, "logits/rejected": -3.945786714553833, "logps/chosen": -598.2088623046875, "logps/rejected": -395.6291198730469, "loss": 0.6215, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.40675464272499084, "rewards/margins": 0.4558509886264801, "rewards/rejected": -0.04909630864858627, "step": 1150 }, { "epoch": 0.3, "learning_rate": 3.892395982783357e-07, "logits/chosen": -3.874563217163086, "logits/rejected": -3.988626480102539, "logps/chosen": -580.8389282226562, "logps/rejected": -464.979248046875, "loss": 0.5563, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.4360577464103699, "rewards/margins": 0.436431884765625, "rewards/rejected": -0.0003741338732652366, "step": 1160 }, { "epoch": 0.3, "learning_rate": 3.878048780487805e-07, "logits/chosen": -3.8558075428009033, "logits/rejected": -3.862384080886841, "logps/chosen": -603.0067138671875, "logps/rejected": -453.36126708984375, "loss": 0.5751, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.5401335954666138, "rewards/margins": 0.4825173020362854, "rewards/rejected": 0.05761627480387688, "step": 1170 }, { "epoch": 0.3, "learning_rate": 3.863701578192252e-07, "logits/chosen": -4.10439395904541, "logits/rejected": -4.0709943771362305, "logps/chosen": -562.9437255859375, "logps/rejected": -468.41046142578125, "loss": 0.5989, "rewards/accuracies": 0.75, "rewards/chosen": 0.5524980425834656, "rewards/margins": 0.43577200174331665, "rewards/rejected": 0.1167261153459549, "step": 1180 }, { "epoch": 0.31, "learning_rate": 3.8493543758967e-07, "logits/chosen": -3.9623591899871826, "logits/rejected": -3.9735617637634277, "logps/chosen": -496.47479248046875, "logps/rejected": -354.4454040527344, "loss": 0.639, "rewards/accuracies": 0.625, "rewards/chosen": 0.2600507140159607, "rewards/margins": 0.2917958199977875, "rewards/rejected": -0.031745124608278275, "step": 1190 }, { "epoch": 0.31, "learning_rate": 3.8350071736011475e-07, "logits/chosen": -3.906859874725342, "logits/rejected": -3.9087185859680176, "logps/chosen": -427.0773010253906, "logps/rejected": -349.86077880859375, "loss": 0.6289, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.2505320608615875, "rewards/margins": 0.1847679316997528, "rewards/rejected": 0.06576415151357651, "step": 1200 }, { "epoch": 0.31, "eval_logits/chosen": -3.9994406700134277, "eval_logits/rejected": -4.025309085845947, "eval_logps/chosen": -545.2498168945312, "eval_logps/rejected": -437.9264831542969, "eval_loss": 0.5823842287063599, "eval_rewards/accuracies": 0.6669999957084656, "eval_rewards/chosen": 0.41267773509025574, "eval_rewards/margins": 0.4048316776752472, "eval_rewards/rejected": 0.007846098393201828, "eval_runtime": 147.2172, "eval_samples_per_second": 13.585, "eval_steps_per_second": 1.698, "step": 1200 }, { "epoch": 0.31, "learning_rate": 3.8206599713055956e-07, "logits/chosen": -4.042483329772949, "logits/rejected": -3.88130259513855, "logps/chosen": -579.4273681640625, "logps/rejected": -485.6758728027344, "loss": 0.6176, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.34652236104011536, "rewards/margins": 0.2972865104675293, "rewards/rejected": 0.049235861748456955, "step": 1210 }, { "epoch": 0.32, "learning_rate": 3.8063127690100426e-07, "logits/chosen": -4.063638210296631, "logits/rejected": -4.126063346862793, "logps/chosen": -599.88916015625, "logps/rejected": -484.5962829589844, "loss": 0.5938, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.32978740334510803, "rewards/margins": 0.2084900587797165, "rewards/rejected": 0.12129731476306915, "step": 1220 }, { "epoch": 0.32, "learning_rate": 3.7919655667144906e-07, "logits/chosen": -3.99627947807312, "logits/rejected": -3.940380573272705, "logps/chosen": -546.5948486328125, "logps/rejected": -412.3846740722656, "loss": 0.5958, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.44469523429870605, "rewards/margins": 0.48819655179977417, "rewards/rejected": -0.0435013584792614, "step": 1230 }, { "epoch": 0.32, "learning_rate": 3.777618364418938e-07, "logits/chosen": -3.9554569721221924, "logits/rejected": -3.9587948322296143, "logps/chosen": -467.00677490234375, "logps/rejected": -398.10955810546875, "loss": 0.5939, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.2916651964187622, "rewards/margins": 0.2515104413032532, "rewards/rejected": 0.040154773741960526, "step": 1240 }, { "epoch": 0.32, "learning_rate": 3.763271162123386e-07, "logits/chosen": -3.7756049633026123, "logits/rejected": -3.755903720855713, "logps/chosen": -591.2271728515625, "logps/rejected": -444.3221130371094, "loss": 0.5495, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.4893193244934082, "rewards/margins": 0.5091069936752319, "rewards/rejected": -0.019787678495049477, "step": 1250 }, { "epoch": 0.33, "learning_rate": 3.748923959827833e-07, "logits/chosen": -4.159283638000488, "logits/rejected": -4.039699077606201, "logps/chosen": -449.0978088378906, "logps/rejected": -326.54791259765625, "loss": 0.624, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.2927771806716919, "rewards/margins": 0.25214409828186035, "rewards/rejected": 0.040633104741573334, "step": 1260 }, { "epoch": 0.33, "learning_rate": 3.734576757532281e-07, "logits/chosen": -4.083529472351074, "logits/rejected": -4.100892543792725, "logps/chosen": -642.364501953125, "logps/rejected": -476.1664123535156, "loss": 0.5597, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5070067644119263, "rewards/margins": 0.6494277715682983, "rewards/rejected": -0.1424209624528885, "step": 1270 }, { "epoch": 0.33, "learning_rate": 3.7202295552367287e-07, "logits/chosen": -4.093569755554199, "logits/rejected": -4.279056549072266, "logps/chosen": -589.33642578125, "logps/rejected": -452.7460021972656, "loss": 0.5692, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.5026015639305115, "rewards/margins": 0.4538155198097229, "rewards/rejected": 0.048786066472530365, "step": 1280 }, { "epoch": 0.33, "learning_rate": 3.705882352941176e-07, "logits/chosen": -4.209478855133057, "logits/rejected": -4.320340633392334, "logps/chosen": -601.5377197265625, "logps/rejected": -405.8938293457031, "loss": 0.5319, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5548459887504578, "rewards/margins": 0.6283925771713257, "rewards/rejected": -0.07354650646448135, "step": 1290 }, { "epoch": 0.34, "learning_rate": 3.6915351506456237e-07, "logits/chosen": -4.303310871124268, "logits/rejected": -4.392244338989258, "logps/chosen": -526.3382568359375, "logps/rejected": -336.311279296875, "loss": 0.5818, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2645590901374817, "rewards/margins": 0.33163318037986755, "rewards/rejected": -0.06707411259412766, "step": 1300 }, { "epoch": 0.34, "eval_logits/chosen": -3.9953150749206543, "eval_logits/rejected": -4.021241188049316, "eval_logps/chosen": -545.1544189453125, "eval_logps/rejected": -437.90802001953125, "eval_loss": 0.5818018913269043, "eval_rewards/accuracies": 0.6679999828338623, "eval_rewards/chosen": 0.42221859097480774, "eval_rewards/margins": 0.41252991557121277, "eval_rewards/rejected": 0.009688721038401127, "eval_runtime": 146.2307, "eval_samples_per_second": 13.677, "eval_steps_per_second": 1.71, "step": 1300 }, { "epoch": 0.34, "learning_rate": 3.677187948350072e-07, "logits/chosen": -4.2785139083862305, "logits/rejected": -4.281913757324219, "logps/chosen": -631.8258056640625, "logps/rejected": -432.3312072753906, "loss": 0.5513, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.4580743908882141, "rewards/margins": 0.5590990781784058, "rewards/rejected": -0.10102470219135284, "step": 1310 }, { "epoch": 0.34, "learning_rate": 3.6628407460545193e-07, "logits/chosen": -4.18049955368042, "logits/rejected": -4.1783599853515625, "logps/chosen": -482.9546813964844, "logps/rejected": -441.1940002441406, "loss": 0.6002, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.4040532112121582, "rewards/margins": 0.2903508245944977, "rewards/rejected": 0.11370239406824112, "step": 1320 }, { "epoch": 0.34, "learning_rate": 3.648493543758967e-07, "logits/chosen": -4.047448635101318, "logits/rejected": -4.031399726867676, "logps/chosen": -513.3343505859375, "logps/rejected": -439.59857177734375, "loss": 0.5949, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.5155481696128845, "rewards/margins": 0.5262617468833923, "rewards/rejected": -0.01071359496563673, "step": 1330 }, { "epoch": 0.35, "learning_rate": 3.6341463414634143e-07, "logits/chosen": -4.256237983703613, "logits/rejected": -4.140265941619873, "logps/chosen": -586.2674560546875, "logps/rejected": -513.4707641601562, "loss": 0.6051, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4771571159362793, "rewards/margins": 0.3665952682495117, "rewards/rejected": 0.11056187003850937, "step": 1340 }, { "epoch": 0.35, "learning_rate": 3.6197991391678623e-07, "logits/chosen": -4.259045600891113, "logits/rejected": -4.169145584106445, "logps/chosen": -492.68572998046875, "logps/rejected": -336.1100769042969, "loss": 0.5918, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.3294292390346527, "rewards/margins": 0.42583298683166504, "rewards/rejected": -0.09640369564294815, "step": 1350 }, { "epoch": 0.35, "learning_rate": 3.60545193687231e-07, "logits/chosen": -4.042055130004883, "logits/rejected": -4.034060478210449, "logps/chosen": -437.0550231933594, "logps/rejected": -344.05828857421875, "loss": 0.5862, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.3367193341255188, "rewards/margins": 0.3423077464103699, "rewards/rejected": -0.0055884262546896935, "step": 1360 }, { "epoch": 0.35, "learning_rate": 3.5911047345767574e-07, "logits/chosen": -3.9995014667510986, "logits/rejected": -4.026850700378418, "logps/chosen": -576.7128295898438, "logps/rejected": -466.55010986328125, "loss": 0.6112, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.4717404246330261, "rewards/margins": 0.5018633604049683, "rewards/rejected": -0.030122917145490646, "step": 1370 }, { "epoch": 0.36, "learning_rate": 3.576757532281205e-07, "logits/chosen": -3.9444518089294434, "logits/rejected": -3.91229510307312, "logps/chosen": -586.0652465820312, "logps/rejected": -464.03271484375, "loss": 0.5891, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.3900233507156372, "rewards/margins": 0.42400288581848145, "rewards/rejected": -0.03397948667407036, "step": 1380 }, { "epoch": 0.36, "learning_rate": 3.562410329985653e-07, "logits/chosen": -3.970731735229492, "logits/rejected": -4.1245927810668945, "logps/chosen": -567.556884765625, "logps/rejected": -479.55841064453125, "loss": 0.6725, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.331743061542511, "rewards/margins": 0.251522034406662, "rewards/rejected": 0.08022100478410721, "step": 1390 }, { "epoch": 0.36, "learning_rate": 3.5480631276901004e-07, "logits/chosen": -4.056004524230957, "logits/rejected": -4.051678657531738, "logps/chosen": -536.676513671875, "logps/rejected": -385.55767822265625, "loss": 0.567, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.28222841024398804, "rewards/margins": 0.38999465107917786, "rewards/rejected": -0.10776624828577042, "step": 1400 }, { "epoch": 0.36, "eval_logits/chosen": -4.006156921386719, "eval_logits/rejected": -4.033264636993408, "eval_logps/chosen": -545.279052734375, "eval_logps/rejected": -438.14556884765625, "eval_loss": 0.5797023773193359, "eval_rewards/accuracies": 0.6729999780654907, "eval_rewards/chosen": 0.409759521484375, "eval_rewards/margins": 0.42382344603538513, "eval_rewards/rejected": -0.014063959941267967, "eval_runtime": 147.9054, "eval_samples_per_second": 13.522, "eval_steps_per_second": 1.69, "step": 1400 }, { "epoch": 0.36, "learning_rate": 3.533715925394548e-07, "logits/chosen": -3.957362413406372, "logits/rejected": -3.8286430835723877, "logps/chosen": -518.3594970703125, "logps/rejected": -345.4431457519531, "loss": 0.5558, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.4510959982872009, "rewards/margins": 0.5645402669906616, "rewards/rejected": -0.11344428360462189, "step": 1410 }, { "epoch": 0.37, "learning_rate": 3.5193687230989955e-07, "logits/chosen": -4.1850199699401855, "logits/rejected": -4.076201915740967, "logps/chosen": -611.8565673828125, "logps/rejected": -568.882080078125, "loss": 0.6438, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.4028521478176117, "rewards/margins": 0.12559688091278076, "rewards/rejected": 0.2772553265094757, "step": 1420 }, { "epoch": 0.37, "learning_rate": 3.5050215208034435e-07, "logits/chosen": -4.432595252990723, "logits/rejected": -4.352065086364746, "logps/chosen": -611.6333618164062, "logps/rejected": -451.0577697753906, "loss": 0.5624, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.40066614747047424, "rewards/margins": 0.4248287081718445, "rewards/rejected": -0.024162566289305687, "step": 1430 }, { "epoch": 0.37, "learning_rate": 3.4906743185078905e-07, "logits/chosen": -4.167354106903076, "logits/rejected": -4.119304656982422, "logps/chosen": -597.4181518554688, "logps/rejected": -468.30126953125, "loss": 0.5355, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.5246739983558655, "rewards/margins": 0.6675662994384766, "rewards/rejected": -0.1428922414779663, "step": 1440 }, { "epoch": 0.37, "learning_rate": 3.4763271162123385e-07, "logits/chosen": -3.9422059059143066, "logits/rejected": -4.009974002838135, "logps/chosen": -456.62103271484375, "logps/rejected": -456.80462646484375, "loss": 0.6016, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.46443605422973633, "rewards/margins": 0.3656379282474518, "rewards/rejected": 0.09879810363054276, "step": 1450 }, { "epoch": 0.38, "learning_rate": 3.461979913916786e-07, "logits/chosen": -4.015919208526611, "logits/rejected": -4.085513114929199, "logps/chosen": -490.6377868652344, "logps/rejected": -374.94866943359375, "loss": 0.5377, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.44695836305618286, "rewards/margins": 0.5198525190353394, "rewards/rejected": -0.07289411872625351, "step": 1460 }, { "epoch": 0.38, "learning_rate": 3.447632711621234e-07, "logits/chosen": -4.100437164306641, "logits/rejected": -4.218926906585693, "logps/chosen": -555.1058349609375, "logps/rejected": -427.87139892578125, "loss": 0.6011, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3257507383823395, "rewards/margins": 0.39021044969558716, "rewards/rejected": -0.06445976346731186, "step": 1470 }, { "epoch": 0.38, "learning_rate": 3.433285509325681e-07, "logits/chosen": -3.8781065940856934, "logits/rejected": -3.8362109661102295, "logps/chosen": -423.455078125, "logps/rejected": -366.01617431640625, "loss": 0.609, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.23626752197742462, "rewards/margins": 0.28217631578445435, "rewards/rejected": -0.04590878635644913, "step": 1480 }, { "epoch": 0.38, "learning_rate": 3.418938307030129e-07, "logits/chosen": -4.141830921173096, "logits/rejected": -4.139374256134033, "logps/chosen": -490.91473388671875, "logps/rejected": -431.1683654785156, "loss": 0.5698, "rewards/accuracies": 0.625, "rewards/chosen": 0.3411320149898529, "rewards/margins": 0.3740822374820709, "rewards/rejected": -0.032950229942798615, "step": 1490 }, { "epoch": 0.39, "learning_rate": 3.4045911047345766e-07, "logits/chosen": -4.246241569519043, "logits/rejected": -4.1093244552612305, "logps/chosen": -595.197509765625, "logps/rejected": -455.29656982421875, "loss": 0.5659, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.3736713230609894, "rewards/margins": 0.5091021060943604, "rewards/rejected": -0.13543078303337097, "step": 1500 }, { "epoch": 0.39, "eval_logits/chosen": -3.996328353881836, "eval_logits/rejected": -4.024491786956787, "eval_logps/chosen": -545.1725463867188, "eval_logps/rejected": -438.1591491699219, "eval_loss": 0.5790306925773621, "eval_rewards/accuracies": 0.6779999732971191, "eval_rewards/chosen": 0.4204104244709015, "eval_rewards/margins": 0.43583211302757263, "eval_rewards/rejected": -0.015421712771058083, "eval_runtime": 145.84, "eval_samples_per_second": 13.714, "eval_steps_per_second": 1.714, "step": 1500 }, { "epoch": 0.39, "learning_rate": 3.3902439024390247e-07, "logits/chosen": -3.910076141357422, "logits/rejected": -4.025428295135498, "logps/chosen": -489.580322265625, "logps/rejected": -334.91131591796875, "loss": 0.5546, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.26078709959983826, "rewards/margins": 0.4329034686088562, "rewards/rejected": -0.17211636900901794, "step": 1510 }, { "epoch": 0.39, "learning_rate": 3.3758967001434716e-07, "logits/chosen": -4.3428544998168945, "logits/rejected": -4.32183837890625, "logps/chosen": -733.9967041015625, "logps/rejected": -545.9852905273438, "loss": 0.5303, "rewards/accuracies": 0.75, "rewards/chosen": 0.6733923554420471, "rewards/margins": 0.6076704263687134, "rewards/rejected": 0.06572196632623672, "step": 1520 }, { "epoch": 0.4, "learning_rate": 3.3615494978479197e-07, "logits/chosen": -4.124747276306152, "logits/rejected": -4.166211128234863, "logps/chosen": -608.829345703125, "logps/rejected": -383.62518310546875, "loss": 0.5576, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.39822494983673096, "rewards/margins": 0.5036035776138306, "rewards/rejected": -0.1053786501288414, "step": 1530 }, { "epoch": 0.4, "learning_rate": 3.347202295552367e-07, "logits/chosen": -4.033061981201172, "logits/rejected": -4.068852424621582, "logps/chosen": -511.05682373046875, "logps/rejected": -465.43475341796875, "loss": 0.6175, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.35540980100631714, "rewards/margins": 0.31782034039497375, "rewards/rejected": 0.03758946806192398, "step": 1540 }, { "epoch": 0.4, "learning_rate": 3.332855093256815e-07, "logits/chosen": -4.0989203453063965, "logits/rejected": -4.1430792808532715, "logps/chosen": -612.7939453125, "logps/rejected": -483.2908630371094, "loss": 0.5781, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.567456841468811, "rewards/margins": 0.6063565611839294, "rewards/rejected": -0.03889976069331169, "step": 1550 }, { "epoch": 0.4, "learning_rate": 3.318507890961262e-07, "logits/chosen": -4.184214115142822, "logits/rejected": -4.299299716949463, "logps/chosen": -540.6097412109375, "logps/rejected": -396.2356872558594, "loss": 0.5515, "rewards/accuracies": 0.75, "rewards/chosen": 0.401826947927475, "rewards/margins": 0.48910683393478394, "rewards/rejected": -0.08727996051311493, "step": 1560 }, { "epoch": 0.41, "learning_rate": 3.3041606886657103e-07, "logits/chosen": -3.9090118408203125, "logits/rejected": -3.9316658973693848, "logps/chosen": -574.2691650390625, "logps/rejected": -485.7064514160156, "loss": 0.674, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.4122130274772644, "rewards/margins": 0.2545969486236572, "rewards/rejected": 0.1576160490512848, "step": 1570 }, { "epoch": 0.41, "learning_rate": 3.289813486370158e-07, "logits/chosen": -4.144872665405273, "logits/rejected": -4.0689921379089355, "logps/chosen": -513.11181640625, "logps/rejected": -471.02606201171875, "loss": 0.6057, "rewards/accuracies": 0.625, "rewards/chosen": 0.34228435158729553, "rewards/margins": 0.4427550733089447, "rewards/rejected": -0.10047070682048798, "step": 1580 }, { "epoch": 0.41, "learning_rate": 3.275466284074606e-07, "logits/chosen": -3.9695823192596436, "logits/rejected": -4.070342063903809, "logps/chosen": -653.9967651367188, "logps/rejected": -471.898193359375, "loss": 0.5594, "rewards/accuracies": 0.8125, "rewards/chosen": 0.57194983959198, "rewards/margins": 0.6099370121955872, "rewards/rejected": -0.03798716515302658, "step": 1590 }, { "epoch": 0.41, "learning_rate": 3.261119081779053e-07, "logits/chosen": -4.089110851287842, "logits/rejected": -4.0619401931762695, "logps/chosen": -515.8906860351562, "logps/rejected": -470.29541015625, "loss": 0.5993, "rewards/accuracies": 0.625, "rewards/chosen": 0.42164483666419983, "rewards/margins": 0.3623473346233368, "rewards/rejected": 0.05929745361208916, "step": 1600 }, { "epoch": 0.41, "eval_logits/chosen": -3.990658760070801, "eval_logits/rejected": -4.0185322761535645, "eval_logps/chosen": -545.216064453125, "eval_logps/rejected": -438.2904052734375, "eval_loss": 0.5782522559165955, "eval_rewards/accuracies": 0.671999990940094, "eval_rewards/chosen": 0.41605862975120544, "eval_rewards/margins": 0.4446040093898773, "eval_rewards/rejected": -0.02854539081454277, "eval_runtime": 147.5337, "eval_samples_per_second": 13.556, "eval_steps_per_second": 1.695, "step": 1600 }, { "epoch": 0.42, "learning_rate": 3.246771879483501e-07, "logits/chosen": -3.944901704788208, "logits/rejected": -3.9903030395507812, "logps/chosen": -475.91363525390625, "logps/rejected": -396.0389099121094, "loss": 0.5996, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2757423520088196, "rewards/margins": 0.3251574635505676, "rewards/rejected": -0.049415141344070435, "step": 1610 }, { "epoch": 0.42, "learning_rate": 3.2324246771879484e-07, "logits/chosen": -4.066908359527588, "logits/rejected": -3.8957467079162598, "logps/chosen": -538.4827880859375, "logps/rejected": -386.1225280761719, "loss": 0.5916, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.40634027123451233, "rewards/margins": 0.44673413038253784, "rewards/rejected": -0.04039386659860611, "step": 1620 }, { "epoch": 0.42, "learning_rate": 3.2180774748923953e-07, "logits/chosen": -4.119419097900391, "logits/rejected": -3.880350112915039, "logps/chosen": -571.5280151367188, "logps/rejected": -467.98321533203125, "loss": 0.5709, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.3607470393180847, "rewards/margins": 0.40588730573654175, "rewards/rejected": -0.04514027386903763, "step": 1630 }, { "epoch": 0.42, "learning_rate": 3.2037302725968434e-07, "logits/chosen": -3.995079517364502, "logits/rejected": -3.9660801887512207, "logps/chosen": -528.3262939453125, "logps/rejected": -391.5002746582031, "loss": 0.6413, "rewards/accuracies": 0.75, "rewards/chosen": 0.3550952970981598, "rewards/margins": 0.4048345685005188, "rewards/rejected": -0.049739234149456024, "step": 1640 }, { "epoch": 0.43, "learning_rate": 3.189383070301291e-07, "logits/chosen": -4.0606889724731445, "logits/rejected": -4.020025253295898, "logps/chosen": -606.38330078125, "logps/rejected": -492.71759033203125, "loss": 0.5607, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.392439067363739, "rewards/margins": 0.29129353165626526, "rewards/rejected": 0.10114555060863495, "step": 1650 }, { "epoch": 0.43, "learning_rate": 3.175035868005739e-07, "logits/chosen": -4.366388320922852, "logits/rejected": -4.3169779777526855, "logps/chosen": -572.692626953125, "logps/rejected": -431.1947326660156, "loss": 0.6249, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.4136829972267151, "rewards/margins": 0.31379351019859314, "rewards/rejected": 0.09988941252231598, "step": 1660 }, { "epoch": 0.43, "learning_rate": 3.160688665710186e-07, "logits/chosen": -4.261553764343262, "logits/rejected": -4.20203971862793, "logps/chosen": -548.4271240234375, "logps/rejected": -461.83563232421875, "loss": 0.5295, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.4961729943752289, "rewards/margins": 0.5303093194961548, "rewards/rejected": -0.034136295318603516, "step": 1670 }, { "epoch": 0.43, "learning_rate": 3.146341463414634e-07, "logits/chosen": -4.172554016113281, "logits/rejected": -4.170234680175781, "logps/chosen": -538.4212036132812, "logps/rejected": -511.212890625, "loss": 0.5634, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.45014914870262146, "rewards/margins": 0.44471946358680725, "rewards/rejected": 0.005429693963378668, "step": 1680 }, { "epoch": 0.44, "learning_rate": 3.1319942611190815e-07, "logits/chosen": -3.915037155151367, "logits/rejected": -3.8585174083709717, "logps/chosen": -497.04229736328125, "logps/rejected": -471.8094787597656, "loss": 0.5919, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2910478711128235, "rewards/margins": 0.37008222937583923, "rewards/rejected": -0.07903440296649933, "step": 1690 }, { "epoch": 0.44, "learning_rate": 3.1176470588235295e-07, "logits/chosen": -3.864201307296753, "logits/rejected": -3.8585095405578613, "logps/chosen": -542.01953125, "logps/rejected": -397.53424072265625, "loss": 0.5999, "rewards/accuracies": 0.625, "rewards/chosen": 0.17917154729366302, "rewards/margins": 0.28133153915405273, "rewards/rejected": -0.10215996205806732, "step": 1700 }, { "epoch": 0.44, "eval_logits/chosen": -3.99351167678833, "eval_logits/rejected": -4.020653247833252, "eval_logps/chosen": -545.3095092773438, "eval_logps/rejected": -438.4728698730469, "eval_loss": 0.5767195820808411, "eval_rewards/accuracies": 0.6840000152587891, "eval_rewards/chosen": 0.40671002864837646, "eval_rewards/margins": 0.4535037875175476, "eval_rewards/rejected": -0.04679381474852562, "eval_runtime": 147.2862, "eval_samples_per_second": 13.579, "eval_steps_per_second": 1.697, "step": 1700 }, { "epoch": 0.44, "learning_rate": 3.1032998565279765e-07, "logits/chosen": -4.243551254272461, "logits/rejected": -4.064631938934326, "logps/chosen": -478.11187744140625, "logps/rejected": -458.78692626953125, "loss": 0.5753, "rewards/accuracies": 0.6875, "rewards/chosen": 0.40352755784988403, "rewards/margins": 0.4834938645362854, "rewards/rejected": -0.07996630668640137, "step": 1710 }, { "epoch": 0.44, "learning_rate": 3.0889526542324245e-07, "logits/chosen": -4.1683268547058105, "logits/rejected": -4.173158645629883, "logps/chosen": -652.5173950195312, "logps/rejected": -432.58428955078125, "loss": 0.5737, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.4788149297237396, "rewards/margins": 0.5799158811569214, "rewards/rejected": -0.10110093653202057, "step": 1720 }, { "epoch": 0.45, "learning_rate": 3.074605451936872e-07, "logits/chosen": -4.137356758117676, "logits/rejected": -4.176325798034668, "logps/chosen": -576.1214599609375, "logps/rejected": -380.2808837890625, "loss": 0.5699, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.4571780562400818, "rewards/margins": 0.49380144476890564, "rewards/rejected": -0.03662336990237236, "step": 1730 }, { "epoch": 0.45, "learning_rate": 3.06025824964132e-07, "logits/chosen": -4.188223838806152, "logits/rejected": -4.05302095413208, "logps/chosen": -480.8373107910156, "logps/rejected": -422.5328063964844, "loss": 0.5799, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.2358378916978836, "rewards/margins": 0.28218406438827515, "rewards/rejected": -0.04634615033864975, "step": 1740 }, { "epoch": 0.45, "learning_rate": 3.045911047345767e-07, "logits/chosen": -4.110243797302246, "logits/rejected": -4.0695366859436035, "logps/chosen": -615.70263671875, "logps/rejected": -426.46075439453125, "loss": 0.5073, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5083123445510864, "rewards/margins": 0.7217694520950317, "rewards/rejected": -0.2134571522474289, "step": 1750 }, { "epoch": 0.45, "learning_rate": 3.031563845050215e-07, "logits/chosen": -4.203267574310303, "logits/rejected": -4.161170482635498, "logps/chosen": -590.3410034179688, "logps/rejected": -447.26715087890625, "loss": 0.5185, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.5431427955627441, "rewards/margins": 0.68207848072052, "rewards/rejected": -0.13893572986125946, "step": 1760 }, { "epoch": 0.46, "learning_rate": 3.0172166427546626e-07, "logits/chosen": -4.285967826843262, "logits/rejected": -4.167950630187988, "logps/chosen": -533.8848876953125, "logps/rejected": -413.30975341796875, "loss": 0.5936, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.39957305788993835, "rewards/margins": 0.3795527517795563, "rewards/rejected": 0.020020361989736557, "step": 1770 }, { "epoch": 0.46, "learning_rate": 3.00286944045911e-07, "logits/chosen": -4.027644634246826, "logits/rejected": -3.9792587757110596, "logps/chosen": -626.9630737304688, "logps/rejected": -397.4438781738281, "loss": 0.6014, "rewards/accuracies": 0.75, "rewards/chosen": 0.45451897382736206, "rewards/margins": 0.5369467735290527, "rewards/rejected": -0.08242778480052948, "step": 1780 }, { "epoch": 0.46, "learning_rate": 2.9885222381635577e-07, "logits/chosen": -4.1345133781433105, "logits/rejected": -4.244950771331787, "logps/chosen": -562.5131225585938, "logps/rejected": -422.6846618652344, "loss": 0.5804, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.40697044134140015, "rewards/margins": 0.48475074768066406, "rewards/rejected": -0.07778030633926392, "step": 1790 }, { "epoch": 0.46, "learning_rate": 2.9741750358680057e-07, "logits/chosen": -4.032704830169678, "logits/rejected": -3.9772307872772217, "logps/chosen": -568.47802734375, "logps/rejected": -502.3460998535156, "loss": 0.6004, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.46621161699295044, "rewards/margins": 0.4852283000946045, "rewards/rejected": -0.019016731530427933, "step": 1800 }, { "epoch": 0.46, "eval_logits/chosen": -3.9943645000457764, "eval_logits/rejected": -4.0218825340271, "eval_logps/chosen": -545.1437377929688, "eval_logps/rejected": -438.3991394042969, "eval_loss": 0.5730865597724915, "eval_rewards/accuracies": 0.6830000281333923, "eval_rewards/chosen": 0.4232881963253021, "eval_rewards/margins": 0.46270594000816345, "eval_rewards/rejected": -0.03941771015524864, "eval_runtime": 148.86, "eval_samples_per_second": 13.435, "eval_steps_per_second": 1.679, "step": 1800 }, { "epoch": 0.47, "learning_rate": 2.959827833572453e-07, "logits/chosen": -4.1379075050354, "logits/rejected": -4.1423420906066895, "logps/chosen": -620.6439819335938, "logps/rejected": -438.18084716796875, "loss": 0.5651, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5286334753036499, "rewards/margins": 0.5047623515129089, "rewards/rejected": 0.02387116476893425, "step": 1810 }, { "epoch": 0.47, "learning_rate": 2.9454806312769007e-07, "logits/chosen": -4.126761436462402, "logits/rejected": -4.265500545501709, "logps/chosen": -494.80206298828125, "logps/rejected": -427.181640625, "loss": 0.6087, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.4608178734779358, "rewards/margins": 0.36958831548690796, "rewards/rejected": 0.09122952073812485, "step": 1820 }, { "epoch": 0.47, "learning_rate": 2.931133428981348e-07, "logits/chosen": -4.138312339782715, "logits/rejected": -4.2697319984436035, "logps/chosen": -492.8348693847656, "logps/rejected": -405.1728515625, "loss": 0.6199, "rewards/accuracies": 0.5625, "rewards/chosen": 0.21532472968101501, "rewards/margins": 0.19911542534828186, "rewards/rejected": 0.016209278255701065, "step": 1830 }, { "epoch": 0.48, "learning_rate": 2.9167862266857963e-07, "logits/chosen": -4.26310396194458, "logits/rejected": -4.242154121398926, "logps/chosen": -562.9186401367188, "logps/rejected": -377.67303466796875, "loss": 0.5497, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.5041324496269226, "rewards/margins": 0.5989420413970947, "rewards/rejected": -0.09480961412191391, "step": 1840 }, { "epoch": 0.48, "learning_rate": 2.902439024390244e-07, "logits/chosen": -4.286158561706543, "logits/rejected": -4.289405345916748, "logps/chosen": -607.9891357421875, "logps/rejected": -496.7867126464844, "loss": 0.5634, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5594509840011597, "rewards/margins": 0.6072807908058167, "rewards/rejected": -0.04782974720001221, "step": 1850 }, { "epoch": 0.48, "learning_rate": 2.8880918220946913e-07, "logits/chosen": -4.170632839202881, "logits/rejected": -4.215968132019043, "logps/chosen": -445.17559814453125, "logps/rejected": -355.8191223144531, "loss": 0.6043, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.31880825757980347, "rewards/margins": 0.2609938383102417, "rewards/rejected": 0.05781441926956177, "step": 1860 }, { "epoch": 0.48, "learning_rate": 2.873744619799139e-07, "logits/chosen": -4.265324592590332, "logits/rejected": -4.255076885223389, "logps/chosen": -590.837158203125, "logps/rejected": -441.911376953125, "loss": 0.6081, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.33519288897514343, "rewards/margins": 0.3991120457649231, "rewards/rejected": -0.06391920149326324, "step": 1870 }, { "epoch": 0.49, "learning_rate": 2.859397417503587e-07, "logits/chosen": -4.415879249572754, "logits/rejected": -4.314742565155029, "logps/chosen": -501.2169494628906, "logps/rejected": -451.86553955078125, "loss": 0.6035, "rewards/accuracies": 0.6875, "rewards/chosen": 0.33651891350746155, "rewards/margins": 0.30277958512306213, "rewards/rejected": 0.03373932093381882, "step": 1880 }, { "epoch": 0.49, "learning_rate": 2.8450502152080344e-07, "logits/chosen": -3.878053665161133, "logits/rejected": -4.040474891662598, "logps/chosen": -647.9942626953125, "logps/rejected": -437.6617736816406, "loss": 0.5379, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.5844290852546692, "rewards/margins": 0.661879301071167, "rewards/rejected": -0.07745026051998138, "step": 1890 }, { "epoch": 0.49, "learning_rate": 2.830703012912482e-07, "logits/chosen": -4.195162296295166, "logits/rejected": -4.2288408279418945, "logps/chosen": -590.7817993164062, "logps/rejected": -428.28387451171875, "loss": 0.5349, "rewards/accuracies": 0.8125, "rewards/chosen": 0.510196328163147, "rewards/margins": 0.6029139757156372, "rewards/rejected": -0.09271766245365143, "step": 1900 }, { "epoch": 0.49, "eval_logits/chosen": -4.0012006759643555, "eval_logits/rejected": -4.029512405395508, "eval_logps/chosen": -545.0914306640625, "eval_logps/rejected": -438.4334716796875, "eval_loss": 0.5719799995422363, "eval_rewards/accuracies": 0.6830000281333923, "eval_rewards/chosen": 0.4285166561603546, "eval_rewards/margins": 0.4713680148124695, "eval_rewards/rejected": -0.04285132512450218, "eval_runtime": 148.6253, "eval_samples_per_second": 13.457, "eval_steps_per_second": 1.682, "step": 1900 }, { "epoch": 0.49, "learning_rate": 2.8163558106169294e-07, "logits/chosen": -4.198761940002441, "logits/rejected": -4.1362786293029785, "logps/chosen": -616.3384399414062, "logps/rejected": -427.9803771972656, "loss": 0.5013, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.5358282327651978, "rewards/margins": 0.5532472729682922, "rewards/rejected": -0.017419060692191124, "step": 1910 }, { "epoch": 0.5, "learning_rate": 2.8020086083213774e-07, "logits/chosen": -4.3343377113342285, "logits/rejected": -4.233187198638916, "logps/chosen": -663.6807861328125, "logps/rejected": -496.68121337890625, "loss": 0.5346, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.5831555128097534, "rewards/margins": 0.7528368830680847, "rewards/rejected": -0.16968131065368652, "step": 1920 }, { "epoch": 0.5, "learning_rate": 2.7876614060258244e-07, "logits/chosen": -4.208827018737793, "logits/rejected": -4.191887378692627, "logps/chosen": -546.9085693359375, "logps/rejected": -454.7889099121094, "loss": 0.6139, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4015926420688629, "rewards/margins": 0.4776438772678375, "rewards/rejected": -0.0760512501001358, "step": 1930 }, { "epoch": 0.5, "learning_rate": 2.7733142037302725e-07, "logits/chosen": -3.8217597007751465, "logits/rejected": -3.925053119659424, "logps/chosen": -661.263916015625, "logps/rejected": -534.646728515625, "loss": 0.5292, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.4541807770729065, "rewards/margins": 0.5835798382759094, "rewards/rejected": -0.12939909100532532, "step": 1940 }, { "epoch": 0.5, "learning_rate": 2.75896700143472e-07, "logits/chosen": -4.182621955871582, "logits/rejected": -3.9824492931365967, "logps/chosen": -570.587890625, "logps/rejected": -394.5463562011719, "loss": 0.5491, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.5437583923339844, "rewards/margins": 0.5803283452987671, "rewards/rejected": -0.03656994178891182, "step": 1950 }, { "epoch": 0.51, "learning_rate": 2.744619799139168e-07, "logits/chosen": -4.0817694664001465, "logits/rejected": -4.021645545959473, "logps/chosen": -562.7216796875, "logps/rejected": -408.1803894042969, "loss": 0.6227, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.38751038908958435, "rewards/margins": 0.412889301776886, "rewards/rejected": -0.025378871709108353, "step": 1960 }, { "epoch": 0.51, "learning_rate": 2.730272596843615e-07, "logits/chosen": -4.210979461669922, "logits/rejected": -4.233429908752441, "logps/chosen": -531.6992797851562, "logps/rejected": -415.34130859375, "loss": 0.5544, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.4554630219936371, "rewards/margins": 0.6501585841178894, "rewards/rejected": -0.1946956068277359, "step": 1970 }, { "epoch": 0.51, "learning_rate": 2.715925394548063e-07, "logits/chosen": -3.991922378540039, "logits/rejected": -3.861186981201172, "logps/chosen": -516.5054931640625, "logps/rejected": -480.88043212890625, "loss": 0.6105, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.35484379529953003, "rewards/margins": 0.21209315955638885, "rewards/rejected": 0.14275071024894714, "step": 1980 }, { "epoch": 0.51, "learning_rate": 2.7015781922525106e-07, "logits/chosen": -3.956188678741455, "logits/rejected": -3.9556357860565186, "logps/chosen": -464.2613830566406, "logps/rejected": -448.0741271972656, "loss": 0.6277, "rewards/accuracies": 0.6875, "rewards/chosen": 0.34133443236351013, "rewards/margins": 0.3067266345024109, "rewards/rejected": 0.03460781276226044, "step": 1990 }, { "epoch": 0.52, "learning_rate": 2.6872309899569586e-07, "logits/chosen": -4.143117427825928, "logits/rejected": -4.186631679534912, "logps/chosen": -578.9446411132812, "logps/rejected": -439.56658935546875, "loss": 0.5377, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5403174161911011, "rewards/margins": 0.633220911026001, "rewards/rejected": -0.0929035171866417, "step": 2000 }, { "epoch": 0.52, "eval_logits/chosen": -4.000906944274902, "eval_logits/rejected": -4.028975009918213, "eval_logps/chosen": -545.1220092773438, "eval_logps/rejected": -438.54486083984375, "eval_loss": 0.5702030062675476, "eval_rewards/accuracies": 0.6850000023841858, "eval_rewards/chosen": 0.4254603683948517, "eval_rewards/margins": 0.4794518053531647, "eval_rewards/rejected": -0.0539914108812809, "eval_runtime": 147.8823, "eval_samples_per_second": 13.524, "eval_steps_per_second": 1.691, "step": 2000 }, { "epoch": 0.52, "learning_rate": 2.6728837876614056e-07, "logits/chosen": -4.1301422119140625, "logits/rejected": -4.1415839195251465, "logps/chosen": -582.57666015625, "logps/rejected": -450.591552734375, "loss": 0.5582, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.5017611384391785, "rewards/margins": 0.6365527510643005, "rewards/rejected": -0.13479158282279968, "step": 2010 }, { "epoch": 0.52, "learning_rate": 2.6585365853658536e-07, "logits/chosen": -4.098201274871826, "logits/rejected": -4.06491756439209, "logps/chosen": -536.2640380859375, "logps/rejected": -417.99481201171875, "loss": 0.5795, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.4373628497123718, "rewards/margins": 0.393180251121521, "rewards/rejected": 0.044182561337947845, "step": 2020 }, { "epoch": 0.52, "learning_rate": 2.644189383070301e-07, "logits/chosen": -4.560007572174072, "logits/rejected": -4.380262851715088, "logps/chosen": -560.2337646484375, "logps/rejected": -399.133056640625, "loss": 0.5672, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.5478136539459229, "rewards/margins": 0.6600695252418518, "rewards/rejected": -0.11225590854883194, "step": 2030 }, { "epoch": 0.53, "learning_rate": 2.629842180774749e-07, "logits/chosen": -4.084300518035889, "logits/rejected": -4.194989204406738, "logps/chosen": -615.1845703125, "logps/rejected": -399.0810546875, "loss": 0.5302, "rewards/accuracies": 0.875, "rewards/chosen": 0.5594775676727295, "rewards/margins": 0.8519641160964966, "rewards/rejected": -0.2924865782260895, "step": 2040 }, { "epoch": 0.53, "learning_rate": 2.615494978479196e-07, "logits/chosen": -3.948491334915161, "logits/rejected": -3.790837049484253, "logps/chosen": -506.5896911621094, "logps/rejected": -390.9329528808594, "loss": 0.5634, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.3724905848503113, "rewards/margins": 0.4565068781375885, "rewards/rejected": -0.08401624858379364, "step": 2050 }, { "epoch": 0.53, "learning_rate": 2.601147776183644e-07, "logits/chosen": -4.237751007080078, "logits/rejected": -4.1675705909729, "logps/chosen": -569.1129150390625, "logps/rejected": -415.8509826660156, "loss": 0.5397, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.3635835647583008, "rewards/margins": 0.4130166471004486, "rewards/rejected": -0.04943311959505081, "step": 2060 }, { "epoch": 0.53, "learning_rate": 2.5868005738880917e-07, "logits/chosen": -4.343452453613281, "logits/rejected": -4.299803733825684, "logps/chosen": -468.18768310546875, "logps/rejected": -420.81256103515625, "loss": 0.5592, "rewards/accuracies": 0.75, "rewards/chosen": 0.4336473047733307, "rewards/margins": 0.4215630888938904, "rewards/rejected": 0.012084214016795158, "step": 2070 }, { "epoch": 0.54, "learning_rate": 2.57245337159254e-07, "logits/chosen": -4.12381649017334, "logits/rejected": -4.066357135772705, "logps/chosen": -512.6734619140625, "logps/rejected": -421.00872802734375, "loss": 0.5649, "rewards/accuracies": 0.75, "rewards/chosen": 0.26631009578704834, "rewards/margins": 0.49776148796081543, "rewards/rejected": -0.23145142197608948, "step": 2080 }, { "epoch": 0.54, "learning_rate": 2.558106169296987e-07, "logits/chosen": -4.055663108825684, "logits/rejected": -4.202220439910889, "logps/chosen": -531.7757568359375, "logps/rejected": -427.21051025390625, "loss": 0.5916, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.35120105743408203, "rewards/margins": 0.329306036233902, "rewards/rejected": 0.021895062178373337, "step": 2090 }, { "epoch": 0.54, "learning_rate": 2.543758967001435e-07, "logits/chosen": -3.896604537963867, "logits/rejected": -3.8615658283233643, "logps/chosen": -546.3208618164062, "logps/rejected": -435.53143310546875, "loss": 0.4988, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.46676820516586304, "rewards/margins": 0.7485499382019043, "rewards/rejected": -0.28178170323371887, "step": 2100 }, { "epoch": 0.54, "eval_logits/chosen": -4.003889560699463, "eval_logits/rejected": -4.031704425811768, "eval_logps/chosen": -545.0299072265625, "eval_logps/rejected": -438.5533142089844, "eval_loss": 0.5712563395500183, "eval_rewards/accuracies": 0.6840000152587891, "eval_rewards/chosen": 0.43467363715171814, "eval_rewards/margins": 0.4895067512989044, "eval_rewards/rejected": -0.05483310669660568, "eval_runtime": 148.9309, "eval_samples_per_second": 13.429, "eval_steps_per_second": 1.679, "step": 2100 }, { "epoch": 0.54, "learning_rate": 2.5294117647058823e-07, "logits/chosen": -4.0738677978515625, "logits/rejected": -4.02095890045166, "logps/chosen": -547.7879638671875, "logps/rejected": -461.1922302246094, "loss": 0.5612, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.4428838789463043, "rewards/margins": 0.4759696424007416, "rewards/rejected": -0.03308583423495293, "step": 2110 }, { "epoch": 0.55, "learning_rate": 2.51506456241033e-07, "logits/chosen": -4.027615547180176, "logits/rejected": -4.137267112731934, "logps/chosen": -519.4240112304688, "logps/rejected": -418.49884033203125, "loss": 0.5573, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.33291319012641907, "rewards/margins": 0.5208204388618469, "rewards/rejected": -0.18790724873542786, "step": 2120 }, { "epoch": 0.55, "learning_rate": 2.5007173601147773e-07, "logits/chosen": -3.9522101879119873, "logits/rejected": -4.056872367858887, "logps/chosen": -581.5064697265625, "logps/rejected": -583.0844116210938, "loss": 0.5702, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.40307894349098206, "rewards/margins": 0.49982690811157227, "rewards/rejected": -0.09674793481826782, "step": 2130 }, { "epoch": 0.55, "learning_rate": 2.486370157819225e-07, "logits/chosen": -4.075150966644287, "logits/rejected": -3.9781277179718018, "logps/chosen": -570.3604736328125, "logps/rejected": -457.1639099121094, "loss": 0.5703, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.4488401412963867, "rewards/margins": 0.4194963574409485, "rewards/rejected": 0.029343824833631516, "step": 2140 }, { "epoch": 0.56, "learning_rate": 2.472022955523673e-07, "logits/chosen": -3.9273715019226074, "logits/rejected": -4.03403377532959, "logps/chosen": -576.674072265625, "logps/rejected": -480.82330322265625, "loss": 0.5837, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.4813673496246338, "rewards/margins": 0.5193904638290405, "rewards/rejected": -0.038023076951503754, "step": 2150 }, { "epoch": 0.56, "learning_rate": 2.4576757532281204e-07, "logits/chosen": -4.104135513305664, "logits/rejected": -4.141896724700928, "logps/chosen": -570.6798095703125, "logps/rejected": -480.5628967285156, "loss": 0.6344, "rewards/accuracies": 0.625, "rewards/chosen": 0.4069296419620514, "rewards/margins": 0.38408637046813965, "rewards/rejected": 0.02284328266978264, "step": 2160 }, { "epoch": 0.56, "learning_rate": 2.443328550932568e-07, "logits/chosen": -3.959376811981201, "logits/rejected": -3.8872084617614746, "logps/chosen": -552.5087280273438, "logps/rejected": -436.44189453125, "loss": 0.568, "rewards/accuracies": 0.625, "rewards/chosen": 0.29828330874443054, "rewards/margins": 0.3948608338832855, "rewards/rejected": -0.09657756984233856, "step": 2170 }, { "epoch": 0.56, "learning_rate": 2.4289813486370154e-07, "logits/chosen": -4.081685543060303, "logits/rejected": -4.045130729675293, "logps/chosen": -545.3685302734375, "logps/rejected": -373.71600341796875, "loss": 0.5179, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.462582528591156, "rewards/margins": 0.6623843908309937, "rewards/rejected": -0.19980189204216003, "step": 2180 }, { "epoch": 0.57, "learning_rate": 2.4146341463414635e-07, "logits/chosen": -4.286005973815918, "logits/rejected": -4.341670036315918, "logps/chosen": -546.548828125, "logps/rejected": -416.674072265625, "loss": 0.5689, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.4305512011051178, "rewards/margins": 0.6499841213226318, "rewards/rejected": -0.21943287551403046, "step": 2190 }, { "epoch": 0.57, "learning_rate": 2.400286944045911e-07, "logits/chosen": -3.9777417182922363, "logits/rejected": -3.9910645484924316, "logps/chosen": -470.6476135253906, "logps/rejected": -453.6451721191406, "loss": 0.6093, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.38502687215805054, "rewards/margins": 0.30599719285964966, "rewards/rejected": 0.07902970165014267, "step": 2200 }, { "epoch": 0.57, "eval_logits/chosen": -4.001364231109619, "eval_logits/rejected": -4.028832912445068, "eval_logps/chosen": -544.912841796875, "eval_logps/rejected": -438.4606628417969, "eval_loss": 0.5706081986427307, "eval_rewards/accuracies": 0.6809999942779541, "eval_rewards/chosen": 0.44638243317604065, "eval_rewards/margins": 0.49195748567581177, "eval_rewards/rejected": -0.045575033873319626, "eval_runtime": 146.1996, "eval_samples_per_second": 13.68, "eval_steps_per_second": 1.71, "step": 2200 }, { "epoch": 0.57, "learning_rate": 2.3859397417503585e-07, "logits/chosen": -4.030927658081055, "logits/rejected": -3.9580256938934326, "logps/chosen": -514.30712890625, "logps/rejected": -354.2815246582031, "loss": 0.5861, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.4988827705383301, "rewards/margins": 0.5217560529708862, "rewards/rejected": -0.0228732917457819, "step": 2210 }, { "epoch": 0.57, "learning_rate": 2.3715925394548063e-07, "logits/chosen": -4.007624626159668, "logits/rejected": -4.2475457191467285, "logps/chosen": -661.9398193359375, "logps/rejected": -411.38848876953125, "loss": 0.538, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.663671612739563, "rewards/margins": 0.7176098227500916, "rewards/rejected": -0.05393817275762558, "step": 2220 }, { "epoch": 0.58, "learning_rate": 2.3572453371592538e-07, "logits/chosen": -4.090206146240234, "logits/rejected": -4.1433539390563965, "logps/chosen": -504.1729431152344, "logps/rejected": -387.82623291015625, "loss": 0.5351, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.45734721422195435, "rewards/margins": 0.4615212082862854, "rewards/rejected": -0.0041740150190889835, "step": 2230 }, { "epoch": 0.58, "learning_rate": 2.3428981348637013e-07, "logits/chosen": -4.289696216583252, "logits/rejected": -4.263758659362793, "logps/chosen": -579.8231201171875, "logps/rejected": -404.7850646972656, "loss": 0.5307, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.6702554225921631, "rewards/margins": 0.7904798984527588, "rewards/rejected": -0.12022446095943451, "step": 2240 }, { "epoch": 0.58, "learning_rate": 2.328550932568149e-07, "logits/chosen": -3.835402727127075, "logits/rejected": -3.8511269092559814, "logps/chosen": -510.6192932128906, "logps/rejected": -446.6246643066406, "loss": 0.6048, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.38664764165878296, "rewards/margins": 0.3122571110725403, "rewards/rejected": 0.07439050823450089, "step": 2250 }, { "epoch": 0.58, "learning_rate": 2.3142037302725966e-07, "logits/chosen": -3.915778398513794, "logits/rejected": -3.8879055976867676, "logps/chosen": -513.9568481445312, "logps/rejected": -410.12310791015625, "loss": 0.5356, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4804447293281555, "rewards/margins": 0.5258998274803162, "rewards/rejected": -0.04545507952570915, "step": 2260 }, { "epoch": 0.59, "learning_rate": 2.2998565279770444e-07, "logits/chosen": -4.199291229248047, "logits/rejected": -4.164752006530762, "logps/chosen": -644.1868896484375, "logps/rejected": -394.4111022949219, "loss": 0.603, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.38507136702537537, "rewards/margins": 0.504477858543396, "rewards/rejected": -0.11940644681453705, "step": 2270 }, { "epoch": 0.59, "learning_rate": 2.285509325681492e-07, "logits/chosen": -4.0966033935546875, "logits/rejected": -4.1276421546936035, "logps/chosen": -525.5519409179688, "logps/rejected": -452.3783264160156, "loss": 0.5688, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.3376069664955139, "rewards/margins": 0.48656487464904785, "rewards/rejected": -0.14895787835121155, "step": 2280 }, { "epoch": 0.59, "learning_rate": 2.2711621233859396e-07, "logits/chosen": -3.9321861267089844, "logits/rejected": -3.8997440338134766, "logps/chosen": -524.1985473632812, "logps/rejected": -368.1291198730469, "loss": 0.5533, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.33177611231803894, "rewards/margins": 0.49390387535095215, "rewards/rejected": -0.1621277630329132, "step": 2290 }, { "epoch": 0.59, "learning_rate": 2.2568149210903872e-07, "logits/chosen": -4.461883544921875, "logits/rejected": -4.530648708343506, "logps/chosen": -619.8561401367188, "logps/rejected": -468.6748046875, "loss": 0.5356, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.5222944021224976, "rewards/margins": 0.6014097929000854, "rewards/rejected": -0.07911545038223267, "step": 2300 }, { "epoch": 0.59, "eval_logits/chosen": -3.998574733734131, "eval_logits/rejected": -4.025696277618408, "eval_logps/chosen": -544.8922119140625, "eval_logps/rejected": -438.4912109375, "eval_loss": 0.5689104199409485, "eval_rewards/accuracies": 0.6880000233650208, "eval_rewards/chosen": 0.448445200920105, "eval_rewards/margins": 0.49707192182540894, "eval_rewards/rejected": -0.04862673580646515, "eval_runtime": 148.9177, "eval_samples_per_second": 13.43, "eval_steps_per_second": 1.679, "step": 2300 }, { "epoch": 0.6, "learning_rate": 2.242467718794835e-07, "logits/chosen": -3.9245476722717285, "logits/rejected": -4.00443696975708, "logps/chosen": -561.0607299804688, "logps/rejected": -444.2076110839844, "loss": 0.542, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.41927021741867065, "rewards/margins": 0.5216065049171448, "rewards/rejected": -0.10233630239963531, "step": 2310 }, { "epoch": 0.6, "learning_rate": 2.2281205164992824e-07, "logits/chosen": -4.177279949188232, "logits/rejected": -4.077963352203369, "logps/chosen": -504.2713928222656, "logps/rejected": -444.5032653808594, "loss": 0.6156, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.43868088722229004, "rewards/margins": 0.4138507843017578, "rewards/rejected": 0.024830086156725883, "step": 2320 }, { "epoch": 0.6, "learning_rate": 2.2137733142037302e-07, "logits/chosen": -3.990638017654419, "logits/rejected": -4.014552593231201, "logps/chosen": -549.037353515625, "logps/rejected": -442.3534240722656, "loss": 0.5213, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.5215023756027222, "rewards/margins": 0.638957679271698, "rewards/rejected": -0.11745530366897583, "step": 2330 }, { "epoch": 0.6, "learning_rate": 2.1994261119081777e-07, "logits/chosen": -4.201764106750488, "logits/rejected": -4.19627046585083, "logps/chosen": -564.8765258789062, "logps/rejected": -433.4271545410156, "loss": 0.5939, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.4676602780818939, "rewards/margins": 0.5004759430885315, "rewards/rejected": -0.03281565010547638, "step": 2340 }, { "epoch": 0.61, "learning_rate": 2.1850789096126255e-07, "logits/chosen": -3.911675214767456, "logits/rejected": -4.010054588317871, "logps/chosen": -611.3627319335938, "logps/rejected": -452.5043029785156, "loss": 0.663, "rewards/accuracies": 0.625, "rewards/chosen": 0.40320587158203125, "rewards/margins": 0.3427828252315521, "rewards/rejected": 0.060423027724027634, "step": 2350 }, { "epoch": 0.61, "learning_rate": 2.170731707317073e-07, "logits/chosen": -4.167824745178223, "logits/rejected": -4.243043422698975, "logps/chosen": -556.890625, "logps/rejected": -397.8431091308594, "loss": 0.551, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.4790073037147522, "rewards/margins": 0.5861107110977173, "rewards/rejected": -0.10710340738296509, "step": 2360 }, { "epoch": 0.61, "learning_rate": 2.1563845050215208e-07, "logits/chosen": -4.110980033874512, "logits/rejected": -4.188474655151367, "logps/chosen": -569.0153198242188, "logps/rejected": -404.994384765625, "loss": 0.5716, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.4905467927455902, "rewards/margins": 0.5199242830276489, "rewards/rejected": -0.029377540573477745, "step": 2370 }, { "epoch": 0.61, "learning_rate": 2.1420373027259683e-07, "logits/chosen": -4.269859313964844, "logits/rejected": -4.332370758056641, "logps/chosen": -543.8313598632812, "logps/rejected": -437.123046875, "loss": 0.5571, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.500403106212616, "rewards/margins": 0.7072377800941467, "rewards/rejected": -0.20683467388153076, "step": 2380 }, { "epoch": 0.62, "learning_rate": 2.127690100430416e-07, "logits/chosen": -3.926335096359253, "logits/rejected": -3.9738330841064453, "logps/chosen": -533.0458984375, "logps/rejected": -424.850341796875, "loss": 0.6198, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.35639142990112305, "rewards/margins": 0.4018617570400238, "rewards/rejected": -0.04547032713890076, "step": 2390 }, { "epoch": 0.62, "learning_rate": 2.1133428981348636e-07, "logits/chosen": -3.779186248779297, "logits/rejected": -3.8913798332214355, "logps/chosen": -617.508056640625, "logps/rejected": -492.209228515625, "loss": 0.5753, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5383332371711731, "rewards/margins": 0.467332661151886, "rewards/rejected": 0.07100055366754532, "step": 2400 }, { "epoch": 0.62, "eval_logits/chosen": -3.9845926761627197, "eval_logits/rejected": -4.009966850280762, "eval_logps/chosen": -544.7802124023438, "eval_logps/rejected": -438.44573974609375, "eval_loss": 0.5681360960006714, "eval_rewards/accuracies": 0.6850000023841858, "eval_rewards/chosen": 0.4596436619758606, "eval_rewards/margins": 0.5037252306938171, "eval_rewards/rejected": -0.04408155009150505, "eval_runtime": 148.5709, "eval_samples_per_second": 13.462, "eval_steps_per_second": 1.683, "step": 2400 }, { "epoch": 0.62, "learning_rate": 2.098995695839311e-07, "logits/chosen": -3.978921890258789, "logits/rejected": -3.8923873901367188, "logps/chosen": -556.697998046875, "logps/rejected": -416.08184814453125, "loss": 0.534, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.4480956494808197, "rewards/margins": 0.6271561980247498, "rewards/rejected": -0.17906051874160767, "step": 2410 }, { "epoch": 0.62, "learning_rate": 2.084648493543759e-07, "logits/chosen": -4.281157493591309, "logits/rejected": -4.271050453186035, "logps/chosen": -673.8267211914062, "logps/rejected": -463.4944763183594, "loss": 0.5461, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.5151551365852356, "rewards/margins": 0.6002731919288635, "rewards/rejected": -0.08511805534362793, "step": 2420 }, { "epoch": 0.63, "learning_rate": 2.0703012912482064e-07, "logits/chosen": -3.901240110397339, "logits/rejected": -3.862910509109497, "logps/chosen": -591.2846069335938, "logps/rejected": -389.8904113769531, "loss": 0.6189, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.2968345284461975, "rewards/margins": 0.3309480547904968, "rewards/rejected": -0.03411349281668663, "step": 2430 }, { "epoch": 0.63, "learning_rate": 2.0559540889526542e-07, "logits/chosen": -4.235989570617676, "logits/rejected": -4.060244560241699, "logps/chosen": -597.9428100585938, "logps/rejected": -404.6048889160156, "loss": 0.5864, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.46942558884620667, "rewards/margins": 0.5440183877944946, "rewards/rejected": -0.07459276914596558, "step": 2440 }, { "epoch": 0.63, "learning_rate": 2.0416068866571017e-07, "logits/chosen": -3.8396244049072266, "logits/rejected": -3.752044677734375, "logps/chosen": -601.25341796875, "logps/rejected": -435.07647705078125, "loss": 0.5917, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.49751096963882446, "rewards/margins": 0.46320000290870667, "rewards/rejected": 0.03431097790598869, "step": 2450 }, { "epoch": 0.64, "learning_rate": 2.0272596843615495e-07, "logits/chosen": -4.197469711303711, "logits/rejected": -4.122381687164307, "logps/chosen": -553.6739501953125, "logps/rejected": -420.4598083496094, "loss": 0.5932, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.453556627035141, "rewards/margins": 0.4589596390724182, "rewards/rejected": -0.005403043236583471, "step": 2460 }, { "epoch": 0.64, "learning_rate": 2.012912482065997e-07, "logits/chosen": -4.135566711425781, "logits/rejected": -4.087862968444824, "logps/chosen": -528.1041259765625, "logps/rejected": -432.2552795410156, "loss": 0.5474, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.5243477821350098, "rewards/margins": 0.636970043182373, "rewards/rejected": -0.11262223869562149, "step": 2470 }, { "epoch": 0.64, "learning_rate": 1.9985652797704448e-07, "logits/chosen": -3.9298617839813232, "logits/rejected": -3.9982573986053467, "logps/chosen": -466.2574157714844, "logps/rejected": -393.4660339355469, "loss": 0.5623, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.469001442193985, "rewards/margins": 0.48559433221817017, "rewards/rejected": -0.016592923551797867, "step": 2480 }, { "epoch": 0.64, "learning_rate": 1.9842180774748923e-07, "logits/chosen": -3.9439053535461426, "logits/rejected": -3.8986332416534424, "logps/chosen": -559.919921875, "logps/rejected": -430.98162841796875, "loss": 0.5399, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.4386838376522064, "rewards/margins": 0.4418070912361145, "rewards/rejected": -0.003123197006061673, "step": 2490 }, { "epoch": 0.65, "learning_rate": 1.96987087517934e-07, "logits/chosen": -4.274647235870361, "logits/rejected": -4.253532409667969, "logps/chosen": -593.2935791015625, "logps/rejected": -445.554931640625, "loss": 0.5709, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4176342487335205, "rewards/margins": 0.4632183909416199, "rewards/rejected": -0.04558416083455086, "step": 2500 }, { "epoch": 0.65, "eval_logits/chosen": -3.984861135482788, "eval_logits/rejected": -4.0099897384643555, "eval_logps/chosen": -544.6834716796875, "eval_logps/rejected": -438.3924255371094, "eval_loss": 0.5672796368598938, "eval_rewards/accuracies": 0.6909999847412109, "eval_rewards/chosen": 0.4693204462528229, "eval_rewards/margins": 0.5080692172050476, "eval_rewards/rejected": -0.03874876722693443, "eval_runtime": 146.2016, "eval_samples_per_second": 13.68, "eval_steps_per_second": 1.71, "step": 2500 }, { "epoch": 0.65, "learning_rate": 1.9555236728837876e-07, "logits/chosen": -4.1506547927856445, "logits/rejected": -4.121700286865234, "logps/chosen": -559.0662231445312, "logps/rejected": -428.6475524902344, "loss": 0.5692, "rewards/accuracies": 0.75, "rewards/chosen": 0.5197519659996033, "rewards/margins": 0.5856004953384399, "rewards/rejected": -0.06584848463535309, "step": 2510 }, { "epoch": 0.65, "learning_rate": 1.9411764705882353e-07, "logits/chosen": -4.2824015617370605, "logits/rejected": -4.196056365966797, "logps/chosen": -598.8436279296875, "logps/rejected": -525.2330322265625, "loss": 0.5574, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.4597892165184021, "rewards/margins": 0.5178920030593872, "rewards/rejected": -0.05810274928808212, "step": 2520 }, { "epoch": 0.65, "learning_rate": 1.9268292682926829e-07, "logits/chosen": -4.085073947906494, "logits/rejected": -4.154143810272217, "logps/chosen": -669.5693359375, "logps/rejected": -449.7344665527344, "loss": 0.5496, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.6506977081298828, "rewards/margins": 0.7707425355911255, "rewards/rejected": -0.12004482746124268, "step": 2530 }, { "epoch": 0.66, "learning_rate": 1.9124820659971306e-07, "logits/chosen": -3.931575059890747, "logits/rejected": -3.969634532928467, "logps/chosen": -670.7431640625, "logps/rejected": -442.46405029296875, "loss": 0.5956, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.47314882278442383, "rewards/margins": 0.5320797562599182, "rewards/rejected": -0.058930885046720505, "step": 2540 }, { "epoch": 0.66, "learning_rate": 1.8981348637015781e-07, "logits/chosen": -3.7605667114257812, "logits/rejected": -3.7463626861572266, "logps/chosen": -507.4091796875, "logps/rejected": -419.73919677734375, "loss": 0.6149, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.4424256384372711, "rewards/margins": 0.36436089873313904, "rewards/rejected": 0.07806471735239029, "step": 2550 }, { "epoch": 0.66, "learning_rate": 1.883787661406026e-07, "logits/chosen": -3.9714431762695312, "logits/rejected": -4.080648899078369, "logps/chosen": -589.9671630859375, "logps/rejected": -402.8089904785156, "loss": 0.5552, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.5086973309516907, "rewards/margins": 0.7329601645469666, "rewards/rejected": -0.2242628037929535, "step": 2560 }, { "epoch": 0.66, "learning_rate": 1.8694404591104734e-07, "logits/chosen": -4.056425094604492, "logits/rejected": -4.178628444671631, "logps/chosen": -524.011962890625, "logps/rejected": -415.20233154296875, "loss": 0.6136, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.41316717863082886, "rewards/margins": 0.47009795904159546, "rewards/rejected": -0.056930772960186005, "step": 2570 }, { "epoch": 0.67, "learning_rate": 1.855093256814921e-07, "logits/chosen": -4.216281890869141, "logits/rejected": -4.076776027679443, "logps/chosen": -544.1904296875, "logps/rejected": -466.5315856933594, "loss": 0.575, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.41413965821266174, "rewards/margins": 0.48670220375061035, "rewards/rejected": -0.07256259769201279, "step": 2580 }, { "epoch": 0.67, "learning_rate": 1.8407460545193687e-07, "logits/chosen": -4.047214984893799, "logits/rejected": -4.0597639083862305, "logps/chosen": -594.7711181640625, "logps/rejected": -453.0791015625, "loss": 0.612, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6000150442123413, "rewards/margins": 0.62732994556427, "rewards/rejected": -0.027314912527799606, "step": 2590 }, { "epoch": 0.67, "learning_rate": 1.8263988522238162e-07, "logits/chosen": -3.8675410747528076, "logits/rejected": -3.881988525390625, "logps/chosen": -478.98443603515625, "logps/rejected": -408.0418395996094, "loss": 0.5565, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.34187614917755127, "rewards/margins": 0.5348206162452698, "rewards/rejected": -0.19294443726539612, "step": 2600 }, { "epoch": 0.67, "eval_logits/chosen": -3.984271764755249, "eval_logits/rejected": -4.009637832641602, "eval_logps/chosen": -544.6849975585938, "eval_logps/rejected": -438.4054260253906, "eval_loss": 0.5665393471717834, "eval_rewards/accuracies": 0.6819999814033508, "eval_rewards/chosen": 0.469163715839386, "eval_rewards/margins": 0.5092154145240784, "eval_rewards/rejected": -0.04005170986056328, "eval_runtime": 145.7676, "eval_samples_per_second": 13.72, "eval_steps_per_second": 1.715, "step": 2600 }, { "epoch": 0.67, "learning_rate": 1.812051649928264e-07, "logits/chosen": -3.83473539352417, "logits/rejected": -3.9073386192321777, "logps/chosen": -604.2052612304688, "logps/rejected": -456.9849548339844, "loss": 0.5876, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.5973328351974487, "rewards/margins": 0.4590074121952057, "rewards/rejected": 0.13832543790340424, "step": 2610 }, { "epoch": 0.68, "learning_rate": 1.7977044476327115e-07, "logits/chosen": -3.848345994949341, "logits/rejected": -3.786773681640625, "logps/chosen": -502.00653076171875, "logps/rejected": -401.30877685546875, "loss": 0.6026, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.4152609407901764, "rewards/margins": 0.3713647425174713, "rewards/rejected": 0.04389624670147896, "step": 2620 }, { "epoch": 0.68, "learning_rate": 1.7833572453371593e-07, "logits/chosen": -4.126666069030762, "logits/rejected": -4.045652389526367, "logps/chosen": -495.5149841308594, "logps/rejected": -428.750244140625, "loss": 0.5802, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4449082911014557, "rewards/margins": 0.48476019501686096, "rewards/rejected": -0.03985190391540527, "step": 2630 }, { "epoch": 0.68, "learning_rate": 1.7690100430416068e-07, "logits/chosen": -4.006113529205322, "logits/rejected": -4.05719518661499, "logps/chosen": -564.3095092773438, "logps/rejected": -465.112060546875, "loss": 0.5478, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.4402576982975006, "rewards/margins": 0.5477269887924194, "rewards/rejected": -0.10746929794549942, "step": 2640 }, { "epoch": 0.68, "learning_rate": 1.7546628407460546e-07, "logits/chosen": -4.016690254211426, "logits/rejected": -4.137378692626953, "logps/chosen": -540.1869506835938, "logps/rejected": -384.2567443847656, "loss": 0.5053, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.46003857254981995, "rewards/margins": 0.587317168712616, "rewards/rejected": -0.1272786259651184, "step": 2650 }, { "epoch": 0.69, "learning_rate": 1.740315638450502e-07, "logits/chosen": -4.14896297454834, "logits/rejected": -4.022231101989746, "logps/chosen": -551.8126831054688, "logps/rejected": -429.8963317871094, "loss": 0.553, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.5274641513824463, "rewards/margins": 0.4932268559932709, "rewards/rejected": 0.034237295389175415, "step": 2660 }, { "epoch": 0.69, "learning_rate": 1.72596843615495e-07, "logits/chosen": -3.9441299438476562, "logits/rejected": -3.7229416370391846, "logps/chosen": -541.1564331054688, "logps/rejected": -522.1112060546875, "loss": 0.6686, "rewards/accuracies": 0.5625, "rewards/chosen": 0.36212533712387085, "rewards/margins": 0.2563532590866089, "rewards/rejected": 0.10577203333377838, "step": 2670 }, { "epoch": 0.69, "learning_rate": 1.7116212338593974e-07, "logits/chosen": -4.236396312713623, "logits/rejected": -4.220719337463379, "logps/chosen": -498.52874755859375, "logps/rejected": -399.10699462890625, "loss": 0.6321, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.4771362245082855, "rewards/margins": 0.3947621285915375, "rewards/rejected": 0.08237410336732864, "step": 2680 }, { "epoch": 0.69, "learning_rate": 1.6972740315638452e-07, "logits/chosen": -4.176735877990723, "logits/rejected": -3.9685966968536377, "logps/chosen": -628.45947265625, "logps/rejected": -414.69329833984375, "loss": 0.4936, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5496100783348083, "rewards/margins": 0.7379701733589172, "rewards/rejected": -0.1883600354194641, "step": 2690 }, { "epoch": 0.7, "learning_rate": 1.6829268292682927e-07, "logits/chosen": -4.031551361083984, "logits/rejected": -4.172730445861816, "logps/chosen": -528.1746215820312, "logps/rejected": -456.8675231933594, "loss": 0.585, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5029736757278442, "rewards/margins": 0.41340795159339905, "rewards/rejected": 0.08956580609083176, "step": 2700 }, { "epoch": 0.7, "eval_logits/chosen": -3.981973171234131, "eval_logits/rejected": -4.00735330581665, "eval_logps/chosen": -544.59619140625, "eval_logps/rejected": -438.3558044433594, "eval_loss": 0.5649946331977844, "eval_rewards/accuracies": 0.6940000057220459, "eval_rewards/chosen": 0.47803932428359985, "eval_rewards/margins": 0.5131266713142395, "eval_rewards/rejected": -0.03508726879954338, "eval_runtime": 146.3353, "eval_samples_per_second": 13.667, "eval_steps_per_second": 1.708, "step": 2700 }, { "epoch": 0.7, "learning_rate": 1.6685796269727405e-07, "logits/chosen": -4.223569393157959, "logits/rejected": -4.211024284362793, "logps/chosen": -586.1395874023438, "logps/rejected": -488.5260314941406, "loss": 0.5806, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.5072857141494751, "rewards/margins": 0.45038923621177673, "rewards/rejected": 0.05689648538827896, "step": 2710 }, { "epoch": 0.7, "learning_rate": 1.654232424677188e-07, "logits/chosen": -4.059657573699951, "logits/rejected": -4.050175666809082, "logps/chosen": -636.249755859375, "logps/rejected": -445.1454162597656, "loss": 0.5708, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6326101422309875, "rewards/margins": 0.6772100925445557, "rewards/rejected": -0.04459994286298752, "step": 2720 }, { "epoch": 0.7, "learning_rate": 1.6398852223816355e-07, "logits/chosen": -3.965324878692627, "logits/rejected": -3.852470874786377, "logps/chosen": -587.070556640625, "logps/rejected": -468.05755615234375, "loss": 0.5195, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.6876250505447388, "rewards/margins": 0.6847888827323914, "rewards/rejected": 0.0028361976146698, "step": 2730 }, { "epoch": 0.71, "learning_rate": 1.6255380200860833e-07, "logits/chosen": -3.9872021675109863, "logits/rejected": -4.126004695892334, "logps/chosen": -575.1105346679688, "logps/rejected": -469.9864807128906, "loss": 0.6969, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.39717382192611694, "rewards/margins": 0.3177093267440796, "rewards/rejected": 0.07946449518203735, "step": 2740 }, { "epoch": 0.71, "learning_rate": 1.6111908177905308e-07, "logits/chosen": -4.012079238891602, "logits/rejected": -3.936004161834717, "logps/chosen": -597.6954345703125, "logps/rejected": -411.5677795410156, "loss": 0.6023, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.4575771391391754, "rewards/margins": 0.4063941538333893, "rewards/rejected": 0.05118294805288315, "step": 2750 }, { "epoch": 0.71, "learning_rate": 1.5968436154949786e-07, "logits/chosen": -4.271051406860352, "logits/rejected": -4.011579990386963, "logps/chosen": -563.8511962890625, "logps/rejected": -387.9336853027344, "loss": 0.5445, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.46211299300193787, "rewards/margins": 0.5439087748527527, "rewards/rejected": -0.0817958191037178, "step": 2760 }, { "epoch": 0.72, "learning_rate": 1.582496413199426e-07, "logits/chosen": -4.078734397888184, "logits/rejected": -4.14528751373291, "logps/chosen": -748.7713623046875, "logps/rejected": -491.18206787109375, "loss": 0.5499, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.6352272033691406, "rewards/margins": 0.7783417701721191, "rewards/rejected": -0.14311453700065613, "step": 2770 }, { "epoch": 0.72, "learning_rate": 1.5681492109038739e-07, "logits/chosen": -4.074445724487305, "logits/rejected": -3.9905147552490234, "logps/chosen": -471.805908203125, "logps/rejected": -419.62109375, "loss": 0.5468, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.5179725885391235, "rewards/margins": 0.5387195348739624, "rewards/rejected": -0.020746838301420212, "step": 2780 }, { "epoch": 0.72, "learning_rate": 1.553802008608321e-07, "logits/chosen": -4.071807384490967, "logits/rejected": -4.146918296813965, "logps/chosen": -561.8287353515625, "logps/rejected": -447.42034912109375, "loss": 0.5752, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.5403738021850586, "rewards/margins": 0.5378284454345703, "rewards/rejected": 0.0025453567504882812, "step": 2790 }, { "epoch": 0.72, "learning_rate": 1.539454806312769e-07, "logits/chosen": -3.9499289989471436, "logits/rejected": -3.7599105834960938, "logps/chosen": -573.884765625, "logps/rejected": -480.6022033691406, "loss": 0.5883, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.5067580938339233, "rewards/margins": 0.46255749464035034, "rewards/rejected": 0.044200599193573, "step": 2800 }, { "epoch": 0.72, "eval_logits/chosen": -3.966898202896118, "eval_logits/rejected": -3.9893743991851807, "eval_logps/chosen": -544.46240234375, "eval_logps/rejected": -438.1562194824219, "eval_loss": 0.5670157074928284, "eval_rewards/accuracies": 0.6880000233650208, "eval_rewards/chosen": 0.4914305508136749, "eval_rewards/margins": 0.5065579414367676, "eval_rewards/rejected": -0.015127355232834816, "eval_runtime": 146.024, "eval_samples_per_second": 13.696, "eval_steps_per_second": 1.712, "step": 2800 }, { "epoch": 0.73, "learning_rate": 1.5251076040172164e-07, "logits/chosen": -4.044391632080078, "logits/rejected": -4.076410293579102, "logps/chosen": -598.0468139648438, "logps/rejected": -476.67181396484375, "loss": 0.5806, "rewards/accuracies": 0.625, "rewards/chosen": 0.5654221773147583, "rewards/margins": 0.4769902229309082, "rewards/rejected": 0.0884319394826889, "step": 2810 }, { "epoch": 0.73, "learning_rate": 1.5107604017216642e-07, "logits/chosen": -3.93943452835083, "logits/rejected": -4.029221534729004, "logps/chosen": -531.6763916015625, "logps/rejected": -357.01910400390625, "loss": 0.5708, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.5427955389022827, "rewards/margins": 0.6645749807357788, "rewards/rejected": -0.12177946418523788, "step": 2820 }, { "epoch": 0.73, "learning_rate": 1.4964131994261117e-07, "logits/chosen": -4.1592698097229, "logits/rejected": -4.196699142456055, "logps/chosen": -555.0905151367188, "logps/rejected": -400.6699523925781, "loss": 0.5251, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.5090584754943848, "rewards/margins": 0.5514262318611145, "rewards/rejected": -0.04236777871847153, "step": 2830 }, { "epoch": 0.73, "learning_rate": 1.4820659971305595e-07, "logits/chosen": -3.996324062347412, "logits/rejected": -3.8942997455596924, "logps/chosen": -559.6539916992188, "logps/rejected": -462.087890625, "loss": 0.5617, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.48809123039245605, "rewards/margins": 0.41656923294067383, "rewards/rejected": 0.07152204215526581, "step": 2840 }, { "epoch": 0.74, "learning_rate": 1.467718794835007e-07, "logits/chosen": -3.6908695697784424, "logits/rejected": -3.810857057571411, "logps/chosen": -488.4864196777344, "logps/rejected": -417.60400390625, "loss": 0.5171, "rewards/accuracies": 0.75, "rewards/chosen": 0.4484861493110657, "rewards/margins": 0.4763699173927307, "rewards/rejected": -0.02788383699953556, "step": 2850 }, { "epoch": 0.74, "learning_rate": 1.4533715925394547e-07, "logits/chosen": -3.739753007888794, "logits/rejected": -3.9605700969696045, "logps/chosen": -507.19659423828125, "logps/rejected": -371.3736572265625, "loss": 0.5889, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.5019505023956299, "rewards/margins": 0.4500049650669098, "rewards/rejected": 0.05194549635052681, "step": 2860 }, { "epoch": 0.74, "learning_rate": 1.4390243902439023e-07, "logits/chosen": -4.046762466430664, "logits/rejected": -4.0279541015625, "logps/chosen": -556.5122680664062, "logps/rejected": -335.6501770019531, "loss": 0.5436, "rewards/accuracies": 0.75, "rewards/chosen": 0.6151873469352722, "rewards/margins": 0.7387471795082092, "rewards/rejected": -0.12355981022119522, "step": 2870 }, { "epoch": 0.74, "learning_rate": 1.4246771879483498e-07, "logits/chosen": -3.7312331199645996, "logits/rejected": -3.662278413772583, "logps/chosen": -459.88525390625, "logps/rejected": -383.14984130859375, "loss": 0.582, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.3872886300086975, "rewards/margins": 0.42437830567359924, "rewards/rejected": -0.03708968311548233, "step": 2880 }, { "epoch": 0.75, "learning_rate": 1.4103299856527975e-07, "logits/chosen": -4.147296905517578, "logits/rejected": -4.07787561416626, "logps/chosen": -554.8704833984375, "logps/rejected": -398.159912109375, "loss": 0.6256, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5256353616714478, "rewards/margins": 0.47513723373413086, "rewards/rejected": 0.05049814656376839, "step": 2890 }, { "epoch": 0.75, "learning_rate": 1.395982783357245e-07, "logits/chosen": -3.914976119995117, "logits/rejected": -3.905255079269409, "logps/chosen": -563.65576171875, "logps/rejected": -461.05389404296875, "loss": 0.624, "rewards/accuracies": 0.625, "rewards/chosen": 0.48630857467651367, "rewards/margins": 0.4222942888736725, "rewards/rejected": 0.0640142410993576, "step": 2900 }, { "epoch": 0.75, "eval_logits/chosen": -3.970454692840576, "eval_logits/rejected": -3.993534803390503, "eval_logps/chosen": -544.4996948242188, "eval_logps/rejected": -438.1958312988281, "eval_loss": 0.5662667155265808, "eval_rewards/accuracies": 0.6840000152587891, "eval_rewards/chosen": 0.4876936674118042, "eval_rewards/margins": 0.5067842602729797, "eval_rewards/rejected": -0.019090561196208, "eval_runtime": 146.1468, "eval_samples_per_second": 13.685, "eval_steps_per_second": 1.711, "step": 2900 }, { "epoch": 0.75, "learning_rate": 1.3816355810616928e-07, "logits/chosen": -3.885633945465088, "logits/rejected": -3.992154598236084, "logps/chosen": -606.8893432617188, "logps/rejected": -488.0694885253906, "loss": 0.6481, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.506466269493103, "rewards/margins": 0.38042640686035156, "rewards/rejected": 0.12603983283042908, "step": 2910 }, { "epoch": 0.75, "learning_rate": 1.3672883787661404e-07, "logits/chosen": -3.9882044792175293, "logits/rejected": -4.012315273284912, "logps/chosen": -598.3148193359375, "logps/rejected": -423.91839599609375, "loss": 0.5414, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.4659012258052826, "rewards/margins": 0.621524453163147, "rewards/rejected": -0.15562327206134796, "step": 2920 }, { "epoch": 0.76, "learning_rate": 1.352941176470588e-07, "logits/chosen": -4.175354957580566, "logits/rejected": -4.1613287925720215, "logps/chosen": -553.1173706054688, "logps/rejected": -457.8443298339844, "loss": 0.5357, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.49676817655563354, "rewards/margins": 0.5213097333908081, "rewards/rejected": -0.024541499093174934, "step": 2930 }, { "epoch": 0.76, "learning_rate": 1.3385939741750356e-07, "logits/chosen": -4.05168342590332, "logits/rejected": -4.1532673835754395, "logps/chosen": -515.8355712890625, "logps/rejected": -411.39788818359375, "loss": 0.5648, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.4736366271972656, "rewards/margins": 0.5187323689460754, "rewards/rejected": -0.045095693320035934, "step": 2940 }, { "epoch": 0.76, "learning_rate": 1.3242467718794834e-07, "logits/chosen": -4.0590500831604, "logits/rejected": -4.028027534484863, "logps/chosen": -478.349853515625, "logps/rejected": -426.6962890625, "loss": 0.5647, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4894079566001892, "rewards/margins": 0.4928358197212219, "rewards/rejected": -0.0034278512466698885, "step": 2950 }, { "epoch": 0.76, "learning_rate": 1.309899569583931e-07, "logits/chosen": -3.9538333415985107, "logits/rejected": -4.088204383850098, "logps/chosen": -615.9334106445312, "logps/rejected": -433.25189208984375, "loss": 0.5954, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.5048609972000122, "rewards/margins": 0.6196298599243164, "rewards/rejected": -0.11476895958185196, "step": 2960 }, { "epoch": 0.77, "learning_rate": 1.2955523672883787e-07, "logits/chosen": -3.891871690750122, "logits/rejected": -3.8232593536376953, "logps/chosen": -516.1154174804688, "logps/rejected": -465.57977294921875, "loss": 0.55, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.6237145662307739, "rewards/margins": 0.559754490852356, "rewards/rejected": 0.06396011263132095, "step": 2970 }, { "epoch": 0.77, "learning_rate": 1.2812051649928262e-07, "logits/chosen": -4.138208866119385, "logits/rejected": -4.1710052490234375, "logps/chosen": -514.2755737304688, "logps/rejected": -385.1489562988281, "loss": 0.6169, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.442038357257843, "rewards/margins": 0.44703513383865356, "rewards/rejected": -0.004996694624423981, "step": 2980 }, { "epoch": 0.77, "learning_rate": 1.266857962697274e-07, "logits/chosen": -4.293547630310059, "logits/rejected": -4.3495774269104, "logps/chosen": -570.8736572265625, "logps/rejected": -520.841796875, "loss": 0.6195, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.49560147523880005, "rewards/margins": 0.48620933294296265, "rewards/rejected": 0.009392100386321545, "step": 2990 }, { "epoch": 0.77, "learning_rate": 1.2525107604017215e-07, "logits/chosen": -4.126075267791748, "logits/rejected": -3.8955910205841064, "logps/chosen": -650.3043212890625, "logps/rejected": -456.94110107421875, "loss": 0.5347, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.6632064580917358, "rewards/margins": 0.6959556341171265, "rewards/rejected": -0.032749250531196594, "step": 3000 }, { "epoch": 0.77, "eval_logits/chosen": -3.9776611328125, "eval_logits/rejected": -4.001935958862305, "eval_logps/chosen": -544.619873046875, "eval_logps/rejected": -438.3401184082031, "eval_loss": 0.564439594745636, "eval_rewards/accuracies": 0.6850000023841858, "eval_rewards/chosen": 0.4756743311882019, "eval_rewards/margins": 0.5091925859451294, "eval_rewards/rejected": -0.03351828455924988, "eval_runtime": 145.9011, "eval_samples_per_second": 13.708, "eval_steps_per_second": 1.713, "step": 3000 }, { "epoch": 0.78, "learning_rate": 1.2381635581061693e-07, "logits/chosen": -4.2549543380737305, "logits/rejected": -4.473557472229004, "logps/chosen": -614.374267578125, "logps/rejected": -472.72259521484375, "loss": 0.5698, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.49129754304885864, "rewards/margins": 0.42049235105514526, "rewards/rejected": 0.07080519199371338, "step": 3010 }, { "epoch": 0.78, "learning_rate": 1.2238163558106168e-07, "logits/chosen": -4.080137729644775, "logits/rejected": -4.035037040710449, "logps/chosen": -531.7971801757812, "logps/rejected": -427.37860107421875, "loss": 0.6113, "rewards/accuracies": 0.6875, "rewards/chosen": 0.28571632504463196, "rewards/margins": 0.4164826273918152, "rewards/rejected": -0.13076625764369965, "step": 3020 }, { "epoch": 0.78, "learning_rate": 1.2094691535150646e-07, "logits/chosen": -3.9666385650634766, "logits/rejected": -4.024598121643066, "logps/chosen": -485.36181640625, "logps/rejected": -329.271728515625, "loss": 0.5889, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.49098771810531616, "rewards/margins": 0.6027761697769165, "rewards/rejected": -0.11178841441869736, "step": 3030 }, { "epoch": 0.78, "learning_rate": 1.195121951219512e-07, "logits/chosen": -4.0698957443237305, "logits/rejected": -4.125982761383057, "logps/chosen": -544.8397827148438, "logps/rejected": -468.3692932128906, "loss": 0.5681, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.5031709671020508, "rewards/margins": 0.5633045434951782, "rewards/rejected": -0.06013358756899834, "step": 3040 }, { "epoch": 0.79, "learning_rate": 1.1807747489239597e-07, "logits/chosen": -4.321501731872559, "logits/rejected": -4.136828899383545, "logps/chosen": -502.498779296875, "logps/rejected": -387.68841552734375, "loss": 0.6017, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.33200693130493164, "rewards/margins": 0.5042457580566406, "rewards/rejected": -0.17223885655403137, "step": 3050 }, { "epoch": 0.79, "learning_rate": 1.1664275466284074e-07, "logits/chosen": -4.194310188293457, "logits/rejected": -4.310281753540039, "logps/chosen": -580.3814086914062, "logps/rejected": -446.2391662597656, "loss": 0.5559, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.44598498940467834, "rewards/margins": 0.5190831422805786, "rewards/rejected": -0.07309817522764206, "step": 3060 }, { "epoch": 0.79, "learning_rate": 1.152080344332855e-07, "logits/chosen": -4.0883283615112305, "logits/rejected": -4.150923728942871, "logps/chosen": -554.2647094726562, "logps/rejected": -424.0943298339844, "loss": 0.5772, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.4851420521736145, "rewards/margins": 0.5894008874893188, "rewards/rejected": -0.10425883531570435, "step": 3070 }, { "epoch": 0.8, "learning_rate": 1.1377331420373027e-07, "logits/chosen": -4.141337871551514, "logits/rejected": -4.110450267791748, "logps/chosen": -580.8941040039062, "logps/rejected": -452.0682067871094, "loss": 0.5978, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5248143076896667, "rewards/margins": 0.5298766493797302, "rewards/rejected": -0.005062357988208532, "step": 3080 }, { "epoch": 0.8, "learning_rate": 1.1233859397417503e-07, "logits/chosen": -4.203800201416016, "logits/rejected": -4.3287224769592285, "logps/chosen": -611.932373046875, "logps/rejected": -462.0245056152344, "loss": 0.5423, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.6605905890464783, "rewards/margins": 0.6125748753547668, "rewards/rejected": 0.04801566153764725, "step": 3090 }, { "epoch": 0.8, "learning_rate": 1.109038737446198e-07, "logits/chosen": -3.9451992511749268, "logits/rejected": -3.978661060333252, "logps/chosen": -548.4806518554688, "logps/rejected": -418.55255126953125, "loss": 0.5837, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.4787958264350891, "rewards/margins": 0.4884239733219147, "rewards/rejected": -0.009628054685890675, "step": 3100 }, { "epoch": 0.8, "eval_logits/chosen": -3.9741623401641846, "eval_logits/rejected": -3.997610092163086, "eval_logps/chosen": -544.5935668945312, "eval_logps/rejected": -438.3072509765625, "eval_loss": 0.563690721988678, "eval_rewards/accuracies": 0.6830000281333923, "eval_rewards/chosen": 0.47830715775489807, "eval_rewards/margins": 0.5085403323173523, "eval_rewards/rejected": -0.03023313544690609, "eval_runtime": 145.8972, "eval_samples_per_second": 13.708, "eval_steps_per_second": 1.714, "step": 3100 }, { "epoch": 0.8, "learning_rate": 1.0946915351506456e-07, "logits/chosen": -3.9184958934783936, "logits/rejected": -3.7819457054138184, "logps/chosen": -571.571533203125, "logps/rejected": -445.2757263183594, "loss": 0.5715, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.40955060720443726, "rewards/margins": 0.4587160050868988, "rewards/rejected": -0.049165401607751846, "step": 3110 }, { "epoch": 0.81, "learning_rate": 1.0803443328550932e-07, "logits/chosen": -3.905442476272583, "logits/rejected": -3.8821640014648438, "logps/chosen": -460.50372314453125, "logps/rejected": -420.4170837402344, "loss": 0.5573, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.32348185777664185, "rewards/margins": 0.37588781118392944, "rewards/rejected": -0.05240591615438461, "step": 3120 }, { "epoch": 0.81, "learning_rate": 1.0659971305595408e-07, "logits/chosen": -3.8710105419158936, "logits/rejected": -3.9780330657958984, "logps/chosen": -492.71075439453125, "logps/rejected": -370.08160400390625, "loss": 0.5272, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.4087301790714264, "rewards/margins": 0.49260735511779785, "rewards/rejected": -0.08387719094753265, "step": 3130 }, { "epoch": 0.81, "learning_rate": 1.0516499282639884e-07, "logits/chosen": -4.243491172790527, "logits/rejected": -4.153388977050781, "logps/chosen": -496.98260498046875, "logps/rejected": -367.54632568359375, "loss": 0.5874, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.44594526290893555, "rewards/margins": 0.5719509720802307, "rewards/rejected": -0.12600573897361755, "step": 3140 }, { "epoch": 0.81, "learning_rate": 1.037302725968436e-07, "logits/chosen": -3.9939746856689453, "logits/rejected": -4.004325866699219, "logps/chosen": -478.77056884765625, "logps/rejected": -464.50665283203125, "loss": 0.6232, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.35015568137168884, "rewards/margins": 0.37633177638053894, "rewards/rejected": -0.02617608569562435, "step": 3150 }, { "epoch": 0.82, "learning_rate": 1.0229555236728837e-07, "logits/chosen": -4.253720283508301, "logits/rejected": -4.267764568328857, "logps/chosen": -557.8858642578125, "logps/rejected": -412.69818115234375, "loss": 0.5698, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.4118489623069763, "rewards/margins": 0.5684719085693359, "rewards/rejected": -0.15662303566932678, "step": 3160 }, { "epoch": 0.82, "learning_rate": 1.0086083213773313e-07, "logits/chosen": -3.7371535301208496, "logits/rejected": -3.7913818359375, "logps/chosen": -562.0777587890625, "logps/rejected": -491.91143798828125, "loss": 0.5108, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.5288134813308716, "rewards/margins": 0.5345078706741333, "rewards/rejected": -0.005694452673196793, "step": 3170 }, { "epoch": 0.82, "learning_rate": 9.94261119081779e-08, "logits/chosen": -3.9377448558807373, "logits/rejected": -3.955479383468628, "logps/chosen": -502.92718505859375, "logps/rejected": -408.81976318359375, "loss": 0.5617, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5690404772758484, "rewards/margins": 0.502609372138977, "rewards/rejected": 0.06643114238977432, "step": 3180 }, { "epoch": 0.82, "learning_rate": 9.799139167862266e-08, "logits/chosen": -4.40088415145874, "logits/rejected": -4.40977144241333, "logps/chosen": -582.387451171875, "logps/rejected": -508.1477966308594, "loss": 0.523, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.5947442650794983, "rewards/margins": 0.6568835377693176, "rewards/rejected": -0.06213930994272232, "step": 3190 }, { "epoch": 0.83, "learning_rate": 9.655667144906743e-08, "logits/chosen": -4.194244861602783, "logits/rejected": -4.144165992736816, "logps/chosen": -582.037109375, "logps/rejected": -409.24224853515625, "loss": 0.5293, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.5745357275009155, "rewards/margins": 0.728204071521759, "rewards/rejected": -0.15366844832897186, "step": 3200 }, { "epoch": 0.83, "eval_logits/chosen": -3.9778196811676025, "eval_logits/rejected": -4.002331733703613, "eval_logps/chosen": -544.66162109375, "eval_logps/rejected": -438.367919921875, "eval_loss": 0.5634328126907349, "eval_rewards/accuracies": 0.6890000104904175, "eval_rewards/chosen": 0.47150418162345886, "eval_rewards/margins": 0.5078018307685852, "eval_rewards/rejected": -0.03629762679338455, "eval_runtime": 146.8004, "eval_samples_per_second": 13.624, "eval_steps_per_second": 1.703, "step": 3200 }, { "epoch": 0.83, "learning_rate": 9.512195121951219e-08, "logits/chosen": -4.067798614501953, "logits/rejected": -4.208149433135986, "logps/chosen": -511.45013427734375, "logps/rejected": -376.13067626953125, "loss": 0.5854, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.4113733768463135, "rewards/margins": 0.48462361097335815, "rewards/rejected": -0.07325027137994766, "step": 3210 }, { "epoch": 0.83, "learning_rate": 9.368723098995696e-08, "logits/chosen": -4.2905426025390625, "logits/rejected": -4.173062801361084, "logps/chosen": -573.29296875, "logps/rejected": -427.76739501953125, "loss": 0.5641, "rewards/accuracies": 0.6875, "rewards/chosen": 0.48305606842041016, "rewards/margins": 0.5324376225471497, "rewards/rejected": -0.049381546676158905, "step": 3220 }, { "epoch": 0.83, "learning_rate": 9.225251076040172e-08, "logits/chosen": -3.8998687267303467, "logits/rejected": -3.798374891281128, "logps/chosen": -553.4527587890625, "logps/rejected": -378.1935119628906, "loss": 0.5525, "rewards/accuracies": 0.75, "rewards/chosen": 0.4461655020713806, "rewards/margins": 0.6628150343894958, "rewards/rejected": -0.21664953231811523, "step": 3230 }, { "epoch": 0.84, "learning_rate": 9.081779053084649e-08, "logits/chosen": -4.098966121673584, "logits/rejected": -4.07062292098999, "logps/chosen": -563.8943481445312, "logps/rejected": -444.6372985839844, "loss": 0.5853, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.5742425918579102, "rewards/margins": 0.5752390027046204, "rewards/rejected": -0.0009963444899767637, "step": 3240 }, { "epoch": 0.84, "learning_rate": 8.938307030129125e-08, "logits/chosen": -4.253169059753418, "logits/rejected": -4.21138858795166, "logps/chosen": -535.7103271484375, "logps/rejected": -373.97003173828125, "loss": 0.5755, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3600585460662842, "rewards/margins": 0.45460644364356995, "rewards/rejected": -0.09454789757728577, "step": 3250 }, { "epoch": 0.84, "learning_rate": 8.794835007173601e-08, "logits/chosen": -3.9807746410369873, "logits/rejected": -3.941415309906006, "logps/chosen": -572.0859985351562, "logps/rejected": -468.2994079589844, "loss": 0.58, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.5067304968833923, "rewards/margins": 0.50401771068573, "rewards/rejected": 0.0027127789799124002, "step": 3260 }, { "epoch": 0.84, "learning_rate": 8.651362984218078e-08, "logits/chosen": -4.293785095214844, "logits/rejected": -4.3718976974487305, "logps/chosen": -584.475830078125, "logps/rejected": -409.7781677246094, "loss": 0.5662, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.3756251931190491, "rewards/margins": 0.5408438444137573, "rewards/rejected": -0.16521869599819183, "step": 3270 }, { "epoch": 0.85, "learning_rate": 8.507890961262554e-08, "logits/chosen": -3.9542198181152344, "logits/rejected": -3.9630751609802246, "logps/chosen": -513.8878173828125, "logps/rejected": -526.8805541992188, "loss": 0.5605, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.5460586547851562, "rewards/margins": 0.4736880362033844, "rewards/rejected": 0.07237061113119125, "step": 3280 }, { "epoch": 0.85, "learning_rate": 8.364418938307031e-08, "logits/chosen": -3.925539493560791, "logits/rejected": -3.744020462036133, "logps/chosen": -536.7335205078125, "logps/rejected": -374.2388610839844, "loss": 0.5676, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.4619140028953552, "rewards/margins": 0.5450933575630188, "rewards/rejected": -0.08317933976650238, "step": 3290 }, { "epoch": 0.85, "learning_rate": 8.220946915351506e-08, "logits/chosen": -3.950735569000244, "logits/rejected": -3.9631354808807373, "logps/chosen": -522.4519653320312, "logps/rejected": -515.7288208007812, "loss": 0.5128, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.4143013060092926, "rewards/margins": 0.5980950593948364, "rewards/rejected": -0.1837938129901886, "step": 3300 }, { "epoch": 0.85, "eval_logits/chosen": -3.9803950786590576, "eval_logits/rejected": -4.005295753479004, "eval_logps/chosen": -544.6318969726562, "eval_logps/rejected": -438.39166259765625, "eval_loss": 0.5619609355926514, "eval_rewards/accuracies": 0.6880000233650208, "eval_rewards/chosen": 0.474471777677536, "eval_rewards/margins": 0.5131421089172363, "eval_rewards/rejected": -0.038670338690280914, "eval_runtime": 147.8686, "eval_samples_per_second": 13.526, "eval_steps_per_second": 1.691, "step": 3300 }, { "epoch": 0.85, "learning_rate": 8.077474892395982e-08, "logits/chosen": -4.0901780128479, "logits/rejected": -4.084465980529785, "logps/chosen": -538.7194213867188, "logps/rejected": -432.45892333984375, "loss": 0.5549, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.45796340703964233, "rewards/margins": 0.44825053215026855, "rewards/rejected": 0.009712914004921913, "step": 3310 }, { "epoch": 0.86, "learning_rate": 7.934002869440459e-08, "logits/chosen": -3.90226411819458, "logits/rejected": -4.059938907623291, "logps/chosen": -635.9435424804688, "logps/rejected": -362.847412109375, "loss": 0.5811, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.3464578092098236, "rewards/margins": 0.35814762115478516, "rewards/rejected": -0.011689816601574421, "step": 3320 }, { "epoch": 0.86, "learning_rate": 7.790530846484935e-08, "logits/chosen": -4.0388689041137695, "logits/rejected": -3.9921679496765137, "logps/chosen": -555.1847534179688, "logps/rejected": -400.4381408691406, "loss": 0.5537, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.46570801734924316, "rewards/margins": 0.5739080309867859, "rewards/rejected": -0.1082000583410263, "step": 3330 }, { "epoch": 0.86, "learning_rate": 7.647058823529412e-08, "logits/chosen": -4.057839870452881, "logits/rejected": -3.962494373321533, "logps/chosen": -633.3861083984375, "logps/rejected": -521.1183471679688, "loss": 0.569, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.44552069902420044, "rewards/margins": 0.42656344175338745, "rewards/rejected": 0.01895725727081299, "step": 3340 }, { "epoch": 0.86, "learning_rate": 7.503586800573888e-08, "logits/chosen": -4.196125507354736, "logits/rejected": -4.071255683898926, "logps/chosen": -488.28460693359375, "logps/rejected": -371.49169921875, "loss": 0.5918, "rewards/accuracies": 0.6875, "rewards/chosen": 0.39670076966285706, "rewards/margins": 0.3868991434574127, "rewards/rejected": 0.009801648557186127, "step": 3350 }, { "epoch": 0.87, "learning_rate": 7.360114777618365e-08, "logits/chosen": -3.972092390060425, "logits/rejected": -4.106286525726318, "logps/chosen": -572.9547119140625, "logps/rejected": -420.35400390625, "loss": 0.5315, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.49660277366638184, "rewards/margins": 0.5567241311073303, "rewards/rejected": -0.06012127920985222, "step": 3360 }, { "epoch": 0.87, "learning_rate": 7.21664275466284e-08, "logits/chosen": -3.9770302772521973, "logits/rejected": -4.03969669342041, "logps/chosen": -547.3425903320312, "logps/rejected": -495.91851806640625, "loss": 0.604, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.48121723532676697, "rewards/margins": 0.4144704341888428, "rewards/rejected": 0.06674680858850479, "step": 3370 }, { "epoch": 0.87, "learning_rate": 7.073170731707316e-08, "logits/chosen": -4.059111595153809, "logits/rejected": -4.129426002502441, "logps/chosen": -564.0946655273438, "logps/rejected": -506.58709716796875, "loss": 0.549, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.40859222412109375, "rewards/margins": 0.3801480233669281, "rewards/rejected": 0.028444204479455948, "step": 3380 }, { "epoch": 0.88, "learning_rate": 6.929698708751793e-08, "logits/chosen": -4.225644111633301, "logits/rejected": -4.292551040649414, "logps/chosen": -588.9600830078125, "logps/rejected": -472.1261291503906, "loss": 0.5453, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.4812944531440735, "rewards/margins": 0.5918601155281067, "rewards/rejected": -0.11056558787822723, "step": 3390 }, { "epoch": 0.88, "learning_rate": 6.786226685796269e-08, "logits/chosen": -4.155394077301025, "logits/rejected": -4.14565896987915, "logps/chosen": -525.122314453125, "logps/rejected": -392.67376708984375, "loss": 0.6204, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.31629544496536255, "rewards/margins": 0.37368619441986084, "rewards/rejected": -0.0573907308280468, "step": 3400 }, { "epoch": 0.88, "eval_logits/chosen": -3.9814672470092773, "eval_logits/rejected": -4.006735324859619, "eval_logps/chosen": -544.69775390625, "eval_logps/rejected": -438.4468688964844, "eval_loss": 0.5624998211860657, "eval_rewards/accuracies": 0.6859999895095825, "eval_rewards/chosen": 0.46788930892944336, "eval_rewards/margins": 0.5120863914489746, "eval_rewards/rejected": -0.044197000563144684, "eval_runtime": 147.1881, "eval_samples_per_second": 13.588, "eval_steps_per_second": 1.699, "step": 3400 }, { "epoch": 0.88, "learning_rate": 6.642754662840746e-08, "logits/chosen": -4.196806907653809, "logits/rejected": -4.363795280456543, "logps/chosen": -574.8983154296875, "logps/rejected": -502.860107421875, "loss": 0.5918, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.49528592824935913, "rewards/margins": 0.5257723927497864, "rewards/rejected": -0.03048643469810486, "step": 3410 }, { "epoch": 0.88, "learning_rate": 6.499282639885222e-08, "logits/chosen": -3.957125186920166, "logits/rejected": -3.888190507888794, "logps/chosen": -547.2122192382812, "logps/rejected": -404.66595458984375, "loss": 0.5457, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.4918132722377777, "rewards/margins": 0.5843526124954224, "rewards/rejected": -0.09253935515880585, "step": 3420 }, { "epoch": 0.89, "learning_rate": 6.355810616929698e-08, "logits/chosen": -3.979330539703369, "logits/rejected": -4.06231164932251, "logps/chosen": -524.4691772460938, "logps/rejected": -422.9524841308594, "loss": 0.5182, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.4810391962528229, "rewards/margins": 0.5485955476760864, "rewards/rejected": -0.06755636632442474, "step": 3430 }, { "epoch": 0.89, "learning_rate": 6.212338593974175e-08, "logits/chosen": -4.29758882522583, "logits/rejected": -4.182176113128662, "logps/chosen": -595.9281616210938, "logps/rejected": -425.3091735839844, "loss": 0.5189, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5549203157424927, "rewards/margins": 0.5970374941825867, "rewards/rejected": -0.04211718589067459, "step": 3440 }, { "epoch": 0.89, "learning_rate": 6.068866571018651e-08, "logits/chosen": -4.2188720703125, "logits/rejected": -4.125060081481934, "logps/chosen": -521.8778076171875, "logps/rejected": -426.075439453125, "loss": 0.6504, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.3750000596046448, "rewards/margins": 0.34474819898605347, "rewards/rejected": 0.03025185689330101, "step": 3450 }, { "epoch": 0.89, "learning_rate": 5.925394548063128e-08, "logits/chosen": -4.250518321990967, "logits/rejected": -4.323083400726318, "logps/chosen": -596.1790161132812, "logps/rejected": -468.62078857421875, "loss": 0.628, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.49697431921958923, "rewards/margins": 0.5001575350761414, "rewards/rejected": -0.00318324426189065, "step": 3460 }, { "epoch": 0.9, "learning_rate": 5.7819225251076036e-08, "logits/chosen": -4.112654685974121, "logits/rejected": -3.997405529022217, "logps/chosen": -498.8377990722656, "logps/rejected": -473.70550537109375, "loss": 0.5889, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.3584319055080414, "rewards/margins": 0.3437018394470215, "rewards/rejected": 0.014730053022503853, "step": 3470 }, { "epoch": 0.9, "learning_rate": 5.63845050215208e-08, "logits/chosen": -4.2403950691223145, "logits/rejected": -4.099762916564941, "logps/chosen": -572.9876098632812, "logps/rejected": -437.2333984375, "loss": 0.565, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.43449336290359497, "rewards/margins": 0.5010371804237366, "rewards/rejected": -0.06654379516839981, "step": 3480 }, { "epoch": 0.9, "learning_rate": 5.4949784791965565e-08, "logits/chosen": -4.034601211547852, "logits/rejected": -4.012211322784424, "logps/chosen": -535.9708862304688, "logps/rejected": -390.2044372558594, "loss": 0.5451, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5869877338409424, "rewards/margins": 0.674468994140625, "rewards/rejected": -0.087481290102005, "step": 3490 }, { "epoch": 0.9, "learning_rate": 5.351506456241032e-08, "logits/chosen": -4.010631561279297, "logits/rejected": -3.996525526046753, "logps/chosen": -550.5261840820312, "logps/rejected": -470.1412658691406, "loss": 0.5469, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.5485318899154663, "rewards/margins": 0.49418431520462036, "rewards/rejected": 0.054347604513168335, "step": 3500 }, { "epoch": 0.9, "eval_logits/chosen": -3.98427677154541, "eval_logits/rejected": -4.009834289550781, "eval_logps/chosen": -544.7650756835938, "eval_logps/rejected": -438.4956359863281, "eval_loss": 0.5618208050727844, "eval_rewards/accuracies": 0.6859999895095825, "eval_rewards/chosen": 0.4611594080924988, "eval_rewards/margins": 0.5102306604385376, "eval_rewards/rejected": -0.049071334302425385, "eval_runtime": 146.3352, "eval_samples_per_second": 13.667, "eval_steps_per_second": 1.708, "step": 3500 }, { "epoch": 0.91, "learning_rate": 5.208034433285509e-08, "logits/chosen": -3.8921310901641846, "logits/rejected": -3.848719358444214, "logps/chosen": -647.0032958984375, "logps/rejected": -568.3533935546875, "loss": 0.5397, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.5782560110092163, "rewards/margins": 0.5668593645095825, "rewards/rejected": 0.011396640911698341, "step": 3510 }, { "epoch": 0.91, "learning_rate": 5.064562410329985e-08, "logits/chosen": -4.195284843444824, "logits/rejected": -4.076591491699219, "logps/chosen": -530.7032470703125, "logps/rejected": -445.4291076660156, "loss": 0.5744, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.45487624406814575, "rewards/margins": 0.4947517514228821, "rewards/rejected": -0.03987548500299454, "step": 3520 }, { "epoch": 0.91, "learning_rate": 4.9210903873744616e-08, "logits/chosen": -4.124705791473389, "logits/rejected": -4.10734748840332, "logps/chosen": -541.8250122070312, "logps/rejected": -430.775146484375, "loss": 0.6309, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.4992215633392334, "rewards/margins": 0.41505131125450134, "rewards/rejected": 0.08417025953531265, "step": 3530 }, { "epoch": 0.91, "learning_rate": 4.777618364418938e-08, "logits/chosen": -3.803776502609253, "logits/rejected": -3.8794872760772705, "logps/chosen": -551.4606323242188, "logps/rejected": -393.61907958984375, "loss": 0.5847, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.3885660171508789, "rewards/margins": 0.4686763882637024, "rewards/rejected": -0.08011035621166229, "step": 3540 }, { "epoch": 0.92, "learning_rate": 4.6341463414634145e-08, "logits/chosen": -4.222638130187988, "logits/rejected": -4.128180027008057, "logps/chosen": -616.28515625, "logps/rejected": -464.65692138671875, "loss": 0.6212, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5344708561897278, "rewards/margins": 0.4752708971500397, "rewards/rejected": 0.059200018644332886, "step": 3550 }, { "epoch": 0.92, "learning_rate": 4.490674318507891e-08, "logits/chosen": -4.027795314788818, "logits/rejected": -3.9686641693115234, "logps/chosen": -548.213134765625, "logps/rejected": -391.7416076660156, "loss": 0.6047, "rewards/accuracies": 0.625, "rewards/chosen": 0.4162333607673645, "rewards/margins": 0.338506817817688, "rewards/rejected": 0.07772652804851532, "step": 3560 }, { "epoch": 0.92, "learning_rate": 4.3472022955523674e-08, "logits/chosen": -4.117037773132324, "logits/rejected": -4.153426647186279, "logps/chosen": -632.22607421875, "logps/rejected": -593.6041870117188, "loss": 0.627, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.40994158387184143, "rewards/margins": 0.3304620385169983, "rewards/rejected": 0.07947959750890732, "step": 3570 }, { "epoch": 0.92, "learning_rate": 4.203730272596843e-08, "logits/chosen": -3.860865831375122, "logits/rejected": -3.7739486694335938, "logps/chosen": -516.7479248046875, "logps/rejected": -392.3046569824219, "loss": 0.6383, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.31629177927970886, "rewards/margins": 0.39345088601112366, "rewards/rejected": -0.07715904712677002, "step": 3580 }, { "epoch": 0.93, "learning_rate": 4.0602582496413197e-08, "logits/chosen": -3.8026537895202637, "logits/rejected": -3.838305711746216, "logps/chosen": -599.1148681640625, "logps/rejected": -478.2513122558594, "loss": 0.5632, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.4942142963409424, "rewards/margins": 0.4958348274230957, "rewards/rejected": -0.001620540046133101, "step": 3590 }, { "epoch": 0.93, "learning_rate": 3.916786226685796e-08, "logits/chosen": -3.803657054901123, "logits/rejected": -3.809593677520752, "logps/chosen": -450.1053161621094, "logps/rejected": -387.50457763671875, "loss": 0.5807, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.4047510623931885, "rewards/margins": 0.3800794184207916, "rewards/rejected": 0.0246716421097517, "step": 3600 }, { "epoch": 0.93, "eval_logits/chosen": -3.9818077087402344, "eval_logits/rejected": -4.006768703460693, "eval_logps/chosen": -544.7014770507812, "eval_logps/rejected": -438.45843505859375, "eval_loss": 0.561528742313385, "eval_rewards/accuracies": 0.6890000104904175, "eval_rewards/chosen": 0.46752142906188965, "eval_rewards/margins": 0.5128761529922485, "eval_rewards/rejected": -0.04535466805100441, "eval_runtime": 146.6531, "eval_samples_per_second": 13.638, "eval_steps_per_second": 1.705, "step": 3600 }, { "epoch": 0.93, "learning_rate": 3.7733142037302726e-08, "logits/chosen": -4.237338066101074, "logits/rejected": -4.075575828552246, "logps/chosen": -507.56475830078125, "logps/rejected": -460.54864501953125, "loss": 0.5626, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.3922446668148041, "rewards/margins": 0.3932397663593292, "rewards/rejected": -0.000995102571323514, "step": 3610 }, { "epoch": 0.93, "learning_rate": 3.629842180774749e-08, "logits/chosen": -4.196072578430176, "logits/rejected": -4.2456374168396, "logps/chosen": -561.850830078125, "logps/rejected": -485.9574279785156, "loss": 0.5872, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.45551833510398865, "rewards/margins": 0.5499417185783386, "rewards/rejected": -0.09442339837551117, "step": 3620 }, { "epoch": 0.94, "learning_rate": 3.4863701578192255e-08, "logits/chosen": -4.031411170959473, "logits/rejected": -4.024487495422363, "logps/chosen": -486.8564453125, "logps/rejected": -412.80712890625, "loss": 0.5609, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.395702064037323, "rewards/margins": 0.37622708082199097, "rewards/rejected": 0.01947496458888054, "step": 3630 }, { "epoch": 0.94, "learning_rate": 3.342898134863702e-08, "logits/chosen": -4.104000091552734, "logits/rejected": -4.140475273132324, "logps/chosen": -629.5847778320312, "logps/rejected": -438.322021484375, "loss": 0.5127, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.6019877195358276, "rewards/margins": 0.7443768382072449, "rewards/rejected": -0.1423892229795456, "step": 3640 }, { "epoch": 0.94, "learning_rate": 3.1994261119081784e-08, "logits/chosen": -4.0185065269470215, "logits/rejected": -3.976111888885498, "logps/chosen": -567.458740234375, "logps/rejected": -433.2574157714844, "loss": 0.5677, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.40292826294898987, "rewards/margins": 0.45192545652389526, "rewards/rejected": -0.048997145146131516, "step": 3650 }, { "epoch": 0.95, "learning_rate": 3.055954088952654e-08, "logits/chosen": -3.958566665649414, "logits/rejected": -4.042834758758545, "logps/chosen": -507.0655212402344, "logps/rejected": -423.1856384277344, "loss": 0.5869, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4078023433685303, "rewards/margins": 0.4608895778656006, "rewards/rejected": -0.05308721214532852, "step": 3660 }, { "epoch": 0.95, "learning_rate": 2.9124820659971306e-08, "logits/chosen": -4.479077339172363, "logits/rejected": -4.497659683227539, "logps/chosen": -593.07666015625, "logps/rejected": -471.2254943847656, "loss": 0.5922, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5073246359825134, "rewards/margins": 0.5663779973983765, "rewards/rejected": -0.059053339064121246, "step": 3670 }, { "epoch": 0.95, "learning_rate": 2.7690100430416067e-08, "logits/chosen": -4.271915912628174, "logits/rejected": -4.254021644592285, "logps/chosen": -458.9129943847656, "logps/rejected": -393.89862060546875, "loss": 0.5112, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.3906114995479584, "rewards/margins": 0.6087072491645813, "rewards/rejected": -0.2180958241224289, "step": 3680 }, { "epoch": 0.95, "learning_rate": 2.625538020086083e-08, "logits/chosen": -4.178504943847656, "logits/rejected": -4.220212459564209, "logps/chosen": -584.3404541015625, "logps/rejected": -444.0389709472656, "loss": 0.5676, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.4277767241001129, "rewards/margins": 0.5100394487380981, "rewards/rejected": -0.08226276189088821, "step": 3690 }, { "epoch": 0.96, "learning_rate": 2.4820659971305596e-08, "logits/chosen": -4.02055549621582, "logits/rejected": -4.129875183105469, "logps/chosen": -557.54345703125, "logps/rejected": -435.833984375, "loss": 0.5265, "rewards/accuracies": 0.6875, "rewards/chosen": 0.48887911438941956, "rewards/margins": 0.5414212942123413, "rewards/rejected": -0.05254218727350235, "step": 3700 }, { "epoch": 0.96, "eval_logits/chosen": -3.9832568168640137, "eval_logits/rejected": -4.008208274841309, "eval_logps/chosen": -544.701904296875, "eval_logps/rejected": -438.4403381347656, "eval_loss": 0.5619760751724243, "eval_rewards/accuracies": 0.6880000233650208, "eval_rewards/chosen": 0.46747326850891113, "eval_rewards/margins": 0.5110137462615967, "eval_rewards/rejected": -0.04354046657681465, "eval_runtime": 147.499, "eval_samples_per_second": 13.559, "eval_steps_per_second": 1.695, "step": 3700 }, { "epoch": 0.96, "learning_rate": 2.3385939741750357e-08, "logits/chosen": -4.32746696472168, "logits/rejected": -4.259668827056885, "logps/chosen": -622.6856079101562, "logps/rejected": -556.1048583984375, "loss": 0.5794, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.5587274432182312, "rewards/margins": 0.6079638600349426, "rewards/rejected": -0.049236398190259933, "step": 3710 }, { "epoch": 0.96, "learning_rate": 2.195121951219512e-08, "logits/chosen": -3.798098087310791, "logits/rejected": -4.027928829193115, "logps/chosen": -534.12646484375, "logps/rejected": -370.48907470703125, "loss": 0.5292, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.5056679844856262, "rewards/margins": 0.6213759183883667, "rewards/rejected": -0.11570799350738525, "step": 3720 }, { "epoch": 0.96, "learning_rate": 2.0516499282639883e-08, "logits/chosen": -4.053610801696777, "logits/rejected": -4.082070350646973, "logps/chosen": -639.72998046875, "logps/rejected": -456.34844970703125, "loss": 0.5726, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.5697764158248901, "rewards/margins": 0.5703621506690979, "rewards/rejected": -0.0005857095238752663, "step": 3730 }, { "epoch": 0.97, "learning_rate": 1.9081779053084647e-08, "logits/chosen": -3.877751111984253, "logits/rejected": -3.9127018451690674, "logps/chosen": -583.1852416992188, "logps/rejected": -401.8482971191406, "loss": 0.5216, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.5018226504325867, "rewards/margins": 0.5718634724617004, "rewards/rejected": -0.07004072517156601, "step": 3740 }, { "epoch": 0.97, "learning_rate": 1.7647058823529412e-08, "logits/chosen": -4.168404579162598, "logits/rejected": -4.074638366699219, "logps/chosen": -510.4005432128906, "logps/rejected": -415.7406311035156, "loss": 0.5916, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.4106532037258148, "rewards/margins": 0.4389967918395996, "rewards/rejected": -0.028343593701720238, "step": 3750 }, { "epoch": 0.97, "learning_rate": 1.6212338593974173e-08, "logits/chosen": -4.087225914001465, "logits/rejected": -3.9666972160339355, "logps/chosen": -465.10986328125, "logps/rejected": -324.4447021484375, "loss": 0.5872, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.3475942015647888, "rewards/margins": 0.424374520778656, "rewards/rejected": -0.07678033411502838, "step": 3760 }, { "epoch": 0.97, "learning_rate": 1.4777618364418938e-08, "logits/chosen": -4.230082035064697, "logits/rejected": -4.243712902069092, "logps/chosen": -635.1439208984375, "logps/rejected": -543.9008178710938, "loss": 0.5959, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6082009077072144, "rewards/margins": 0.42760133743286133, "rewards/rejected": 0.1805996149778366, "step": 3770 }, { "epoch": 0.98, "learning_rate": 1.3342898134863702e-08, "logits/chosen": -4.096220970153809, "logits/rejected": -4.115006446838379, "logps/chosen": -577.7149658203125, "logps/rejected": -440.178955078125, "loss": 0.6036, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.4464997351169586, "rewards/margins": 0.4255821108818054, "rewards/rejected": 0.020917650312185287, "step": 3780 }, { "epoch": 0.98, "learning_rate": 1.1908177905308463e-08, "logits/chosen": -3.8991074562072754, "logits/rejected": -3.802464246749878, "logps/chosen": -519.2384643554688, "logps/rejected": -435.0231018066406, "loss": 0.5509, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.44993266463279724, "rewards/margins": 0.5668569207191467, "rewards/rejected": -0.1169242262840271, "step": 3790 }, { "epoch": 0.98, "learning_rate": 1.0473457675753228e-08, "logits/chosen": -4.110097885131836, "logits/rejected": -4.070154190063477, "logps/chosen": -539.1266479492188, "logps/rejected": -404.272216796875, "loss": 0.5484, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.5660614371299744, "rewards/margins": 0.6612092852592468, "rewards/rejected": -0.0951477512717247, "step": 3800 }, { "epoch": 0.98, "eval_logits/chosen": -3.9850714206695557, "eval_logits/rejected": -4.010331630706787, "eval_logps/chosen": -544.69189453125, "eval_logps/rejected": -438.4535827636719, "eval_loss": 0.5614883899688721, "eval_rewards/accuracies": 0.6930000185966492, "eval_rewards/chosen": 0.46847668290138245, "eval_rewards/margins": 0.5133422613143921, "eval_rewards/rejected": -0.044865623116493225, "eval_runtime": 146.7356, "eval_samples_per_second": 13.63, "eval_steps_per_second": 1.704, "step": 3800 }, { "epoch": 0.98, "learning_rate": 9.03873744619799e-09, "logits/chosen": -4.123238563537598, "logits/rejected": -4.03770112991333, "logps/chosen": -511.1904296875, "logps/rejected": -446.9549865722656, "loss": 0.5468, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.46543973684310913, "rewards/margins": 0.4664246439933777, "rewards/rejected": -0.0009849362540990114, "step": 3810 }, { "epoch": 0.99, "learning_rate": 7.604017216642753e-09, "logits/chosen": -4.272444725036621, "logits/rejected": -4.183244705200195, "logps/chosen": -505.7000427246094, "logps/rejected": -433.4576721191406, "loss": 0.5317, "rewards/accuracies": 0.75, "rewards/chosen": 0.5172845721244812, "rewards/margins": 0.5496169328689575, "rewards/rejected": -0.03233236074447632, "step": 3820 }, { "epoch": 0.99, "learning_rate": 6.169296987087518e-09, "logits/chosen": -4.149974346160889, "logits/rejected": -4.188473701477051, "logps/chosen": -587.9978637695312, "logps/rejected": -437.9640197753906, "loss": 0.557, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5558081269264221, "rewards/margins": 0.5474778413772583, "rewards/rejected": 0.008330265991389751, "step": 3830 }, { "epoch": 0.99, "learning_rate": 4.734576757532282e-09, "logits/chosen": -4.119086265563965, "logits/rejected": -4.145096778869629, "logps/chosen": -511.7611389160156, "logps/rejected": -394.4190368652344, "loss": 0.5667, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.42686066031455994, "rewards/margins": 0.49367666244506836, "rewards/rejected": -0.06681600958108902, "step": 3840 }, { "epoch": 0.99, "learning_rate": 3.299856527977044e-09, "logits/chosen": -3.9760982990264893, "logits/rejected": -4.040436267852783, "logps/chosen": -612.0133056640625, "logps/rejected": -519.7421875, "loss": 0.5988, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.49833711981773376, "rewards/margins": 0.5593429803848267, "rewards/rejected": -0.06100592762231827, "step": 3850 }, { "epoch": 1.0, "learning_rate": 1.8651362984218077e-09, "logits/chosen": -3.951416015625, "logits/rejected": -3.949162006378174, "logps/chosen": -557.078857421875, "logps/rejected": -389.9462890625, "loss": 0.5123, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.5364678502082825, "rewards/margins": 0.6398257613182068, "rewards/rejected": -0.10335797071456909, "step": 3860 }, { "epoch": 1.0, "learning_rate": 4.30416068866571e-10, "logits/chosen": -4.126285076141357, "logits/rejected": -4.162901878356934, "logps/chosen": -532.77880859375, "logps/rejected": -499.14410400390625, "loss": 0.6348, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4620552659034729, "rewards/margins": 0.32118576765060425, "rewards/rejected": 0.14086945354938507, "step": 3870 }, { "epoch": 1.0, "step": 3873, "total_flos": 0.0, "train_loss": 0.5913154047772216, "train_runtime": 14580.3501, "train_samples_per_second": 4.25, "train_steps_per_second": 0.266 } ], "logging_steps": 10, "max_steps": 3873, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }