{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996020692399522, "eval_steps": 100, "global_step": 1256, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 3.968253968253968e-09, "logits/chosen": -2.7193620204925537, "logits/rejected": -2.698728084564209, "logps/chosen": -182.0961456298828, "logps/rejected": -172.47128295898438, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 3.968253968253968e-08, "logits/chosen": -2.7041964530944824, "logits/rejected": -2.6794540882110596, "logps/chosen": -162.45831298828125, "logps/rejected": -140.5693359375, "loss": 0.6931, "rewards/accuracies": 0.5486111044883728, "rewards/chosen": 0.00032037965138442814, "rewards/margins": 0.0004935775068588555, "rewards/rejected": -0.00017319784092251211, "step": 10 }, { "epoch": 0.02, "learning_rate": 7.936507936507936e-08, "logits/chosen": -2.7177577018737793, "logits/rejected": -2.7136425971984863, "logps/chosen": -134.47242736816406, "logps/rejected": -143.55604553222656, "loss": 0.6931, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 8.780837379163131e-05, "rewards/margins": 0.00010721785656642169, "rewards/rejected": -1.940951551659964e-05, "step": 20 }, { "epoch": 0.02, "learning_rate": 1.1904761904761903e-07, "logits/chosen": -2.6898293495178223, "logits/rejected": -2.676154613494873, "logps/chosen": -140.94692993164062, "logps/rejected": -136.50369262695312, "loss": 0.6931, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0005466601578518748, "rewards/margins": -0.00021456097601912916, "rewards/rejected": 0.0007612211629748344, "step": 30 }, { "epoch": 0.03, "learning_rate": 1.5873015873015872e-07, "logits/chosen": -2.6958394050598145, "logits/rejected": -2.686532974243164, "logps/chosen": -134.98963928222656, "logps/rejected": -144.46652221679688, "loss": 0.6928, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0015748919686302543, "rewards/margins": 0.0009769219905138016, "rewards/rejected": 0.0005979698617011309, "step": 40 }, { "epoch": 0.04, "learning_rate": 1.984126984126984e-07, "logits/chosen": -2.7042899131774902, "logits/rejected": -2.6861345767974854, "logps/chosen": -149.71768188476562, "logps/rejected": -145.0757293701172, "loss": 0.6921, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.005199921317398548, "rewards/margins": 0.0022330707870423794, "rewards/rejected": 0.0029668500646948814, "step": 50 }, { "epoch": 0.05, "learning_rate": 2.3809523809523806e-07, "logits/chosen": -2.705153703689575, "logits/rejected": -2.685439348220825, "logps/chosen": -154.3783416748047, "logps/rejected": -151.54519653320312, "loss": 0.6912, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.00569504126906395, "rewards/margins": 0.0022000311873853207, "rewards/rejected": 0.003495010081678629, "step": 60 }, { "epoch": 0.06, "learning_rate": 2.7777777777777776e-07, "logits/chosen": -2.7017154693603516, "logits/rejected": -2.6924962997436523, "logps/chosen": -146.3284149169922, "logps/rejected": -138.79405212402344, "loss": 0.6885, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.010588793084025383, "rewards/margins": 0.010192448273301125, "rewards/rejected": 0.00039634370477870107, "step": 70 }, { "epoch": 0.06, "learning_rate": 3.1746031746031743e-07, "logits/chosen": -2.7155232429504395, "logits/rejected": -2.696071147918701, "logps/chosen": -141.80067443847656, "logps/rejected": -147.0068817138672, "loss": 0.6867, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0049073463305830956, "rewards/margins": 0.013599385507404804, "rewards/rejected": -0.008692039176821709, "step": 80 }, { "epoch": 0.07, "learning_rate": 3.5714285714285716e-07, "logits/chosen": -2.7175304889678955, "logits/rejected": -2.7080624103546143, "logps/chosen": -153.12509155273438, "logps/rejected": -146.53590393066406, "loss": 0.6847, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.028871387243270874, "rewards/margins": 0.017175236716866493, "rewards/rejected": -0.046046625822782516, "step": 90 }, { "epoch": 0.08, "learning_rate": 3.968253968253968e-07, "logits/chosen": -2.7524733543395996, "logits/rejected": -2.7452526092529297, "logps/chosen": -163.88070678710938, "logps/rejected": -163.61032104492188, "loss": 0.6789, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0662173330783844, "rewards/margins": 0.02977912127971649, "rewards/rejected": -0.09599645435810089, "step": 100 }, { "epoch": 0.08, "eval_logits/chosen": -2.7336502075195312, "eval_logits/rejected": -2.7255024909973145, "eval_logps/chosen": -155.19271850585938, "eval_logps/rejected": -165.35523986816406, "eval_loss": 0.6769910454750061, "eval_rewards/accuracies": 0.5914179086685181, "eval_rewards/chosen": -0.10619194805622101, "eval_rewards/margins": 0.03601696714758873, "eval_rewards/rejected": -0.14220890402793884, "eval_runtime": 184.251, "eval_samples_per_second": 46.415, "eval_steps_per_second": 0.727, "step": 100 }, { "epoch": 0.09, "learning_rate": 4.365079365079365e-07, "logits/chosen": -2.738532543182373, "logits/rejected": -2.7273170948028564, "logps/chosen": -164.2928009033203, "logps/rejected": -160.19398498535156, "loss": 0.6738, "rewards/accuracies": 0.5625, "rewards/chosen": -0.16211798787117004, "rewards/margins": 0.03163355216383934, "rewards/rejected": -0.19375154376029968, "step": 110 }, { "epoch": 0.1, "learning_rate": 4.761904761904761e-07, "logits/chosen": -2.7289297580718994, "logits/rejected": -2.705962657928467, "logps/chosen": -196.69662475585938, "logps/rejected": -197.2833251953125, "loss": 0.661, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2917623221874237, "rewards/margins": 0.08966299891471863, "rewards/rejected": -0.38142532110214233, "step": 120 }, { "epoch": 0.1, "learning_rate": 4.999845414634076e-07, "logits/chosen": -2.658005475997925, "logits/rejected": -2.6317684650421143, "logps/chosen": -187.4532928466797, "logps/rejected": -188.37689208984375, "loss": 0.6542, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.3956056833267212, "rewards/margins": 0.12092368304729462, "rewards/rejected": -0.5165294408798218, "step": 130 }, { "epoch": 0.11, "learning_rate": 4.998106548810311e-07, "logits/chosen": -2.6906683444976807, "logits/rejected": -2.6913747787475586, "logps/chosen": -199.67568969726562, "logps/rejected": -253.02487182617188, "loss": 0.6171, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4227059781551361, "rewards/margins": 0.27536457777023315, "rewards/rejected": -0.6980706453323364, "step": 140 }, { "epoch": 0.12, "learning_rate": 4.994436933879359e-07, "logits/chosen": -2.6662166118621826, "logits/rejected": -2.644784927368164, "logps/chosen": -197.07180786132812, "logps/rejected": -198.4012908935547, "loss": 0.6395, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.3597154915332794, "rewards/margins": 0.13716872036457062, "rewards/rejected": -0.49688419699668884, "step": 150 }, { "epoch": 0.13, "learning_rate": 4.988839406031596e-07, "logits/chosen": -2.647681474685669, "logits/rejected": -2.6395888328552246, "logps/chosen": -182.04420471191406, "logps/rejected": -206.59780883789062, "loss": 0.629, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3573477864265442, "rewards/margins": 0.2222837507724762, "rewards/rejected": -0.579631507396698, "step": 160 }, { "epoch": 0.14, "learning_rate": 4.981318291512395e-07, "logits/chosen": -2.619232654571533, "logits/rejected": -2.598362684249878, "logps/chosen": -227.0933380126953, "logps/rejected": -230.9747772216797, "loss": 0.6242, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7434185743331909, "rewards/margins": 0.21749505400657654, "rewards/rejected": -0.9609137773513794, "step": 170 }, { "epoch": 0.14, "learning_rate": 4.971879403278432e-07, "logits/chosen": -2.5654754638671875, "logits/rejected": -2.5364232063293457, "logps/chosen": -241.6617431640625, "logps/rejected": -245.66268920898438, "loss": 0.6151, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7342535257339478, "rewards/margins": 0.23685339093208313, "rewards/rejected": -0.9711068868637085, "step": 180 }, { "epoch": 0.15, "learning_rate": 4.960530036504941e-07, "logits/chosen": -2.5271048545837402, "logits/rejected": -2.486818790435791, "logps/chosen": -235.6089630126953, "logps/rejected": -251.17758178710938, "loss": 0.6215, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.807177722454071, "rewards/margins": 0.28561535477638245, "rewards/rejected": -1.0927931070327759, "step": 190 }, { "epoch": 0.16, "learning_rate": 4.947278962947386e-07, "logits/chosen": -2.4217896461486816, "logits/rejected": -2.413295269012451, "logps/chosen": -251.0736083984375, "logps/rejected": -268.6098937988281, "loss": 0.6062, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.086307406425476, "rewards/margins": 0.24874301254749298, "rewards/rejected": -1.3350504636764526, "step": 200 }, { "epoch": 0.16, "eval_logits/chosen": -2.3855514526367188, "eval_logits/rejected": -2.369593858718872, "eval_logps/chosen": -246.6970672607422, "eval_logps/rejected": -289.8621826171875, "eval_loss": 0.6079375743865967, "eval_rewards/accuracies": 0.66697758436203, "eval_rewards/chosen": -1.021235704421997, "eval_rewards/margins": 0.3660426437854767, "eval_rewards/rejected": -1.3872781991958618, "eval_runtime": 184.1922, "eval_samples_per_second": 46.43, "eval_steps_per_second": 0.728, "step": 200 }, { "epoch": 0.17, "learning_rate": 4.932136424161899e-07, "logits/chosen": -2.3366785049438477, "logits/rejected": -2.3228511810302734, "logps/chosen": -266.292236328125, "logps/rejected": -300.22894287109375, "loss": 0.5893, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2130026817321777, "rewards/margins": 0.3487839996814728, "rewards/rejected": -1.5617868900299072, "step": 210 }, { "epoch": 0.18, "learning_rate": 4.915114123589732e-07, "logits/chosen": -2.321228504180908, "logits/rejected": -2.3033699989318848, "logps/chosen": -336.34161376953125, "logps/rejected": -373.39935302734375, "loss": 0.612, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.9529145956039429, "rewards/margins": 0.2863468527793884, "rewards/rejected": -2.2392613887786865, "step": 220 }, { "epoch": 0.18, "learning_rate": 4.896225217511849e-07, "logits/chosen": -2.4310107231140137, "logits/rejected": -2.422048568725586, "logps/chosen": -291.1025695800781, "logps/rejected": -328.18963623046875, "loss": 0.6079, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4314143657684326, "rewards/margins": 0.3364001214504242, "rewards/rejected": -1.7678143978118896, "step": 230 }, { "epoch": 0.19, "learning_rate": 4.875484304880629e-07, "logits/chosen": -2.3412394523620605, "logits/rejected": -2.309183120727539, "logps/chosen": -280.8785705566406, "logps/rejected": -308.54132080078125, "loss": 0.613, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.309309482574463, "rewards/margins": 0.3731766939163208, "rewards/rejected": -1.6824861764907837, "step": 240 }, { "epoch": 0.2, "learning_rate": 4.852907416036558e-07, "logits/chosen": -2.415271282196045, "logits/rejected": -2.4072234630584717, "logps/chosen": -243.56332397460938, "logps/rejected": -298.7532043457031, "loss": 0.591, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.967076301574707, "rewards/margins": 0.4581146240234375, "rewards/rejected": -1.4251911640167236, "step": 250 }, { "epoch": 0.21, "learning_rate": 4.828512000318616e-07, "logits/chosen": -2.3924427032470703, "logits/rejected": -2.3613152503967285, "logps/chosen": -266.86572265625, "logps/rejected": -304.2983093261719, "loss": 0.5986, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2214807271957397, "rewards/margins": 0.4553411602973938, "rewards/rejected": -1.6768219470977783, "step": 260 }, { "epoch": 0.21, "learning_rate": 4.802316912577946e-07, "logits/chosen": -2.4108529090881348, "logits/rejected": -2.3902478218078613, "logps/chosen": -252.7959442138672, "logps/rejected": -295.266357421875, "loss": 0.5917, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0824626684188843, "rewards/margins": 0.39643940329551697, "rewards/rejected": -1.4789022207260132, "step": 270 }, { "epoch": 0.22, "learning_rate": 4.774342398605221e-07, "logits/chosen": -2.3505263328552246, "logits/rejected": -2.2942967414855957, "logps/chosen": -279.871337890625, "logps/rejected": -300.4220886230469, "loss": 0.5979, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1787078380584717, "rewards/margins": 0.430286169052124, "rewards/rejected": -1.6089938879013062, "step": 280 }, { "epoch": 0.23, "learning_rate": 4.744610079482978e-07, "logits/chosen": -2.3269264698028564, "logits/rejected": -2.2910802364349365, "logps/chosen": -255.27706909179688, "logps/rejected": -281.60137939453125, "loss": 0.5853, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1753785610198975, "rewards/margins": 0.3495523929595947, "rewards/rejected": -1.5249310731887817, "step": 290 }, { "epoch": 0.24, "learning_rate": 4.713142934875005e-07, "logits/chosen": -2.2868428230285645, "logits/rejected": -2.2631592750549316, "logps/chosen": -284.2200012207031, "logps/rejected": -322.45269775390625, "loss": 0.5965, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.336501955986023, "rewards/margins": 0.3968800902366638, "rewards/rejected": -1.733382225036621, "step": 300 }, { "epoch": 0.24, "eval_logits/chosen": -2.265592098236084, "eval_logits/rejected": -2.244987964630127, "eval_logps/chosen": -282.3620910644531, "eval_logps/rejected": -331.2099609375, "eval_loss": 0.5907339453697205, "eval_rewards/accuracies": 0.6623134613037109, "eval_rewards/chosen": -1.3778856992721558, "eval_rewards/margins": 0.42287060618400574, "eval_rewards/rejected": -1.8007562160491943, "eval_runtime": 184.1495, "eval_samples_per_second": 46.441, "eval_steps_per_second": 0.728, "step": 300 }, { "epoch": 0.25, "learning_rate": 4.679965285265706e-07, "logits/chosen": -2.2354235649108887, "logits/rejected": -2.23685884475708, "logps/chosen": -277.09283447265625, "logps/rejected": -347.7145080566406, "loss": 0.5612, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3514426946640015, "rewards/margins": 0.4907970428466797, "rewards/rejected": -1.8422397375106812, "step": 310 }, { "epoch": 0.25, "learning_rate": 4.64510277316316e-07, "logits/chosen": -2.2262344360351562, "logits/rejected": -2.226029634475708, "logps/chosen": -271.74212646484375, "logps/rejected": -332.5010986328125, "loss": 0.5903, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3337775468826294, "rewards/margins": 0.39512914419174194, "rewards/rejected": -1.7289068698883057, "step": 320 }, { "epoch": 0.26, "learning_rate": 4.6085823432804137e-07, "logits/chosen": -2.2451891899108887, "logits/rejected": -2.2502384185791016, "logps/chosen": -250.6347198486328, "logps/rejected": -333.8939208984375, "loss": 0.5722, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1326004266738892, "rewards/margins": 0.5066065192222595, "rewards/rejected": -1.639206886291504, "step": 330 }, { "epoch": 0.27, "learning_rate": 4.570432221710314e-07, "logits/chosen": -2.0656931400299072, "logits/rejected": -2.0213730335235596, "logps/chosen": -318.232177734375, "logps/rejected": -369.13311767578125, "loss": 0.5766, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.584176778793335, "rewards/margins": 0.5901076197624207, "rewards/rejected": -2.1742844581604004, "step": 340 }, { "epoch": 0.28, "learning_rate": 4.5306818941099866e-07, "logits/chosen": -1.9084612131118774, "logits/rejected": -1.8514792919158936, "logps/chosen": -316.9821472167969, "logps/rejected": -352.9412841796875, "loss": 0.5825, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5627154111862183, "rewards/margins": 0.5152220726013184, "rewards/rejected": -2.077937364578247, "step": 350 }, { "epoch": 0.29, "learning_rate": 4.4893620829118124e-07, "logits/chosen": -1.8860156536102295, "logits/rejected": -1.8301204442977905, "logps/chosen": -309.8200378417969, "logps/rejected": -362.0408935546875, "loss": 0.5755, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5869390964508057, "rewards/margins": 0.49348369240760803, "rewards/rejected": -2.080422878265381, "step": 360 }, { "epoch": 0.29, "learning_rate": 4.4465047235785185e-07, "logits/chosen": -1.6610889434814453, "logits/rejected": -1.585129737854004, "logps/chosen": -321.8608703613281, "logps/rejected": -380.31036376953125, "loss": 0.5697, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.774713158607483, "rewards/margins": 0.6593302488327026, "rewards/rejected": -2.4340434074401855, "step": 370 }, { "epoch": 0.3, "learning_rate": 4.40214293992074e-07, "logits/chosen": -1.385825753211975, "logits/rejected": -1.31913161277771, "logps/chosen": -377.07269287109375, "logps/rejected": -459.5557556152344, "loss": 0.5818, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1890993118286133, "rewards/margins": 0.7521292567253113, "rewards/rejected": -2.9412286281585693, "step": 380 }, { "epoch": 0.31, "learning_rate": 4.3563110184961234e-07, "logits/chosen": -1.5089499950408936, "logits/rejected": -1.4075387716293335, "logps/chosen": -338.3626708984375, "logps/rejected": -396.67578125, "loss": 0.5584, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.9042552709579468, "rewards/margins": 0.5932050347328186, "rewards/rejected": -2.49746036529541, "step": 390 }, { "epoch": 0.32, "learning_rate": 4.3090443821097566e-07, "logits/chosen": -1.2587625980377197, "logits/rejected": -1.2017955780029297, "logps/chosen": -309.43377685546875, "logps/rejected": -372.00531005859375, "loss": 0.5729, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.9031288623809814, "rewards/margins": 0.5602144598960876, "rewards/rejected": -2.4633431434631348, "step": 400 }, { "epoch": 0.32, "eval_logits/chosen": -1.3760210275650024, "eval_logits/rejected": -1.2920024394989014, "eval_logps/chosen": -312.20635986328125, "eval_logps/rejected": -375.1720275878906, "eval_loss": 0.5711147785186768, "eval_rewards/accuracies": 0.6828358173370361, "eval_rewards/chosen": -1.676328182220459, "eval_rewards/margins": 0.5640482306480408, "eval_rewards/rejected": -2.2403764724731445, "eval_runtime": 183.9293, "eval_samples_per_second": 46.496, "eval_steps_per_second": 0.729, "step": 400 }, { "epoch": 0.33, "learning_rate": 4.2603795624364195e-07, "logits/chosen": -1.2894772291183472, "logits/rejected": -1.23129141330719, "logps/chosen": -299.457275390625, "logps/rejected": -370.8555908203125, "loss": 0.5666, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6054102182388306, "rewards/margins": 0.5984233021736145, "rewards/rejected": -2.203833818435669, "step": 410 }, { "epoch": 0.33, "learning_rate": 4.210354171785795e-07, "logits/chosen": -1.022984266281128, "logits/rejected": -0.9285897016525269, "logps/chosen": -324.4284973144531, "logps/rejected": -385.0074157714844, "loss": 0.5596, "rewards/accuracies": 0.6875, "rewards/chosen": -1.921677589416504, "rewards/margins": 0.5404387712478638, "rewards/rejected": -2.4621164798736572, "step": 420 }, { "epoch": 0.34, "learning_rate": 4.15900687403248e-07, "logits/chosen": -0.8059805631637573, "logits/rejected": -0.7196700572967529, "logps/chosen": -353.788330078125, "logps/rejected": -411.4853515625, "loss": 0.5865, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.1321234703063965, "rewards/margins": 0.463266521692276, "rewards/rejected": -2.5953898429870605, "step": 430 }, { "epoch": 0.35, "learning_rate": 4.1063773547332584e-07, "logits/chosen": -0.9645301699638367, "logits/rejected": -0.7601315975189209, "logps/chosen": -346.8272705078125, "logps/rejected": -392.2935791015625, "loss": 0.5591, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.9153356552124023, "rewards/margins": 0.5891679525375366, "rewards/rejected": -2.5045037269592285, "step": 440 }, { "epoch": 0.36, "learning_rate": 4.0525062904547276e-07, "logits/chosen": -0.608537495136261, "logits/rejected": -0.47767123579978943, "logps/chosen": -341.55364990234375, "logps/rejected": -434.1073303222656, "loss": 0.5687, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.105318069458008, "rewards/margins": 0.6994394659996033, "rewards/rejected": -2.8047571182250977, "step": 450 }, { "epoch": 0.37, "learning_rate": 3.997435317334988e-07, "logits/chosen": -0.6356207132339478, "logits/rejected": -0.25634175539016724, "logps/chosen": -384.43780517578125, "logps/rejected": -419.24176025390625, "loss": 0.5608, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.2970900535583496, "rewards/margins": 0.6535100340843201, "rewards/rejected": -2.9506001472473145, "step": 460 }, { "epoch": 0.37, "learning_rate": 3.941206998903701e-07, "logits/chosen": -1.0318920612335205, "logits/rejected": -0.7451022267341614, "logps/chosen": -338.9430236816406, "logps/rejected": -384.64111328125, "loss": 0.5678, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.9643396139144897, "rewards/margins": 0.5402536392211914, "rewards/rejected": -2.5045928955078125, "step": 470 }, { "epoch": 0.38, "learning_rate": 3.8838647931853684e-07, "logits/chosen": -0.6847028732299805, "logits/rejected": -0.5548251867294312, "logps/chosen": -339.61456298828125, "logps/rejected": -435.32061767578125, "loss": 0.5814, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1349122524261475, "rewards/margins": 0.7573872804641724, "rewards/rejected": -2.8923001289367676, "step": 480 }, { "epoch": 0.39, "learning_rate": 3.825453019111281e-07, "logits/chosen": -0.5378957986831665, "logits/rejected": -0.28533270955085754, "logps/chosen": -363.78570556640625, "logps/rejected": -430.11749267578125, "loss": 0.5327, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.134934425354004, "rewards/margins": 0.6089809536933899, "rewards/rejected": -2.743915319442749, "step": 490 }, { "epoch": 0.4, "learning_rate": 3.7660168222660824e-07, "logits/chosen": -0.6318235397338867, "logits/rejected": -0.5071814656257629, "logps/chosen": -350.5252380371094, "logps/rejected": -421.93353271484375, "loss": 0.5645, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.062009572982788, "rewards/margins": 0.5333147048950195, "rewards/rejected": -2.5953242778778076, "step": 500 }, { "epoch": 0.4, "eval_logits/chosen": -0.7860146760940552, "eval_logits/rejected": -0.6090859770774841, "eval_logps/chosen": -351.7882995605469, "eval_logps/rejected": -419.81939697265625, "eval_loss": 0.5639454126358032, "eval_rewards/accuracies": 0.6986940503120422, "eval_rewards/chosen": -2.0721471309661865, "eval_rewards/margins": 0.6147031188011169, "eval_rewards/rejected": -2.6868505477905273, "eval_runtime": 184.0154, "eval_samples_per_second": 46.474, "eval_steps_per_second": 0.728, "step": 500 }, { "epoch": 0.41, "learning_rate": 3.705602139995416e-07, "logits/chosen": -0.7258490920066833, "logits/rejected": -0.4828409254550934, "logps/chosen": -388.1371154785156, "logps/rejected": -422.11181640625, "loss": 0.574, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.290266513824463, "rewards/margins": 0.4104091227054596, "rewards/rejected": -2.7006754875183105, "step": 510 }, { "epoch": 0.41, "learning_rate": 3.6442556659016475e-07, "logits/chosen": -0.5335447192192078, "logits/rejected": -0.33706527948379517, "logps/chosen": -378.86492919921875, "logps/rejected": -429.67724609375, "loss": 0.5608, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.236337423324585, "rewards/margins": 0.556148886680603, "rewards/rejected": -2.7924864292144775, "step": 520 }, { "epoch": 0.42, "learning_rate": 3.582024813755076e-07, "logits/chosen": -0.39548322558403015, "logits/rejected": -0.10662730038166046, "logps/chosen": -368.8847961425781, "logps/rejected": -473.3500061035156, "loss": 0.5485, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.3263449668884277, "rewards/margins": 0.8236624598503113, "rewards/rejected": -3.150007724761963, "step": 530 }, { "epoch": 0.43, "learning_rate": 3.5189576808485404e-07, "logits/chosen": 0.15742243826389313, "logits/rejected": 0.31491726636886597, "logps/chosen": -394.34930419921875, "logps/rejected": -492.82232666015625, "loss": 0.5478, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.6109700202941895, "rewards/margins": 0.8250136375427246, "rewards/rejected": -3.435983657836914, "step": 540 }, { "epoch": 0.44, "learning_rate": 3.4551030108237433e-07, "logits/chosen": -0.2550584375858307, "logits/rejected": -0.06936412304639816, "logps/chosen": -406.5508728027344, "logps/rejected": -448.47576904296875, "loss": 0.5562, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.5152666568756104, "rewards/margins": 0.4819938540458679, "rewards/rejected": -2.997260332107544, "step": 550 }, { "epoch": 0.45, "learning_rate": 3.390510155998023e-07, "logits/chosen": -0.5292027592658997, "logits/rejected": -0.2619571387767792, "logps/chosen": -371.6798095703125, "logps/rejected": -420.7915954589844, "loss": 0.5492, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1147050857543945, "rewards/margins": 0.6524336338043213, "rewards/rejected": -2.7671384811401367, "step": 560 }, { "epoch": 0.45, "learning_rate": 3.325229039220684e-07, "logits/chosen": -0.5881962776184082, "logits/rejected": -0.4658876061439514, "logps/chosen": -343.7039794921875, "logps/rejected": -406.14178466796875, "loss": 0.57, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0860273838043213, "rewards/margins": 0.498068630695343, "rewards/rejected": -2.5840957164764404, "step": 570 }, { "epoch": 0.46, "learning_rate": 3.2593101152883795e-07, "logits/chosen": -0.6565806269645691, "logits/rejected": -0.2549567222595215, "logps/chosen": -374.8047180175781, "logps/rejected": -430.33221435546875, "loss": 0.5512, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.2211391925811768, "rewards/margins": 0.6813799142837524, "rewards/rejected": -2.9025187492370605, "step": 580 }, { "epoch": 0.47, "learning_rate": 3.192804331949349e-07, "logits/chosen": -0.07184700667858124, "logits/rejected": 0.1699156016111374, "logps/chosen": -422.27081298828125, "logps/rejected": -490.69134521484375, "loss": 0.535, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.730973720550537, "rewards/margins": 0.7726518511772156, "rewards/rejected": -3.5036251544952393, "step": 590 }, { "epoch": 0.48, "learning_rate": 3.125763090526674e-07, "logits/chosen": -0.029465889558196068, "logits/rejected": 0.15842057764530182, "logps/chosen": -417.373046875, "logps/rejected": -478.73291015625, "loss": 0.5513, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.8006317615509033, "rewards/margins": 0.6451797485351562, "rewards/rejected": -3.4458115100860596, "step": 600 }, { "epoch": 0.48, "eval_logits/chosen": -0.10542195290327072, "eval_logits/rejected": 0.12242482602596283, "eval_logps/chosen": -436.9386291503906, "eval_logps/rejected": -505.02227783203125, "eval_loss": 0.5582411885261536, "eval_rewards/accuracies": 0.7108209133148193, "eval_rewards/chosen": -2.9236514568328857, "eval_rewards/margins": 0.6152271032333374, "eval_rewards/rejected": -3.5388784408569336, "eval_runtime": 183.9235, "eval_samples_per_second": 46.498, "eval_steps_per_second": 0.729, "step": 600 }, { "epoch": 0.49, "learning_rate": 3.0582382061909623e-07, "logits/chosen": -0.2445104569196701, "logits/rejected": -0.018268002197146416, "logps/chosen": -441.7857971191406, "logps/rejected": -502.60791015625, "loss": 0.5594, "rewards/accuracies": 0.6875, "rewards/chosen": -2.858261823654175, "rewards/margins": 0.5510683655738831, "rewards/rejected": -3.409330368041992, "step": 610 }, { "epoch": 0.49, "learning_rate": 2.9902818679131775e-07, "logits/chosen": -0.4190225601196289, "logits/rejected": -0.22823679447174072, "logps/chosen": -399.03924560546875, "logps/rejected": -498.6724548339844, "loss": 0.5499, "rewards/accuracies": 0.71875, "rewards/chosen": -2.650449275970459, "rewards/margins": 0.7673205137252808, "rewards/rejected": -3.4177703857421875, "step": 620 }, { "epoch": 0.5, "learning_rate": 2.921946598128571e-07, "logits/chosen": -0.43653860688209534, "logits/rejected": -0.20837187767028809, "logps/chosen": -402.82781982421875, "logps/rejected": -485.4117736816406, "loss": 0.5739, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.573000192642212, "rewards/margins": 0.7478531002998352, "rewards/rejected": -3.3208529949188232, "step": 630 }, { "epoch": 0.51, "learning_rate": 2.8532852121428733e-07, "logits/chosen": -0.43430274724960327, "logits/rejected": -0.13240045309066772, "logps/chosen": -397.2491149902344, "logps/rejected": -442.12384033203125, "loss": 0.5462, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.4125733375549316, "rewards/margins": 0.5821372866630554, "rewards/rejected": -2.9947104454040527, "step": 640 }, { "epoch": 0.52, "learning_rate": 2.7843507773121414e-07, "logits/chosen": -0.4247920513153076, "logits/rejected": -0.21372787654399872, "logps/chosen": -389.4237976074219, "logps/rejected": -458.3169860839844, "loss": 0.5373, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.444688320159912, "rewards/margins": 0.7236617207527161, "rewards/rejected": -3.1683506965637207, "step": 650 }, { "epoch": 0.53, "learning_rate": 2.715196572027789e-07, "logits/chosen": -0.6697942614555359, "logits/rejected": -0.4933086931705475, "logps/chosen": -387.529296875, "logps/rejected": -472.73944091796875, "loss": 0.5685, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.3496451377868652, "rewards/margins": 0.7728831171989441, "rewards/rejected": -3.122528314590454, "step": 660 }, { "epoch": 0.53, "learning_rate": 2.645876044538521e-07, "logits/chosen": -1.0338900089263916, "logits/rejected": -0.8813627362251282, "logps/chosen": -372.53118896484375, "logps/rejected": -426.54241943359375, "loss": 0.5725, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.201908588409424, "rewards/margins": 0.5010865926742554, "rewards/rejected": -2.7029950618743896, "step": 670 }, { "epoch": 0.54, "learning_rate": 2.5764427716409815e-07, "logits/chosen": -0.9278701543807983, "logits/rejected": -0.7282145023345947, "logps/chosen": -347.2828674316406, "logps/rejected": -416.9349060058594, "loss": 0.5479, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0276436805725098, "rewards/margins": 0.743033230304718, "rewards/rejected": -2.770677089691162, "step": 680 }, { "epoch": 0.55, "learning_rate": 2.5069504172710494e-07, "logits/chosen": -0.5008482336997986, "logits/rejected": -0.34875133633613586, "logps/chosen": -373.7621154785156, "logps/rejected": -485.12884521484375, "loss": 0.5217, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.347053050994873, "rewards/margins": 0.9024646878242493, "rewards/rejected": -3.2495174407958984, "step": 690 }, { "epoch": 0.56, "learning_rate": 2.4374526910277886e-07, "logits/chosen": 0.06850005686283112, "logits/rejected": 0.41385045647621155, "logps/chosen": -411.46246337890625, "logps/rejected": -476.6162109375, "loss": 0.5571, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.6757898330688477, "rewards/margins": 0.8085702657699585, "rewards/rejected": -3.4843602180480957, "step": 700 }, { "epoch": 0.56, "eval_logits/chosen": 0.035554468631744385, "eval_logits/rejected": 0.2980235815048218, "eval_logps/chosen": -424.2823486328125, "eval_logps/rejected": -505.6960754394531, "eval_loss": 0.5558871626853943, "eval_rewards/accuracies": 0.704291045665741, "eval_rewards/chosen": -2.797088146209717, "eval_rewards/margins": 0.748529314994812, "eval_rewards/rejected": -3.5456173419952393, "eval_runtime": 183.8747, "eval_samples_per_second": 46.51, "eval_steps_per_second": 0.729, "step": 700 }, { "epoch": 0.57, "learning_rate": 2.368003306662104e-07, "logits/chosen": 0.07857178151607513, "logits/rejected": 0.3302653729915619, "logps/chosen": -413.8836975097656, "logps/rejected": -535.0875244140625, "loss": 0.5287, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.7193782329559326, "rewards/margins": 1.0089346170425415, "rewards/rejected": -3.7283127307891846, "step": 710 }, { "epoch": 0.57, "learning_rate": 2.2986559405621886e-07, "logits/chosen": 0.2789291739463806, "logits/rejected": 0.4242584705352783, "logps/chosen": -422.7801818847656, "logps/rejected": -522.7840576171875, "loss": 0.5551, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.925621509552002, "rewards/margins": 0.8043605089187622, "rewards/rejected": -3.729982376098633, "step": 720 }, { "epoch": 0.58, "learning_rate": 2.2294641902678443e-07, "logits/chosen": -0.19327735900878906, "logits/rejected": 0.043265581130981445, "logps/chosen": -363.1488342285156, "logps/rejected": -470.94970703125, "loss": 0.5284, "rewards/accuracies": 0.78125, "rewards/chosen": -2.39530873298645, "rewards/margins": 0.908363938331604, "rewards/rejected": -3.3036727905273438, "step": 730 }, { "epoch": 0.59, "learning_rate": 2.160481533045751e-07, "logits/chosen": -0.37412697076797485, "logits/rejected": -0.17320053279399872, "logps/chosen": -390.2896423339844, "logps/rejected": -428.08099365234375, "loss": 0.5572, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.3005330562591553, "rewards/margins": 0.48462891578674316, "rewards/rejected": -2.7851624488830566, "step": 740 }, { "epoch": 0.6, "learning_rate": 2.0917612845576882e-07, "logits/chosen": -0.26352375745773315, "logits/rejected": -0.0010178961092606187, "logps/chosen": -373.3875427246094, "logps/rejected": -440.09442138671875, "loss": 0.5534, "rewards/accuracies": 0.6875, "rewards/chosen": -2.3315823078155518, "rewards/margins": 0.6843063235282898, "rewards/rejected": -3.0158886909484863, "step": 750 }, { "epoch": 0.6, "learning_rate": 2.0233565576536564e-07, "logits/chosen": -0.3354080021381378, "logits/rejected": -0.006600166670978069, "logps/chosen": -360.56463623046875, "logps/rejected": -440.66961669921875, "loss": 0.5328, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.1626803874969482, "rewards/margins": 0.8473829030990601, "rewards/rejected": -3.010063409805298, "step": 760 }, { "epoch": 0.61, "learning_rate": 1.9553202213217537e-07, "logits/chosen": -0.021420275792479515, "logits/rejected": 0.19946305453777313, "logps/chosen": -389.1043395996094, "logps/rejected": -448.04998779296875, "loss": 0.5523, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.391838788986206, "rewards/margins": 0.6678962707519531, "rewards/rejected": -3.059735059738159, "step": 770 }, { "epoch": 0.62, "learning_rate": 1.887704859826528e-07, "logits/chosen": -0.15253478288650513, "logits/rejected": -0.00011998042464256287, "logps/chosen": -394.9501953125, "logps/rejected": -462.32843017578125, "loss": 0.5443, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.476644992828369, "rewards/margins": 0.566824734210968, "rewards/rejected": -3.0434699058532715, "step": 780 }, { "epoch": 0.63, "learning_rate": 1.8205627320673836e-07, "logits/chosen": -0.17955633997917175, "logits/rejected": 0.18167546391487122, "logps/chosen": -390.32244873046875, "logps/rejected": -444.895263671875, "loss": 0.5566, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.4376220703125, "rewards/margins": 0.7008293271064758, "rewards/rejected": -3.138451099395752, "step": 790 }, { "epoch": 0.64, "learning_rate": 1.7539457311884675e-07, "logits/chosen": -0.09838727861642838, "logits/rejected": 0.11829495429992676, "logps/chosen": -402.4017333984375, "logps/rejected": -451.49346923828125, "loss": 0.5609, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.4441986083984375, "rewards/margins": 0.5067789554595947, "rewards/rejected": -2.9509775638580322, "step": 800 }, { "epoch": 0.64, "eval_logits/chosen": -0.03116540051996708, "eval_logits/rejected": 0.1922437697649002, "eval_logps/chosen": -387.7091979980469, "eval_logps/rejected": -459.44390869140625, "eval_loss": 0.5468714833259583, "eval_rewards/accuracies": 0.7108209133148193, "eval_rewards/chosen": -2.431356430053711, "eval_rewards/margins": 0.6517390012741089, "eval_rewards/rejected": -3.0830955505371094, "eval_runtime": 183.9774, "eval_samples_per_second": 46.484, "eval_steps_per_second": 0.728, "step": 800 }, { "epoch": 0.64, "learning_rate": 1.687905344471226e-07, "logits/chosen": 0.07735608518123627, "logits/rejected": 0.3973601460456848, "logps/chosen": -408.05999755859375, "logps/rejected": -459.011474609375, "loss": 0.5384, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.5008435249328613, "rewards/margins": 0.6535352468490601, "rewards/rejected": -3.154379367828369, "step": 810 }, { "epoch": 0.65, "learning_rate": 1.6224926135406693e-07, "logits/chosen": 0.1125444769859314, "logits/rejected": 0.3865428566932678, "logps/chosen": -404.16058349609375, "logps/rejected": -484.68621826171875, "loss": 0.5448, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.4405789375305176, "rewards/margins": 0.718208909034729, "rewards/rejected": -3.158787727355957, "step": 820 }, { "epoch": 0.66, "learning_rate": 1.557758094916053e-07, "logits/chosen": 0.11989516019821167, "logits/rejected": 0.30926594138145447, "logps/chosen": -370.29876708984375, "logps/rejected": -452.27911376953125, "loss": 0.5418, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.3860089778900146, "rewards/margins": 0.7260924577713013, "rewards/rejected": -3.1121015548706055, "step": 830 }, { "epoch": 0.67, "learning_rate": 1.4937518209365108e-07, "logits/chosen": -0.14239154756069183, "logits/rejected": 0.14250756800174713, "logps/chosen": -395.55755615234375, "logps/rejected": -447.6368713378906, "loss": 0.5573, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.378154754638672, "rewards/margins": 0.6160937547683716, "rewards/rejected": -2.994248390197754, "step": 840 }, { "epoch": 0.68, "learning_rate": 1.4305232610918045e-07, "logits/chosen": -0.16526366770267487, "logits/rejected": 0.16432161629199982, "logps/chosen": -373.45330810546875, "logps/rejected": -436.6773376464844, "loss": 0.5415, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.3134028911590576, "rewards/margins": 0.774810791015625, "rewards/rejected": -3.0882136821746826, "step": 850 }, { "epoch": 0.68, "learning_rate": 1.3681212837880977e-07, "logits/chosen": -0.1321481615304947, "logits/rejected": 0.23287932574748993, "logps/chosen": -364.96990966796875, "logps/rejected": -447.7923278808594, "loss": 0.5396, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.176964282989502, "rewards/margins": 0.8955341577529907, "rewards/rejected": -3.0724985599517822, "step": 860 }, { "epoch": 0.69, "learning_rate": 1.3065941185782977e-07, "logits/chosen": 0.05437428876757622, "logits/rejected": 0.2819867432117462, "logps/chosen": -383.08599853515625, "logps/rejected": -439.3629455566406, "loss": 0.5505, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.405247211456299, "rewards/margins": 0.5403125882148743, "rewards/rejected": -2.9455599784851074, "step": 870 }, { "epoch": 0.7, "learning_rate": 1.2459893188861613e-07, "logits/chosen": -0.12052659690380096, "logits/rejected": 0.12284734100103378, "logps/chosen": -367.1181640625, "logps/rejected": -468.1044921875, "loss": 0.5185, "rewards/accuracies": 0.75, "rewards/chosen": -2.39152455329895, "rewards/margins": 0.9137696027755737, "rewards/rejected": -3.3052947521209717, "step": 880 }, { "epoch": 0.71, "learning_rate": 1.1863537252529548e-07, "logits/chosen": 0.14598000049591064, "logits/rejected": 0.38815659284591675, "logps/chosen": -397.891357421875, "logps/rejected": -472.38677978515625, "loss": 0.5323, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.512676239013672, "rewards/margins": 0.7713057994842529, "rewards/rejected": -3.2839818000793457, "step": 890 }, { "epoch": 0.72, "learning_rate": 1.1277334291351145e-07, "logits/chosen": 0.15319526195526123, "logits/rejected": 0.35974830389022827, "logps/chosen": -380.77783203125, "logps/rejected": -449.54315185546875, "loss": 0.5514, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.3706305027008057, "rewards/margins": 0.6724039912223816, "rewards/rejected": -3.043034076690674, "step": 900 }, { "epoch": 0.72, "eval_logits/chosen": 0.28598034381866455, "eval_logits/rejected": 0.5382024645805359, "eval_logps/chosen": -392.3096008300781, "eval_logps/rejected": -471.95330810546875, "eval_loss": 0.5473664402961731, "eval_rewards/accuracies": 0.6996268630027771, "eval_rewards/chosen": -2.4773612022399902, "eval_rewards/margins": 0.7308279275894165, "eval_rewards/rejected": -3.2081892490386963, "eval_runtime": 183.9377, "eval_samples_per_second": 46.494, "eval_steps_per_second": 0.729, "step": 900 }, { "epoch": 0.72, "learning_rate": 1.0701737372808431e-07, "logits/chosen": 0.15951867401599884, "logits/rejected": 0.46630391478538513, "logps/chosen": -383.52850341796875, "logps/rejected": -467.2303771972656, "loss": 0.5362, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.2782187461853027, "rewards/margins": 0.8473943471908569, "rewards/rejected": -3.125612735748291, "step": 910 }, { "epoch": 0.73, "learning_rate": 1.0137191367132078e-07, "logits/chosen": 0.2791319191455841, "logits/rejected": 0.45174160599708557, "logps/chosen": -372.1945495605469, "logps/rejected": -446.6507263183594, "loss": 0.5458, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.3160648345947266, "rewards/margins": 0.68004310131073, "rewards/rejected": -2.996107816696167, "step": 920 }, { "epoch": 0.74, "learning_rate": 9.584132603467827e-08, "logits/chosen": -0.12192128598690033, "logits/rejected": 0.1477951854467392, "logps/chosen": -366.48321533203125, "logps/rejected": -453.130126953125, "loss": 0.5467, "rewards/accuracies": 0.75, "rewards/chosen": -2.200005531311035, "rewards/margins": 0.7978888750076294, "rewards/rejected": -2.997894287109375, "step": 930 }, { "epoch": 0.75, "learning_rate": 9.042988532644249e-08, "logits/chosen": -0.03106372058391571, "logits/rejected": 0.07721444219350815, "logps/chosen": -344.21270751953125, "logps/rejected": -438.11077880859375, "loss": 0.5161, "rewards/accuracies": 0.71875, "rewards/chosen": -2.149094581604004, "rewards/margins": 0.7353444695472717, "rewards/rejected": -2.884438991546631, "step": 940 }, { "epoch": 0.76, "learning_rate": 8.514177396802428e-08, "logits/chosen": 0.006801058538258076, "logits/rejected": 0.20282092690467834, "logps/chosen": -358.15167236328125, "logps/rejected": -436.4964294433594, "loss": 0.5385, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.2220425605773926, "rewards/margins": 0.7004804611206055, "rewards/rejected": -2.922523260116577, "step": 950 }, { "epoch": 0.76, "learning_rate": 7.998107906142839e-08, "logits/chosen": 0.41448846459388733, "logits/rejected": 0.705254852771759, "logps/chosen": -371.27801513671875, "logps/rejected": -434.56866455078125, "loss": 0.5236, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.2166616916656494, "rewards/margins": 0.6714047193527222, "rewards/rejected": -2.888066530227661, "step": 960 }, { "epoch": 0.77, "learning_rate": 7.495178923039396e-08, "logits/chosen": 0.23847150802612305, "logits/rejected": 0.48661884665489197, "logps/chosen": -366.28179931640625, "logps/rejected": -462.679443359375, "loss": 0.5459, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.1916985511779785, "rewards/margins": 0.8472123146057129, "rewards/rejected": -3.038910388946533, "step": 970 }, { "epoch": 0.78, "learning_rate": 7.005779153764682e-08, "logits/chosen": 0.41438961029052734, "logits/rejected": 0.6912784576416016, "logps/chosen": -382.70123291015625, "logps/rejected": -461.8614807128906, "loss": 0.5453, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.4304287433624268, "rewards/margins": 0.7116767764091492, "rewards/rejected": -3.1421055793762207, "step": 980 }, { "epoch": 0.79, "learning_rate": 6.530286848064698e-08, "logits/chosen": 0.36573725938796997, "logits/rejected": 0.5834362506866455, "logps/chosen": -384.49749755859375, "logps/rejected": -466.30096435546875, "loss": 0.5528, "rewards/accuracies": 0.71875, "rewards/chosen": -2.5111565589904785, "rewards/margins": 0.7234699130058289, "rewards/rejected": -3.234626054763794, "step": 990 }, { "epoch": 0.8, "learning_rate": 6.069069506815325e-08, "logits/chosen": 0.45530566573143005, "logits/rejected": 0.5909157991409302, "logps/chosen": -379.1433410644531, "logps/rejected": -468.88458251953125, "loss": 0.527, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.5361268520355225, "rewards/margins": 0.7407721281051636, "rewards/rejected": -3.2768986225128174, "step": 1000 }, { "epoch": 0.8, "eval_logits/chosen": 0.3871051073074341, "eval_logits/rejected": 0.6372014284133911, "eval_logps/chosen": -394.97113037109375, "eval_logps/rejected": -471.8453674316406, "eval_loss": 0.5453863739967346, "eval_rewards/accuracies": 0.70802241563797, "eval_rewards/chosen": -2.503976345062256, "eval_rewards/margins": 0.7031334638595581, "eval_rewards/rejected": -3.2071101665496826, "eval_runtime": 183.9898, "eval_samples_per_second": 46.481, "eval_steps_per_second": 0.728, "step": 1000 }, { "epoch": 0.8, "learning_rate": 5.6224835979863714e-08, "logits/chosen": 0.31174296140670776, "logits/rejected": 0.6193565130233765, "logps/chosen": -390.387451171875, "logps/rejected": -468.4959411621094, "loss": 0.5568, "rewards/accuracies": 0.71875, "rewards/chosen": -2.32766056060791, "rewards/margins": 0.68747878074646, "rewards/rejected": -3.015139102935791, "step": 1010 }, { "epoch": 0.81, "learning_rate": 5.190874281132851e-08, "logits/chosen": 0.22277125716209412, "logits/rejected": 0.6487134099006653, "logps/chosen": -402.0958557128906, "logps/rejected": -448.5992736816406, "loss": 0.5408, "rewards/accuracies": 0.71875, "rewards/chosen": -2.359062671661377, "rewards/margins": 0.6533006429672241, "rewards/rejected": -3.0123631954193115, "step": 1020 }, { "epoch": 0.82, "learning_rate": 4.774575140626316e-08, "logits/chosen": 0.23170511424541473, "logits/rejected": 0.47184085845947266, "logps/chosen": -363.46917724609375, "logps/rejected": -442.47918701171875, "loss": 0.5309, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.142770290374756, "rewards/margins": 0.7513145208358765, "rewards/rejected": -2.894084930419922, "step": 1030 }, { "epoch": 0.83, "learning_rate": 4.373907927832513e-08, "logits/chosen": 0.07573021948337555, "logits/rejected": 0.32997313141822815, "logps/chosen": -381.45599365234375, "logps/rejected": -443.0684509277344, "loss": 0.5407, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2033116817474365, "rewards/margins": 0.710732638835907, "rewards/rejected": -2.914044141769409, "step": 1040 }, { "epoch": 0.84, "learning_rate": 3.9891823124345665e-08, "logits/chosen": 0.23884686827659607, "logits/rejected": 0.6128005385398865, "logps/chosen": -364.00567626953125, "logps/rejected": -433.3273010253906, "loss": 0.5471, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.2081527709960938, "rewards/margins": 0.7639263868331909, "rewards/rejected": -2.972079038619995, "step": 1050 }, { "epoch": 0.84, "learning_rate": 3.620695643093924e-08, "logits/chosen": 0.21963253617286682, "logits/rejected": 0.6894062757492065, "logps/chosen": -399.5767517089844, "logps/rejected": -452.88909912109375, "loss": 0.5154, "rewards/accuracies": 0.71875, "rewards/chosen": -2.3384335041046143, "rewards/margins": 0.7010769844055176, "rewards/rejected": -3.0395102500915527, "step": 1060 }, { "epoch": 0.85, "learning_rate": 3.268732717634032e-08, "logits/chosen": 0.3474286198616028, "logits/rejected": 0.695271372795105, "logps/chosen": -368.0654602050781, "logps/rejected": -431.47222900390625, "loss": 0.5499, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.1910276412963867, "rewards/margins": 0.7267633080482483, "rewards/rejected": -2.9177908897399902, "step": 1070 }, { "epoch": 0.86, "learning_rate": 2.9335655629243645e-08, "logits/chosen": 0.2347393035888672, "logits/rejected": 0.5894696712493896, "logps/chosen": -388.94757080078125, "logps/rejected": -447.3855895996094, "loss": 0.525, "rewards/accuracies": 0.78125, "rewards/chosen": -2.254683017730713, "rewards/margins": 0.7334609031677246, "rewards/rejected": -2.9881439208984375, "step": 1080 }, { "epoch": 0.87, "learning_rate": 2.6154532246349476e-08, "logits/chosen": 0.25378522276878357, "logits/rejected": 0.5771256685256958, "logps/chosen": -358.50640869140625, "logps/rejected": -431.145751953125, "loss": 0.5462, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1666626930236816, "rewards/margins": 0.756801426410675, "rewards/rejected": -2.923464059829712, "step": 1090 }, { "epoch": 0.88, "learning_rate": 2.31464156702382e-08, "logits/chosen": 0.35370689630508423, "logits/rejected": 0.5671936273574829, "logps/chosen": -363.0, "logps/rejected": -438.209228515625, "loss": 0.5487, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2130119800567627, "rewards/margins": 0.7499077916145325, "rewards/rejected": -2.9629194736480713, "step": 1100 }, { "epoch": 0.88, "eval_logits/chosen": 0.1857856959104538, "eval_logits/rejected": 0.43363669514656067, "eval_logps/chosen": -373.08306884765625, "eval_logps/rejected": -450.7598876953125, "eval_loss": 0.5444055199623108, "eval_rewards/accuracies": 0.7089552283287048, "eval_rewards/chosen": -2.285095453262329, "eval_rewards/margins": 0.711159884929657, "eval_rewards/rejected": -2.996255397796631, "eval_runtime": 183.9924, "eval_samples_per_second": 46.48, "eval_steps_per_second": 0.728, "step": 1100 }, { "epoch": 0.88, "learning_rate": 2.031363082912252e-08, "logits/chosen": 0.070524200797081, "logits/rejected": 0.4635602533817291, "logps/chosen": -373.29327392578125, "logps/rejected": -426.85552978515625, "loss": 0.5513, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.2541089057922363, "rewards/margins": 0.6198171973228455, "rewards/rejected": -2.8739261627197266, "step": 1110 }, { "epoch": 0.89, "learning_rate": 1.7658367139945228e-08, "logits/chosen": 0.2600646913051605, "logits/rejected": 0.5517584681510925, "logps/chosen": -390.8568115234375, "logps/rejected": -462.80828857421875, "loss": 0.5471, "rewards/accuracies": 0.6875, "rewards/chosen": -2.377396821975708, "rewards/margins": 0.6719989776611328, "rewards/rejected": -3.049395799636841, "step": 1120 }, { "epoch": 0.9, "learning_rate": 1.5182676816211632e-08, "logits/chosen": 0.04413030296564102, "logits/rejected": 0.30151715874671936, "logps/chosen": -382.0662536621094, "logps/rejected": -447.08673095703125, "loss": 0.5431, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.228654384613037, "rewards/margins": 0.6926024556159973, "rewards/rejected": -2.9212570190429688, "step": 1130 }, { "epoch": 0.91, "learning_rate": 1.2888473281864597e-08, "logits/chosen": 0.14212054014205933, "logits/rejected": 0.47429710626602173, "logps/chosen": -367.8409729003906, "logps/rejected": -435.02764892578125, "loss": 0.5369, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.2534115314483643, "rewards/margins": 0.7273036241531372, "rewards/rejected": -2.980715274810791, "step": 1140 }, { "epoch": 0.92, "learning_rate": 1.0777529692427679e-08, "logits/chosen": 0.04115242511034012, "logits/rejected": 0.28970104455947876, "logps/chosen": -372.7949523925781, "logps/rejected": -456.10675048828125, "loss": 0.5265, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.300356388092041, "rewards/margins": 0.8059718012809753, "rewards/rejected": -3.106328248977661, "step": 1150 }, { "epoch": 0.92, "learning_rate": 8.851477564560061e-09, "logits/chosen": 0.0867738351225853, "logits/rejected": 0.4068300127983093, "logps/chosen": -372.08636474609375, "logps/rejected": -426.42388916015625, "loss": 0.5342, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.331385850906372, "rewards/margins": 0.6490964293479919, "rewards/rejected": -2.9804821014404297, "step": 1160 }, { "epoch": 0.93, "learning_rate": 7.111805515081531e-09, "logits/chosen": 0.02022993005812168, "logits/rejected": 0.41968393325805664, "logps/chosen": -363.818603515625, "logps/rejected": -447.7919006347656, "loss": 0.5312, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.2372307777404785, "rewards/margins": 0.8540315628051758, "rewards/rejected": -3.0912623405456543, "step": 1170 }, { "epoch": 0.94, "learning_rate": 5.559858110443016e-09, "logits/chosen": 0.29695388674736023, "logits/rejected": 0.714096188545227, "logps/chosen": -372.5519714355469, "logps/rejected": -442.5354919433594, "loss": 0.5383, "rewards/accuracies": 0.71875, "rewards/chosen": -2.3107995986938477, "rewards/margins": 0.8070123791694641, "rewards/rejected": -3.117811918258667, "step": 1180 }, { "epoch": 0.95, "learning_rate": 4.196834827531276e-09, "logits/chosen": 0.140055850148201, "logits/rejected": 0.3409932255744934, "logps/chosen": -355.64324951171875, "logps/rejected": -447.585693359375, "loss": 0.5152, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.149151563644409, "rewards/margins": 0.7904965877532959, "rewards/rejected": -2.939648151397705, "step": 1190 }, { "epoch": 0.96, "learning_rate": 3.023789126611137e-09, "logits/chosen": 0.03294936567544937, "logits/rejected": 0.2933207154273987, "logps/chosen": -363.29290771484375, "logps/rejected": -435.640380859375, "loss": 0.5483, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.281057357788086, "rewards/margins": 0.7091296911239624, "rewards/rejected": -2.990186929702759, "step": 1200 }, { "epoch": 0.96, "eval_logits/chosen": 0.07418080419301987, "eval_logits/rejected": 0.32435521483421326, "eval_logps/chosen": -373.978515625, "eval_logps/rejected": -451.6764831542969, "eval_loss": 0.5440130829811096, "eval_rewards/accuracies": 0.7089552283287048, "eval_rewards/chosen": -2.2940499782562256, "eval_rewards/margins": 0.7113713622093201, "eval_rewards/rejected": -3.0054211616516113, "eval_runtime": 183.9013, "eval_samples_per_second": 46.503, "eval_steps_per_second": 0.729, "step": 1200 }, { "epoch": 0.96, "learning_rate": 2.041627637121929e-09, "logits/chosen": 0.10010697692632675, "logits/rejected": 0.3795483410358429, "logps/chosen": -348.8675231933594, "logps/rejected": -437.20361328125, "loss": 0.5398, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.087364673614502, "rewards/margins": 0.828387439250946, "rewards/rejected": -2.9157521724700928, "step": 1210 }, { "epoch": 0.97, "learning_rate": 1.2511094569571668e-09, "logits/chosen": 0.09991980344057083, "logits/rejected": 0.4467397630214691, "logps/chosen": -380.14520263671875, "logps/rejected": -440.24658203125, "loss": 0.5345, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.253425121307373, "rewards/margins": 0.702509880065918, "rewards/rejected": -2.955935001373291, "step": 1220 }, { "epoch": 0.98, "learning_rate": 6.528455657691112e-10, "logits/chosen": 0.11626466363668442, "logits/rejected": 0.41348797082901, "logps/chosen": -372.7298889160156, "logps/rejected": -427.22576904296875, "loss": 0.549, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.2800345420837402, "rewards/margins": 0.6291176080703735, "rewards/rejected": -2.909151792526245, "step": 1230 }, { "epoch": 0.99, "learning_rate": 2.4729835275189016e-10, "logits/chosen": 0.06715863198041916, "logits/rejected": 0.29241910576820374, "logps/chosen": -393.8903503417969, "logps/rejected": -477.9420471191406, "loss": 0.5462, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.323488473892212, "rewards/margins": 0.8067766427993774, "rewards/rejected": -3.1302647590637207, "step": 1240 }, { "epoch": 0.99, "learning_rate": 3.478125926756337e-11, "logits/chosen": 0.25983649492263794, "logits/rejected": 0.4905417561531067, "logps/chosen": -364.73431396484375, "logps/rejected": -443.79296875, "loss": 0.5474, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2353272438049316, "rewards/margins": 0.771331787109375, "rewards/rejected": -3.0066590309143066, "step": 1250 }, { "epoch": 1.0, "step": 1256, "total_flos": 0.0, "train_loss": 0.5712926928784438, "train_runtime": 11525.4961, "train_samples_per_second": 13.952, "train_steps_per_second": 0.109 } ], "logging_steps": 10, "max_steps": 1256, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "trial_name": null, "trial_params": null }