{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 100, "global_step": 3112, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.353319057815846e-10, "logits/chosen": -2.322030782699585, "logits/rejected": -2.360077381134033, "logps/chosen": -413.0701599121094, "logps/rejected": -503.9693603515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 5.353319057815845e-09, "logits/chosen": -2.3369531631469727, "logits/rejected": -2.352255344390869, "logps/chosen": -334.3316650390625, "logps/rejected": -329.3804016113281, "loss": 0.6949, "rewards/accuracies": 0.5416666865348816, "rewards/chosen": 0.0067011509090662, "rewards/margins": 0.011725478805601597, "rewards/rejected": -0.005024327430874109, "step": 10 }, { "epoch": 0.01, "learning_rate": 1.070663811563169e-08, "logits/chosen": -2.307915210723877, "logits/rejected": -2.3184759616851807, "logps/chosen": -383.9498291015625, "logps/rejected": -349.3071594238281, "loss": 0.6939, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.020408455282449722, "rewards/margins": -0.008849766105413437, "rewards/rejected": -0.011558687314391136, "step": 20 }, { "epoch": 0.02, "learning_rate": 1.6059957173447538e-08, "logits/chosen": -2.3540706634521484, "logits/rejected": -2.3323521614074707, "logps/chosen": -382.1279602050781, "logps/rejected": -429.32147216796875, "loss": 0.6794, "rewards/accuracies": 0.5625, "rewards/chosen": -0.009235721081495285, "rewards/margins": 0.022110218182206154, "rewards/rejected": -0.03134594112634659, "step": 30 }, { "epoch": 0.03, "learning_rate": 2.141327623126338e-08, "logits/chosen": -2.281588315963745, "logits/rejected": -2.2917075157165527, "logps/chosen": -329.18243408203125, "logps/rejected": -268.74761962890625, "loss": 0.6573, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.008301397785544395, "rewards/margins": 0.07960359007120132, "rewards/rejected": -0.07130218297243118, "step": 40 }, { "epoch": 0.03, "learning_rate": 2.676659528907923e-08, "logits/chosen": -2.2929608821868896, "logits/rejected": -2.3293919563293457, "logps/chosen": -348.1233215332031, "logps/rejected": -352.0605163574219, "loss": 0.6243, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.02957112155854702, "rewards/margins": 0.16750425100326538, "rewards/rejected": -0.1379331350326538, "step": 50 }, { "epoch": 0.04, "learning_rate": 3.2119914346895076e-08, "logits/chosen": -2.2743587493896484, "logits/rejected": -2.271085262298584, "logps/chosen": -369.2187805175781, "logps/rejected": -364.81781005859375, "loss": 0.5746, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.016968611627817154, "rewards/margins": 0.25147581100463867, "rewards/rejected": -0.23450717329978943, "step": 60 }, { "epoch": 0.04, "learning_rate": 3.747323340471092e-08, "logits/chosen": -2.3003923892974854, "logits/rejected": -2.3059000968933105, "logps/chosen": -358.39813232421875, "logps/rejected": -377.95391845703125, "loss": 0.5103, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.06770019978284836, "rewards/margins": 0.510492205619812, "rewards/rejected": -0.44279199838638306, "step": 70 }, { "epoch": 0.05, "learning_rate": 4.282655246252676e-08, "logits/chosen": -2.2736544609069824, "logits/rejected": -2.2347025871276855, "logps/chosen": -350.6732177734375, "logps/rejected": -310.04315185546875, "loss": 0.4697, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.11150838434696198, "rewards/margins": 0.5623964071273804, "rewards/rejected": -0.45088809728622437, "step": 80 }, { "epoch": 0.06, "learning_rate": 4.817987152034261e-08, "logits/chosen": -2.2885279655456543, "logits/rejected": -2.2864298820495605, "logps/chosen": -376.4400939941406, "logps/rejected": -356.20245361328125, "loss": 0.4095, "rewards/accuracies": 0.875, "rewards/chosen": 0.13773031532764435, "rewards/margins": 0.7758801579475403, "rewards/rejected": -0.6381498575210571, "step": 90 }, { "epoch": 0.06, "learning_rate": 5.353319057815846e-08, "logits/chosen": -2.2083301544189453, "logits/rejected": -2.191849946975708, "logps/chosen": -388.324462890625, "logps/rejected": -376.165771484375, "loss": 0.3876, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.19459189474582672, "rewards/margins": 0.9206008911132812, "rewards/rejected": -0.7260090112686157, "step": 100 }, { "epoch": 0.06, "eval_logits/chosen": -2.2498624324798584, "eval_logits/rejected": -2.221994400024414, "eval_logps/chosen": -328.4210510253906, "eval_logps/rejected": -324.17828369140625, "eval_loss": 0.3689849376678467, "eval_rewards/accuracies": 0.8984375, "eval_rewards/chosen": 0.0942004844546318, "eval_rewards/margins": 0.8769543170928955, "eval_rewards/rejected": -0.7827538251876831, "eval_runtime": 76.5553, "eval_samples_per_second": 13.062, "eval_steps_per_second": 0.418, "step": 100 }, { "epoch": 0.07, "learning_rate": 5.88865096359743e-08, "logits/chosen": -2.2286324501037598, "logits/rejected": -2.2091312408447266, "logps/chosen": -372.5191650390625, "logps/rejected": -399.0936584472656, "loss": 0.3497, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.1948019564151764, "rewards/margins": 1.1746506690979004, "rewards/rejected": -0.9798487424850464, "step": 110 }, { "epoch": 0.08, "learning_rate": 6.423982869379015e-08, "logits/chosen": -2.1091272830963135, "logits/rejected": -2.118820905685425, "logps/chosen": -361.56353759765625, "logps/rejected": -404.4062805175781, "loss": 0.3081, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.11251312494277954, "rewards/margins": 1.413537621498108, "rewards/rejected": -1.3010244369506836, "step": 120 }, { "epoch": 0.08, "learning_rate": 6.959314775160599e-08, "logits/chosen": -2.1156742572784424, "logits/rejected": -2.092258930206299, "logps/chosen": -406.53851318359375, "logps/rejected": -382.41717529296875, "loss": 0.2846, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.18506963551044464, "rewards/margins": 1.6774908304214478, "rewards/rejected": -1.492421269416809, "step": 130 }, { "epoch": 0.09, "learning_rate": 7.494646680942184e-08, "logits/chosen": -2.092482089996338, "logits/rejected": -2.0639469623565674, "logps/chosen": -364.37774658203125, "logps/rejected": -419.9878845214844, "loss": 0.2908, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.16728416085243225, "rewards/margins": 1.9277279376983643, "rewards/rejected": -1.760443925857544, "step": 140 }, { "epoch": 0.1, "learning_rate": 8.029978586723767e-08, "logits/chosen": -2.035637378692627, "logits/rejected": -2.007483959197998, "logps/chosen": -364.07745361328125, "logps/rejected": -378.3775939941406, "loss": 0.2519, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.009305549785494804, "rewards/margins": 1.7874408960342407, "rewards/rejected": -1.7967464923858643, "step": 150 }, { "epoch": 0.1, "learning_rate": 8.565310492505352e-08, "logits/chosen": -1.9844402074813843, "logits/rejected": -1.9815524816513062, "logps/chosen": -384.30645751953125, "logps/rejected": -454.4593811035156, "loss": 0.2155, "rewards/accuracies": 0.9375, "rewards/chosen": -0.009780190885066986, "rewards/margins": 2.3053088188171387, "rewards/rejected": -2.315088987350464, "step": 160 }, { "epoch": 0.11, "learning_rate": 9.100642398286937e-08, "logits/chosen": -1.8912779092788696, "logits/rejected": -1.8273910284042358, "logps/chosen": -378.23199462890625, "logps/rejected": -344.0122985839844, "loss": 0.2089, "rewards/accuracies": 0.9375, "rewards/chosen": -0.13415472209453583, "rewards/margins": 2.2053399085998535, "rewards/rejected": -2.3394949436187744, "step": 170 }, { "epoch": 0.12, "learning_rate": 9.635974304068522e-08, "logits/chosen": -1.8499170541763306, "logits/rejected": -1.8102867603302002, "logps/chosen": -337.0228271484375, "logps/rejected": -421.8775329589844, "loss": 0.1974, "rewards/accuracies": 0.9375, "rewards/chosen": -0.24989835917949677, "rewards/margins": 2.7448952198028564, "rewards/rejected": -2.99479341506958, "step": 180 }, { "epoch": 0.12, "learning_rate": 1.0171306209850107e-07, "logits/chosen": -1.8178815841674805, "logits/rejected": -1.8075618743896484, "logps/chosen": -420.0069885253906, "logps/rejected": -451.9561462402344, "loss": 0.1646, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2596689462661743, "rewards/margins": 2.7779135704040527, "rewards/rejected": -3.0375826358795166, "step": 190 }, { "epoch": 0.13, "learning_rate": 1.0706638115631692e-07, "logits/chosen": -1.8774926662445068, "logits/rejected": -1.7711089849472046, "logps/chosen": -368.1988830566406, "logps/rejected": -364.1625671386719, "loss": 0.1791, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.20116904377937317, "rewards/margins": 3.0269274711608887, "rewards/rejected": -3.2280964851379395, "step": 200 }, { "epoch": 0.13, "eval_logits/chosen": -1.9170525074005127, "eval_logits/rejected": -1.8318464756011963, "eval_logps/chosen": -330.80859375, "eval_logps/rejected": -343.30322265625, "eval_loss": 0.19462376832962036, "eval_rewards/accuracies": 0.90625, "eval_rewards/chosen": -0.14455503225326538, "eval_rewards/margins": 2.5506958961486816, "eval_rewards/rejected": -2.695250988006592, "eval_runtime": 76.4342, "eval_samples_per_second": 13.083, "eval_steps_per_second": 0.419, "step": 200 }, { "epoch": 0.13, "learning_rate": 1.1241970021413276e-07, "logits/chosen": -1.870164155960083, "logits/rejected": -1.7529096603393555, "logps/chosen": -394.24310302734375, "logps/rejected": -359.798828125, "loss": 0.1689, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.23141837120056152, "rewards/margins": 2.955934524536133, "rewards/rejected": -3.1873526573181152, "step": 210 }, { "epoch": 0.14, "learning_rate": 1.177730192719486e-07, "logits/chosen": -1.7515084743499756, "logits/rejected": -1.6350901126861572, "logps/chosen": -399.573486328125, "logps/rejected": -395.66864013671875, "loss": 0.1476, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.43568697571754456, "rewards/margins": 3.202346086502075, "rewards/rejected": -3.638033390045166, "step": 220 }, { "epoch": 0.15, "learning_rate": 1.2312633832976445e-07, "logits/chosen": -1.7598545551300049, "logits/rejected": -1.6209675073623657, "logps/chosen": -397.47747802734375, "logps/rejected": -407.18780517578125, "loss": 0.1534, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.2008744478225708, "rewards/margins": 3.397059917449951, "rewards/rejected": -3.5979347229003906, "step": 230 }, { "epoch": 0.15, "learning_rate": 1.284796573875803e-07, "logits/chosen": -1.7311824560165405, "logits/rejected": -1.6193158626556396, "logps/chosen": -379.2996520996094, "logps/rejected": -434.6549377441406, "loss": 0.136, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4003673493862152, "rewards/margins": 3.5797741413116455, "rewards/rejected": -3.9801411628723145, "step": 240 }, { "epoch": 0.16, "learning_rate": 1.3383297644539615e-07, "logits/chosen": -1.7749998569488525, "logits/rejected": -1.6336084604263306, "logps/chosen": -384.36016845703125, "logps/rejected": -412.198974609375, "loss": 0.1424, "rewards/accuracies": 0.9375, "rewards/chosen": -0.44941583275794983, "rewards/margins": 4.18085241317749, "rewards/rejected": -4.630268096923828, "step": 250 }, { "epoch": 0.17, "learning_rate": 1.3918629550321198e-07, "logits/chosen": -1.6577208042144775, "logits/rejected": -1.4876978397369385, "logps/chosen": -368.1956481933594, "logps/rejected": -311.98223876953125, "loss": 0.1379, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5439226031303406, "rewards/margins": 3.506497621536255, "rewards/rejected": -4.050419807434082, "step": 260 }, { "epoch": 0.17, "learning_rate": 1.4453961456102785e-07, "logits/chosen": -1.6528087854385376, "logits/rejected": -1.5587131977081299, "logps/chosen": -357.02655029296875, "logps/rejected": -449.92718505859375, "loss": 0.1273, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7018794417381287, "rewards/margins": 4.135760307312012, "rewards/rejected": -4.837639808654785, "step": 270 }, { "epoch": 0.18, "learning_rate": 1.4989293361884367e-07, "logits/chosen": -1.6728260517120361, "logits/rejected": -1.4406911134719849, "logps/chosen": -383.44635009765625, "logps/rejected": -405.33428955078125, "loss": 0.1286, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.7234546542167664, "rewards/margins": 4.43070125579834, "rewards/rejected": -5.154156684875488, "step": 280 }, { "epoch": 0.19, "learning_rate": 1.5524625267665952e-07, "logits/chosen": -1.5811737775802612, "logits/rejected": -1.4415075778961182, "logps/chosen": -380.16302490234375, "logps/rejected": -385.4619140625, "loss": 0.1184, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6038004159927368, "rewards/margins": 4.522972106933594, "rewards/rejected": -5.126772403717041, "step": 290 }, { "epoch": 0.19, "learning_rate": 1.6059957173447535e-07, "logits/chosen": -1.5964694023132324, "logits/rejected": -1.5073761940002441, "logps/chosen": -348.82110595703125, "logps/rejected": -396.21234130859375, "loss": 0.1218, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.7199587821960449, "rewards/margins": 4.322809219360352, "rewards/rejected": -5.042768478393555, "step": 300 }, { "epoch": 0.19, "eval_logits/chosen": -1.7016242742538452, "eval_logits/rejected": -1.5659476518630981, "eval_logps/chosen": -336.16741943359375, "eval_logps/rejected": -363.2919006347656, "eval_loss": 0.1464938521385193, "eval_rewards/accuracies": 0.921875, "eval_rewards/chosen": -0.6804376840591431, "eval_rewards/margins": 4.013677597045898, "eval_rewards/rejected": -4.69411563873291, "eval_runtime": 76.4822, "eval_samples_per_second": 13.075, "eval_steps_per_second": 0.418, "step": 300 }, { "epoch": 0.2, "learning_rate": 1.6595289079229122e-07, "logits/chosen": -1.5078377723693848, "logits/rejected": -1.4056973457336426, "logps/chosen": -412.2068786621094, "logps/rejected": -462.7405700683594, "loss": 0.1094, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.718084454536438, "rewards/margins": 5.0283284187316895, "rewards/rejected": -5.746413230895996, "step": 310 }, { "epoch": 0.21, "learning_rate": 1.7130620985010704e-07, "logits/chosen": -1.586196780204773, "logits/rejected": -1.4498493671417236, "logps/chosen": -351.50775146484375, "logps/rejected": -403.0026550292969, "loss": 0.1016, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.6627265214920044, "rewards/margins": 4.463755130767822, "rewards/rejected": -5.126482009887695, "step": 320 }, { "epoch": 0.21, "learning_rate": 1.766595289079229e-07, "logits/chosen": -1.5722558498382568, "logits/rejected": -1.340435266494751, "logps/chosen": -337.4762878417969, "logps/rejected": -419.3818359375, "loss": 0.1221, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.869371235370636, "rewards/margins": 5.570626258850098, "rewards/rejected": -6.439997673034668, "step": 330 }, { "epoch": 0.22, "learning_rate": 1.8201284796573874e-07, "logits/chosen": -1.563241958618164, "logits/rejected": -1.43202805519104, "logps/chosen": -332.5054016113281, "logps/rejected": -401.9011535644531, "loss": 0.1141, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.7410942912101746, "rewards/margins": 5.449254512786865, "rewards/rejected": -6.1903486251831055, "step": 340 }, { "epoch": 0.22, "learning_rate": 1.873661670235546e-07, "logits/chosen": -1.509913682937622, "logits/rejected": -1.352683186531067, "logps/chosen": -428.9737243652344, "logps/rejected": -455.664306640625, "loss": 0.1092, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.7532263398170471, "rewards/margins": 5.451329708099365, "rewards/rejected": -6.204555988311768, "step": 350 }, { "epoch": 0.23, "learning_rate": 1.9271948608137044e-07, "logits/chosen": -1.6219438314437866, "logits/rejected": -1.3939545154571533, "logps/chosen": -389.11297607421875, "logps/rejected": -411.80035400390625, "loss": 0.1218, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0884325504302979, "rewards/margins": 5.317915439605713, "rewards/rejected": -6.406347751617432, "step": 360 }, { "epoch": 0.24, "learning_rate": 1.980728051391863e-07, "logits/chosen": -1.465427041053772, "logits/rejected": -1.3383185863494873, "logps/chosen": -373.2828369140625, "logps/rejected": -433.7022399902344, "loss": 0.1098, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.5387589931488037, "rewards/margins": 5.308978080749512, "rewards/rejected": -6.8477373123168945, "step": 370 }, { "epoch": 0.24, "learning_rate": 2.0342612419700214e-07, "logits/chosen": -1.4776766300201416, "logits/rejected": -1.390604019165039, "logps/chosen": -384.6776428222656, "logps/rejected": -409.8622131347656, "loss": 0.1068, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.1275439262390137, "rewards/margins": 5.649918079376221, "rewards/rejected": -6.777462005615234, "step": 380 }, { "epoch": 0.25, "learning_rate": 2.0877944325481796e-07, "logits/chosen": -1.496004343032837, "logits/rejected": -1.2643308639526367, "logps/chosen": -319.6006774902344, "logps/rejected": -435.0767517089844, "loss": 0.0963, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9759249687194824, "rewards/margins": 5.894278526306152, "rewards/rejected": -6.870204925537109, "step": 390 }, { "epoch": 0.26, "learning_rate": 2.1413276231263384e-07, "logits/chosen": -1.373808741569519, "logits/rejected": -1.2059423923492432, "logps/chosen": -355.2186584472656, "logps/rejected": -375.163330078125, "loss": 0.1065, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.117621660232544, "rewards/margins": 5.132685661315918, "rewards/rejected": -6.250307559967041, "step": 400 }, { "epoch": 0.26, "eval_logits/chosen": -1.5739349126815796, "eval_logits/rejected": -1.3864831924438477, "eval_logps/chosen": -342.44061279296875, "eval_logps/rejected": -380.4227600097656, "eval_loss": 0.1241927221417427, "eval_rewards/accuracies": 0.921875, "eval_rewards/chosen": -1.3077539205551147, "eval_rewards/margins": 5.099446773529053, "eval_rewards/rejected": -6.407200336456299, "eval_runtime": 76.309, "eval_samples_per_second": 13.105, "eval_steps_per_second": 0.419, "step": 400 }, { "epoch": 0.26, "learning_rate": 2.1948608137044966e-07, "logits/chosen": -1.453360915184021, "logits/rejected": -1.215315341949463, "logps/chosen": -309.32196044921875, "logps/rejected": -383.0122985839844, "loss": 0.0932, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0941340923309326, "rewards/margins": 5.864490985870361, "rewards/rejected": -6.958624839782715, "step": 410 }, { "epoch": 0.27, "learning_rate": 2.248394004282655e-07, "logits/chosen": -1.6208438873291016, "logits/rejected": -1.3579902648925781, "logps/chosen": -420.99951171875, "logps/rejected": -458.4291076660156, "loss": 0.0972, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9325690269470215, "rewards/margins": 6.371849536895752, "rewards/rejected": -7.304419040679932, "step": 420 }, { "epoch": 0.28, "learning_rate": 2.3019271948608136e-07, "logits/chosen": -1.5590957403182983, "logits/rejected": -1.3185454607009888, "logps/chosen": -433.14007568359375, "logps/rejected": -423.8736877441406, "loss": 0.0955, "rewards/accuracies": 0.9375, "rewards/chosen": -0.888096034526825, "rewards/margins": 6.284546852111816, "rewards/rejected": -7.172643184661865, "step": 430 }, { "epoch": 0.28, "learning_rate": 2.355460385438972e-07, "logits/chosen": -1.5379421710968018, "logits/rejected": -1.287793755531311, "logps/chosen": -384.4920959472656, "logps/rejected": -459.51495361328125, "loss": 0.0925, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9522300958633423, "rewards/margins": 6.38286828994751, "rewards/rejected": -7.3350982666015625, "step": 440 }, { "epoch": 0.29, "learning_rate": 2.4089935760171303e-07, "logits/chosen": -1.539567232131958, "logits/rejected": -1.2881194353103638, "logps/chosen": -360.4696960449219, "logps/rejected": -421.70953369140625, "loss": 0.0983, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.032198190689087, "rewards/margins": 6.628798484802246, "rewards/rejected": -7.660996437072754, "step": 450 }, { "epoch": 0.3, "learning_rate": 2.462526766595289e-07, "logits/chosen": -1.5270180702209473, "logits/rejected": -1.213521122932434, "logps/chosen": -341.48370361328125, "logps/rejected": -416.1814880371094, "loss": 0.0862, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.2647520303726196, "rewards/margins": 7.327714443206787, "rewards/rejected": -8.592466354370117, "step": 460 }, { "epoch": 0.3, "learning_rate": 2.5160599571734473e-07, "logits/chosen": -1.539104700088501, "logits/rejected": -1.304837942123413, "logps/chosen": -412.41192626953125, "logps/rejected": -530.3709106445312, "loss": 0.0685, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.0689477920532227, "rewards/margins": 7.198890686035156, "rewards/rejected": -8.267837524414062, "step": 470 }, { "epoch": 0.31, "learning_rate": 2.569593147751606e-07, "logits/chosen": -1.4452335834503174, "logits/rejected": -1.177215576171875, "logps/chosen": -356.6268005371094, "logps/rejected": -386.1933288574219, "loss": 0.0906, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5285716652870178, "rewards/margins": 7.398108005523682, "rewards/rejected": -7.9266791343688965, "step": 480 }, { "epoch": 0.31, "learning_rate": 2.6231263383297643e-07, "logits/chosen": -1.299133062362671, "logits/rejected": -1.2396031618118286, "logps/chosen": -354.5841979980469, "logps/rejected": -451.559326171875, "loss": 0.0801, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7138751745223999, "rewards/margins": 6.743910789489746, "rewards/rejected": -7.457786560058594, "step": 490 }, { "epoch": 0.32, "learning_rate": 2.676659528907923e-07, "logits/chosen": -1.2933090925216675, "logits/rejected": -1.0351635217666626, "logps/chosen": -403.50091552734375, "logps/rejected": -444.50811767578125, "loss": 0.0804, "rewards/accuracies": 1.0, "rewards/chosen": -1.2215214967727661, "rewards/margins": 7.684378623962402, "rewards/rejected": -8.905900955200195, "step": 500 }, { "epoch": 0.32, "eval_logits/chosen": -1.5204213857650757, "eval_logits/rejected": -1.313443899154663, "eval_logps/chosen": -341.24200439453125, "eval_logps/rejected": -387.53155517578125, "eval_loss": 0.11293376982212067, "eval_rewards/accuracies": 0.9296875, "eval_rewards/chosen": -1.1878938674926758, "eval_rewards/margins": 5.9301862716674805, "eval_rewards/rejected": -7.118079662322998, "eval_runtime": 76.7239, "eval_samples_per_second": 13.034, "eval_steps_per_second": 0.417, "step": 500 }, { "epoch": 0.33, "learning_rate": 2.7301927194860813e-07, "logits/chosen": -1.4917323589324951, "logits/rejected": -1.2333643436431885, "logps/chosen": -391.20941162109375, "logps/rejected": -462.3330993652344, "loss": 0.0677, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.9355085492134094, "rewards/margins": 8.421406745910645, "rewards/rejected": -9.356914520263672, "step": 510 }, { "epoch": 0.33, "learning_rate": 2.7837259100642395e-07, "logits/chosen": -1.3520570993423462, "logits/rejected": -1.1625728607177734, "logps/chosen": -392.84210205078125, "logps/rejected": -441.2123107910156, "loss": 0.084, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.286471962928772, "rewards/margins": 7.407790184020996, "rewards/rejected": -8.69426155090332, "step": 520 }, { "epoch": 0.34, "learning_rate": 2.8372591006423977e-07, "logits/chosen": -1.2561280727386475, "logits/rejected": -1.0563820600509644, "logps/chosen": -355.0276794433594, "logps/rejected": -434.96099853515625, "loss": 0.0754, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.644857406616211, "rewards/margins": 7.365194797515869, "rewards/rejected": -9.010051727294922, "step": 530 }, { "epoch": 0.35, "learning_rate": 2.890792291220557e-07, "logits/chosen": -1.3786545991897583, "logits/rejected": -1.1711790561676025, "logps/chosen": -343.2646179199219, "logps/rejected": -414.624267578125, "loss": 0.0644, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.0238958597183228, "rewards/margins": 7.4112067222595215, "rewards/rejected": -8.435102462768555, "step": 540 }, { "epoch": 0.35, "learning_rate": 2.944325481798715e-07, "logits/chosen": -1.483944296836853, "logits/rejected": -1.2009809017181396, "logps/chosen": -397.4231872558594, "logps/rejected": -453.69903564453125, "loss": 0.0742, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.842347264289856, "rewards/margins": 7.386562347412109, "rewards/rejected": -8.228910446166992, "step": 550 }, { "epoch": 0.36, "learning_rate": 2.9978586723768735e-07, "logits/chosen": -1.4178438186645508, "logits/rejected": -1.243758201599121, "logps/chosen": -412.52105712890625, "logps/rejected": -464.28741455078125, "loss": 0.0944, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.449329137802124, "rewards/margins": 8.025833129882812, "rewards/rejected": -9.475163459777832, "step": 560 }, { "epoch": 0.37, "learning_rate": 3.051391862955032e-07, "logits/chosen": -1.3710880279541016, "logits/rejected": -1.218766450881958, "logps/chosen": -344.7862548828125, "logps/rejected": -453.5824279785156, "loss": 0.0435, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.8545820713043213, "rewards/margins": 7.798386573791504, "rewards/rejected": -9.652968406677246, "step": 570 }, { "epoch": 0.37, "learning_rate": 3.1049250535331905e-07, "logits/chosen": -1.461828589439392, "logits/rejected": -1.0271979570388794, "logps/chosen": -387.4298095703125, "logps/rejected": -435.98663330078125, "loss": 0.0867, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.7421836853027344, "rewards/margins": 7.6707444190979, "rewards/rejected": -9.412927627563477, "step": 580 }, { "epoch": 0.38, "learning_rate": 3.1584582441113487e-07, "logits/chosen": -1.3038650751113892, "logits/rejected": -1.07460618019104, "logps/chosen": -324.758544921875, "logps/rejected": -503.78387451171875, "loss": 0.0716, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3957910537719727, "rewards/margins": 8.855663299560547, "rewards/rejected": -11.25145435333252, "step": 590 }, { "epoch": 0.39, "learning_rate": 3.211991434689507e-07, "logits/chosen": -1.3503267765045166, "logits/rejected": -1.0638809204101562, "logps/chosen": -374.38348388671875, "logps/rejected": -516.7820434570312, "loss": 0.0767, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.1524558067321777, "rewards/margins": 8.407071113586426, "rewards/rejected": -10.559527397155762, "step": 600 }, { "epoch": 0.39, "eval_logits/chosen": -1.4297420978546143, "eval_logits/rejected": -1.150868535041809, "eval_logps/chosen": -353.238037109375, "eval_logps/rejected": -407.5989990234375, "eval_loss": 0.13097576797008514, "eval_rewards/accuracies": 0.8984375, "eval_rewards/chosen": -2.387495994567871, "eval_rewards/margins": 6.737332820892334, "eval_rewards/rejected": -9.124829292297363, "eval_runtime": 76.6752, "eval_samples_per_second": 13.042, "eval_steps_per_second": 0.417, "step": 600 }, { "epoch": 0.39, "learning_rate": 3.265524625267666e-07, "logits/chosen": -1.1903326511383057, "logits/rejected": -0.9525307416915894, "logps/chosen": -419.54608154296875, "logps/rejected": -444.11383056640625, "loss": 0.0831, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.369199752807617, "rewards/margins": 8.518608093261719, "rewards/rejected": -10.887807846069336, "step": 610 }, { "epoch": 0.4, "learning_rate": 3.3190578158458244e-07, "logits/chosen": -1.3438398838043213, "logits/rejected": -1.143065333366394, "logps/chosen": -369.13970947265625, "logps/rejected": -439.51715087890625, "loss": 0.0779, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.6147950887680054, "rewards/margins": 7.42797327041626, "rewards/rejected": -9.042768478393555, "step": 620 }, { "epoch": 0.4, "learning_rate": 3.3725910064239827e-07, "logits/chosen": -1.4843438863754272, "logits/rejected": -1.0890620946884155, "logps/chosen": -409.73583984375, "logps/rejected": -517.4986572265625, "loss": 0.0703, "rewards/accuracies": 1.0, "rewards/chosen": -1.1902521848678589, "rewards/margins": 9.822367668151855, "rewards/rejected": -11.01262092590332, "step": 630 }, { "epoch": 0.41, "learning_rate": 3.426124197002141e-07, "logits/chosen": -1.3501672744750977, "logits/rejected": -1.0996173620224, "logps/chosen": -360.51531982421875, "logps/rejected": -489.69061279296875, "loss": 0.0968, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.6506973505020142, "rewards/margins": 8.347593307495117, "rewards/rejected": -9.998289108276367, "step": 640 }, { "epoch": 0.42, "learning_rate": 3.4796573875802996e-07, "logits/chosen": -1.2105796337127686, "logits/rejected": -0.9770170450210571, "logps/chosen": -335.6686096191406, "logps/rejected": -449.52447509765625, "loss": 0.0659, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.2099552154541016, "rewards/margins": 7.764392852783203, "rewards/rejected": -9.974349021911621, "step": 650 }, { "epoch": 0.42, "learning_rate": 3.533190578158458e-07, "logits/chosen": -1.4429116249084473, "logits/rejected": -1.06112539768219, "logps/chosen": -420.8074645996094, "logps/rejected": -442.88568115234375, "loss": 0.0967, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.681353211402893, "rewards/margins": 7.297033786773682, "rewards/rejected": -8.978387832641602, "step": 660 }, { "epoch": 0.43, "learning_rate": 3.5867237687366166e-07, "logits/chosen": -1.4578073024749756, "logits/rejected": -1.1529022455215454, "logps/chosen": -362.67230224609375, "logps/rejected": -455.08367919921875, "loss": 0.0694, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.878594994544983, "rewards/margins": 8.333452224731445, "rewards/rejected": -10.212045669555664, "step": 670 }, { "epoch": 0.44, "learning_rate": 3.640256959314775e-07, "logits/chosen": -1.34915292263031, "logits/rejected": -1.125140905380249, "logps/chosen": -364.0298156738281, "logps/rejected": -492.75909423828125, "loss": 0.0744, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.043875217437744, "rewards/margins": 9.902002334594727, "rewards/rejected": -11.945877075195312, "step": 680 }, { "epoch": 0.44, "learning_rate": 3.6937901498929336e-07, "logits/chosen": -1.2846548557281494, "logits/rejected": -0.9907848238945007, "logps/chosen": -423.0797424316406, "logps/rejected": -499.71142578125, "loss": 0.0603, "rewards/accuracies": 1.0, "rewards/chosen": -2.9811477661132812, "rewards/margins": 9.256712913513184, "rewards/rejected": -12.237860679626465, "step": 690 }, { "epoch": 0.45, "learning_rate": 3.747323340471092e-07, "logits/chosen": -1.3538507223129272, "logits/rejected": -1.1379244327545166, "logps/chosen": -329.5379638671875, "logps/rejected": -413.8023986816406, "loss": 0.0759, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.2614781856536865, "rewards/margins": 8.729515075683594, "rewards/rejected": -10.990991592407227, "step": 700 }, { "epoch": 0.45, "eval_logits/chosen": -1.4886287450790405, "eval_logits/rejected": -1.2180323600769043, "eval_logps/chosen": -348.9230041503906, "eval_logps/rejected": -409.6312255859375, "eval_loss": 0.12044651806354523, "eval_rewards/accuracies": 0.9296875, "eval_rewards/chosen": -1.9559952020645142, "eval_rewards/margins": 7.372057914733887, "eval_rewards/rejected": -9.328052520751953, "eval_runtime": 76.578, "eval_samples_per_second": 13.059, "eval_steps_per_second": 0.418, "step": 700 }, { "epoch": 0.46, "learning_rate": 3.80085653104925e-07, "logits/chosen": -1.3313727378845215, "logits/rejected": -0.9123377799987793, "logps/chosen": -442.8067321777344, "logps/rejected": -468.52984619140625, "loss": 0.0905, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6124906539916992, "rewards/margins": 8.923405647277832, "rewards/rejected": -10.535896301269531, "step": 710 }, { "epoch": 0.46, "learning_rate": 3.854389721627409e-07, "logits/chosen": -1.4308016300201416, "logits/rejected": -1.0905249118804932, "logps/chosen": -378.02545166015625, "logps/rejected": -479.7505798339844, "loss": 0.0561, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.7049999237060547, "rewards/margins": 8.536346435546875, "rewards/rejected": -10.241347312927246, "step": 720 }, { "epoch": 0.47, "learning_rate": 3.9079229122055676e-07, "logits/chosen": -1.3243951797485352, "logits/rejected": -0.9832109212875366, "logps/chosen": -431.05645751953125, "logps/rejected": -432.52886962890625, "loss": 0.0581, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.067716121673584, "rewards/margins": 8.114839553833008, "rewards/rejected": -10.18255615234375, "step": 730 }, { "epoch": 0.48, "learning_rate": 3.961456102783726e-07, "logits/chosen": -1.258172869682312, "logits/rejected": -1.0175604820251465, "logps/chosen": -404.3645935058594, "logps/rejected": -519.955322265625, "loss": 0.0452, "rewards/accuracies": 1.0, "rewards/chosen": -1.6845811605453491, "rewards/margins": 10.622332572937012, "rewards/rejected": -12.306914329528809, "step": 740 }, { "epoch": 0.48, "learning_rate": 4.014989293361884e-07, "logits/chosen": -1.28511381149292, "logits/rejected": -1.0871832370758057, "logps/chosen": -378.40478515625, "logps/rejected": -460.6148376464844, "loss": 0.0669, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.8259923458099365, "rewards/margins": 8.321816444396973, "rewards/rejected": -10.147809982299805, "step": 750 }, { "epoch": 0.49, "learning_rate": 4.068522483940043e-07, "logits/chosen": -1.5064611434936523, "logits/rejected": -1.0779759883880615, "logps/chosen": -371.971435546875, "logps/rejected": -416.6922302246094, "loss": 0.0762, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.135917067527771, "rewards/margins": 10.22150993347168, "rewards/rejected": -11.357427597045898, "step": 760 }, { "epoch": 0.49, "learning_rate": 4.122055674518201e-07, "logits/chosen": -1.429022192955017, "logits/rejected": -1.0790882110595703, "logps/chosen": -394.6370544433594, "logps/rejected": -461.9344787597656, "loss": 0.0662, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.259974956512451, "rewards/margins": 9.807271003723145, "rewards/rejected": -12.067245483398438, "step": 770 }, { "epoch": 0.5, "learning_rate": 4.175588865096359e-07, "logits/chosen": -1.296064853668213, "logits/rejected": -0.9478403925895691, "logps/chosen": -376.76727294921875, "logps/rejected": -470.43743896484375, "loss": 0.0513, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.549682378768921, "rewards/margins": 9.523083686828613, "rewards/rejected": -11.072766304016113, "step": 780 }, { "epoch": 0.51, "learning_rate": 4.2291220556745175e-07, "logits/chosen": -1.309073567390442, "logits/rejected": -0.8560987710952759, "logps/chosen": -457.4974670410156, "logps/rejected": -528.8920288085938, "loss": 0.0616, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.5288264751434326, "rewards/margins": 9.966497421264648, "rewards/rejected": -12.495325088500977, "step": 790 }, { "epoch": 0.51, "learning_rate": 4.282655246252677e-07, "logits/chosen": -1.2481553554534912, "logits/rejected": -0.764441728591919, "logps/chosen": -306.9283142089844, "logps/rejected": -417.22491455078125, "loss": 0.0681, "rewards/accuracies": 1.0, "rewards/chosen": -1.760056734085083, "rewards/margins": 10.229304313659668, "rewards/rejected": -11.989361763000488, "step": 800 }, { "epoch": 0.51, "eval_logits/chosen": -1.4168956279754639, "eval_logits/rejected": -1.0999643802642822, "eval_logps/chosen": -350.21661376953125, "eval_logps/rejected": -414.49041748046875, "eval_loss": 0.1091269925236702, "eval_rewards/accuracies": 0.9609375, "eval_rewards/chosen": -2.085355281829834, "eval_rewards/margins": 7.728612422943115, "eval_rewards/rejected": -9.81396770477295, "eval_runtime": 76.655, "eval_samples_per_second": 13.045, "eval_steps_per_second": 0.417, "step": 800 }, { "epoch": 0.52, "learning_rate": 4.336188436830835e-07, "logits/chosen": -1.092874526977539, "logits/rejected": -0.8753455281257629, "logps/chosen": -367.61846923828125, "logps/rejected": -431.23834228515625, "loss": 0.0496, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.793146848678589, "rewards/margins": 8.786299705505371, "rewards/rejected": -11.579446792602539, "step": 810 }, { "epoch": 0.53, "learning_rate": 4.389721627408993e-07, "logits/chosen": -1.02623450756073, "logits/rejected": -0.5294391512870789, "logps/chosen": -348.81463623046875, "logps/rejected": -391.2160339355469, "loss": 0.0634, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.466538906097412, "rewards/margins": 9.582557678222656, "rewards/rejected": -12.049097061157227, "step": 820 }, { "epoch": 0.53, "learning_rate": 4.443254817987152e-07, "logits/chosen": -1.1547324657440186, "logits/rejected": -0.6096884608268738, "logps/chosen": -426.49798583984375, "logps/rejected": -467.9422302246094, "loss": 0.072, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5411438941955566, "rewards/margins": 9.401772499084473, "rewards/rejected": -11.942916870117188, "step": 830 }, { "epoch": 0.54, "learning_rate": 4.49678800856531e-07, "logits/chosen": -1.247434377670288, "logits/rejected": -0.7016826868057251, "logps/chosen": -393.85430908203125, "logps/rejected": -482.9002990722656, "loss": 0.0467, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.051135301589966, "rewards/margins": 10.815892219543457, "rewards/rejected": -12.867027282714844, "step": 840 }, { "epoch": 0.55, "learning_rate": 4.5503211991434684e-07, "logits/chosen": -1.2653485536575317, "logits/rejected": -0.7517842054367065, "logps/chosen": -393.53643798828125, "logps/rejected": -458.5020446777344, "loss": 0.0962, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.8538535833358765, "rewards/margins": 9.80111312866211, "rewards/rejected": -11.654967308044434, "step": 850 }, { "epoch": 0.55, "learning_rate": 4.603854389721627e-07, "logits/chosen": -1.0140388011932373, "logits/rejected": -0.5941162109375, "logps/chosen": -431.3036193847656, "logps/rejected": -522.5137939453125, "loss": 0.073, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.430864095687866, "rewards/margins": 10.6370267868042, "rewards/rejected": -13.067891120910645, "step": 860 }, { "epoch": 0.56, "learning_rate": 4.657387580299786e-07, "logits/chosen": -0.8724175691604614, "logits/rejected": -0.424823522567749, "logps/chosen": -405.14544677734375, "logps/rejected": -458.218994140625, "loss": 0.0896, "rewards/accuracies": 0.9375, "rewards/chosen": -3.2016243934631348, "rewards/margins": 10.069561958312988, "rewards/rejected": -13.271186828613281, "step": 870 }, { "epoch": 0.57, "learning_rate": 4.710920770877944e-07, "logits/chosen": -0.9462097883224487, "logits/rejected": -0.5895959138870239, "logps/chosen": -384.04046630859375, "logps/rejected": -430.6656188964844, "loss": 0.0703, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.604841947555542, "rewards/margins": 8.405369758605957, "rewards/rejected": -11.010213851928711, "step": 880 }, { "epoch": 0.57, "learning_rate": 4.7644539614561024e-07, "logits/chosen": -1.0459139347076416, "logits/rejected": -0.5180732011795044, "logps/chosen": -349.8458557128906, "logps/rejected": -463.24993896484375, "loss": 0.068, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.0111186504364014, "rewards/margins": 10.823290824890137, "rewards/rejected": -12.8344087600708, "step": 890 }, { "epoch": 0.58, "learning_rate": 4.817987152034261e-07, "logits/chosen": -1.0149575471878052, "logits/rejected": -0.3724205493927002, "logps/chosen": -396.1781311035156, "logps/rejected": -512.572265625, "loss": 0.0494, "rewards/accuracies": 1.0, "rewards/chosen": -2.9190399646759033, "rewards/margins": 12.252616882324219, "rewards/rejected": -15.171656608581543, "step": 900 }, { "epoch": 0.58, "eval_logits/chosen": -1.1149048805236816, "eval_logits/rejected": -0.723557710647583, "eval_logps/chosen": -371.6729736328125, "eval_logps/rejected": -447.3422546386719, "eval_loss": 0.14773131906986237, "eval_rewards/accuracies": 0.890625, "eval_rewards/chosen": -4.230995178222656, "eval_rewards/margins": 8.868158340454102, "eval_rewards/rejected": -13.099154472351074, "eval_runtime": 76.5391, "eval_samples_per_second": 13.065, "eval_steps_per_second": 0.418, "step": 900 }, { "epoch": 0.58, "learning_rate": 4.871520342612419e-07, "logits/chosen": -1.0406441688537598, "logits/rejected": -0.7672456502914429, "logps/chosen": -367.68988037109375, "logps/rejected": -497.24542236328125, "loss": 0.0497, "rewards/accuracies": 1.0, "rewards/chosen": -2.375262498855591, "rewards/margins": 11.108481407165527, "rewards/rejected": -13.483744621276855, "step": 910 }, { "epoch": 0.59, "learning_rate": 4.925053533190578e-07, "logits/chosen": -1.3684611320495605, "logits/rejected": -0.7619699239730835, "logps/chosen": -395.2074279785156, "logps/rejected": -435.02490234375, "loss": 0.0771, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.186323642730713, "rewards/margins": 9.919301986694336, "rewards/rejected": -12.10562515258789, "step": 920 }, { "epoch": 0.6, "learning_rate": 4.978586723768736e-07, "logits/chosen": -1.1214783191680908, "logits/rejected": -0.5191805362701416, "logps/chosen": -415.1065979003906, "logps/rejected": -468.7010192871094, "loss": 0.046, "rewards/accuracies": 1.0, "rewards/chosen": -3.4811503887176514, "rewards/margins": 9.906661987304688, "rewards/rejected": -13.387812614440918, "step": 930 }, { "epoch": 0.6, "learning_rate": 4.996429421566293e-07, "logits/chosen": -1.2635540962219238, "logits/rejected": -0.628160834312439, "logps/chosen": -416.54644775390625, "logps/rejected": -465.719970703125, "loss": 0.1156, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.1887431144714355, "rewards/margins": 9.464393615722656, "rewards/rejected": -12.65313720703125, "step": 940 }, { "epoch": 0.61, "learning_rate": 4.990478457510116e-07, "logits/chosen": -1.3375509977340698, "logits/rejected": -0.5667593479156494, "logps/chosen": -430.92742919921875, "logps/rejected": -532.2154541015625, "loss": 0.0961, "rewards/accuracies": 0.9375, "rewards/chosen": -2.9553847312927246, "rewards/margins": 12.077500343322754, "rewards/rejected": -15.03288745880127, "step": 950 }, { "epoch": 0.62, "learning_rate": 4.98452749345394e-07, "logits/chosen": -1.117200493812561, "logits/rejected": -0.6453494429588318, "logps/chosen": -424.7369689941406, "logps/rejected": -452.61749267578125, "loss": 0.0766, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.5312678813934326, "rewards/margins": 10.04582405090332, "rewards/rejected": -13.577092170715332, "step": 960 }, { "epoch": 0.62, "learning_rate": 4.978576529397762e-07, "logits/chosen": -1.1340444087982178, "logits/rejected": -0.656354546546936, "logps/chosen": -389.5958557128906, "logps/rejected": -524.0951538085938, "loss": 0.0653, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.064024448394775, "rewards/margins": 11.442460060119629, "rewards/rejected": -16.506484985351562, "step": 970 }, { "epoch": 0.63, "learning_rate": 4.972625565341585e-07, "logits/chosen": -1.2187734842300415, "logits/rejected": -0.6018660664558411, "logps/chosen": -386.0228271484375, "logps/rejected": -454.13140869140625, "loss": 0.0838, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.2799274921417236, "rewards/margins": 9.308359146118164, "rewards/rejected": -12.588286399841309, "step": 980 }, { "epoch": 0.64, "learning_rate": 4.966674601285408e-07, "logits/chosen": -1.111574411392212, "logits/rejected": -0.7768339514732361, "logps/chosen": -366.3212585449219, "logps/rejected": -501.3627014160156, "loss": 0.0817, "rewards/accuracies": 0.9375, "rewards/chosen": -2.8082051277160645, "rewards/margins": 10.757050514221191, "rewards/rejected": -13.565256118774414, "step": 990 }, { "epoch": 0.64, "learning_rate": 4.960723637229232e-07, "logits/chosen": -0.855319619178772, "logits/rejected": -0.4498973786830902, "logps/chosen": -356.251708984375, "logps/rejected": -479.78094482421875, "loss": 0.0735, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.355088710784912, "rewards/margins": 9.392342567443848, "rewards/rejected": -13.747430801391602, "step": 1000 }, { "epoch": 0.64, "eval_logits/chosen": -1.069393277168274, "eval_logits/rejected": -0.698665976524353, "eval_logps/chosen": -367.884033203125, "eval_logps/rejected": -449.0148620605469, "eval_loss": 0.12433216720819473, "eval_rewards/accuracies": 0.9609375, "eval_rewards/chosen": -3.8520963191986084, "eval_rewards/margins": 9.414312362670898, "eval_rewards/rejected": -13.26640796661377, "eval_runtime": 76.7244, "eval_samples_per_second": 13.034, "eval_steps_per_second": 0.417, "step": 1000 }, { "epoch": 0.65, "learning_rate": 4.954772673173054e-07, "logits/chosen": -0.8880437612533569, "logits/rejected": -0.5571062564849854, "logps/chosen": -427.5099182128906, "logps/rejected": -511.27398681640625, "loss": 0.1211, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.051557779312134, "rewards/margins": 10.788399696350098, "rewards/rejected": -13.839956283569336, "step": 1010 }, { "epoch": 0.66, "learning_rate": 4.948821709116876e-07, "logits/chosen": -1.3221044540405273, "logits/rejected": -0.6521639823913574, "logps/chosen": -380.6373596191406, "logps/rejected": -449.31817626953125, "loss": 0.0824, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.038402795791626, "rewards/margins": 11.031317710876465, "rewards/rejected": -13.069720268249512, "step": 1020 }, { "epoch": 0.66, "learning_rate": 4.9428707450607e-07, "logits/chosen": -1.1821175813674927, "logits/rejected": -0.7060034871101379, "logps/chosen": -366.3950500488281, "logps/rejected": -507.5702209472656, "loss": 0.078, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.4800446033477783, "rewards/margins": 10.978177070617676, "rewards/rejected": -14.458221435546875, "step": 1030 }, { "epoch": 0.67, "learning_rate": 4.936919781004522e-07, "logits/chosen": -1.2339891195297241, "logits/rejected": -0.8903535604476929, "logps/chosen": -411.30615234375, "logps/rejected": -553.6360473632812, "loss": 0.0829, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.9073569774627686, "rewards/margins": 10.883973121643066, "rewards/rejected": -14.79133129119873, "step": 1040 }, { "epoch": 0.67, "learning_rate": 4.930968816948346e-07, "logits/chosen": -1.3873369693756104, "logits/rejected": -0.8341131210327148, "logps/chosen": -424.88543701171875, "logps/rejected": -477.947021484375, "loss": 0.0747, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.1361687183380127, "rewards/margins": 11.415266036987305, "rewards/rejected": -13.551434516906738, "step": 1050 }, { "epoch": 0.68, "learning_rate": 4.925017852892168e-07, "logits/chosen": -1.1863329410552979, "logits/rejected": -0.6122242212295532, "logps/chosen": -400.68865966796875, "logps/rejected": -515.4220581054688, "loss": 0.0844, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.35847544670105, "rewards/margins": 13.144007682800293, "rewards/rejected": -15.502484321594238, "step": 1060 }, { "epoch": 0.69, "learning_rate": 4.919066888835991e-07, "logits/chosen": -1.1419920921325684, "logits/rejected": -0.5117210149765015, "logps/chosen": -370.2403564453125, "logps/rejected": -593.1256103515625, "loss": 0.0685, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.8190958499908447, "rewards/margins": 14.543069839477539, "rewards/rejected": -17.362167358398438, "step": 1070 }, { "epoch": 0.69, "learning_rate": 4.913115924779814e-07, "logits/chosen": -1.1333223581314087, "logits/rejected": -0.5887588858604431, "logps/chosen": -438.794677734375, "logps/rejected": -556.837890625, "loss": 0.0702, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.482754945755005, "rewards/margins": 12.69272232055664, "rewards/rejected": -16.175479888916016, "step": 1080 }, { "epoch": 0.7, "learning_rate": 4.907164960723638e-07, "logits/chosen": -1.0477075576782227, "logits/rejected": -0.5950930714607239, "logps/chosen": -385.4281005859375, "logps/rejected": -448.23211669921875, "loss": 0.0694, "rewards/accuracies": 0.9375, "rewards/chosen": -4.315608501434326, "rewards/margins": 10.84793472290039, "rewards/rejected": -15.163541793823242, "step": 1090 }, { "epoch": 0.71, "learning_rate": 4.90121399666746e-07, "logits/chosen": -0.8860788345336914, "logits/rejected": -0.4812610149383545, "logps/chosen": -356.5136413574219, "logps/rejected": -509.6454162597656, "loss": 0.0806, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.0979533195495605, "rewards/margins": 11.64047908782959, "rewards/rejected": -15.738431930541992, "step": 1100 }, { "epoch": 0.71, "eval_logits/chosen": -1.1228324174880981, "eval_logits/rejected": -0.7543247938156128, "eval_logps/chosen": -354.41119384765625, "eval_logps/rejected": -438.4850158691406, "eval_loss": 0.11537463963031769, "eval_rewards/accuracies": 0.9765625, "eval_rewards/chosen": -2.5048139095306396, "eval_rewards/margins": 9.708612442016602, "eval_rewards/rejected": -12.213427543640137, "eval_runtime": 76.7376, "eval_samples_per_second": 13.031, "eval_steps_per_second": 0.417, "step": 1100 }, { "epoch": 0.71, "learning_rate": 4.895263032611282e-07, "logits/chosen": -1.2738934755325317, "logits/rejected": -0.6076444387435913, "logps/chosen": -452.056396484375, "logps/rejected": -505.551513671875, "loss": 0.0556, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.597477674484253, "rewards/margins": 11.379976272583008, "rewards/rejected": -14.977453231811523, "step": 1110 }, { "epoch": 0.72, "learning_rate": 4.889312068555106e-07, "logits/chosen": -1.174073576927185, "logits/rejected": -0.7190831303596497, "logps/chosen": -400.78656005859375, "logps/rejected": -528.5311889648438, "loss": 0.0635, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.351175308227539, "rewards/margins": 11.898660659790039, "rewards/rejected": -15.249837875366211, "step": 1120 }, { "epoch": 0.73, "learning_rate": 4.883361104498928e-07, "logits/chosen": -1.2661216259002686, "logits/rejected": -0.7576111555099487, "logps/chosen": -395.97833251953125, "logps/rejected": -508.68719482421875, "loss": 0.0564, "rewards/accuracies": 1.0, "rewards/chosen": -3.6894619464874268, "rewards/margins": 13.860685348510742, "rewards/rejected": -17.550146102905273, "step": 1130 }, { "epoch": 0.73, "learning_rate": 4.877410140442752e-07, "logits/chosen": -1.3575642108917236, "logits/rejected": -0.8006072044372559, "logps/chosen": -423.7298889160156, "logps/rejected": -535.811767578125, "loss": 0.0727, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.8236665725708008, "rewards/margins": 12.405186653137207, "rewards/rejected": -14.228856086730957, "step": 1140 }, { "epoch": 0.74, "learning_rate": 4.871459176386574e-07, "logits/chosen": -1.1852174997329712, "logits/rejected": -0.7334953546524048, "logps/chosen": -447.42724609375, "logps/rejected": -565.5848388671875, "loss": 0.0614, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.648531436920166, "rewards/margins": 13.974637031555176, "rewards/rejected": -17.6231689453125, "step": 1150 }, { "epoch": 0.75, "learning_rate": 4.865508212330398e-07, "logits/chosen": -1.1278337240219116, "logits/rejected": -0.5487757325172424, "logps/chosen": -394.2236022949219, "logps/rejected": -564.2678833007812, "loss": 0.05, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.3238348960876465, "rewards/margins": 13.824040412902832, "rewards/rejected": -16.147876739501953, "step": 1160 }, { "epoch": 0.75, "learning_rate": 4.85955724827422e-07, "logits/chosen": -0.7651220560073853, "logits/rejected": -0.06668927520513535, "logps/chosen": -444.3988342285156, "logps/rejected": -548.660888671875, "loss": 0.0583, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.920553684234619, "rewards/margins": 13.667207717895508, "rewards/rejected": -16.5877628326416, "step": 1170 }, { "epoch": 0.76, "learning_rate": 4.853606284218044e-07, "logits/chosen": -1.0830278396606445, "logits/rejected": -0.4501362442970276, "logps/chosen": -332.0751037597656, "logps/rejected": -418.5309143066406, "loss": 0.0789, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7456166744232178, "rewards/margins": 11.250526428222656, "rewards/rejected": -12.996142387390137, "step": 1180 }, { "epoch": 0.76, "learning_rate": 4.847655320161866e-07, "logits/chosen": -0.6781784296035767, "logits/rejected": -0.27196237444877625, "logps/chosen": -396.753662109375, "logps/rejected": -491.14892578125, "loss": 0.0668, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.8804516792297363, "rewards/margins": 11.853898048400879, "rewards/rejected": -14.734350204467773, "step": 1190 }, { "epoch": 0.77, "learning_rate": 4.841704356105689e-07, "logits/chosen": -0.7234100103378296, "logits/rejected": -0.2980794310569763, "logps/chosen": -367.587158203125, "logps/rejected": -433.124755859375, "loss": 0.0822, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.162593364715576, "rewards/margins": 10.242734909057617, "rewards/rejected": -13.405328750610352, "step": 1200 }, { "epoch": 0.77, "eval_logits/chosen": -1.0001459121704102, "eval_logits/rejected": -0.6194829940795898, "eval_logps/chosen": -365.28955078125, "eval_logps/rejected": -458.57135009765625, "eval_loss": 0.13022372126579285, "eval_rewards/accuracies": 0.96875, "eval_rewards/chosen": -3.5926475524902344, "eval_rewards/margins": 10.629414558410645, "eval_rewards/rejected": -14.222061157226562, "eval_runtime": 76.5043, "eval_samples_per_second": 13.071, "eval_steps_per_second": 0.418, "step": 1200 }, { "epoch": 0.78, "learning_rate": 4.835753392049512e-07, "logits/chosen": -0.6337307691574097, "logits/rejected": -0.23596186935901642, "logps/chosen": -356.19122314453125, "logps/rejected": -514.327392578125, "loss": 0.1085, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.958400249481201, "rewards/margins": 13.92591667175293, "rewards/rejected": -17.884319305419922, "step": 1210 }, { "epoch": 0.78, "learning_rate": 4.829802427993334e-07, "logits/chosen": -0.6189112663269043, "logits/rejected": -0.34567025303840637, "logps/chosen": -374.00286865234375, "logps/rejected": -474.2106018066406, "loss": 0.1089, "rewards/accuracies": 0.9375, "rewards/chosen": -3.2338080406188965, "rewards/margins": 9.657389640808105, "rewards/rejected": -12.891199111938477, "step": 1220 }, { "epoch": 0.79, "learning_rate": 4.823851463937158e-07, "logits/chosen": -0.3865962028503418, "logits/rejected": 0.04075580835342407, "logps/chosen": -359.3363342285156, "logps/rejected": -501.40240478515625, "loss": 0.1142, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.306814193725586, "rewards/margins": 13.71052074432373, "rewards/rejected": -18.017335891723633, "step": 1230 }, { "epoch": 0.8, "learning_rate": 4.81790049988098e-07, "logits/chosen": -0.7158625721931458, "logits/rejected": -0.21917279064655304, "logps/chosen": -434.91253662109375, "logps/rejected": -528.3048095703125, "loss": 0.0757, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.414928913116455, "rewards/margins": 12.52367115020752, "rewards/rejected": -15.938600540161133, "step": 1240 }, { "epoch": 0.8, "learning_rate": 4.811949535824804e-07, "logits/chosen": -0.7639582753181458, "logits/rejected": -0.4316403269767761, "logps/chosen": -365.72784423828125, "logps/rejected": -567.1033935546875, "loss": 0.084, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.063331127166748, "rewards/margins": 12.180209159851074, "rewards/rejected": -15.243539810180664, "step": 1250 }, { "epoch": 0.81, "learning_rate": 4.805998571768626e-07, "logits/chosen": -1.2161794900894165, "logits/rejected": -0.6932582259178162, "logps/chosen": -406.9449768066406, "logps/rejected": -520.5137939453125, "loss": 0.0951, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.868889331817627, "rewards/margins": 10.655603408813477, "rewards/rejected": -14.524490356445312, "step": 1260 }, { "epoch": 0.82, "learning_rate": 4.80004760771245e-07, "logits/chosen": -0.9595744013786316, "logits/rejected": -0.43417948484420776, "logps/chosen": -411.44677734375, "logps/rejected": -526.41845703125, "loss": 0.0691, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.0373916625976562, "rewards/margins": 9.937090873718262, "rewards/rejected": -12.97448444366455, "step": 1270 }, { "epoch": 0.82, "learning_rate": 4.794096643656272e-07, "logits/chosen": -1.0694881677627563, "logits/rejected": -0.5757830142974854, "logps/chosen": -382.9389343261719, "logps/rejected": -500.0220642089844, "loss": 0.0743, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.738356113433838, "rewards/margins": 10.426973342895508, "rewards/rejected": -14.165328979492188, "step": 1280 }, { "epoch": 0.83, "learning_rate": 4.788145679600095e-07, "logits/chosen": -0.9496662020683289, "logits/rejected": -0.3601847290992737, "logps/chosen": -358.3319396972656, "logps/rejected": -560.3468627929688, "loss": 0.0742, "rewards/accuracies": 1.0, "rewards/chosen": -2.929766893386841, "rewards/margins": 13.46239185333252, "rewards/rejected": -16.39215660095215, "step": 1290 }, { "epoch": 0.84, "learning_rate": 4.782194715543918e-07, "logits/chosen": -0.6319989562034607, "logits/rejected": -0.09417597949504852, "logps/chosen": -379.7413024902344, "logps/rejected": -522.1488037109375, "loss": 0.063, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.54395055770874, "rewards/margins": 13.589025497436523, "rewards/rejected": -18.132978439331055, "step": 1300 }, { "epoch": 0.84, "eval_logits/chosen": -0.9766608476638794, "eval_logits/rejected": -0.467482328414917, "eval_logps/chosen": -374.5415954589844, "eval_logps/rejected": -472.6826477050781, "eval_loss": 0.18041668832302094, "eval_rewards/accuracies": 0.9375, "eval_rewards/chosen": -4.517853260040283, "eval_rewards/margins": 11.115335464477539, "eval_rewards/rejected": -15.633190155029297, "eval_runtime": 76.7717, "eval_samples_per_second": 13.026, "eval_steps_per_second": 0.417, "step": 1300 }, { "epoch": 0.84, "learning_rate": 4.77624375148774e-07, "logits/chosen": -0.6732112169265747, "logits/rejected": 0.014696260914206505, "logps/chosen": -417.83074951171875, "logps/rejected": -488.5224609375, "loss": 0.0634, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.793654441833496, "rewards/margins": 13.286532402038574, "rewards/rejected": -18.08018684387207, "step": 1310 }, { "epoch": 0.85, "learning_rate": 4.770292787431564e-07, "logits/chosen": -0.3928489685058594, "logits/rejected": -0.07063998281955719, "logps/chosen": -365.6458435058594, "logps/rejected": -472.38043212890625, "loss": 0.0885, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.527663707733154, "rewards/margins": 11.80773639678955, "rewards/rejected": -16.335399627685547, "step": 1320 }, { "epoch": 0.85, "learning_rate": 4.764341823375387e-07, "logits/chosen": -0.5924497842788696, "logits/rejected": 0.10169048607349396, "logps/chosen": -394.43292236328125, "logps/rejected": -514.439697265625, "loss": 0.0762, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.806927442550659, "rewards/margins": 12.637249946594238, "rewards/rejected": -16.444177627563477, "step": 1330 }, { "epoch": 0.86, "learning_rate": 4.7583908593192097e-07, "logits/chosen": -0.6815664172172546, "logits/rejected": -0.29060396552085876, "logps/chosen": -420.83221435546875, "logps/rejected": -551.6920776367188, "loss": 0.0532, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.1024906635284424, "rewards/margins": 13.700825691223145, "rewards/rejected": -16.80331802368164, "step": 1340 }, { "epoch": 0.87, "learning_rate": 4.752439895263032e-07, "logits/chosen": -0.689470112323761, "logits/rejected": -0.20767569541931152, "logps/chosen": -385.32037353515625, "logps/rejected": -533.0687255859375, "loss": 0.0878, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.7341361045837402, "rewards/margins": 14.102251052856445, "rewards/rejected": -17.83638572692871, "step": 1350 }, { "epoch": 0.87, "learning_rate": 4.746488931206855e-07, "logits/chosen": -0.7405373454093933, "logits/rejected": -0.07327382266521454, "logps/chosen": -377.8034973144531, "logps/rejected": -466.72784423828125, "loss": 0.0851, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.5931084156036377, "rewards/margins": 12.507670402526855, "rewards/rejected": -16.100778579711914, "step": 1360 }, { "epoch": 0.88, "learning_rate": 4.7405379671506785e-07, "logits/chosen": -0.5811715126037598, "logits/rejected": -0.06195932626724243, "logps/chosen": -411.6004943847656, "logps/rejected": -548.6055908203125, "loss": 0.1093, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.2639055252075195, "rewards/margins": 11.680307388305664, "rewards/rejected": -15.944211959838867, "step": 1370 }, { "epoch": 0.89, "learning_rate": 4.734587003094501e-07, "logits/chosen": -0.8370053172111511, "logits/rejected": -0.14809687435626984, "logps/chosen": -447.0265197753906, "logps/rejected": -535.3525390625, "loss": 0.081, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.550225734710693, "rewards/margins": 12.257158279418945, "rewards/rejected": -16.807382583618164, "step": 1380 }, { "epoch": 0.89, "learning_rate": 4.728636039038324e-07, "logits/chosen": -0.7418749928474426, "logits/rejected": -0.25607532262802124, "logps/chosen": -374.479248046875, "logps/rejected": -532.3735961914062, "loss": 0.0738, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.4016432762146, "rewards/margins": 13.285806655883789, "rewards/rejected": -18.687450408935547, "step": 1390 }, { "epoch": 0.9, "learning_rate": 4.722685074982147e-07, "logits/chosen": -0.8056036233901978, "logits/rejected": -0.2640933394432068, "logps/chosen": -372.2264709472656, "logps/rejected": -565.7047729492188, "loss": 0.0648, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.8431191444396973, "rewards/margins": 13.19017505645752, "rewards/rejected": -17.033292770385742, "step": 1400 }, { "epoch": 0.9, "eval_logits/chosen": -1.118211030960083, "eval_logits/rejected": -0.5943832397460938, "eval_logps/chosen": -354.5495300292969, "eval_logps/rejected": -446.5892333984375, "eval_loss": 0.13570235669612885, "eval_rewards/accuracies": 0.953125, "eval_rewards/chosen": -2.518648147583008, "eval_rewards/margins": 10.50519847869873, "eval_rewards/rejected": -13.023846626281738, "eval_runtime": 76.6799, "eval_samples_per_second": 13.041, "eval_steps_per_second": 0.417, "step": 1400 }, { "epoch": 0.91, "learning_rate": 4.7167341109259703e-07, "logits/chosen": -1.0566179752349854, "logits/rejected": -0.2900046706199646, "logps/chosen": -398.8577575683594, "logps/rejected": -454.0634765625, "loss": 0.094, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.2813262939453125, "rewards/margins": 11.221616744995117, "rewards/rejected": -13.502942085266113, "step": 1410 }, { "epoch": 0.91, "learning_rate": 4.710783146869793e-07, "logits/chosen": -0.9073772430419922, "logits/rejected": -0.400244802236557, "logps/chosen": -357.8586730957031, "logps/rejected": -546.3787841796875, "loss": 0.0928, "rewards/accuracies": 0.9375, "rewards/chosen": -3.4375388622283936, "rewards/margins": 11.90378475189209, "rewards/rejected": -15.341323852539062, "step": 1420 }, { "epoch": 0.92, "learning_rate": 4.7048321828136157e-07, "logits/chosen": -0.6786088347434998, "logits/rejected": -0.10559716075658798, "logps/chosen": -417.87939453125, "logps/rejected": -530.9561767578125, "loss": 0.042, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.6138105392456055, "rewards/margins": 13.705339431762695, "rewards/rejected": -19.319150924682617, "step": 1430 }, { "epoch": 0.93, "learning_rate": 4.698881218757438e-07, "logits/chosen": -0.5866619348526001, "logits/rejected": -0.026242520660161972, "logps/chosen": -422.72613525390625, "logps/rejected": -541.8472900390625, "loss": 0.1036, "rewards/accuracies": 0.9375, "rewards/chosen": -4.567929267883301, "rewards/margins": 11.613755226135254, "rewards/rejected": -16.181686401367188, "step": 1440 }, { "epoch": 0.93, "learning_rate": 4.692930254701261e-07, "logits/chosen": -0.7376483082771301, "logits/rejected": -0.09872325509786606, "logps/chosen": -425.328857421875, "logps/rejected": -529.0191650390625, "loss": 0.0628, "rewards/accuracies": 0.9375, "rewards/chosen": -4.07589054107666, "rewards/margins": 12.2763032913208, "rewards/rejected": -16.352191925048828, "step": 1450 }, { "epoch": 0.94, "learning_rate": 4.6869792906450845e-07, "logits/chosen": -0.5996996760368347, "logits/rejected": 0.09782281517982483, "logps/chosen": -426.3692321777344, "logps/rejected": -495.6768493652344, "loss": 0.0448, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.688590049743652, "rewards/margins": 13.677145004272461, "rewards/rejected": -18.36573600769043, "step": 1460 }, { "epoch": 0.94, "learning_rate": 4.6810283265889075e-07, "logits/chosen": -0.6974693536758423, "logits/rejected": 0.09173402935266495, "logps/chosen": -361.08746337890625, "logps/rejected": -483.73211669921875, "loss": 0.0571, "rewards/accuracies": 1.0, "rewards/chosen": -4.233609199523926, "rewards/margins": 13.777618408203125, "rewards/rejected": -18.011228561401367, "step": 1470 }, { "epoch": 0.95, "learning_rate": 4.67507736253273e-07, "logits/chosen": -0.8001763224601746, "logits/rejected": -0.1673848032951355, "logps/chosen": -368.32196044921875, "logps/rejected": -508.84368896484375, "loss": 0.0676, "rewards/accuracies": 1.0, "rewards/chosen": -2.7094593048095703, "rewards/margins": 13.913045883178711, "rewards/rejected": -16.622507095336914, "step": 1480 }, { "epoch": 0.96, "learning_rate": 4.669126398476553e-07, "logits/chosen": -0.589401125907898, "logits/rejected": 0.09411343187093735, "logps/chosen": -448.2190856933594, "logps/rejected": -579.1790161132812, "loss": 0.07, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.9501609802246094, "rewards/margins": 15.417715072631836, "rewards/rejected": -19.367877960205078, "step": 1490 }, { "epoch": 0.96, "learning_rate": 4.6631754344203763e-07, "logits/chosen": -0.6391473412513733, "logits/rejected": -0.1050008162856102, "logps/chosen": -376.50579833984375, "logps/rejected": -544.2982177734375, "loss": 0.0714, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.8969063758850098, "rewards/margins": 13.172216415405273, "rewards/rejected": -17.069122314453125, "step": 1500 }, { "epoch": 0.96, "eval_logits/chosen": -1.0033119916915894, "eval_logits/rejected": -0.4300505220890045, "eval_logps/chosen": -374.3122863769531, "eval_logps/rejected": -470.21783447265625, "eval_loss": 0.12438826262950897, "eval_rewards/accuracies": 0.9453125, "eval_rewards/chosen": -4.494921684265137, "eval_rewards/margins": 10.89178466796875, "eval_rewards/rejected": -15.38670539855957, "eval_runtime": 76.5689, "eval_samples_per_second": 13.06, "eval_steps_per_second": 0.418, "step": 1500 }, { "epoch": 0.97, "learning_rate": 4.657224470364199e-07, "logits/chosen": -0.43436574935913086, "logits/rejected": 0.07220065593719482, "logps/chosen": -391.37725830078125, "logps/rejected": -588.6260986328125, "loss": 0.0546, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.231613636016846, "rewards/margins": 13.30431842803955, "rewards/rejected": -18.535930633544922, "step": 1510 }, { "epoch": 0.98, "learning_rate": 4.6512735063080217e-07, "logits/chosen": -0.6157822608947754, "logits/rejected": 0.005131366662681103, "logps/chosen": -411.13946533203125, "logps/rejected": -483.9326171875, "loss": 0.0527, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.003513336181641, "rewards/margins": 10.31053638458252, "rewards/rejected": -15.314050674438477, "step": 1520 }, { "epoch": 0.98, "learning_rate": 4.6453225422518447e-07, "logits/chosen": -0.6011049747467041, "logits/rejected": -0.10970073938369751, "logps/chosen": -406.9378356933594, "logps/rejected": -468.97314453125, "loss": 0.0739, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.8249969482421875, "rewards/margins": 11.424562454223633, "rewards/rejected": -16.24955940246582, "step": 1530 }, { "epoch": 0.99, "learning_rate": 4.6393715781956676e-07, "logits/chosen": -0.6942557096481323, "logits/rejected": 0.16295239329338074, "logps/chosen": -407.1282653808594, "logps/rejected": -560.19970703125, "loss": 0.0629, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.603503704071045, "rewards/margins": 13.417546272277832, "rewards/rejected": -18.02104949951172, "step": 1540 }, { "epoch": 1.0, "learning_rate": 4.6334206141394905e-07, "logits/chosen": -0.3027026057243347, "logits/rejected": 0.10327012836933136, "logps/chosen": -405.692626953125, "logps/rejected": -576.5963134765625, "loss": 0.0673, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.6844029426574707, "rewards/margins": 13.106195449829102, "rewards/rejected": -16.790597915649414, "step": 1550 }, { "epoch": 1.0, "learning_rate": 4.6274696500833135e-07, "logits/chosen": -0.5037144422531128, "logits/rejected": 0.06204764172434807, "logps/chosen": -412.0623474121094, "logps/rejected": -525.1038818359375, "loss": 0.061, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.6065430641174316, "rewards/margins": 14.261393547058105, "rewards/rejected": -16.867937088012695, "step": 1560 }, { "epoch": 1.01, "learning_rate": 4.621518686027136e-07, "logits/chosen": -0.526481568813324, "logits/rejected": 0.24668976664543152, "logps/chosen": -410.71746826171875, "logps/rejected": -473.8052673339844, "loss": 0.026, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.839338779449463, "rewards/margins": 13.472638130187988, "rewards/rejected": -16.31197738647461, "step": 1570 }, { "epoch": 1.02, "learning_rate": 4.6155677219709594e-07, "logits/chosen": -0.2807101607322693, "logits/rejected": 0.3707982003688812, "logps/chosen": -430.853759765625, "logps/rejected": -465.86328125, "loss": 0.016, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.3595848083496094, "rewards/margins": 12.908388137817383, "rewards/rejected": -16.267974853515625, "step": 1580 }, { "epoch": 1.02, "learning_rate": 4.6096167579147823e-07, "logits/chosen": -0.2973923683166504, "logits/rejected": 0.30676135420799255, "logps/chosen": -384.2185974121094, "logps/rejected": -527.7244873046875, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": -3.731484889984131, "rewards/margins": 15.676348686218262, "rewards/rejected": -19.407833099365234, "step": 1590 }, { "epoch": 1.03, "learning_rate": 4.603665793858605e-07, "logits/chosen": -0.5543831586837769, "logits/rejected": 0.3015816807746887, "logps/chosen": -395.59368896484375, "logps/rejected": -544.031494140625, "loss": 0.0095, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.2728145122528076, "rewards/margins": 15.476274490356445, "rewards/rejected": -18.74909019470215, "step": 1600 }, { "epoch": 1.03, "eval_logits/chosen": -0.7791086435317993, "eval_logits/rejected": -0.20377439260482788, "eval_logps/chosen": -380.9189453125, "eval_logps/rejected": -480.3507995605469, "eval_loss": 0.10078604519367218, "eval_rewards/accuracies": 0.9375, "eval_rewards/chosen": -5.155591011047363, "eval_rewards/margins": 11.244410514831543, "eval_rewards/rejected": -16.40000343322754, "eval_runtime": 76.553, "eval_samples_per_second": 13.063, "eval_steps_per_second": 0.418, "step": 1600 }, { "epoch": 1.03, "learning_rate": 4.5977148298024277e-07, "logits/chosen": -0.5526180267333984, "logits/rejected": 0.3482429087162018, "logps/chosen": -391.2712707519531, "logps/rejected": -509.62847900390625, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -3.7447268962860107, "rewards/margins": 15.208663940429688, "rewards/rejected": -18.95339012145996, "step": 1610 }, { "epoch": 1.04, "learning_rate": 4.5917638657462507e-07, "logits/chosen": -0.4088626801967621, "logits/rejected": 0.3109140992164612, "logps/chosen": -426.03643798828125, "logps/rejected": -556.555908203125, "loss": 0.0053, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.894583702087402, "rewards/margins": 14.70109748840332, "rewards/rejected": -19.595678329467773, "step": 1620 }, { "epoch": 1.05, "learning_rate": 4.5858129016900736e-07, "logits/chosen": -0.5371668934822083, "logits/rejected": 0.3167404532432556, "logps/chosen": -393.68524169921875, "logps/rejected": -562.7149658203125, "loss": 0.0129, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.771970272064209, "rewards/margins": 15.018495559692383, "rewards/rejected": -19.79046630859375, "step": 1630 }, { "epoch": 1.05, "learning_rate": 4.5798619376338966e-07, "logits/chosen": -0.247820645570755, "logits/rejected": 0.3391680121421814, "logps/chosen": -370.0538024902344, "logps/rejected": -491.40252685546875, "loss": 0.0067, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.405149936676025, "rewards/margins": 14.295089721679688, "rewards/rejected": -18.700239181518555, "step": 1640 }, { "epoch": 1.06, "learning_rate": 4.5739109735777195e-07, "logits/chosen": -0.2800091505050659, "logits/rejected": 0.2654980719089508, "logps/chosen": -439.1228942871094, "logps/rejected": -581.3944091796875, "loss": 0.0115, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.183625221252441, "rewards/margins": 16.414287567138672, "rewards/rejected": -22.597911834716797, "step": 1650 }, { "epoch": 1.07, "learning_rate": 4.567960009521542e-07, "logits/chosen": -0.32536178827285767, "logits/rejected": 0.2308044731616974, "logps/chosen": -362.37945556640625, "logps/rejected": -542.333984375, "loss": 0.0122, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.910571575164795, "rewards/margins": 15.097010612487793, "rewards/rejected": -19.007579803466797, "step": 1660 }, { "epoch": 1.07, "learning_rate": 4.5620090454653654e-07, "logits/chosen": -0.44402211904525757, "logits/rejected": 0.07348278164863586, "logps/chosen": -377.78759765625, "logps/rejected": -574.8585205078125, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -5.063399314880371, "rewards/margins": 16.107744216918945, "rewards/rejected": -21.171140670776367, "step": 1670 }, { "epoch": 1.08, "learning_rate": 4.5560580814091884e-07, "logits/chosen": -0.6580984592437744, "logits/rejected": 0.15282706916332245, "logps/chosen": -388.4255065917969, "logps/rejected": -543.6886596679688, "loss": 0.0331, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.285706996917725, "rewards/margins": 15.84802532196045, "rewards/rejected": -20.13373374938965, "step": 1680 }, { "epoch": 1.09, "learning_rate": 4.550107117353011e-07, "logits/chosen": -0.5916538834571838, "logits/rejected": 0.042367033660411835, "logps/chosen": -425.89013671875, "logps/rejected": -556.3162841796875, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": -4.548110008239746, "rewards/margins": 14.834815979003906, "rewards/rejected": -19.38292694091797, "step": 1690 }, { "epoch": 1.09, "learning_rate": 4.5441561532968337e-07, "logits/chosen": -0.6858727335929871, "logits/rejected": -0.02261565811932087, "logps/chosen": -391.042724609375, "logps/rejected": -572.4518432617188, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": -3.9488537311553955, "rewards/margins": 17.172945022583008, "rewards/rejected": -21.121801376342773, "step": 1700 }, { "epoch": 1.09, "eval_logits/chosen": -0.9992736577987671, "eval_logits/rejected": -0.40412917733192444, "eval_logps/chosen": -377.3610534667969, "eval_logps/rejected": -483.4149169921875, "eval_loss": 0.1330709010362625, "eval_rewards/accuracies": 0.953125, "eval_rewards/chosen": -4.799799919128418, "eval_rewards/margins": 11.906618118286133, "eval_rewards/rejected": -16.706417083740234, "eval_runtime": 76.6396, "eval_samples_per_second": 13.048, "eval_steps_per_second": 0.418, "step": 1700 }, { "epoch": 1.1, "learning_rate": 4.538205189240657e-07, "logits/chosen": -0.4306742548942566, "logits/rejected": 0.3547573983669281, "logps/chosen": -398.1122741699219, "logps/rejected": -550.4100341796875, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": -5.38922643661499, "rewards/margins": 16.458080291748047, "rewards/rejected": -21.847307205200195, "step": 1710 }, { "epoch": 1.11, "learning_rate": 4.5322542251844796e-07, "logits/chosen": -0.5155312418937683, "logits/rejected": -0.017366236075758934, "logps/chosen": -410.91412353515625, "logps/rejected": -553.8331298828125, "loss": 0.0058, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.796280384063721, "rewards/margins": 16.963947296142578, "rewards/rejected": -21.76022720336914, "step": 1720 }, { "epoch": 1.11, "learning_rate": 4.5263032611283026e-07, "logits/chosen": -0.12997198104858398, "logits/rejected": 0.2565564215183258, "logps/chosen": -405.981201171875, "logps/rejected": -620.6024169921875, "loss": 0.014, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.828683376312256, "rewards/margins": 18.5574893951416, "rewards/rejected": -23.386173248291016, "step": 1730 }, { "epoch": 1.12, "learning_rate": 4.5203522970721255e-07, "logits/chosen": -0.0680803433060646, "logits/rejected": 0.4802146852016449, "logps/chosen": -421.7552185058594, "logps/rejected": -488.4523010253906, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": -3.658025026321411, "rewards/margins": 15.093057632446289, "rewards/rejected": -18.751081466674805, "step": 1740 }, { "epoch": 1.12, "learning_rate": 4.5144013330159485e-07, "logits/chosen": -0.3530596196651459, "logits/rejected": 0.2572212815284729, "logps/chosen": -413.51629638671875, "logps/rejected": -577.3765869140625, "loss": 0.0179, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.011878967285156, "rewards/margins": 15.384442329406738, "rewards/rejected": -19.39632225036621, "step": 1750 }, { "epoch": 1.13, "learning_rate": 4.5084503689597714e-07, "logits/chosen": -0.28345853090286255, "logits/rejected": 0.42750459909439087, "logps/chosen": -437.33941650390625, "logps/rejected": -554.0641479492188, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -4.84152889251709, "rewards/margins": 14.463783264160156, "rewards/rejected": -19.305313110351562, "step": 1760 }, { "epoch": 1.14, "learning_rate": 4.5024994049035944e-07, "logits/chosen": -0.09664113819599152, "logits/rejected": 0.22168950736522675, "logps/chosen": -378.29632568359375, "logps/rejected": -565.90185546875, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": -4.932016372680664, "rewards/margins": 17.979869842529297, "rewards/rejected": -22.911888122558594, "step": 1770 }, { "epoch": 1.14, "learning_rate": 4.496548440847417e-07, "logits/chosen": -0.049262501299381256, "logits/rejected": 0.4335893988609314, "logps/chosen": -402.48980712890625, "logps/rejected": -552.8167114257812, "loss": 0.0279, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.7815442085266113, "rewards/margins": 15.186810493469238, "rewards/rejected": -18.968353271484375, "step": 1780 }, { "epoch": 1.15, "learning_rate": 4.49059747679124e-07, "logits/chosen": -0.28909042477607727, "logits/rejected": 0.3799073398113251, "logps/chosen": -430.60760498046875, "logps/rejected": -537.1256713867188, "loss": 0.0168, "rewards/accuracies": 1.0, "rewards/chosen": -4.874712944030762, "rewards/margins": 15.4542875289917, "rewards/rejected": -20.32900047302246, "step": 1790 }, { "epoch": 1.16, "learning_rate": 4.484646512735063e-07, "logits/chosen": -0.3206165134906769, "logits/rejected": 0.06656259298324585, "logps/chosen": -471.58001708984375, "logps/rejected": -507.40301513671875, "loss": 0.0293, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.206088066101074, "rewards/margins": 13.663434982299805, "rewards/rejected": -17.869524002075195, "step": 1800 }, { "epoch": 1.16, "eval_logits/chosen": -0.8347997665405273, "eval_logits/rejected": -0.41860231757164, "eval_logps/chosen": -372.9380187988281, "eval_logps/rejected": -465.8204345703125, "eval_loss": 0.14275716245174408, "eval_rewards/accuracies": 0.9609375, "eval_rewards/chosen": -4.35749626159668, "eval_rewards/margins": 10.589475631713867, "eval_rewards/rejected": -14.94697093963623, "eval_runtime": 76.7934, "eval_samples_per_second": 13.022, "eval_steps_per_second": 0.417, "step": 1800 }, { "epoch": 1.16, "learning_rate": 4.4786955486788856e-07, "logits/chosen": -0.3631385266780853, "logits/rejected": 0.07067543268203735, "logps/chosen": -377.46551513671875, "logps/rejected": -519.6616821289062, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": -3.011387586593628, "rewards/margins": 15.720315933227539, "rewards/rejected": -18.73170280456543, "step": 1810 }, { "epoch": 1.17, "learning_rate": 4.4727445846227086e-07, "logits/chosen": -0.36144647002220154, "logits/rejected": -0.008104220032691956, "logps/chosen": -397.82940673828125, "logps/rejected": -593.34521484375, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -5.28765869140625, "rewards/margins": 15.561511039733887, "rewards/rejected": -20.849170684814453, "step": 1820 }, { "epoch": 1.18, "learning_rate": 4.4667936205665315e-07, "logits/chosen": -0.1907232105731964, "logits/rejected": 0.313555508852005, "logps/chosen": -393.31719970703125, "logps/rejected": -510.4048767089844, "loss": 0.0247, "rewards/accuracies": 1.0, "rewards/chosen": -4.2387800216674805, "rewards/margins": 14.292640686035156, "rewards/rejected": -18.531421661376953, "step": 1830 }, { "epoch": 1.18, "learning_rate": 4.4608426565103545e-07, "logits/chosen": -0.6258490085601807, "logits/rejected": 0.040771596133708954, "logps/chosen": -419.44549560546875, "logps/rejected": -544.8959350585938, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": -3.875296115875244, "rewards/margins": 15.84538459777832, "rewards/rejected": -19.720678329467773, "step": 1840 }, { "epoch": 1.19, "learning_rate": 4.4548916924541774e-07, "logits/chosen": -0.3880535066127777, "logits/rejected": -0.07375472038984299, "logps/chosen": -354.2195739746094, "logps/rejected": -516.4483642578125, "loss": 0.024, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.647956609725952, "rewards/margins": 14.303915023803711, "rewards/rejected": -16.95186996459961, "step": 1850 }, { "epoch": 1.2, "learning_rate": 4.4489407283980004e-07, "logits/chosen": 0.03841676935553551, "logits/rejected": 0.44405698776245117, "logps/chosen": -405.54351806640625, "logps/rejected": -583.1129760742188, "loss": 0.0113, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.536320686340332, "rewards/margins": 15.479736328125, "rewards/rejected": -21.016056060791016, "step": 1860 }, { "epoch": 1.2, "learning_rate": 4.442989764341823e-07, "logits/chosen": 0.409410297870636, "logits/rejected": 0.6078455448150635, "logps/chosen": -425.523193359375, "logps/rejected": -559.0198364257812, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": -7.118569374084473, "rewards/margins": 15.779667854309082, "rewards/rejected": -22.898235321044922, "step": 1870 }, { "epoch": 1.21, "learning_rate": 4.437038800285646e-07, "logits/chosen": 0.023495376110076904, "logits/rejected": 0.8307549357414246, "logps/chosen": -367.8580017089844, "logps/rejected": -482.8976135253906, "loss": 0.0228, "rewards/accuracies": 1.0, "rewards/chosen": -4.339402198791504, "rewards/margins": 14.32616901397705, "rewards/rejected": -18.665569305419922, "step": 1880 }, { "epoch": 1.21, "learning_rate": 4.431087836229469e-07, "logits/chosen": -0.1776140034198761, "logits/rejected": 0.4008842408657074, "logps/chosen": -368.744873046875, "logps/rejected": -584.6207885742188, "loss": 0.0183, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.053328514099121, "rewards/margins": 16.808618545532227, "rewards/rejected": -20.861948013305664, "step": 1890 }, { "epoch": 1.22, "learning_rate": 4.4251368721732916e-07, "logits/chosen": -0.10556875169277191, "logits/rejected": 0.6400087475776672, "logps/chosen": -395.7705993652344, "logps/rejected": -568.189697265625, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": -4.5353851318359375, "rewards/margins": 15.502912521362305, "rewards/rejected": -20.038297653198242, "step": 1900 }, { "epoch": 1.22, "eval_logits/chosen": -0.3349679708480835, "eval_logits/rejected": 0.1110042855143547, "eval_logps/chosen": -412.0855712890625, "eval_logps/rejected": -525.799560546875, "eval_loss": 0.16337917745113373, "eval_rewards/accuracies": 0.90625, "eval_rewards/chosen": -8.27225112915039, "eval_rewards/margins": 12.672636032104492, "eval_rewards/rejected": -20.94488525390625, "eval_runtime": 76.7623, "eval_samples_per_second": 13.027, "eval_steps_per_second": 0.417, "step": 1900 }, { "epoch": 1.23, "learning_rate": 4.4191859081171146e-07, "logits/chosen": 0.13517367839813232, "logits/rejected": 0.3502965569496155, "logps/chosen": -432.887451171875, "logps/rejected": -610.2424926757812, "loss": 0.0176, "rewards/accuracies": 1.0, "rewards/chosen": -5.2362380027771, "rewards/margins": 17.042871475219727, "rewards/rejected": -22.279109954833984, "step": 1910 }, { "epoch": 1.23, "learning_rate": 4.413234944060938e-07, "logits/chosen": -0.011228932067751884, "logits/rejected": 0.4079880714416504, "logps/chosen": -397.6374206542969, "logps/rejected": -609.8255615234375, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -3.824807643890381, "rewards/margins": 15.99610424041748, "rewards/rejected": -19.820913314819336, "step": 1920 }, { "epoch": 1.24, "learning_rate": 4.4072839800047605e-07, "logits/chosen": -0.048855990171432495, "logits/rejected": 0.2614760994911194, "logps/chosen": -425.06182861328125, "logps/rejected": -693.6744384765625, "loss": 0.0236, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.437471866607666, "rewards/margins": 17.300281524658203, "rewards/rejected": -21.73775291442871, "step": 1930 }, { "epoch": 1.25, "learning_rate": 4.4013330159485834e-07, "logits/chosen": -0.6388794183731079, "logits/rejected": -0.16498331725597382, "logps/chosen": -455.348388671875, "logps/rejected": -607.502685546875, "loss": 0.0343, "rewards/accuracies": 1.0, "rewards/chosen": -2.9607555866241455, "rewards/margins": 16.582752227783203, "rewards/rejected": -19.543508529663086, "step": 1940 }, { "epoch": 1.25, "learning_rate": 4.3953820518924064e-07, "logits/chosen": -0.7284079194068909, "logits/rejected": -0.07122499495744705, "logps/chosen": -416.238525390625, "logps/rejected": -586.7880249023438, "loss": 0.0142, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.067997932434082, "rewards/margins": 16.282215118408203, "rewards/rejected": -20.350215911865234, "step": 1950 }, { "epoch": 1.26, "learning_rate": 4.3894310878362293e-07, "logits/chosen": -0.7452508211135864, "logits/rejected": 0.1260381042957306, "logps/chosen": -449.06268310546875, "logps/rejected": -529.99072265625, "loss": 0.0258, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.9167428016662598, "rewards/margins": 15.735280990600586, "rewards/rejected": -18.65202522277832, "step": 1960 }, { "epoch": 1.27, "learning_rate": 4.3834801237800523e-07, "logits/chosen": -0.20991234481334686, "logits/rejected": 0.1470331847667694, "logps/chosen": -391.4617919921875, "logps/rejected": -590.724365234375, "loss": 0.0332, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.234528541564941, "rewards/margins": 17.127511978149414, "rewards/rejected": -21.362041473388672, "step": 1970 }, { "epoch": 1.27, "learning_rate": 4.377529159723875e-07, "logits/chosen": -0.1375136375427246, "logits/rejected": 0.26706498861312866, "logps/chosen": -444.4725646972656, "logps/rejected": -529.6869506835938, "loss": 0.0262, "rewards/accuracies": 1.0, "rewards/chosen": -5.4430928230285645, "rewards/margins": 15.911623001098633, "rewards/rejected": -21.354717254638672, "step": 1980 }, { "epoch": 1.28, "learning_rate": 4.3715781956676976e-07, "logits/chosen": -0.21475636959075928, "logits/rejected": 0.3777112364768982, "logps/chosen": -406.0322570800781, "logps/rejected": -613.7669067382812, "loss": 0.0202, "rewards/accuracies": 1.0, "rewards/chosen": -4.220062732696533, "rewards/margins": 17.0797119140625, "rewards/rejected": -21.299776077270508, "step": 1990 }, { "epoch": 1.29, "learning_rate": 4.365627231611521e-07, "logits/chosen": -0.42644554376602173, "logits/rejected": 0.14130757749080658, "logps/chosen": -414.5654296875, "logps/rejected": -642.5784912109375, "loss": 0.0182, "rewards/accuracies": 1.0, "rewards/chosen": -3.209155321121216, "rewards/margins": 17.73525619506836, "rewards/rejected": -20.944412231445312, "step": 2000 }, { "epoch": 1.29, "eval_logits/chosen": -0.6391128301620483, "eval_logits/rejected": -0.05341078341007233, "eval_logps/chosen": -384.2879638671875, "eval_logps/rejected": -496.6387023925781, "eval_loss": 0.15107305347919464, "eval_rewards/accuracies": 0.953125, "eval_rewards/chosen": -5.492494106292725, "eval_rewards/margins": 12.536298751831055, "eval_rewards/rejected": -18.028793334960938, "eval_runtime": 76.6391, "eval_samples_per_second": 13.048, "eval_steps_per_second": 0.418, "step": 2000 }, { "epoch": 1.29, "learning_rate": 4.359676267555344e-07, "logits/chosen": -0.516070544719696, "logits/rejected": 0.47864165902137756, "logps/chosen": -344.65240478515625, "logps/rejected": -633.1707763671875, "loss": 0.0116, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.215503692626953, "rewards/margins": 20.52490234375, "rewards/rejected": -24.740406036376953, "step": 2010 }, { "epoch": 1.3, "learning_rate": 4.3537253034991665e-07, "logits/chosen": -0.2483837604522705, "logits/rejected": 0.22826921939849854, "logps/chosen": -390.77874755859375, "logps/rejected": -561.1676025390625, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": -4.410299777984619, "rewards/margins": 18.708133697509766, "rewards/rejected": -23.11842918395996, "step": 2020 }, { "epoch": 1.3, "learning_rate": 4.3477743394429894e-07, "logits/chosen": -0.058499228209257126, "logits/rejected": 0.5287091135978699, "logps/chosen": -399.7431640625, "logps/rejected": -509.27593994140625, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -4.519265651702881, "rewards/margins": 16.500688552856445, "rewards/rejected": -21.01995277404785, "step": 2030 }, { "epoch": 1.31, "learning_rate": 4.3418233753868124e-07, "logits/chosen": 0.22348590195178986, "logits/rejected": 0.7664919495582581, "logps/chosen": -411.91668701171875, "logps/rejected": -567.1552124023438, "loss": 0.0393, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.37440299987793, "rewards/margins": 16.45658302307129, "rewards/rejected": -21.830982208251953, "step": 2040 }, { "epoch": 1.32, "learning_rate": 4.3358724113306353e-07, "logits/chosen": 0.23079347610473633, "logits/rejected": 0.849484920501709, "logps/chosen": -399.9824523925781, "logps/rejected": -545.28271484375, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": -5.708091735839844, "rewards/margins": 18.776620864868164, "rewards/rejected": -24.484712600708008, "step": 2050 }, { "epoch": 1.32, "learning_rate": 4.3299214472744583e-07, "logits/chosen": 0.5551694631576538, "logits/rejected": 1.1838432550430298, "logps/chosen": -406.03460693359375, "logps/rejected": -533.4246826171875, "loss": 0.0198, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.767632961273193, "rewards/margins": 15.5044527053833, "rewards/rejected": -22.27208709716797, "step": 2060 }, { "epoch": 1.33, "learning_rate": 4.323970483218281e-07, "logits/chosen": 0.5585372447967529, "logits/rejected": 1.1072343587875366, "logps/chosen": -390.73870849609375, "logps/rejected": -581.7123413085938, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -5.473759651184082, "rewards/margins": 17.37258529663086, "rewards/rejected": -22.846345901489258, "step": 2070 }, { "epoch": 1.34, "learning_rate": 4.3180195191621036e-07, "logits/chosen": 0.19852328300476074, "logits/rejected": 0.6014672517776489, "logps/chosen": -408.9750061035156, "logps/rejected": -573.5753784179688, "loss": 0.0317, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.9805541038513184, "rewards/margins": 16.70760154724121, "rewards/rejected": -20.688152313232422, "step": 2080 }, { "epoch": 1.34, "learning_rate": 4.312068555105927e-07, "logits/chosen": 0.2863103151321411, "logits/rejected": 1.09657883644104, "logps/chosen": -437.265380859375, "logps/rejected": -549.7987060546875, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": -4.587510108947754, "rewards/margins": 17.130268096923828, "rewards/rejected": -21.7177791595459, "step": 2090 }, { "epoch": 1.35, "learning_rate": 4.30611759104975e-07, "logits/chosen": 0.16401129961013794, "logits/rejected": 0.43381816148757935, "logps/chosen": -367.1972351074219, "logps/rejected": -574.0045776367188, "loss": 0.0181, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.683870315551758, "rewards/margins": 18.48448944091797, "rewards/rejected": -22.168359756469727, "step": 2100 }, { "epoch": 1.35, "eval_logits/chosen": -0.38096773624420166, "eval_logits/rejected": 0.2048163115978241, "eval_logps/chosen": -379.0364074707031, "eval_logps/rejected": -488.2851867675781, "eval_loss": 0.16009199619293213, "eval_rewards/accuracies": 0.9140625, "eval_rewards/chosen": -4.9673357009887695, "eval_rewards/margins": 12.22611141204834, "eval_rewards/rejected": -17.19344711303711, "eval_runtime": 77.0334, "eval_samples_per_second": 12.981, "eval_steps_per_second": 0.415, "step": 2100 }, { "epoch": 1.36, "learning_rate": 4.3001666269935725e-07, "logits/chosen": 0.4979768395423889, "logits/rejected": 0.9932268261909485, "logps/chosen": -439.8809509277344, "logps/rejected": -572.9110107421875, "loss": 0.0196, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.661294937133789, "rewards/margins": 16.660076141357422, "rewards/rejected": -21.32137107849121, "step": 2110 }, { "epoch": 1.36, "learning_rate": 4.2942156629373954e-07, "logits/chosen": 0.38851994276046753, "logits/rejected": 1.0913450717926025, "logps/chosen": -391.9513854980469, "logps/rejected": -557.0726318359375, "loss": 0.0222, "rewards/accuracies": 1.0, "rewards/chosen": -4.788923263549805, "rewards/margins": 16.37869644165039, "rewards/rejected": -21.167619705200195, "step": 2120 }, { "epoch": 1.37, "learning_rate": 4.288264698881219e-07, "logits/chosen": 0.7406013607978821, "logits/rejected": 1.1288832426071167, "logps/chosen": -434.83880615234375, "logps/rejected": -587.9083251953125, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -5.380326271057129, "rewards/margins": 18.681997299194336, "rewards/rejected": -24.06232261657715, "step": 2130 }, { "epoch": 1.38, "learning_rate": 4.2823137348250413e-07, "logits/chosen": 0.521850049495697, "logits/rejected": 0.9274239540100098, "logps/chosen": -380.3211669921875, "logps/rejected": -581.7938232421875, "loss": 0.0179, "rewards/accuracies": 1.0, "rewards/chosen": -4.798120021820068, "rewards/margins": 17.523136138916016, "rewards/rejected": -22.321256637573242, "step": 2140 }, { "epoch": 1.38, "learning_rate": 4.2763627707688643e-07, "logits/chosen": 0.12944528460502625, "logits/rejected": 0.8914273381233215, "logps/chosen": -443.59954833984375, "logps/rejected": -575.8558349609375, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": -4.318188190460205, "rewards/margins": 17.411693572998047, "rewards/rejected": -21.729881286621094, "step": 2150 }, { "epoch": 1.39, "learning_rate": 4.270411806712687e-07, "logits/chosen": -0.12583580613136292, "logits/rejected": 0.797269344329834, "logps/chosen": -445.8939514160156, "logps/rejected": -576.5032958984375, "loss": 0.0453, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.454132080078125, "rewards/margins": 17.303661346435547, "rewards/rejected": -23.757795333862305, "step": 2160 }, { "epoch": 1.39, "learning_rate": 4.26446084265651e-07, "logits/chosen": -0.158376082777977, "logits/rejected": 0.6286741495132446, "logps/chosen": -425.6141662597656, "logps/rejected": -587.5155029296875, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -6.210710048675537, "rewards/margins": 16.975194931030273, "rewards/rejected": -23.18590545654297, "step": 2170 }, { "epoch": 1.4, "learning_rate": 4.258509878600333e-07, "logits/chosen": -0.22286149859428406, "logits/rejected": 0.8965922594070435, "logps/chosen": -482.9183654785156, "logps/rejected": -620.5008544921875, "loss": 0.0323, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.59103536605835, "rewards/margins": 16.49752426147461, "rewards/rejected": -24.088560104370117, "step": 2180 }, { "epoch": 1.41, "learning_rate": 4.252558914544156e-07, "logits/chosen": 0.22360090911388397, "logits/rejected": 0.7913219928741455, "logps/chosen": -467.44757080078125, "logps/rejected": -694.8656616210938, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -8.326881408691406, "rewards/margins": 19.117225646972656, "rewards/rejected": -27.444107055664062, "step": 2190 }, { "epoch": 1.41, "learning_rate": 4.2466079504879785e-07, "logits/chosen": -0.1699143350124359, "logits/rejected": 0.9544417262077332, "logps/chosen": -444.05224609375, "logps/rejected": -574.4688110351562, "loss": 0.0107, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.666506767272949, "rewards/margins": 15.971659660339355, "rewards/rejected": -23.638164520263672, "step": 2200 }, { "epoch": 1.41, "eval_logits/chosen": -0.5077850222587585, "eval_logits/rejected": 0.1793254315853119, "eval_logps/chosen": -393.3812561035156, "eval_logps/rejected": -497.8894958496094, "eval_loss": 0.1433764100074768, "eval_rewards/accuracies": 0.9296875, "eval_rewards/chosen": -6.401820182800293, "eval_rewards/margins": 11.752058029174805, "eval_rewards/rejected": -18.15387725830078, "eval_runtime": 76.5649, "eval_samples_per_second": 13.061, "eval_steps_per_second": 0.418, "step": 2200 }, { "epoch": 1.42, "learning_rate": 4.240656986431802e-07, "logits/chosen": 0.25252458453178406, "logits/rejected": 0.6517130136489868, "logps/chosen": -439.65478515625, "logps/rejected": -589.1119995117188, "loss": 0.023, "rewards/accuracies": 1.0, "rewards/chosen": -5.0924177169799805, "rewards/margins": 17.32897186279297, "rewards/rejected": -22.421390533447266, "step": 2210 }, { "epoch": 1.43, "learning_rate": 4.234706022375625e-07, "logits/chosen": 0.16541361808776855, "logits/rejected": 0.7442089319229126, "logps/chosen": -340.91363525390625, "logps/rejected": -542.9158935546875, "loss": 0.0378, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.545586585998535, "rewards/margins": 16.971027374267578, "rewards/rejected": -23.51661491394043, "step": 2220 }, { "epoch": 1.43, "learning_rate": 4.2287550583194473e-07, "logits/chosen": 0.12309785187244415, "logits/rejected": 0.7068944573402405, "logps/chosen": -416.27166748046875, "logps/rejected": -671.1712646484375, "loss": 0.0373, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.756946563720703, "rewards/margins": 19.396137237548828, "rewards/rejected": -26.1530818939209, "step": 2230 }, { "epoch": 1.44, "learning_rate": 4.2228040942632703e-07, "logits/chosen": -0.17779667675495148, "logits/rejected": 0.7514945268630981, "logps/chosen": -387.315185546875, "logps/rejected": -554.4407348632812, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -6.591644287109375, "rewards/margins": 16.485776901245117, "rewards/rejected": -23.07741928100586, "step": 2240 }, { "epoch": 1.45, "learning_rate": 4.216853130207093e-07, "logits/chosen": -0.4565064311027527, "logits/rejected": 0.8017956018447876, "logps/chosen": -407.85028076171875, "logps/rejected": -557.0631103515625, "loss": 0.0244, "rewards/accuracies": 1.0, "rewards/chosen": -4.639117240905762, "rewards/margins": 16.890506744384766, "rewards/rejected": -21.529621124267578, "step": 2250 }, { "epoch": 1.45, "learning_rate": 4.210902166150916e-07, "logits/chosen": -0.1683514416217804, "logits/rejected": 0.797057569026947, "logps/chosen": -448.7789611816406, "logps/rejected": -617.1752319335938, "loss": 0.0067, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.5801100730896, "rewards/margins": 17.799306869506836, "rewards/rejected": -24.379417419433594, "step": 2260 }, { "epoch": 1.46, "learning_rate": 4.204951202094739e-07, "logits/chosen": 0.12932386994361877, "logits/rejected": 0.9783223867416382, "logps/chosen": -392.2380676269531, "logps/rejected": -563.818603515625, "loss": 0.0214, "rewards/accuracies": 1.0, "rewards/chosen": -5.884858131408691, "rewards/margins": 17.019224166870117, "rewards/rejected": -22.904081344604492, "step": 2270 }, { "epoch": 1.47, "learning_rate": 4.199000238038562e-07, "logits/chosen": 0.17704737186431885, "logits/rejected": 0.9605581164360046, "logps/chosen": -393.4834289550781, "logps/rejected": -520.2174072265625, "loss": 0.0254, "rewards/accuracies": 1.0, "rewards/chosen": -4.810626983642578, "rewards/margins": 17.23440170288086, "rewards/rejected": -22.045028686523438, "step": 2280 }, { "epoch": 1.47, "learning_rate": 4.1930492739823845e-07, "logits/chosen": 0.23696310818195343, "logits/rejected": 0.8548523783683777, "logps/chosen": -413.873046875, "logps/rejected": -593.3323974609375, "loss": 0.0362, "rewards/accuracies": 1.0, "rewards/chosen": -5.712353229522705, "rewards/margins": 18.40825653076172, "rewards/rejected": -24.120609283447266, "step": 2290 }, { "epoch": 1.48, "learning_rate": 4.187098309926208e-07, "logits/chosen": 0.11162440478801727, "logits/rejected": 0.8643198013305664, "logps/chosen": -446.72216796875, "logps/rejected": -609.0733642578125, "loss": 0.0207, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.024069786071777, "rewards/margins": 17.165082931518555, "rewards/rejected": -24.18915367126465, "step": 2300 }, { "epoch": 1.48, "eval_logits/chosen": -0.3969336152076721, "eval_logits/rejected": 0.30485790967941284, "eval_logps/chosen": -409.54925537109375, "eval_logps/rejected": -527.1227416992188, "eval_loss": 0.1328170895576477, "eval_rewards/accuracies": 0.9609375, "eval_rewards/chosen": -8.018616676330566, "eval_rewards/margins": 13.058581352233887, "eval_rewards/rejected": -21.077199935913086, "eval_runtime": 76.7293, "eval_samples_per_second": 13.033, "eval_steps_per_second": 0.417, "step": 2300 }, { "epoch": 1.48, "learning_rate": 4.181147345870031e-07, "logits/chosen": -0.14238300919532776, "logits/rejected": 0.5450645685195923, "logps/chosen": -449.2244567871094, "logps/rejected": -655.3807373046875, "loss": 0.0182, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.570483207702637, "rewards/margins": 16.494831085205078, "rewards/rejected": -23.065311431884766, "step": 2310 }, { "epoch": 1.49, "learning_rate": 4.1751963818138534e-07, "logits/chosen": -0.011528102681040764, "logits/rejected": 0.896866500377655, "logps/chosen": -477.01226806640625, "logps/rejected": -661.6298217773438, "loss": 0.0098, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.549506187438965, "rewards/margins": 19.142070770263672, "rewards/rejected": -25.691574096679688, "step": 2320 }, { "epoch": 1.5, "learning_rate": 4.1692454177576763e-07, "logits/chosen": -0.2801293432712555, "logits/rejected": 0.5617603063583374, "logps/chosen": -442.40167236328125, "logps/rejected": -573.85205078125, "loss": 0.0289, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.618128776550293, "rewards/margins": 17.062252044677734, "rewards/rejected": -23.680381774902344, "step": 2330 }, { "epoch": 1.5, "learning_rate": 4.1632944537015e-07, "logits/chosen": -0.1246495246887207, "logits/rejected": 0.732117772102356, "logps/chosen": -445.9453125, "logps/rejected": -648.7326049804688, "loss": 0.0567, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.793832302093506, "rewards/margins": 16.911283493041992, "rewards/rejected": -24.705114364624023, "step": 2340 }, { "epoch": 1.51, "learning_rate": 4.157343489645322e-07, "logits/chosen": -0.13531716167926788, "logits/rejected": 0.6813434362411499, "logps/chosen": -500.760986328125, "logps/rejected": -748.8970947265625, "loss": 0.0249, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -8.698297500610352, "rewards/margins": 22.551307678222656, "rewards/rejected": -31.249608993530273, "step": 2350 }, { "epoch": 1.52, "learning_rate": 4.151392525589145e-07, "logits/chosen": -0.17930714786052704, "logits/rejected": 0.5675514340400696, "logps/chosen": -507.093994140625, "logps/rejected": -617.2818603515625, "loss": 0.0077, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.938199043273926, "rewards/margins": 18.702774047851562, "rewards/rejected": -28.640972137451172, "step": 2360 }, { "epoch": 1.52, "learning_rate": 4.145441561532968e-07, "logits/chosen": -0.02535531297326088, "logits/rejected": 0.5533861517906189, "logps/chosen": -441.52362060546875, "logps/rejected": -658.696044921875, "loss": 0.0219, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.094923496246338, "rewards/margins": 19.90808868408203, "rewards/rejected": -27.00301170349121, "step": 2370 }, { "epoch": 1.53, "learning_rate": 4.139490597476791e-07, "logits/chosen": -0.01725180074572563, "logits/rejected": 0.545805811882019, "logps/chosen": -431.83197021484375, "logps/rejected": -565.3300170898438, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": -4.328795909881592, "rewards/margins": 17.926631927490234, "rewards/rejected": -22.255428314208984, "step": 2380 }, { "epoch": 1.54, "learning_rate": 4.133539633420614e-07, "logits/chosen": -0.21750584244728088, "logits/rejected": 0.8260287046432495, "logps/chosen": -426.7081604003906, "logps/rejected": -571.2562255859375, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": -4.599783897399902, "rewards/margins": 18.57124900817871, "rewards/rejected": -23.17103385925293, "step": 2390 }, { "epoch": 1.54, "learning_rate": 4.127588669364437e-07, "logits/chosen": 0.12132026255130768, "logits/rejected": 0.586942195892334, "logps/chosen": -387.7074890136719, "logps/rejected": -575.88623046875, "loss": 0.0237, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.400722980499268, "rewards/margins": 19.196186065673828, "rewards/rejected": -24.596908569335938, "step": 2400 }, { "epoch": 1.54, "eval_logits/chosen": -0.4777054190635681, "eval_logits/rejected": 0.22216691076755524, "eval_logps/chosen": -392.7734069824219, "eval_logps/rejected": -515.349365234375, "eval_loss": 0.1322525590658188, "eval_rewards/accuracies": 0.9765625, "eval_rewards/chosen": -6.341035842895508, "eval_rewards/margins": 13.558823585510254, "eval_rewards/rejected": -19.899858474731445, "eval_runtime": 76.575, "eval_samples_per_second": 13.059, "eval_steps_per_second": 0.418, "step": 2400 }, { "epoch": 1.55, "learning_rate": 4.1216377053082594e-07, "logits/chosen": 0.19748568534851074, "logits/rejected": 0.791664719581604, "logps/chosen": -461.057861328125, "logps/rejected": -645.6085205078125, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": -6.15015172958374, "rewards/margins": 19.398887634277344, "rewards/rejected": -25.549039840698242, "step": 2410 }, { "epoch": 1.56, "learning_rate": 4.115686741252083e-07, "logits/chosen": 0.5446011424064636, "logits/rejected": 0.8287714123725891, "logps/chosen": -428.01068115234375, "logps/rejected": -596.6788330078125, "loss": 0.0192, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.512643337249756, "rewards/margins": 18.827342987060547, "rewards/rejected": -25.33998680114746, "step": 2420 }, { "epoch": 1.56, "learning_rate": 4.109735777195906e-07, "logits/chosen": 0.2738257646560669, "logits/rejected": 0.6724745035171509, "logps/chosen": -443.29852294921875, "logps/rejected": -617.8185424804688, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": -4.934741497039795, "rewards/margins": 18.175350189208984, "rewards/rejected": -23.110088348388672, "step": 2430 }, { "epoch": 1.57, "learning_rate": 4.103784813139728e-07, "logits/chosen": -0.21786899864673615, "logits/rejected": 0.508806049823761, "logps/chosen": -386.5865173339844, "logps/rejected": -540.3848876953125, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -3.1630804538726807, "rewards/margins": 18.559303283691406, "rewards/rejected": -21.72238540649414, "step": 2440 }, { "epoch": 1.57, "learning_rate": 4.097833849083551e-07, "logits/chosen": 0.27359846234321594, "logits/rejected": 0.8219190835952759, "logps/chosen": -489.2318420410156, "logps/rejected": -654.9146118164062, "loss": 0.0167, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.5973005294799805, "rewards/margins": 19.05999183654785, "rewards/rejected": -25.65729331970215, "step": 2450 }, { "epoch": 1.58, "learning_rate": 4.091882885027374e-07, "logits/chosen": 0.23141400516033173, "logits/rejected": 0.8404422998428345, "logps/chosen": -470.26300048828125, "logps/rejected": -647.7288208007812, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": -5.572993278503418, "rewards/margins": 19.47740364074707, "rewards/rejected": -25.050395965576172, "step": 2460 }, { "epoch": 1.59, "learning_rate": 4.0859319209711976e-07, "logits/chosen": 0.21321940422058105, "logits/rejected": 0.9413207769393921, "logps/chosen": -481.8892517089844, "logps/rejected": -568.3132934570312, "loss": 0.0106, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.117605686187744, "rewards/margins": 17.3239688873291, "rewards/rejected": -22.441572189331055, "step": 2470 }, { "epoch": 1.59, "learning_rate": 4.07998095691502e-07, "logits/chosen": 0.34440717101097107, "logits/rejected": 1.0420324802398682, "logps/chosen": -412.06756591796875, "logps/rejected": -661.225830078125, "loss": 0.0085, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.927117347717285, "rewards/margins": 19.294017791748047, "rewards/rejected": -25.22113609313965, "step": 2480 }, { "epoch": 1.6, "learning_rate": 4.074029992858843e-07, "logits/chosen": 0.617647647857666, "logits/rejected": 1.0356745719909668, "logps/chosen": -466.6419372558594, "logps/rejected": -593.55517578125, "loss": 0.027, "rewards/accuracies": 1.0, "rewards/chosen": -5.356731414794922, "rewards/margins": 18.503705978393555, "rewards/rejected": -23.860441207885742, "step": 2490 }, { "epoch": 1.61, "learning_rate": 4.0680790288026654e-07, "logits/chosen": 0.6595112681388855, "logits/rejected": 1.4305124282836914, "logps/chosen": -434.178466796875, "logps/rejected": -573.2113647460938, "loss": 0.0269, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.551410675048828, "rewards/margins": 15.67224407196045, "rewards/rejected": -22.223651885986328, "step": 2500 }, { "epoch": 1.61, "eval_logits/chosen": -0.08062884956598282, "eval_logits/rejected": 0.4895774722099304, "eval_logps/chosen": -393.9366149902344, "eval_logps/rejected": -509.43963623046875, "eval_loss": 0.151441290974617, "eval_rewards/accuracies": 0.96875, "eval_rewards/chosen": -6.457356929779053, "eval_rewards/margins": 12.851531982421875, "eval_rewards/rejected": -19.308889389038086, "eval_runtime": 76.6294, "eval_samples_per_second": 13.05, "eval_steps_per_second": 0.418, "step": 2500 }, { "epoch": 1.61, "learning_rate": 4.062128064746489e-07, "logits/chosen": 0.7047534584999084, "logits/rejected": 1.2956479787826538, "logps/chosen": -373.82366943359375, "logps/rejected": -593.6276245117188, "loss": 0.0182, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.4311394691467285, "rewards/margins": 15.736666679382324, "rewards/rejected": -22.167804718017578, "step": 2510 }, { "epoch": 1.62, "learning_rate": 4.056177100690312e-07, "logits/chosen": 0.23054015636444092, "logits/rejected": 0.9308466911315918, "logps/chosen": -403.5622253417969, "logps/rejected": -522.812744140625, "loss": 0.03, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.561907768249512, "rewards/margins": 15.371617317199707, "rewards/rejected": -22.933523178100586, "step": 2520 }, { "epoch": 1.63, "learning_rate": 4.050226136634135e-07, "logits/chosen": 0.061969585716724396, "logits/rejected": 0.7286633253097534, "logps/chosen": -414.0792541503906, "logps/rejected": -600.1736450195312, "loss": 0.0162, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.574122428894043, "rewards/margins": 19.657848358154297, "rewards/rejected": -25.231969833374023, "step": 2530 }, { "epoch": 1.63, "learning_rate": 4.044275172577957e-07, "logits/chosen": 0.05762529373168945, "logits/rejected": 1.0145411491394043, "logps/chosen": -445.4334411621094, "logps/rejected": -645.9708251953125, "loss": 0.0138, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.179165840148926, "rewards/margins": 19.949363708496094, "rewards/rejected": -27.128530502319336, "step": 2540 }, { "epoch": 1.64, "learning_rate": 4.0383242085217806e-07, "logits/chosen": 0.2469167709350586, "logits/rejected": 0.8062774538993835, "logps/chosen": -454.03668212890625, "logps/rejected": -624.8360595703125, "loss": 0.0212, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.777960777282715, "rewards/margins": 17.905128479003906, "rewards/rejected": -24.68309211730957, "step": 2550 }, { "epoch": 1.65, "learning_rate": 4.0323732444656036e-07, "logits/chosen": 0.52375328540802, "logits/rejected": 1.2141704559326172, "logps/chosen": -423.7889099121094, "logps/rejected": -609.6292114257812, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -6.806221961975098, "rewards/margins": 18.522531509399414, "rewards/rejected": -25.328754425048828, "step": 2560 }, { "epoch": 1.65, "learning_rate": 4.026422280409426e-07, "logits/chosen": 0.5363117456436157, "logits/rejected": 0.9718011617660522, "logps/chosen": -423.1117248535156, "logps/rejected": -559.2742309570312, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": -5.507771015167236, "rewards/margins": 17.65149688720703, "rewards/rejected": -23.159269332885742, "step": 2570 }, { "epoch": 1.66, "learning_rate": 4.020471316353249e-07, "logits/chosen": 0.7948285341262817, "logits/rejected": 1.6965904235839844, "logps/chosen": -450.24945068359375, "logps/rejected": -587.7679443359375, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": -7.323611259460449, "rewards/margins": 19.197834014892578, "rewards/rejected": -26.521448135375977, "step": 2580 }, { "epoch": 1.66, "learning_rate": 4.0145203522970724e-07, "logits/chosen": 0.6111919283866882, "logits/rejected": 1.2076199054718018, "logps/chosen": -393.1330871582031, "logps/rejected": -658.733154296875, "loss": 0.0212, "rewards/accuracies": 1.0, "rewards/chosen": -6.268210411071777, "rewards/margins": 22.285709381103516, "rewards/rejected": -28.553918838500977, "step": 2590 }, { "epoch": 1.67, "learning_rate": 4.008569388240895e-07, "logits/chosen": 0.8838823437690735, "logits/rejected": 1.2930363416671753, "logps/chosen": -433.79248046875, "logps/rejected": -662.343994140625, "loss": 0.0183, "rewards/accuracies": 1.0, "rewards/chosen": -6.181797027587891, "rewards/margins": 20.533702850341797, "rewards/rejected": -26.715499877929688, "step": 2600 }, { "epoch": 1.67, "eval_logits/chosen": 0.019558865576982498, "eval_logits/rejected": 0.6054279804229736, "eval_logps/chosen": -407.05767822265625, "eval_logps/rejected": -531.6161499023438, "eval_loss": 0.177625834941864, "eval_rewards/accuracies": 0.9375, "eval_rewards/chosen": -7.769463539123535, "eval_rewards/margins": 13.75707721710205, "eval_rewards/rejected": -21.526540756225586, "eval_runtime": 76.8929, "eval_samples_per_second": 13.005, "eval_steps_per_second": 0.416, "step": 2600 }, { "epoch": 1.68, "learning_rate": 4.002618424184718e-07, "logits/chosen": 0.37892434000968933, "logits/rejected": 0.9920595288276672, "logps/chosen": -391.6972351074219, "logps/rejected": -585.7908935546875, "loss": 0.0218, "rewards/accuracies": 1.0, "rewards/chosen": -5.025783538818359, "rewards/margins": 17.12643814086914, "rewards/rejected": -22.1522216796875, "step": 2610 }, { "epoch": 1.68, "learning_rate": 3.996667460128541e-07, "logits/chosen": 0.08298696577548981, "logits/rejected": 0.9364360570907593, "logps/chosen": -374.4390563964844, "logps/rejected": -576.79541015625, "loss": 0.0334, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.216673851013184, "rewards/margins": 18.307117462158203, "rewards/rejected": -23.523792266845703, "step": 2620 }, { "epoch": 1.69, "learning_rate": 3.990716496072363e-07, "logits/chosen": 0.30024105310440063, "logits/rejected": 0.9240363836288452, "logps/chosen": -419.29510498046875, "logps/rejected": -590.0726928710938, "loss": 0.0232, "rewards/accuracies": 1.0, "rewards/chosen": -6.619715690612793, "rewards/margins": 17.221851348876953, "rewards/rejected": -23.841564178466797, "step": 2630 }, { "epoch": 1.7, "learning_rate": 3.9847655320161867e-07, "logits/chosen": 0.2850233018398285, "logits/rejected": 0.6386219263076782, "logps/chosen": -397.0573425292969, "logps/rejected": -577.4563598632812, "loss": 0.0089, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.049335479736328, "rewards/margins": 16.51738739013672, "rewards/rejected": -22.56671905517578, "step": 2640 }, { "epoch": 1.7, "learning_rate": 3.9788145679600096e-07, "logits/chosen": 0.2091820240020752, "logits/rejected": 0.9494321942329407, "logps/chosen": -415.09869384765625, "logps/rejected": -584.553466796875, "loss": 0.0211, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.250141620635986, "rewards/margins": 17.63558578491211, "rewards/rejected": -22.88572883605957, "step": 2650 }, { "epoch": 1.71, "learning_rate": 3.972863603903832e-07, "logits/chosen": 0.15588752925395966, "logits/rejected": 0.9101377725601196, "logps/chosen": -440.03546142578125, "logps/rejected": -556.76953125, "loss": 0.0258, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.330071449279785, "rewards/margins": 16.938446044921875, "rewards/rejected": -23.26851463317871, "step": 2660 }, { "epoch": 1.72, "learning_rate": 3.966912639847655e-07, "logits/chosen": 0.43947821855545044, "logits/rejected": 1.1439796686172485, "logps/chosen": -408.59869384765625, "logps/rejected": -561.8624267578125, "loss": 0.0274, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.567991733551025, "rewards/margins": 18.473587036132812, "rewards/rejected": -25.041580200195312, "step": 2670 }, { "epoch": 1.72, "learning_rate": 3.9609616757914784e-07, "logits/chosen": 0.639157772064209, "logits/rejected": 1.2126656770706177, "logps/chosen": -433.49432373046875, "logps/rejected": -545.498779296875, "loss": 0.0441, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.019906044006348, "rewards/margins": 16.54349708557129, "rewards/rejected": -22.563400268554688, "step": 2680 }, { "epoch": 1.73, "learning_rate": 3.955010711735301e-07, "logits/chosen": 1.0705523490905762, "logits/rejected": 1.574342966079712, "logps/chosen": -402.82073974609375, "logps/rejected": -556.3751831054688, "loss": 0.0333, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.881136417388916, "rewards/margins": 16.096803665161133, "rewards/rejected": -22.977941513061523, "step": 2690 }, { "epoch": 1.74, "learning_rate": 3.949059747679124e-07, "logits/chosen": 0.6093908548355103, "logits/rejected": 1.4320948123931885, "logps/chosen": -416.2469177246094, "logps/rejected": -563.9771118164062, "loss": 0.0219, "rewards/accuracies": 1.0, "rewards/chosen": -5.77686071395874, "rewards/margins": 17.765539169311523, "rewards/rejected": -23.54239845275879, "step": 2700 }, { "epoch": 1.74, "eval_logits/chosen": -0.009464547038078308, "eval_logits/rejected": 0.5373153686523438, "eval_logps/chosen": -408.0279541015625, "eval_logps/rejected": -531.8214721679688, "eval_loss": 0.14935775101184845, "eval_rewards/accuracies": 0.9296875, "eval_rewards/chosen": -7.866490364074707, "eval_rewards/margins": 13.680585861206055, "eval_rewards/rejected": -21.547077178955078, "eval_runtime": 76.6739, "eval_samples_per_second": 13.042, "eval_steps_per_second": 0.417, "step": 2700 }, { "epoch": 1.74, "learning_rate": 3.943108783622947e-07, "logits/chosen": 0.5636851787567139, "logits/rejected": 0.969277024269104, "logps/chosen": -465.9778747558594, "logps/rejected": -580.30810546875, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -7.8137688636779785, "rewards/margins": 17.516647338867188, "rewards/rejected": -25.330415725708008, "step": 2710 }, { "epoch": 1.75, "learning_rate": 3.9371578195667697e-07, "logits/chosen": 0.28570881485939026, "logits/rejected": 1.0974262952804565, "logps/chosen": -420.02520751953125, "logps/rejected": -557.4937744140625, "loss": 0.0242, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.895697116851807, "rewards/margins": 18.642253875732422, "rewards/rejected": -24.53795051574707, "step": 2720 }, { "epoch": 1.75, "learning_rate": 3.9312068555105927e-07, "logits/chosen": 0.2833688259124756, "logits/rejected": 1.2028071880340576, "logps/chosen": -428.30242919921875, "logps/rejected": -601.4442138671875, "loss": 0.022, "rewards/accuracies": 1.0, "rewards/chosen": -5.424818992614746, "rewards/margins": 16.82640838623047, "rewards/rejected": -22.2512264251709, "step": 2730 }, { "epoch": 1.76, "learning_rate": 3.9252558914544156e-07, "logits/chosen": -0.17283181846141815, "logits/rejected": 0.6285573840141296, "logps/chosen": -407.96063232421875, "logps/rejected": -656.8048095703125, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -5.4999213218688965, "rewards/margins": 18.290767669677734, "rewards/rejected": -23.790691375732422, "step": 2740 }, { "epoch": 1.77, "learning_rate": 3.919304927398238e-07, "logits/chosen": 0.1602792888879776, "logits/rejected": 0.6874667406082153, "logps/chosen": -375.77960205078125, "logps/rejected": -546.3209228515625, "loss": 0.0288, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.720765113830566, "rewards/margins": 16.913997650146484, "rewards/rejected": -22.634761810302734, "step": 2750 }, { "epoch": 1.77, "learning_rate": 3.9133539633420615e-07, "logits/chosen": 0.2454497367143631, "logits/rejected": 0.6986191272735596, "logps/chosen": -390.337890625, "logps/rejected": -560.3748779296875, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": -5.7264814376831055, "rewards/margins": 17.356287002563477, "rewards/rejected": -23.082767486572266, "step": 2760 }, { "epoch": 1.78, "learning_rate": 3.9074029992858845e-07, "logits/chosen": 0.38854673504829407, "logits/rejected": 1.452370047569275, "logps/chosen": -415.73297119140625, "logps/rejected": -565.9852294921875, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": -6.159702777862549, "rewards/margins": 17.43438148498535, "rewards/rejected": -23.594083786010742, "step": 2770 }, { "epoch": 1.79, "learning_rate": 3.901452035229707e-07, "logits/chosen": 0.7071353197097778, "logits/rejected": 1.089143991470337, "logps/chosen": -402.5384216308594, "logps/rejected": -598.9990234375, "loss": 0.0203, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.571539402008057, "rewards/margins": 18.129215240478516, "rewards/rejected": -25.700754165649414, "step": 2780 }, { "epoch": 1.79, "learning_rate": 3.89550107117353e-07, "logits/chosen": 0.5019019842147827, "logits/rejected": 1.2631019353866577, "logps/chosen": -412.53607177734375, "logps/rejected": -484.9837341308594, "loss": 0.0231, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.363646984100342, "rewards/margins": 15.107401847839355, "rewards/rejected": -19.471050262451172, "step": 2790 }, { "epoch": 1.8, "learning_rate": 3.8895501071173533e-07, "logits/chosen": 0.5883782505989075, "logits/rejected": 1.2746624946594238, "logps/chosen": -431.44287109375, "logps/rejected": -637.4125366210938, "loss": 0.0335, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.525457859039307, "rewards/margins": 19.470340728759766, "rewards/rejected": -25.995798110961914, "step": 2800 }, { "epoch": 1.8, "eval_logits/chosen": -0.053867146372795105, "eval_logits/rejected": 0.6149381399154663, "eval_logps/chosen": -404.7615051269531, "eval_logps/rejected": -530.05126953125, "eval_loss": 0.15537042915821075, "eval_rewards/accuracies": 0.9609375, "eval_rewards/chosen": -7.539842128753662, "eval_rewards/margins": 13.830206871032715, "eval_rewards/rejected": -21.37004852294922, "eval_runtime": 76.7186, "eval_samples_per_second": 13.035, "eval_steps_per_second": 0.417, "step": 2800 }, { "epoch": 1.81, "learning_rate": 3.8835991430611757e-07, "logits/chosen": 0.6285505294799805, "logits/rejected": 1.1924630403518677, "logps/chosen": -426.795166015625, "logps/rejected": -537.3714599609375, "loss": 0.0291, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.4925079345703125, "rewards/margins": 17.20037078857422, "rewards/rejected": -23.69287872314453, "step": 2810 }, { "epoch": 1.81, "learning_rate": 3.8776481790049987e-07, "logits/chosen": 0.3737705945968628, "logits/rejected": 1.088739275932312, "logps/chosen": -472.11669921875, "logps/rejected": -613.3414916992188, "loss": 0.0219, "rewards/accuracies": 1.0, "rewards/chosen": -5.82584285736084, "rewards/margins": 20.265369415283203, "rewards/rejected": -26.09120750427246, "step": 2820 }, { "epoch": 1.82, "learning_rate": 3.8716972149488216e-07, "logits/chosen": 0.4724366068840027, "logits/rejected": 1.5313258171081543, "logps/chosen": -474.59326171875, "logps/rejected": -591.4876098632812, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": -5.544611930847168, "rewards/margins": 20.092144012451172, "rewards/rejected": -25.636754989624023, "step": 2830 }, { "epoch": 1.83, "learning_rate": 3.865746250892644e-07, "logits/chosen": 0.6960526704788208, "logits/rejected": 1.0484097003936768, "logps/chosen": -442.74298095703125, "logps/rejected": -641.082763671875, "loss": 0.0359, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.976676940917969, "rewards/margins": 18.82603645324707, "rewards/rejected": -24.802715301513672, "step": 2840 }, { "epoch": 1.83, "learning_rate": 3.8597952868364675e-07, "logits/chosen": 0.6734079122543335, "logits/rejected": 1.2869551181793213, "logps/chosen": -410.61846923828125, "logps/rejected": -535.3711547851562, "loss": 0.0256, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.3238630294799805, "rewards/margins": 16.99979019165039, "rewards/rejected": -22.323650360107422, "step": 2850 }, { "epoch": 1.84, "learning_rate": 3.8538443227802905e-07, "logits/chosen": 0.39041590690612793, "logits/rejected": 1.0508239269256592, "logps/chosen": -455.080322265625, "logps/rejected": -550.6773681640625, "loss": 0.0194, "rewards/accuracies": 1.0, "rewards/chosen": -4.037131309509277, "rewards/margins": 17.071130752563477, "rewards/rejected": -21.10826301574707, "step": 2860 }, { "epoch": 1.84, "learning_rate": 3.847893358724113e-07, "logits/chosen": 0.21260789036750793, "logits/rejected": 1.1434606313705444, "logps/chosen": -434.0419006347656, "logps/rejected": -555.1299438476562, "loss": 0.0233, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.177916526794434, "rewards/margins": 15.855340957641602, "rewards/rejected": -21.03325843811035, "step": 2870 }, { "epoch": 1.85, "learning_rate": 3.841942394667936e-07, "logits/chosen": 0.26587316393852234, "logits/rejected": 1.1289548873901367, "logps/chosen": -388.0738830566406, "logps/rejected": -598.3447265625, "loss": 0.0343, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.688809394836426, "rewards/margins": 18.783828735351562, "rewards/rejected": -24.472637176513672, "step": 2880 }, { "epoch": 1.86, "learning_rate": 3.8359914306117593e-07, "logits/chosen": 0.9435780644416809, "logits/rejected": 1.0387550592422485, "logps/chosen": -420.2930603027344, "logps/rejected": -602.9673461914062, "loss": 0.0159, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.682863712310791, "rewards/margins": 18.17510986328125, "rewards/rejected": -24.857975006103516, "step": 2890 }, { "epoch": 1.86, "learning_rate": 3.8300404665555817e-07, "logits/chosen": 0.5122434496879578, "logits/rejected": 1.1984080076217651, "logps/chosen": -479.920654296875, "logps/rejected": -683.6166381835938, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": -6.739544868469238, "rewards/margins": 21.551074981689453, "rewards/rejected": -28.290618896484375, "step": 2900 }, { "epoch": 1.86, "eval_logits/chosen": -0.22228558361530304, "eval_logits/rejected": 0.4165645241737366, "eval_logps/chosen": -403.52783203125, "eval_logps/rejected": -530.5889892578125, "eval_loss": 0.1386074274778366, "eval_rewards/accuracies": 0.9609375, "eval_rewards/chosen": -7.416477680206299, "eval_rewards/margins": 14.007347106933594, "eval_rewards/rejected": -21.423826217651367, "eval_runtime": 76.7881, "eval_samples_per_second": 13.023, "eval_steps_per_second": 0.417, "step": 2900 }, { "epoch": 1.87, "learning_rate": 3.8240895024994047e-07, "logits/chosen": 0.35297220945358276, "logits/rejected": 1.0394313335418701, "logps/chosen": -442.6025390625, "logps/rejected": -655.9295654296875, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": -6.260979175567627, "rewards/margins": 20.33395004272461, "rewards/rejected": -26.59493064880371, "step": 2910 }, { "epoch": 1.88, "learning_rate": 3.8181385384432276e-07, "logits/chosen": 0.402191162109375, "logits/rejected": 1.116523265838623, "logps/chosen": -393.9117126464844, "logps/rejected": -693.4283447265625, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": -4.633070468902588, "rewards/margins": 21.96987533569336, "rewards/rejected": -26.602941513061523, "step": 2920 }, { "epoch": 1.88, "learning_rate": 3.8121875743870506e-07, "logits/chosen": 0.16651079058647156, "logits/rejected": 0.6804031133651733, "logps/chosen": -412.31512451171875, "logps/rejected": -665.4926147460938, "loss": 0.0146, "rewards/accuracies": 1.0, "rewards/chosen": -4.822356700897217, "rewards/margins": 19.13541030883789, "rewards/rejected": -23.957767486572266, "step": 2930 }, { "epoch": 1.89, "learning_rate": 3.8062366103308735e-07, "logits/chosen": 0.1117088571190834, "logits/rejected": 1.083188772201538, "logps/chosen": -369.11370849609375, "logps/rejected": -570.9990844726562, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": -3.329820156097412, "rewards/margins": 19.094078063964844, "rewards/rejected": -22.423898696899414, "step": 2940 }, { "epoch": 1.9, "learning_rate": 3.8002856462746965e-07, "logits/chosen": 0.11049242317676544, "logits/rejected": 0.9105658531188965, "logps/chosen": -433.6129455566406, "logps/rejected": -589.2352294921875, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": -3.2948660850524902, "rewards/margins": 18.181903839111328, "rewards/rejected": -21.47677230834961, "step": 2950 }, { "epoch": 1.9, "learning_rate": 3.794334682218519e-07, "logits/chosen": -0.10087742656469345, "logits/rejected": 0.8825875520706177, "logps/chosen": -401.3361511230469, "logps/rejected": -503.46710205078125, "loss": 0.0172, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.8647804260253906, "rewards/margins": 15.990776062011719, "rewards/rejected": -19.85555648803711, "step": 2960 }, { "epoch": 1.91, "learning_rate": 3.7883837181623424e-07, "logits/chosen": 0.07600893825292587, "logits/rejected": 0.9553617238998413, "logps/chosen": -408.1060485839844, "logps/rejected": -534.631591796875, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -3.9009666442871094, "rewards/margins": 17.22827911376953, "rewards/rejected": -21.12924575805664, "step": 2970 }, { "epoch": 1.92, "learning_rate": 3.7824327541061653e-07, "logits/chosen": 0.2739563286304474, "logits/rejected": 0.8527243733406067, "logps/chosen": -437.29638671875, "logps/rejected": -695.6834106445312, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -5.193564414978027, "rewards/margins": 21.4495792388916, "rewards/rejected": -26.643142700195312, "step": 2980 }, { "epoch": 1.92, "learning_rate": 3.776481790049988e-07, "logits/chosen": 0.43024197220802307, "logits/rejected": 1.1188406944274902, "logps/chosen": -386.9818115234375, "logps/rejected": -554.8717041015625, "loss": 0.0291, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.847230911254883, "rewards/margins": 17.605852127075195, "rewards/rejected": -23.453083038330078, "step": 2990 }, { "epoch": 1.93, "learning_rate": 3.7705308259938107e-07, "logits/chosen": 0.6990832090377808, "logits/rejected": 1.3339582681655884, "logps/chosen": -375.6474609375, "logps/rejected": -568.9274291992188, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": -3.8585574626922607, "rewards/margins": 18.85175323486328, "rewards/rejected": -22.710311889648438, "step": 3000 }, { "epoch": 1.93, "eval_logits/chosen": -0.04803978279232979, "eval_logits/rejected": 0.6017779111862183, "eval_logps/chosen": -375.2929992675781, "eval_logps/rejected": -498.0708312988281, "eval_loss": 0.1309323012828827, "eval_rewards/accuracies": 0.953125, "eval_rewards/chosen": -4.592996120452881, "eval_rewards/margins": 13.579015731811523, "eval_rewards/rejected": -18.17201042175293, "eval_runtime": 76.759, "eval_samples_per_second": 13.028, "eval_steps_per_second": 0.417, "step": 3000 }, { "epoch": 1.93, "learning_rate": 3.764579861937634e-07, "logits/chosen": 0.5929206609725952, "logits/rejected": 1.196090579032898, "logps/chosen": -398.5981750488281, "logps/rejected": -553.6544799804688, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": -4.344789028167725, "rewards/margins": 19.267980575561523, "rewards/rejected": -23.612768173217773, "step": 3010 }, { "epoch": 1.94, "learning_rate": 3.7586288978814566e-07, "logits/chosen": 0.46780499815940857, "logits/rejected": 1.3755557537078857, "logps/chosen": -408.9537658691406, "logps/rejected": -639.6270141601562, "loss": 0.0271, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.787570953369141, "rewards/margins": 19.320308685302734, "rewards/rejected": -26.107879638671875, "step": 3020 }, { "epoch": 1.95, "learning_rate": 3.7526779338252795e-07, "logits/chosen": 0.47549518942832947, "logits/rejected": 1.1248539686203003, "logps/chosen": -458.66864013671875, "logps/rejected": -612.8165283203125, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": -5.648183345794678, "rewards/margins": 18.065431594848633, "rewards/rejected": -23.713613510131836, "step": 3030 }, { "epoch": 1.95, "learning_rate": 3.7467269697691025e-07, "logits/chosen": 0.022746117785573006, "logits/rejected": 1.0212655067443848, "logps/chosen": -437.48583984375, "logps/rejected": -603.9195556640625, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": -5.6978535652160645, "rewards/margins": 19.632654190063477, "rewards/rejected": -25.330509185791016, "step": 3040 }, { "epoch": 1.96, "learning_rate": 3.740776005712925e-07, "logits/chosen": -0.12851546704769135, "logits/rejected": 0.7596645951271057, "logps/chosen": -444.29461669921875, "logps/rejected": -648.9785766601562, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -5.788307189941406, "rewards/margins": 18.562524795532227, "rewards/rejected": -24.350830078125, "step": 3050 }, { "epoch": 1.97, "learning_rate": 3.7348250416567484e-07, "logits/chosen": 0.014139672741293907, "logits/rejected": 0.8716185688972473, "logps/chosen": -395.7372131347656, "logps/rejected": -608.2803955078125, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": -4.413556098937988, "rewards/margins": 19.46448516845703, "rewards/rejected": -23.878040313720703, "step": 3060 }, { "epoch": 1.97, "learning_rate": 3.7288740776005713e-07, "logits/chosen": 0.32760128378868103, "logits/rejected": 0.5136176943778992, "logps/chosen": -420.23480224609375, "logps/rejected": -669.7601318359375, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": -4.701635360717773, "rewards/margins": 19.267168045043945, "rewards/rejected": -23.96880531311035, "step": 3070 }, { "epoch": 1.98, "learning_rate": 3.722923113544394e-07, "logits/chosen": 0.4951232373714447, "logits/rejected": 1.083812952041626, "logps/chosen": -412.07464599609375, "logps/rejected": -682.0787353515625, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": -4.654479026794434, "rewards/margins": 18.33094596862793, "rewards/rejected": -22.985427856445312, "step": 3080 }, { "epoch": 1.99, "learning_rate": 3.7169721494882167e-07, "logits/chosen": 0.7839171290397644, "logits/rejected": 1.5858978033065796, "logps/chosen": -398.9652099609375, "logps/rejected": -555.3340454101562, "loss": 0.022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.493814945220947, "rewards/margins": 17.03528594970703, "rewards/rejected": -21.52910041809082, "step": 3090 }, { "epoch": 1.99, "learning_rate": 3.71102118543204e-07, "logits/chosen": 0.8546144366264343, "logits/rejected": 1.6159175634384155, "logps/chosen": -427.02978515625, "logps/rejected": -589.3820190429688, "loss": 0.0187, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.833371639251709, "rewards/margins": 18.236379623413086, "rewards/rejected": -25.069753646850586, "step": 3100 }, { "epoch": 1.99, "eval_logits/chosen": 0.0045381635427474976, "eval_logits/rejected": 0.7645629048347473, "eval_logps/chosen": -400.106201171875, "eval_logps/rejected": -530.0531616210938, "eval_loss": 0.13586518168449402, "eval_rewards/accuracies": 0.953125, "eval_rewards/chosen": -7.074316024780273, "eval_rewards/margins": 14.295927047729492, "eval_rewards/rejected": -21.370241165161133, "eval_runtime": 76.6017, "eval_samples_per_second": 13.055, "eval_steps_per_second": 0.418, "step": 3100 }, { "epoch": 2.0, "learning_rate": 3.7050702213758626e-07, "logits/chosen": 0.6572461128234863, "logits/rejected": 1.7969856262207031, "logps/chosen": -398.08453369140625, "logps/rejected": -607.6263427734375, "loss": 0.0082, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.214932918548584, "rewards/margins": 21.25518035888672, "rewards/rejected": -27.470117568969727, "step": 3110 } ], "logging_steps": 10, "max_steps": 9336, "num_train_epochs": 6, "save_steps": 500, "total_flos": 0.0, "trial_name": null, "trial_params": null }