diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4883 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 100, + "global_step": 3112, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.353319057815846e-10, + "logits/chosen": -2.322030782699585, + "logits/rejected": -2.360077381134033, + "logps/chosen": -413.0701599121094, + "logps/rejected": -503.9693603515625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 5.353319057815845e-09, + "logits/chosen": -2.3369531631469727, + "logits/rejected": -2.352255344390869, + "logps/chosen": -334.3316650390625, + "logps/rejected": -329.3804016113281, + "loss": 0.6949, + "rewards/accuracies": 0.5416666865348816, + "rewards/chosen": 0.0067011509090662, + "rewards/margins": 0.011725478805601597, + "rewards/rejected": -0.005024327430874109, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 1.070663811563169e-08, + "logits/chosen": -2.307915210723877, + "logits/rejected": -2.3184759616851807, + "logps/chosen": -383.9498291015625, + "logps/rejected": -349.3071594238281, + "loss": 0.6939, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.020408455282449722, + "rewards/margins": -0.008849766105413437, + "rewards/rejected": -0.011558687314391136, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 1.6059957173447538e-08, + "logits/chosen": -2.3540706634521484, + "logits/rejected": -2.3323521614074707, + "logps/chosen": -382.1279602050781, + "logps/rejected": -429.32147216796875, + "loss": 0.6794, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.009235721081495285, + "rewards/margins": 0.022110218182206154, + "rewards/rejected": -0.03134594112634659, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 2.141327623126338e-08, + "logits/chosen": -2.281588315963745, + "logits/rejected": -2.2917075157165527, + "logps/chosen": -329.18243408203125, + "logps/rejected": -268.74761962890625, + "loss": 0.6573, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.008301397785544395, + "rewards/margins": 0.07960359007120132, + "rewards/rejected": -0.07130218297243118, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 2.676659528907923e-08, + "logits/chosen": -2.2929608821868896, + "logits/rejected": -2.3293919563293457, + "logps/chosen": -348.1233215332031, + "logps/rejected": -352.0605163574219, + "loss": 0.6243, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.02957112155854702, + "rewards/margins": 0.16750425100326538, + "rewards/rejected": -0.1379331350326538, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 3.2119914346895076e-08, + "logits/chosen": -2.2743587493896484, + "logits/rejected": -2.271085262298584, + "logps/chosen": -369.2187805175781, + "logps/rejected": -364.81781005859375, + "loss": 0.5746, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.016968611627817154, + "rewards/margins": 0.25147581100463867, + "rewards/rejected": -0.23450717329978943, + "step": 60 + }, + { + "epoch": 0.04, + "learning_rate": 3.747323340471092e-08, + "logits/chosen": -2.3003923892974854, + "logits/rejected": -2.3059000968933105, + "logps/chosen": -358.39813232421875, + "logps/rejected": -377.95391845703125, + "loss": 0.5103, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.06770019978284836, + "rewards/margins": 0.510492205619812, + "rewards/rejected": -0.44279199838638306, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 4.282655246252676e-08, + "logits/chosen": -2.2736544609069824, + "logits/rejected": -2.2347025871276855, + "logps/chosen": -350.6732177734375, + "logps/rejected": -310.04315185546875, + "loss": 0.4697, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.11150838434696198, + "rewards/margins": 0.5623964071273804, + "rewards/rejected": -0.45088809728622437, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 4.817987152034261e-08, + "logits/chosen": -2.2885279655456543, + "logits/rejected": -2.2864298820495605, + "logps/chosen": -376.4400939941406, + "logps/rejected": -356.20245361328125, + "loss": 0.4095, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13773031532764435, + "rewards/margins": 0.7758801579475403, + "rewards/rejected": -0.6381498575210571, + "step": 90 + }, + { + "epoch": 0.06, + "learning_rate": 5.353319057815846e-08, + "logits/chosen": -2.2083301544189453, + "logits/rejected": -2.191849946975708, + "logps/chosen": -388.324462890625, + "logps/rejected": -376.165771484375, + "loss": 0.3876, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.19459189474582672, + "rewards/margins": 0.9206008911132812, + "rewards/rejected": -0.7260090112686157, + "step": 100 + }, + { + "epoch": 0.06, + "eval_logits/chosen": -2.2498624324798584, + "eval_logits/rejected": -2.221994400024414, + "eval_logps/chosen": -328.4210510253906, + "eval_logps/rejected": -324.17828369140625, + "eval_loss": 0.3689849376678467, + "eval_rewards/accuracies": 0.8984375, + "eval_rewards/chosen": 0.0942004844546318, + "eval_rewards/margins": 0.8769543170928955, + "eval_rewards/rejected": -0.7827538251876831, + "eval_runtime": 76.5553, + "eval_samples_per_second": 13.062, + "eval_steps_per_second": 0.418, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 5.88865096359743e-08, + "logits/chosen": -2.2286324501037598, + "logits/rejected": -2.2091312408447266, + "logps/chosen": -372.5191650390625, + "logps/rejected": -399.0936584472656, + "loss": 0.3497, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 0.1948019564151764, + "rewards/margins": 1.1746506690979004, + "rewards/rejected": -0.9798487424850464, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 6.423982869379015e-08, + "logits/chosen": -2.1091272830963135, + "logits/rejected": -2.118820905685425, + "logps/chosen": -361.56353759765625, + "logps/rejected": -404.4062805175781, + "loss": 0.3081, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 0.11251312494277954, + "rewards/margins": 1.413537621498108, + "rewards/rejected": -1.3010244369506836, + "step": 120 + }, + { + "epoch": 0.08, + "learning_rate": 6.959314775160599e-08, + "logits/chosen": -2.1156742572784424, + "logits/rejected": -2.092258930206299, + "logps/chosen": -406.53851318359375, + "logps/rejected": -382.41717529296875, + "loss": 0.2846, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.18506963551044464, + "rewards/margins": 1.6774908304214478, + "rewards/rejected": -1.492421269416809, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 7.494646680942184e-08, + "logits/chosen": -2.092482089996338, + "logits/rejected": -2.0639469623565674, + "logps/chosen": -364.37774658203125, + "logps/rejected": -419.9878845214844, + "loss": 0.2908, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.16728416085243225, + "rewards/margins": 1.9277279376983643, + "rewards/rejected": -1.760443925857544, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 8.029978586723767e-08, + "logits/chosen": -2.035637378692627, + "logits/rejected": -2.007483959197998, + "logps/chosen": -364.07745361328125, + "logps/rejected": -378.3775939941406, + "loss": 0.2519, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.009305549785494804, + "rewards/margins": 1.7874408960342407, + "rewards/rejected": -1.7967464923858643, + "step": 150 + }, + { + "epoch": 0.1, + "learning_rate": 8.565310492505352e-08, + "logits/chosen": -1.9844402074813843, + "logits/rejected": -1.9815524816513062, + "logps/chosen": -384.30645751953125, + "logps/rejected": -454.4593811035156, + "loss": 0.2155, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.009780190885066986, + "rewards/margins": 2.3053088188171387, + "rewards/rejected": -2.315088987350464, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 9.100642398286937e-08, + "logits/chosen": -1.8912779092788696, + "logits/rejected": -1.8273910284042358, + "logps/chosen": -378.23199462890625, + "logps/rejected": -344.0122985839844, + "loss": 0.2089, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.13415472209453583, + "rewards/margins": 2.2053399085998535, + "rewards/rejected": -2.3394949436187744, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 9.635974304068522e-08, + "logits/chosen": -1.8499170541763306, + "logits/rejected": -1.8102867603302002, + "logps/chosen": -337.0228271484375, + "logps/rejected": -421.8775329589844, + "loss": 0.1974, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.24989835917949677, + "rewards/margins": 2.7448952198028564, + "rewards/rejected": -2.99479341506958, + "step": 180 + }, + { + "epoch": 0.12, + "learning_rate": 1.0171306209850107e-07, + "logits/chosen": -1.8178815841674805, + "logits/rejected": -1.8075618743896484, + "logps/chosen": -420.0069885253906, + "logps/rejected": -451.9561462402344, + "loss": 0.1646, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2596689462661743, + "rewards/margins": 2.7779135704040527, + "rewards/rejected": -3.0375826358795166, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 1.0706638115631692e-07, + "logits/chosen": -1.8774926662445068, + "logits/rejected": -1.7711089849472046, + "logps/chosen": -368.1988830566406, + "logps/rejected": -364.1625671386719, + "loss": 0.1791, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.20116904377937317, + "rewards/margins": 3.0269274711608887, + "rewards/rejected": -3.2280964851379395, + "step": 200 + }, + { + "epoch": 0.13, + "eval_logits/chosen": -1.9170525074005127, + "eval_logits/rejected": -1.8318464756011963, + "eval_logps/chosen": -330.80859375, + "eval_logps/rejected": -343.30322265625, + "eval_loss": 0.19462376832962036, + "eval_rewards/accuracies": 0.90625, + "eval_rewards/chosen": -0.14455503225326538, + "eval_rewards/margins": 2.5506958961486816, + "eval_rewards/rejected": -2.695250988006592, + "eval_runtime": 76.4342, + "eval_samples_per_second": 13.083, + "eval_steps_per_second": 0.419, + "step": 200 + }, + { + "epoch": 0.13, + "learning_rate": 1.1241970021413276e-07, + "logits/chosen": -1.870164155960083, + "logits/rejected": -1.7529096603393555, + "logps/chosen": -394.24310302734375, + "logps/rejected": -359.798828125, + "loss": 0.1689, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.23141837120056152, + "rewards/margins": 2.955934524536133, + "rewards/rejected": -3.1873526573181152, + "step": 210 + }, + { + "epoch": 0.14, + "learning_rate": 1.177730192719486e-07, + "logits/chosen": -1.7515084743499756, + "logits/rejected": -1.6350901126861572, + "logps/chosen": -399.573486328125, + "logps/rejected": -395.66864013671875, + "loss": 0.1476, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.43568697571754456, + "rewards/margins": 3.202346086502075, + "rewards/rejected": -3.638033390045166, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 1.2312633832976445e-07, + "logits/chosen": -1.7598545551300049, + "logits/rejected": -1.6209675073623657, + "logps/chosen": -397.47747802734375, + "logps/rejected": -407.18780517578125, + "loss": 0.1534, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.2008744478225708, + "rewards/margins": 3.397059917449951, + "rewards/rejected": -3.5979347229003906, + "step": 230 + }, + { + "epoch": 0.15, + "learning_rate": 1.284796573875803e-07, + "logits/chosen": -1.7311824560165405, + "logits/rejected": -1.6193158626556396, + "logps/chosen": -379.2996520996094, + "logps/rejected": -434.6549377441406, + "loss": 0.136, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4003673493862152, + "rewards/margins": 3.5797741413116455, + "rewards/rejected": -3.9801411628723145, + "step": 240 + }, + { + "epoch": 0.16, + "learning_rate": 1.3383297644539615e-07, + "logits/chosen": -1.7749998569488525, + "logits/rejected": -1.6336084604263306, + "logps/chosen": -384.36016845703125, + "logps/rejected": -412.198974609375, + "loss": 0.1424, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.44941583275794983, + "rewards/margins": 4.18085241317749, + "rewards/rejected": -4.630268096923828, + "step": 250 + }, + { + "epoch": 0.17, + "learning_rate": 1.3918629550321198e-07, + "logits/chosen": -1.6577208042144775, + "logits/rejected": -1.4876978397369385, + "logps/chosen": -368.1956481933594, + "logps/rejected": -311.98223876953125, + "loss": 0.1379, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5439226031303406, + "rewards/margins": 3.506497621536255, + "rewards/rejected": -4.050419807434082, + "step": 260 + }, + { + "epoch": 0.17, + "learning_rate": 1.4453961456102785e-07, + "logits/chosen": -1.6528087854385376, + "logits/rejected": -1.5587131977081299, + "logps/chosen": -357.02655029296875, + "logps/rejected": -449.92718505859375, + "loss": 0.1273, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7018794417381287, + "rewards/margins": 4.135760307312012, + "rewards/rejected": -4.837639808654785, + "step": 270 + }, + { + "epoch": 0.18, + "learning_rate": 1.4989293361884367e-07, + "logits/chosen": -1.6728260517120361, + "logits/rejected": -1.4406911134719849, + "logps/chosen": -383.44635009765625, + "logps/rejected": -405.33428955078125, + "loss": 0.1286, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.7234546542167664, + "rewards/margins": 4.43070125579834, + "rewards/rejected": -5.154156684875488, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 1.5524625267665952e-07, + "logits/chosen": -1.5811737775802612, + "logits/rejected": -1.4415075778961182, + "logps/chosen": -380.16302490234375, + "logps/rejected": -385.4619140625, + "loss": 0.1184, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6038004159927368, + "rewards/margins": 4.522972106933594, + "rewards/rejected": -5.126772403717041, + "step": 290 + }, + { + "epoch": 0.19, + "learning_rate": 1.6059957173447535e-07, + "logits/chosen": -1.5964694023132324, + "logits/rejected": -1.5073761940002441, + "logps/chosen": -348.82110595703125, + "logps/rejected": -396.21234130859375, + "loss": 0.1218, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7199587821960449, + "rewards/margins": 4.322809219360352, + "rewards/rejected": -5.042768478393555, + "step": 300 + }, + { + "epoch": 0.19, + "eval_logits/chosen": -1.7016242742538452, + "eval_logits/rejected": -1.5659476518630981, + "eval_logps/chosen": -336.16741943359375, + "eval_logps/rejected": -363.2919006347656, + "eval_loss": 0.1464938521385193, + "eval_rewards/accuracies": 0.921875, + "eval_rewards/chosen": -0.6804376840591431, + "eval_rewards/margins": 4.013677597045898, + "eval_rewards/rejected": -4.69411563873291, + "eval_runtime": 76.4822, + "eval_samples_per_second": 13.075, + "eval_steps_per_second": 0.418, + "step": 300 + }, + { + "epoch": 0.2, + "learning_rate": 1.6595289079229122e-07, + "logits/chosen": -1.5078377723693848, + "logits/rejected": -1.4056973457336426, + "logps/chosen": -412.2068786621094, + "logps/rejected": -462.7405700683594, + "loss": 0.1094, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.718084454536438, + "rewards/margins": 5.0283284187316895, + "rewards/rejected": -5.746413230895996, + "step": 310 + }, + { + "epoch": 0.21, + "learning_rate": 1.7130620985010704e-07, + "logits/chosen": -1.586196780204773, + "logits/rejected": -1.4498493671417236, + "logps/chosen": -351.50775146484375, + "logps/rejected": -403.0026550292969, + "loss": 0.1016, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.6627265214920044, + "rewards/margins": 4.463755130767822, + "rewards/rejected": -5.126482009887695, + "step": 320 + }, + { + "epoch": 0.21, + "learning_rate": 1.766595289079229e-07, + "logits/chosen": -1.5722558498382568, + "logits/rejected": -1.340435266494751, + "logps/chosen": -337.4762878417969, + "logps/rejected": -419.3818359375, + "loss": 0.1221, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.869371235370636, + "rewards/margins": 5.570626258850098, + "rewards/rejected": -6.439997673034668, + "step": 330 + }, + { + "epoch": 0.22, + "learning_rate": 1.8201284796573874e-07, + "logits/chosen": -1.563241958618164, + "logits/rejected": -1.43202805519104, + "logps/chosen": -332.5054016113281, + "logps/rejected": -401.9011535644531, + "loss": 0.1141, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7410942912101746, + "rewards/margins": 5.449254512786865, + "rewards/rejected": -6.1903486251831055, + "step": 340 + }, + { + "epoch": 0.22, + "learning_rate": 1.873661670235546e-07, + "logits/chosen": -1.509913682937622, + "logits/rejected": -1.352683186531067, + "logps/chosen": -428.9737243652344, + "logps/rejected": -455.664306640625, + "loss": 0.1092, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.7532263398170471, + "rewards/margins": 5.451329708099365, + "rewards/rejected": -6.204555988311768, + "step": 350 + }, + { + "epoch": 0.23, + "learning_rate": 1.9271948608137044e-07, + "logits/chosen": -1.6219438314437866, + "logits/rejected": -1.3939545154571533, + "logps/chosen": -389.11297607421875, + "logps/rejected": -411.80035400390625, + "loss": 0.1218, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0884325504302979, + "rewards/margins": 5.317915439605713, + "rewards/rejected": -6.406347751617432, + "step": 360 + }, + { + "epoch": 0.24, + "learning_rate": 1.980728051391863e-07, + "logits/chosen": -1.465427041053772, + "logits/rejected": -1.3383185863494873, + "logps/chosen": -373.2828369140625, + "logps/rejected": -433.7022399902344, + "loss": 0.1098, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -1.5387589931488037, + "rewards/margins": 5.308978080749512, + "rewards/rejected": -6.8477373123168945, + "step": 370 + }, + { + "epoch": 0.24, + "learning_rate": 2.0342612419700214e-07, + "logits/chosen": -1.4776766300201416, + "logits/rejected": -1.390604019165039, + "logps/chosen": -384.6776428222656, + "logps/rejected": -409.8622131347656, + "loss": 0.1068, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -1.1275439262390137, + "rewards/margins": 5.649918079376221, + "rewards/rejected": -6.777462005615234, + "step": 380 + }, + { + "epoch": 0.25, + "learning_rate": 2.0877944325481796e-07, + "logits/chosen": -1.496004343032837, + "logits/rejected": -1.2643308639526367, + "logps/chosen": -319.6006774902344, + "logps/rejected": -435.0767517089844, + "loss": 0.0963, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.9759249687194824, + "rewards/margins": 5.894278526306152, + "rewards/rejected": -6.870204925537109, + "step": 390 + }, + { + "epoch": 0.26, + "learning_rate": 2.1413276231263384e-07, + "logits/chosen": -1.373808741569519, + "logits/rejected": -1.2059423923492432, + "logps/chosen": -355.2186584472656, + "logps/rejected": -375.163330078125, + "loss": 0.1065, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.117621660232544, + "rewards/margins": 5.132685661315918, + "rewards/rejected": -6.250307559967041, + "step": 400 + }, + { + "epoch": 0.26, + "eval_logits/chosen": -1.5739349126815796, + "eval_logits/rejected": -1.3864831924438477, + "eval_logps/chosen": -342.44061279296875, + "eval_logps/rejected": -380.4227600097656, + "eval_loss": 0.1241927221417427, + "eval_rewards/accuracies": 0.921875, + "eval_rewards/chosen": -1.3077539205551147, + "eval_rewards/margins": 5.099446773529053, + "eval_rewards/rejected": -6.407200336456299, + "eval_runtime": 76.309, + "eval_samples_per_second": 13.105, + "eval_steps_per_second": 0.419, + "step": 400 + }, + { + "epoch": 0.26, + "learning_rate": 2.1948608137044966e-07, + "logits/chosen": -1.453360915184021, + "logits/rejected": -1.215315341949463, + "logps/chosen": -309.32196044921875, + "logps/rejected": -383.0122985839844, + "loss": 0.0932, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0941340923309326, + "rewards/margins": 5.864490985870361, + "rewards/rejected": -6.958624839782715, + "step": 410 + }, + { + "epoch": 0.27, + "learning_rate": 2.248394004282655e-07, + "logits/chosen": -1.6208438873291016, + "logits/rejected": -1.3579902648925781, + "logps/chosen": -420.99951171875, + "logps/rejected": -458.4291076660156, + "loss": 0.0972, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.9325690269470215, + "rewards/margins": 6.371849536895752, + "rewards/rejected": -7.304419040679932, + "step": 420 + }, + { + "epoch": 0.28, + "learning_rate": 2.3019271948608136e-07, + "logits/chosen": -1.5590957403182983, + "logits/rejected": -1.3185454607009888, + "logps/chosen": -433.14007568359375, + "logps/rejected": -423.8736877441406, + "loss": 0.0955, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.888096034526825, + "rewards/margins": 6.284546852111816, + "rewards/rejected": -7.172643184661865, + "step": 430 + }, + { + "epoch": 0.28, + "learning_rate": 2.355460385438972e-07, + "logits/chosen": -1.5379421710968018, + "logits/rejected": -1.287793755531311, + "logps/chosen": -384.4920959472656, + "logps/rejected": -459.51495361328125, + "loss": 0.0925, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.9522300958633423, + "rewards/margins": 6.38286828994751, + "rewards/rejected": -7.3350982666015625, + "step": 440 + }, + { + "epoch": 0.29, + "learning_rate": 2.4089935760171303e-07, + "logits/chosen": -1.539567232131958, + "logits/rejected": -1.2881194353103638, + "logps/chosen": -360.4696960449219, + "logps/rejected": -421.70953369140625, + "loss": 0.0983, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.032198190689087, + "rewards/margins": 6.628798484802246, + "rewards/rejected": -7.660996437072754, + "step": 450 + }, + { + "epoch": 0.3, + "learning_rate": 2.462526766595289e-07, + "logits/chosen": -1.5270180702209473, + "logits/rejected": -1.213521122932434, + "logps/chosen": -341.48370361328125, + "logps/rejected": -416.1814880371094, + "loss": 0.0862, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.2647520303726196, + "rewards/margins": 7.327714443206787, + "rewards/rejected": -8.592466354370117, + "step": 460 + }, + { + "epoch": 0.3, + "learning_rate": 2.5160599571734473e-07, + "logits/chosen": -1.539104700088501, + "logits/rejected": -1.304837942123413, + "logps/chosen": -412.41192626953125, + "logps/rejected": -530.3709106445312, + "loss": 0.0685, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.0689477920532227, + "rewards/margins": 7.198890686035156, + "rewards/rejected": -8.267837524414062, + "step": 470 + }, + { + "epoch": 0.31, + "learning_rate": 2.569593147751606e-07, + "logits/chosen": -1.4452335834503174, + "logits/rejected": -1.177215576171875, + "logps/chosen": -356.6268005371094, + "logps/rejected": -386.1933288574219, + "loss": 0.0906, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.5285716652870178, + "rewards/margins": 7.398108005523682, + "rewards/rejected": -7.9266791343688965, + "step": 480 + }, + { + "epoch": 0.31, + "learning_rate": 2.6231263383297643e-07, + "logits/chosen": -1.299133062362671, + "logits/rejected": -1.2396031618118286, + "logps/chosen": -354.5841979980469, + "logps/rejected": -451.559326171875, + "loss": 0.0801, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.7138751745223999, + "rewards/margins": 6.743910789489746, + "rewards/rejected": -7.457786560058594, + "step": 490 + }, + { + "epoch": 0.32, + "learning_rate": 2.676659528907923e-07, + "logits/chosen": -1.2933090925216675, + "logits/rejected": -1.0351635217666626, + "logps/chosen": -403.50091552734375, + "logps/rejected": -444.50811767578125, + "loss": 0.0804, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2215214967727661, + "rewards/margins": 7.684378623962402, + "rewards/rejected": -8.905900955200195, + "step": 500 + }, + { + "epoch": 0.32, + "eval_logits/chosen": -1.5204213857650757, + "eval_logits/rejected": -1.313443899154663, + "eval_logps/chosen": -341.24200439453125, + "eval_logps/rejected": -387.53155517578125, + "eval_loss": 0.11293376982212067, + "eval_rewards/accuracies": 0.9296875, + "eval_rewards/chosen": -1.1878938674926758, + "eval_rewards/margins": 5.9301862716674805, + "eval_rewards/rejected": -7.118079662322998, + "eval_runtime": 76.7239, + "eval_samples_per_second": 13.034, + "eval_steps_per_second": 0.417, + "step": 500 + }, + { + "epoch": 0.33, + "learning_rate": 2.7301927194860813e-07, + "logits/chosen": -1.4917323589324951, + "logits/rejected": -1.2333643436431885, + "logps/chosen": -391.20941162109375, + "logps/rejected": -462.3330993652344, + "loss": 0.0677, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.9355085492134094, + "rewards/margins": 8.421406745910645, + "rewards/rejected": -9.356914520263672, + "step": 510 + }, + { + "epoch": 0.33, + "learning_rate": 2.7837259100642395e-07, + "logits/chosen": -1.3520570993423462, + "logits/rejected": -1.1625728607177734, + "logps/chosen": -392.84210205078125, + "logps/rejected": -441.2123107910156, + "loss": 0.084, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.286471962928772, + "rewards/margins": 7.407790184020996, + "rewards/rejected": -8.69426155090332, + "step": 520 + }, + { + "epoch": 0.34, + "learning_rate": 2.8372591006423977e-07, + "logits/chosen": -1.2561280727386475, + "logits/rejected": -1.0563820600509644, + "logps/chosen": -355.0276794433594, + "logps/rejected": -434.96099853515625, + "loss": 0.0754, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.644857406616211, + "rewards/margins": 7.365194797515869, + "rewards/rejected": -9.010051727294922, + "step": 530 + }, + { + "epoch": 0.35, + "learning_rate": 2.890792291220557e-07, + "logits/chosen": -1.3786545991897583, + "logits/rejected": -1.1711790561676025, + "logps/chosen": -343.2646179199219, + "logps/rejected": -414.624267578125, + "loss": 0.0644, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -1.0238958597183228, + "rewards/margins": 7.4112067222595215, + "rewards/rejected": -8.435102462768555, + "step": 540 + }, + { + "epoch": 0.35, + "learning_rate": 2.944325481798715e-07, + "logits/chosen": -1.483944296836853, + "logits/rejected": -1.2009809017181396, + "logps/chosen": -397.4231872558594, + "logps/rejected": -453.69903564453125, + "loss": 0.0742, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.842347264289856, + "rewards/margins": 7.386562347412109, + "rewards/rejected": -8.228910446166992, + "step": 550 + }, + { + "epoch": 0.36, + "learning_rate": 2.9978586723768735e-07, + "logits/chosen": -1.4178438186645508, + "logits/rejected": -1.243758201599121, + "logps/chosen": -412.52105712890625, + "logps/rejected": -464.28741455078125, + "loss": 0.0944, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.449329137802124, + "rewards/margins": 8.025833129882812, + "rewards/rejected": -9.475163459777832, + "step": 560 + }, + { + "epoch": 0.37, + "learning_rate": 3.051391862955032e-07, + "logits/chosen": -1.3710880279541016, + "logits/rejected": -1.218766450881958, + "logps/chosen": -344.7862548828125, + "logps/rejected": -453.5824279785156, + "loss": 0.0435, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.8545820713043213, + "rewards/margins": 7.798386573791504, + "rewards/rejected": -9.652968406677246, + "step": 570 + }, + { + "epoch": 0.37, + "learning_rate": 3.1049250535331905e-07, + "logits/chosen": -1.461828589439392, + "logits/rejected": -1.0271979570388794, + "logps/chosen": -387.4298095703125, + "logps/rejected": -435.98663330078125, + "loss": 0.0867, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.7421836853027344, + "rewards/margins": 7.6707444190979, + "rewards/rejected": -9.412927627563477, + "step": 580 + }, + { + "epoch": 0.38, + "learning_rate": 3.1584582441113487e-07, + "logits/chosen": -1.3038650751113892, + "logits/rejected": -1.07460618019104, + "logps/chosen": -324.758544921875, + "logps/rejected": -503.78387451171875, + "loss": 0.0716, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3957910537719727, + "rewards/margins": 8.855663299560547, + "rewards/rejected": -11.25145435333252, + "step": 590 + }, + { + "epoch": 0.39, + "learning_rate": 3.211991434689507e-07, + "logits/chosen": -1.3503267765045166, + "logits/rejected": -1.0638809204101562, + "logps/chosen": -374.38348388671875, + "logps/rejected": -516.7820434570312, + "loss": 0.0767, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.1524558067321777, + "rewards/margins": 8.407071113586426, + "rewards/rejected": -10.559527397155762, + "step": 600 + }, + { + "epoch": 0.39, + "eval_logits/chosen": -1.4297420978546143, + "eval_logits/rejected": -1.150868535041809, + "eval_logps/chosen": -353.238037109375, + "eval_logps/rejected": -407.5989990234375, + "eval_loss": 0.13097576797008514, + "eval_rewards/accuracies": 0.8984375, + "eval_rewards/chosen": -2.387495994567871, + "eval_rewards/margins": 6.737332820892334, + "eval_rewards/rejected": -9.124829292297363, + "eval_runtime": 76.6752, + "eval_samples_per_second": 13.042, + "eval_steps_per_second": 0.417, + "step": 600 + }, + { + "epoch": 0.39, + "learning_rate": 3.265524625267666e-07, + "logits/chosen": -1.1903326511383057, + "logits/rejected": -0.9525307416915894, + "logps/chosen": -419.54608154296875, + "logps/rejected": -444.11383056640625, + "loss": 0.0831, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.369199752807617, + "rewards/margins": 8.518608093261719, + "rewards/rejected": -10.887807846069336, + "step": 610 + }, + { + "epoch": 0.4, + "learning_rate": 3.3190578158458244e-07, + "logits/chosen": -1.3438398838043213, + "logits/rejected": -1.143065333366394, + "logps/chosen": -369.13970947265625, + "logps/rejected": -439.51715087890625, + "loss": 0.0779, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.6147950887680054, + "rewards/margins": 7.42797327041626, + "rewards/rejected": -9.042768478393555, + "step": 620 + }, + { + "epoch": 0.4, + "learning_rate": 3.3725910064239827e-07, + "logits/chosen": -1.4843438863754272, + "logits/rejected": -1.0890620946884155, + "logps/chosen": -409.73583984375, + "logps/rejected": -517.4986572265625, + "loss": 0.0703, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1902521848678589, + "rewards/margins": 9.822367668151855, + "rewards/rejected": -11.01262092590332, + "step": 630 + }, + { + "epoch": 0.41, + "learning_rate": 3.426124197002141e-07, + "logits/chosen": -1.3501672744750977, + "logits/rejected": -1.0996173620224, + "logps/chosen": -360.51531982421875, + "logps/rejected": -489.69061279296875, + "loss": 0.0968, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.6506973505020142, + "rewards/margins": 8.347593307495117, + "rewards/rejected": -9.998289108276367, + "step": 640 + }, + { + "epoch": 0.42, + "learning_rate": 3.4796573875802996e-07, + "logits/chosen": -1.2105796337127686, + "logits/rejected": -0.9770170450210571, + "logps/chosen": -335.6686096191406, + "logps/rejected": -449.52447509765625, + "loss": 0.0659, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.2099552154541016, + "rewards/margins": 7.764392852783203, + "rewards/rejected": -9.974349021911621, + "step": 650 + }, + { + "epoch": 0.42, + "learning_rate": 3.533190578158458e-07, + "logits/chosen": -1.4429116249084473, + "logits/rejected": -1.06112539768219, + "logps/chosen": -420.8074645996094, + "logps/rejected": -442.88568115234375, + "loss": 0.0967, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -1.681353211402893, + "rewards/margins": 7.297033786773682, + "rewards/rejected": -8.978387832641602, + "step": 660 + }, + { + "epoch": 0.43, + "learning_rate": 3.5867237687366166e-07, + "logits/chosen": -1.4578073024749756, + "logits/rejected": -1.1529022455215454, + "logps/chosen": -362.67230224609375, + "logps/rejected": -455.08367919921875, + "loss": 0.0694, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -1.878594994544983, + "rewards/margins": 8.333452224731445, + "rewards/rejected": -10.212045669555664, + "step": 670 + }, + { + "epoch": 0.44, + "learning_rate": 3.640256959314775e-07, + "logits/chosen": -1.34915292263031, + "logits/rejected": -1.125140905380249, + "logps/chosen": -364.0298156738281, + "logps/rejected": -492.75909423828125, + "loss": 0.0744, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.043875217437744, + "rewards/margins": 9.902002334594727, + "rewards/rejected": -11.945877075195312, + "step": 680 + }, + { + "epoch": 0.44, + "learning_rate": 3.6937901498929336e-07, + "logits/chosen": -1.2846548557281494, + "logits/rejected": -0.9907848238945007, + "logps/chosen": -423.0797424316406, + "logps/rejected": -499.71142578125, + "loss": 0.0603, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9811477661132812, + "rewards/margins": 9.256712913513184, + "rewards/rejected": -12.237860679626465, + "step": 690 + }, + { + "epoch": 0.45, + "learning_rate": 3.747323340471092e-07, + "logits/chosen": -1.3538507223129272, + "logits/rejected": -1.1379244327545166, + "logps/chosen": -329.5379638671875, + "logps/rejected": -413.8023986816406, + "loss": 0.0759, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.2614781856536865, + "rewards/margins": 8.729515075683594, + "rewards/rejected": -10.990991592407227, + "step": 700 + }, + { + "epoch": 0.45, + "eval_logits/chosen": -1.4886287450790405, + "eval_logits/rejected": -1.2180323600769043, + "eval_logps/chosen": -348.9230041503906, + "eval_logps/rejected": -409.6312255859375, + "eval_loss": 0.12044651806354523, + "eval_rewards/accuracies": 0.9296875, + "eval_rewards/chosen": -1.9559952020645142, + "eval_rewards/margins": 7.372057914733887, + "eval_rewards/rejected": -9.328052520751953, + "eval_runtime": 76.578, + "eval_samples_per_second": 13.059, + "eval_steps_per_second": 0.418, + "step": 700 + }, + { + "epoch": 0.46, + "learning_rate": 3.80085653104925e-07, + "logits/chosen": -1.3313727378845215, + "logits/rejected": -0.9123377799987793, + "logps/chosen": -442.8067321777344, + "logps/rejected": -468.52984619140625, + "loss": 0.0905, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6124906539916992, + "rewards/margins": 8.923405647277832, + "rewards/rejected": -10.535896301269531, + "step": 710 + }, + { + "epoch": 0.46, + "learning_rate": 3.854389721627409e-07, + "logits/chosen": -1.4308016300201416, + "logits/rejected": -1.0905249118804932, + "logps/chosen": -378.02545166015625, + "logps/rejected": -479.7505798339844, + "loss": 0.0561, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.7049999237060547, + "rewards/margins": 8.536346435546875, + "rewards/rejected": -10.241347312927246, + "step": 720 + }, + { + "epoch": 0.47, + "learning_rate": 3.9079229122055676e-07, + "logits/chosen": -1.3243951797485352, + "logits/rejected": -0.9832109212875366, + "logps/chosen": -431.05645751953125, + "logps/rejected": -432.52886962890625, + "loss": 0.0581, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.067716121673584, + "rewards/margins": 8.114839553833008, + "rewards/rejected": -10.18255615234375, + "step": 730 + }, + { + "epoch": 0.48, + "learning_rate": 3.961456102783726e-07, + "logits/chosen": -1.258172869682312, + "logits/rejected": -1.0175604820251465, + "logps/chosen": -404.3645935058594, + "logps/rejected": -519.955322265625, + "loss": 0.0452, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6845811605453491, + "rewards/margins": 10.622332572937012, + "rewards/rejected": -12.306914329528809, + "step": 740 + }, + { + "epoch": 0.48, + "learning_rate": 4.014989293361884e-07, + "logits/chosen": -1.28511381149292, + "logits/rejected": -1.0871832370758057, + "logps/chosen": -378.40478515625, + "logps/rejected": -460.6148376464844, + "loss": 0.0669, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.8259923458099365, + "rewards/margins": 8.321816444396973, + "rewards/rejected": -10.147809982299805, + "step": 750 + }, + { + "epoch": 0.49, + "learning_rate": 4.068522483940043e-07, + "logits/chosen": -1.5064611434936523, + "logits/rejected": -1.0779759883880615, + "logps/chosen": -371.971435546875, + "logps/rejected": -416.6922302246094, + "loss": 0.0762, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -1.135917067527771, + "rewards/margins": 10.22150993347168, + "rewards/rejected": -11.357427597045898, + "step": 760 + }, + { + "epoch": 0.49, + "learning_rate": 4.122055674518201e-07, + "logits/chosen": -1.429022192955017, + "logits/rejected": -1.0790882110595703, + "logps/chosen": -394.6370544433594, + "logps/rejected": -461.9344787597656, + "loss": 0.0662, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.259974956512451, + "rewards/margins": 9.807271003723145, + "rewards/rejected": -12.067245483398438, + "step": 770 + }, + { + "epoch": 0.5, + "learning_rate": 4.175588865096359e-07, + "logits/chosen": -1.296064853668213, + "logits/rejected": -0.9478403925895691, + "logps/chosen": -376.76727294921875, + "logps/rejected": -470.43743896484375, + "loss": 0.0513, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.549682378768921, + "rewards/margins": 9.523083686828613, + "rewards/rejected": -11.072766304016113, + "step": 780 + }, + { + "epoch": 0.51, + "learning_rate": 4.2291220556745175e-07, + "logits/chosen": -1.309073567390442, + "logits/rejected": -0.8560987710952759, + "logps/chosen": -457.4974670410156, + "logps/rejected": -528.8920288085938, + "loss": 0.0616, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.5288264751434326, + "rewards/margins": 9.966497421264648, + "rewards/rejected": -12.495325088500977, + "step": 790 + }, + { + "epoch": 0.51, + "learning_rate": 4.282655246252677e-07, + "logits/chosen": -1.2481553554534912, + "logits/rejected": -0.764441728591919, + "logps/chosen": -306.9283142089844, + "logps/rejected": -417.22491455078125, + "loss": 0.0681, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.760056734085083, + "rewards/margins": 10.229304313659668, + "rewards/rejected": -11.989361763000488, + "step": 800 + }, + { + "epoch": 0.51, + "eval_logits/chosen": -1.4168956279754639, + "eval_logits/rejected": -1.0999643802642822, + "eval_logps/chosen": -350.21661376953125, + "eval_logps/rejected": -414.49041748046875, + "eval_loss": 0.1091269925236702, + "eval_rewards/accuracies": 0.9609375, + "eval_rewards/chosen": -2.085355281829834, + "eval_rewards/margins": 7.728612422943115, + "eval_rewards/rejected": -9.81396770477295, + "eval_runtime": 76.655, + "eval_samples_per_second": 13.045, + "eval_steps_per_second": 0.417, + "step": 800 + }, + { + "epoch": 0.52, + "learning_rate": 4.336188436830835e-07, + "logits/chosen": -1.092874526977539, + "logits/rejected": -0.8753455281257629, + "logps/chosen": -367.61846923828125, + "logps/rejected": -431.23834228515625, + "loss": 0.0496, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.793146848678589, + "rewards/margins": 8.786299705505371, + "rewards/rejected": -11.579446792602539, + "step": 810 + }, + { + "epoch": 0.53, + "learning_rate": 4.389721627408993e-07, + "logits/chosen": -1.02623450756073, + "logits/rejected": -0.5294391512870789, + "logps/chosen": -348.81463623046875, + "logps/rejected": -391.2160339355469, + "loss": 0.0634, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.466538906097412, + "rewards/margins": 9.582557678222656, + "rewards/rejected": -12.049097061157227, + "step": 820 + }, + { + "epoch": 0.53, + "learning_rate": 4.443254817987152e-07, + "logits/chosen": -1.1547324657440186, + "logits/rejected": -0.6096884608268738, + "logps/chosen": -426.49798583984375, + "logps/rejected": -467.9422302246094, + "loss": 0.072, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5411438941955566, + "rewards/margins": 9.401772499084473, + "rewards/rejected": -11.942916870117188, + "step": 830 + }, + { + "epoch": 0.54, + "learning_rate": 4.49678800856531e-07, + "logits/chosen": -1.247434377670288, + "logits/rejected": -0.7016826868057251, + "logps/chosen": -393.85430908203125, + "logps/rejected": -482.9002990722656, + "loss": 0.0467, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.051135301589966, + "rewards/margins": 10.815892219543457, + "rewards/rejected": -12.867027282714844, + "step": 840 + }, + { + "epoch": 0.55, + "learning_rate": 4.5503211991434684e-07, + "logits/chosen": -1.2653485536575317, + "logits/rejected": -0.7517842054367065, + "logps/chosen": -393.53643798828125, + "logps/rejected": -458.5020446777344, + "loss": 0.0962, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.8538535833358765, + "rewards/margins": 9.80111312866211, + "rewards/rejected": -11.654967308044434, + "step": 850 + }, + { + "epoch": 0.55, + "learning_rate": 4.603854389721627e-07, + "logits/chosen": -1.0140388011932373, + "logits/rejected": -0.5941162109375, + "logps/chosen": -431.3036193847656, + "logps/rejected": -522.5137939453125, + "loss": 0.073, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.430864095687866, + "rewards/margins": 10.6370267868042, + "rewards/rejected": -13.067891120910645, + "step": 860 + }, + { + "epoch": 0.56, + "learning_rate": 4.657387580299786e-07, + "logits/chosen": -0.8724175691604614, + "logits/rejected": -0.424823522567749, + "logps/chosen": -405.14544677734375, + "logps/rejected": -458.218994140625, + "loss": 0.0896, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.2016243934631348, + "rewards/margins": 10.069561958312988, + "rewards/rejected": -13.271186828613281, + "step": 870 + }, + { + "epoch": 0.57, + "learning_rate": 4.710920770877944e-07, + "logits/chosen": -0.9462097883224487, + "logits/rejected": -0.5895959138870239, + "logps/chosen": -384.04046630859375, + "logps/rejected": -430.6656188964844, + "loss": 0.0703, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.604841947555542, + "rewards/margins": 8.405369758605957, + "rewards/rejected": -11.010213851928711, + "step": 880 + }, + { + "epoch": 0.57, + "learning_rate": 4.7644539614561024e-07, + "logits/chosen": -1.0459139347076416, + "logits/rejected": -0.5180732011795044, + "logps/chosen": -349.8458557128906, + "logps/rejected": -463.24993896484375, + "loss": 0.068, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.0111186504364014, + "rewards/margins": 10.823290824890137, + "rewards/rejected": -12.8344087600708, + "step": 890 + }, + { + "epoch": 0.58, + "learning_rate": 4.817987152034261e-07, + "logits/chosen": -1.0149575471878052, + "logits/rejected": -0.3724205493927002, + "logps/chosen": -396.1781311035156, + "logps/rejected": -512.572265625, + "loss": 0.0494, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9190399646759033, + "rewards/margins": 12.252616882324219, + "rewards/rejected": -15.171656608581543, + "step": 900 + }, + { + "epoch": 0.58, + "eval_logits/chosen": -1.1149048805236816, + "eval_logits/rejected": -0.723557710647583, + "eval_logps/chosen": -371.6729736328125, + "eval_logps/rejected": -447.3422546386719, + "eval_loss": 0.14773131906986237, + "eval_rewards/accuracies": 0.890625, + "eval_rewards/chosen": -4.230995178222656, + "eval_rewards/margins": 8.868158340454102, + "eval_rewards/rejected": -13.099154472351074, + "eval_runtime": 76.5391, + "eval_samples_per_second": 13.065, + "eval_steps_per_second": 0.418, + "step": 900 + }, + { + "epoch": 0.58, + "learning_rate": 4.871520342612419e-07, + "logits/chosen": -1.0406441688537598, + "logits/rejected": -0.7672456502914429, + "logps/chosen": -367.68988037109375, + "logps/rejected": -497.24542236328125, + "loss": 0.0497, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.375262498855591, + "rewards/margins": 11.108481407165527, + "rewards/rejected": -13.483744621276855, + "step": 910 + }, + { + "epoch": 0.59, + "learning_rate": 4.925053533190578e-07, + "logits/chosen": -1.3684611320495605, + "logits/rejected": -0.7619699239730835, + "logps/chosen": -395.2074279785156, + "logps/rejected": -435.02490234375, + "loss": 0.0771, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.186323642730713, + "rewards/margins": 9.919301986694336, + "rewards/rejected": -12.10562515258789, + "step": 920 + }, + { + "epoch": 0.6, + "learning_rate": 4.978586723768736e-07, + "logits/chosen": -1.1214783191680908, + "logits/rejected": -0.5191805362701416, + "logps/chosen": -415.1065979003906, + "logps/rejected": -468.7010192871094, + "loss": 0.046, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4811503887176514, + "rewards/margins": 9.906661987304688, + "rewards/rejected": -13.387812614440918, + "step": 930 + }, + { + "epoch": 0.6, + "learning_rate": 4.996429421566293e-07, + "logits/chosen": -1.2635540962219238, + "logits/rejected": -0.628160834312439, + "logps/chosen": -416.54644775390625, + "logps/rejected": -465.719970703125, + "loss": 0.1156, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.1887431144714355, + "rewards/margins": 9.464393615722656, + "rewards/rejected": -12.65313720703125, + "step": 940 + }, + { + "epoch": 0.61, + "learning_rate": 4.990478457510116e-07, + "logits/chosen": -1.3375509977340698, + "logits/rejected": -0.5667593479156494, + "logps/chosen": -430.92742919921875, + "logps/rejected": -532.2154541015625, + "loss": 0.0961, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9553847312927246, + "rewards/margins": 12.077500343322754, + "rewards/rejected": -15.03288745880127, + "step": 950 + }, + { + "epoch": 0.62, + "learning_rate": 4.98452749345394e-07, + "logits/chosen": -1.117200493812561, + "logits/rejected": -0.6453494429588318, + "logps/chosen": -424.7369689941406, + "logps/rejected": -452.61749267578125, + "loss": 0.0766, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.5312678813934326, + "rewards/margins": 10.04582405090332, + "rewards/rejected": -13.577092170715332, + "step": 960 + }, + { + "epoch": 0.62, + "learning_rate": 4.978576529397762e-07, + "logits/chosen": -1.1340444087982178, + "logits/rejected": -0.656354546546936, + "logps/chosen": -389.5958557128906, + "logps/rejected": -524.0951538085938, + "loss": 0.0653, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.064024448394775, + "rewards/margins": 11.442460060119629, + "rewards/rejected": -16.506484985351562, + "step": 970 + }, + { + "epoch": 0.63, + "learning_rate": 4.972625565341585e-07, + "logits/chosen": -1.2187734842300415, + "logits/rejected": -0.6018660664558411, + "logps/chosen": -386.0228271484375, + "logps/rejected": -454.13140869140625, + "loss": 0.0838, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.2799274921417236, + "rewards/margins": 9.308359146118164, + "rewards/rejected": -12.588286399841309, + "step": 980 + }, + { + "epoch": 0.64, + "learning_rate": 4.966674601285408e-07, + "logits/chosen": -1.111574411392212, + "logits/rejected": -0.7768339514732361, + "logps/chosen": -366.3212585449219, + "logps/rejected": -501.3627014160156, + "loss": 0.0817, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8082051277160645, + "rewards/margins": 10.757050514221191, + "rewards/rejected": -13.565256118774414, + "step": 990 + }, + { + "epoch": 0.64, + "learning_rate": 4.960723637229232e-07, + "logits/chosen": -0.855319619178772, + "logits/rejected": -0.4498973786830902, + "logps/chosen": -356.251708984375, + "logps/rejected": -479.78094482421875, + "loss": 0.0735, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.355088710784912, + "rewards/margins": 9.392342567443848, + "rewards/rejected": -13.747430801391602, + "step": 1000 + }, + { + "epoch": 0.64, + "eval_logits/chosen": -1.069393277168274, + "eval_logits/rejected": -0.698665976524353, + "eval_logps/chosen": -367.884033203125, + "eval_logps/rejected": -449.0148620605469, + "eval_loss": 0.12433216720819473, + "eval_rewards/accuracies": 0.9609375, + "eval_rewards/chosen": -3.8520963191986084, + "eval_rewards/margins": 9.414312362670898, + "eval_rewards/rejected": -13.26640796661377, + "eval_runtime": 76.7244, + "eval_samples_per_second": 13.034, + "eval_steps_per_second": 0.417, + "step": 1000 + }, + { + "epoch": 0.65, + "learning_rate": 4.954772673173054e-07, + "logits/chosen": -0.8880437612533569, + "logits/rejected": -0.5571062564849854, + "logps/chosen": -427.5099182128906, + "logps/rejected": -511.27398681640625, + "loss": 0.1211, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.051557779312134, + "rewards/margins": 10.788399696350098, + "rewards/rejected": -13.839956283569336, + "step": 1010 + }, + { + "epoch": 0.66, + "learning_rate": 4.948821709116876e-07, + "logits/chosen": -1.3221044540405273, + "logits/rejected": -0.6521639823913574, + "logps/chosen": -380.6373596191406, + "logps/rejected": -449.31817626953125, + "loss": 0.0824, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.038402795791626, + "rewards/margins": 11.031317710876465, + "rewards/rejected": -13.069720268249512, + "step": 1020 + }, + { + "epoch": 0.66, + "learning_rate": 4.9428707450607e-07, + "logits/chosen": -1.1821175813674927, + "logits/rejected": -0.7060034871101379, + "logps/chosen": -366.3950500488281, + "logps/rejected": -507.5702209472656, + "loss": 0.078, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.4800446033477783, + "rewards/margins": 10.978177070617676, + "rewards/rejected": -14.458221435546875, + "step": 1030 + }, + { + "epoch": 0.67, + "learning_rate": 4.936919781004522e-07, + "logits/chosen": -1.2339891195297241, + "logits/rejected": -0.8903535604476929, + "logps/chosen": -411.30615234375, + "logps/rejected": -553.6360473632812, + "loss": 0.0829, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.9073569774627686, + "rewards/margins": 10.883973121643066, + "rewards/rejected": -14.79133129119873, + "step": 1040 + }, + { + "epoch": 0.67, + "learning_rate": 4.930968816948346e-07, + "logits/chosen": -1.3873369693756104, + "logits/rejected": -0.8341131210327148, + "logps/chosen": -424.88543701171875, + "logps/rejected": -477.947021484375, + "loss": 0.0747, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.1361687183380127, + "rewards/margins": 11.415266036987305, + "rewards/rejected": -13.551434516906738, + "step": 1050 + }, + { + "epoch": 0.68, + "learning_rate": 4.925017852892168e-07, + "logits/chosen": -1.1863329410552979, + "logits/rejected": -0.6122242212295532, + "logps/chosen": -400.68865966796875, + "logps/rejected": -515.4220581054688, + "loss": 0.0844, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.35847544670105, + "rewards/margins": 13.144007682800293, + "rewards/rejected": -15.502484321594238, + "step": 1060 + }, + { + "epoch": 0.69, + "learning_rate": 4.919066888835991e-07, + "logits/chosen": -1.1419920921325684, + "logits/rejected": -0.5117210149765015, + "logps/chosen": -370.2403564453125, + "logps/rejected": -593.1256103515625, + "loss": 0.0685, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.8190958499908447, + "rewards/margins": 14.543069839477539, + "rewards/rejected": -17.362167358398438, + "step": 1070 + }, + { + "epoch": 0.69, + "learning_rate": 4.913115924779814e-07, + "logits/chosen": -1.1333223581314087, + "logits/rejected": -0.5887588858604431, + "logps/chosen": -438.794677734375, + "logps/rejected": -556.837890625, + "loss": 0.0702, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.482754945755005, + "rewards/margins": 12.69272232055664, + "rewards/rejected": -16.175479888916016, + "step": 1080 + }, + { + "epoch": 0.7, + "learning_rate": 4.907164960723638e-07, + "logits/chosen": -1.0477075576782227, + "logits/rejected": -0.5950930714607239, + "logps/chosen": -385.4281005859375, + "logps/rejected": -448.23211669921875, + "loss": 0.0694, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.315608501434326, + "rewards/margins": 10.84793472290039, + "rewards/rejected": -15.163541793823242, + "step": 1090 + }, + { + "epoch": 0.71, + "learning_rate": 4.90121399666746e-07, + "logits/chosen": -0.8860788345336914, + "logits/rejected": -0.4812610149383545, + "logps/chosen": -356.5136413574219, + "logps/rejected": -509.6454162597656, + "loss": 0.0806, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.0979533195495605, + "rewards/margins": 11.64047908782959, + "rewards/rejected": -15.738431930541992, + "step": 1100 + }, + { + "epoch": 0.71, + "eval_logits/chosen": -1.1228324174880981, + "eval_logits/rejected": -0.7543247938156128, + "eval_logps/chosen": -354.41119384765625, + "eval_logps/rejected": -438.4850158691406, + "eval_loss": 0.11537463963031769, + "eval_rewards/accuracies": 0.9765625, + "eval_rewards/chosen": -2.5048139095306396, + "eval_rewards/margins": 9.708612442016602, + "eval_rewards/rejected": -12.213427543640137, + "eval_runtime": 76.7376, + "eval_samples_per_second": 13.031, + "eval_steps_per_second": 0.417, + "step": 1100 + }, + { + "epoch": 0.71, + "learning_rate": 4.895263032611282e-07, + "logits/chosen": -1.2738934755325317, + "logits/rejected": -0.6076444387435913, + "logps/chosen": -452.056396484375, + "logps/rejected": -505.551513671875, + "loss": 0.0556, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.597477674484253, + "rewards/margins": 11.379976272583008, + "rewards/rejected": -14.977453231811523, + "step": 1110 + }, + { + "epoch": 0.72, + "learning_rate": 4.889312068555106e-07, + "logits/chosen": -1.174073576927185, + "logits/rejected": -0.7190831303596497, + "logps/chosen": -400.78656005859375, + "logps/rejected": -528.5311889648438, + "loss": 0.0635, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.351175308227539, + "rewards/margins": 11.898660659790039, + "rewards/rejected": -15.249837875366211, + "step": 1120 + }, + { + "epoch": 0.73, + "learning_rate": 4.883361104498928e-07, + "logits/chosen": -1.2661216259002686, + "logits/rejected": -0.7576111555099487, + "logps/chosen": -395.97833251953125, + "logps/rejected": -508.68719482421875, + "loss": 0.0564, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6894619464874268, + "rewards/margins": 13.860685348510742, + "rewards/rejected": -17.550146102905273, + "step": 1130 + }, + { + "epoch": 0.73, + "learning_rate": 4.877410140442752e-07, + "logits/chosen": -1.3575642108917236, + "logits/rejected": -0.8006072044372559, + "logps/chosen": -423.7298889160156, + "logps/rejected": -535.811767578125, + "loss": 0.0727, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -1.8236665725708008, + "rewards/margins": 12.405186653137207, + "rewards/rejected": -14.228856086730957, + "step": 1140 + }, + { + "epoch": 0.74, + "learning_rate": 4.871459176386574e-07, + "logits/chosen": -1.1852174997329712, + "logits/rejected": -0.7334953546524048, + "logps/chosen": -447.42724609375, + "logps/rejected": -565.5848388671875, + "loss": 0.0614, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.648531436920166, + "rewards/margins": 13.974637031555176, + "rewards/rejected": -17.6231689453125, + "step": 1150 + }, + { + "epoch": 0.75, + "learning_rate": 4.865508212330398e-07, + "logits/chosen": -1.1278337240219116, + "logits/rejected": -0.5487757325172424, + "logps/chosen": -394.2236022949219, + "logps/rejected": -564.2678833007812, + "loss": 0.05, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.3238348960876465, + "rewards/margins": 13.824040412902832, + "rewards/rejected": -16.147876739501953, + "step": 1160 + }, + { + "epoch": 0.75, + "learning_rate": 4.85955724827422e-07, + "logits/chosen": -0.7651220560073853, + "logits/rejected": -0.06668927520513535, + "logps/chosen": -444.3988342285156, + "logps/rejected": -548.660888671875, + "loss": 0.0583, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.920553684234619, + "rewards/margins": 13.667207717895508, + "rewards/rejected": -16.5877628326416, + "step": 1170 + }, + { + "epoch": 0.76, + "learning_rate": 4.853606284218044e-07, + "logits/chosen": -1.0830278396606445, + "logits/rejected": -0.4501362442970276, + "logps/chosen": -332.0751037597656, + "logps/rejected": -418.5309143066406, + "loss": 0.0789, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7456166744232178, + "rewards/margins": 11.250526428222656, + "rewards/rejected": -12.996142387390137, + "step": 1180 + }, + { + "epoch": 0.76, + "learning_rate": 4.847655320161866e-07, + "logits/chosen": -0.6781784296035767, + "logits/rejected": -0.27196237444877625, + "logps/chosen": -396.753662109375, + "logps/rejected": -491.14892578125, + "loss": 0.0668, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.8804516792297363, + "rewards/margins": 11.853898048400879, + "rewards/rejected": -14.734350204467773, + "step": 1190 + }, + { + "epoch": 0.77, + "learning_rate": 4.841704356105689e-07, + "logits/chosen": -0.7234100103378296, + "logits/rejected": -0.2980794310569763, + "logps/chosen": -367.587158203125, + "logps/rejected": -433.124755859375, + "loss": 0.0822, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.162593364715576, + "rewards/margins": 10.242734909057617, + "rewards/rejected": -13.405328750610352, + "step": 1200 + }, + { + "epoch": 0.77, + "eval_logits/chosen": -1.0001459121704102, + "eval_logits/rejected": -0.6194829940795898, + "eval_logps/chosen": -365.28955078125, + "eval_logps/rejected": -458.57135009765625, + "eval_loss": 0.13022372126579285, + "eval_rewards/accuracies": 0.96875, + "eval_rewards/chosen": -3.5926475524902344, + "eval_rewards/margins": 10.629414558410645, + "eval_rewards/rejected": -14.222061157226562, + "eval_runtime": 76.5043, + "eval_samples_per_second": 13.071, + "eval_steps_per_second": 0.418, + "step": 1200 + }, + { + "epoch": 0.78, + "learning_rate": 4.835753392049512e-07, + "logits/chosen": -0.6337307691574097, + "logits/rejected": -0.23596186935901642, + "logps/chosen": -356.19122314453125, + "logps/rejected": -514.327392578125, + "loss": 0.1085, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.958400249481201, + "rewards/margins": 13.92591667175293, + "rewards/rejected": -17.884319305419922, + "step": 1210 + }, + { + "epoch": 0.78, + "learning_rate": 4.829802427993334e-07, + "logits/chosen": -0.6189112663269043, + "logits/rejected": -0.34567025303840637, + "logps/chosen": -374.00286865234375, + "logps/rejected": -474.2106018066406, + "loss": 0.1089, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.2338080406188965, + "rewards/margins": 9.657389640808105, + "rewards/rejected": -12.891199111938477, + "step": 1220 + }, + { + "epoch": 0.79, + "learning_rate": 4.823851463937158e-07, + "logits/chosen": -0.3865962028503418, + "logits/rejected": 0.04075580835342407, + "logps/chosen": -359.3363342285156, + "logps/rejected": -501.40240478515625, + "loss": 0.1142, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.306814193725586, + "rewards/margins": 13.71052074432373, + "rewards/rejected": -18.017335891723633, + "step": 1230 + }, + { + "epoch": 0.8, + "learning_rate": 4.81790049988098e-07, + "logits/chosen": -0.7158625721931458, + "logits/rejected": -0.21917279064655304, + "logps/chosen": -434.91253662109375, + "logps/rejected": -528.3048095703125, + "loss": 0.0757, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.414928913116455, + "rewards/margins": 12.52367115020752, + "rewards/rejected": -15.938600540161133, + "step": 1240 + }, + { + "epoch": 0.8, + "learning_rate": 4.811949535824804e-07, + "logits/chosen": -0.7639582753181458, + "logits/rejected": -0.4316403269767761, + "logps/chosen": -365.72784423828125, + "logps/rejected": -567.1033935546875, + "loss": 0.084, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.063331127166748, + "rewards/margins": 12.180209159851074, + "rewards/rejected": -15.243539810180664, + "step": 1250 + }, + { + "epoch": 0.81, + "learning_rate": 4.805998571768626e-07, + "logits/chosen": -1.2161794900894165, + "logits/rejected": -0.6932582259178162, + "logps/chosen": -406.9449768066406, + "logps/rejected": -520.5137939453125, + "loss": 0.0951, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.868889331817627, + "rewards/margins": 10.655603408813477, + "rewards/rejected": -14.524490356445312, + "step": 1260 + }, + { + "epoch": 0.82, + "learning_rate": 4.80004760771245e-07, + "logits/chosen": -0.9595744013786316, + "logits/rejected": -0.43417948484420776, + "logps/chosen": -411.44677734375, + "logps/rejected": -526.41845703125, + "loss": 0.0691, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.0373916625976562, + "rewards/margins": 9.937090873718262, + "rewards/rejected": -12.97448444366455, + "step": 1270 + }, + { + "epoch": 0.82, + "learning_rate": 4.794096643656272e-07, + "logits/chosen": -1.0694881677627563, + "logits/rejected": -0.5757830142974854, + "logps/chosen": -382.9389343261719, + "logps/rejected": -500.0220642089844, + "loss": 0.0743, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.738356113433838, + "rewards/margins": 10.426973342895508, + "rewards/rejected": -14.165328979492188, + "step": 1280 + }, + { + "epoch": 0.83, + "learning_rate": 4.788145679600095e-07, + "logits/chosen": -0.9496662020683289, + "logits/rejected": -0.3601847290992737, + "logps/chosen": -358.3319396972656, + "logps/rejected": -560.3468627929688, + "loss": 0.0742, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.929766893386841, + "rewards/margins": 13.46239185333252, + "rewards/rejected": -16.39215660095215, + "step": 1290 + }, + { + "epoch": 0.84, + "learning_rate": 4.782194715543918e-07, + "logits/chosen": -0.6319989562034607, + "logits/rejected": -0.09417597949504852, + "logps/chosen": -379.7413024902344, + "logps/rejected": -522.1488037109375, + "loss": 0.063, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.54395055770874, + "rewards/margins": 13.589025497436523, + "rewards/rejected": -18.132978439331055, + "step": 1300 + }, + { + "epoch": 0.84, + "eval_logits/chosen": -0.9766608476638794, + "eval_logits/rejected": -0.467482328414917, + "eval_logps/chosen": -374.5415954589844, + "eval_logps/rejected": -472.6826477050781, + "eval_loss": 0.18041668832302094, + "eval_rewards/accuracies": 0.9375, + "eval_rewards/chosen": -4.517853260040283, + "eval_rewards/margins": 11.115335464477539, + "eval_rewards/rejected": -15.633190155029297, + "eval_runtime": 76.7717, + "eval_samples_per_second": 13.026, + "eval_steps_per_second": 0.417, + "step": 1300 + }, + { + "epoch": 0.84, + "learning_rate": 4.77624375148774e-07, + "logits/chosen": -0.6732112169265747, + "logits/rejected": 0.014696260914206505, + "logps/chosen": -417.83074951171875, + "logps/rejected": -488.5224609375, + "loss": 0.0634, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.793654441833496, + "rewards/margins": 13.286532402038574, + "rewards/rejected": -18.08018684387207, + "step": 1310 + }, + { + "epoch": 0.85, + "learning_rate": 4.770292787431564e-07, + "logits/chosen": -0.3928489685058594, + "logits/rejected": -0.07063998281955719, + "logps/chosen": -365.6458435058594, + "logps/rejected": -472.38043212890625, + "loss": 0.0885, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.527663707733154, + "rewards/margins": 11.80773639678955, + "rewards/rejected": -16.335399627685547, + "step": 1320 + }, + { + "epoch": 0.85, + "learning_rate": 4.764341823375387e-07, + "logits/chosen": -0.5924497842788696, + "logits/rejected": 0.10169048607349396, + "logps/chosen": -394.43292236328125, + "logps/rejected": -514.439697265625, + "loss": 0.0762, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.806927442550659, + "rewards/margins": 12.637249946594238, + "rewards/rejected": -16.444177627563477, + "step": 1330 + }, + { + "epoch": 0.86, + "learning_rate": 4.7583908593192097e-07, + "logits/chosen": -0.6815664172172546, + "logits/rejected": -0.29060396552085876, + "logps/chosen": -420.83221435546875, + "logps/rejected": -551.6920776367188, + "loss": 0.0532, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.1024906635284424, + "rewards/margins": 13.700825691223145, + "rewards/rejected": -16.80331802368164, + "step": 1340 + }, + { + "epoch": 0.87, + "learning_rate": 4.752439895263032e-07, + "logits/chosen": -0.689470112323761, + "logits/rejected": -0.20767569541931152, + "logps/chosen": -385.32037353515625, + "logps/rejected": -533.0687255859375, + "loss": 0.0878, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.7341361045837402, + "rewards/margins": 14.102251052856445, + "rewards/rejected": -17.83638572692871, + "step": 1350 + }, + { + "epoch": 0.87, + "learning_rate": 4.746488931206855e-07, + "logits/chosen": -0.7405373454093933, + "logits/rejected": -0.07327382266521454, + "logps/chosen": -377.8034973144531, + "logps/rejected": -466.72784423828125, + "loss": 0.0851, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.5931084156036377, + "rewards/margins": 12.507670402526855, + "rewards/rejected": -16.100778579711914, + "step": 1360 + }, + { + "epoch": 0.88, + "learning_rate": 4.7405379671506785e-07, + "logits/chosen": -0.5811715126037598, + "logits/rejected": -0.06195932626724243, + "logps/chosen": -411.6004943847656, + "logps/rejected": -548.6055908203125, + "loss": 0.1093, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.2639055252075195, + "rewards/margins": 11.680307388305664, + "rewards/rejected": -15.944211959838867, + "step": 1370 + }, + { + "epoch": 0.89, + "learning_rate": 4.734587003094501e-07, + "logits/chosen": -0.8370053172111511, + "logits/rejected": -0.14809687435626984, + "logps/chosen": -447.0265197753906, + "logps/rejected": -535.3525390625, + "loss": 0.081, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.550225734710693, + "rewards/margins": 12.257158279418945, + "rewards/rejected": -16.807382583618164, + "step": 1380 + }, + { + "epoch": 0.89, + "learning_rate": 4.728636039038324e-07, + "logits/chosen": -0.7418749928474426, + "logits/rejected": -0.25607532262802124, + "logps/chosen": -374.479248046875, + "logps/rejected": -532.3735961914062, + "loss": 0.0738, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.4016432762146, + "rewards/margins": 13.285806655883789, + "rewards/rejected": -18.687450408935547, + "step": 1390 + }, + { + "epoch": 0.9, + "learning_rate": 4.722685074982147e-07, + "logits/chosen": -0.8056036233901978, + "logits/rejected": -0.2640933394432068, + "logps/chosen": -372.2264709472656, + "logps/rejected": -565.7047729492188, + "loss": 0.0648, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.8431191444396973, + "rewards/margins": 13.19017505645752, + "rewards/rejected": -17.033292770385742, + "step": 1400 + }, + { + "epoch": 0.9, + "eval_logits/chosen": -1.118211030960083, + "eval_logits/rejected": -0.5943832397460938, + "eval_logps/chosen": -354.5495300292969, + "eval_logps/rejected": -446.5892333984375, + "eval_loss": 0.13570235669612885, + "eval_rewards/accuracies": 0.953125, + "eval_rewards/chosen": -2.518648147583008, + "eval_rewards/margins": 10.50519847869873, + "eval_rewards/rejected": -13.023846626281738, + "eval_runtime": 76.6799, + "eval_samples_per_second": 13.041, + "eval_steps_per_second": 0.417, + "step": 1400 + }, + { + "epoch": 0.91, + "learning_rate": 4.7167341109259703e-07, + "logits/chosen": -1.0566179752349854, + "logits/rejected": -0.2900046706199646, + "logps/chosen": -398.8577575683594, + "logps/rejected": -454.0634765625, + "loss": 0.094, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.2813262939453125, + "rewards/margins": 11.221616744995117, + "rewards/rejected": -13.502942085266113, + "step": 1410 + }, + { + "epoch": 0.91, + "learning_rate": 4.710783146869793e-07, + "logits/chosen": -0.9073772430419922, + "logits/rejected": -0.400244802236557, + "logps/chosen": -357.8586730957031, + "logps/rejected": -546.3787841796875, + "loss": 0.0928, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.4375388622283936, + "rewards/margins": 11.90378475189209, + "rewards/rejected": -15.341323852539062, + "step": 1420 + }, + { + "epoch": 0.92, + "learning_rate": 4.7048321828136157e-07, + "logits/chosen": -0.6786088347434998, + "logits/rejected": -0.10559716075658798, + "logps/chosen": -417.87939453125, + "logps/rejected": -530.9561767578125, + "loss": 0.042, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.6138105392456055, + "rewards/margins": 13.705339431762695, + "rewards/rejected": -19.319150924682617, + "step": 1430 + }, + { + "epoch": 0.93, + "learning_rate": 4.698881218757438e-07, + "logits/chosen": -0.5866619348526001, + "logits/rejected": -0.026242520660161972, + "logps/chosen": -422.72613525390625, + "logps/rejected": -541.8472900390625, + "loss": 0.1036, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.567929267883301, + "rewards/margins": 11.613755226135254, + "rewards/rejected": -16.181686401367188, + "step": 1440 + }, + { + "epoch": 0.93, + "learning_rate": 4.692930254701261e-07, + "logits/chosen": -0.7376483082771301, + "logits/rejected": -0.09872325509786606, + "logps/chosen": -425.328857421875, + "logps/rejected": -529.0191650390625, + "loss": 0.0628, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.07589054107666, + "rewards/margins": 12.2763032913208, + "rewards/rejected": -16.352191925048828, + "step": 1450 + }, + { + "epoch": 0.94, + "learning_rate": 4.6869792906450845e-07, + "logits/chosen": -0.5996996760368347, + "logits/rejected": 0.09782281517982483, + "logps/chosen": -426.3692321777344, + "logps/rejected": -495.6768493652344, + "loss": 0.0448, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.688590049743652, + "rewards/margins": 13.677145004272461, + "rewards/rejected": -18.36573600769043, + "step": 1460 + }, + { + "epoch": 0.94, + "learning_rate": 4.6810283265889075e-07, + "logits/chosen": -0.6974693536758423, + "logits/rejected": 0.09173402935266495, + "logps/chosen": -361.08746337890625, + "logps/rejected": -483.73211669921875, + "loss": 0.0571, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.233609199523926, + "rewards/margins": 13.777618408203125, + "rewards/rejected": -18.011228561401367, + "step": 1470 + }, + { + "epoch": 0.95, + "learning_rate": 4.67507736253273e-07, + "logits/chosen": -0.8001763224601746, + "logits/rejected": -0.1673848032951355, + "logps/chosen": -368.32196044921875, + "logps/rejected": -508.84368896484375, + "loss": 0.0676, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7094593048095703, + "rewards/margins": 13.913045883178711, + "rewards/rejected": -16.622507095336914, + "step": 1480 + }, + { + "epoch": 0.96, + "learning_rate": 4.669126398476553e-07, + "logits/chosen": -0.589401125907898, + "logits/rejected": 0.09411343187093735, + "logps/chosen": -448.2190856933594, + "logps/rejected": -579.1790161132812, + "loss": 0.07, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.9501609802246094, + "rewards/margins": 15.417715072631836, + "rewards/rejected": -19.367877960205078, + "step": 1490 + }, + { + "epoch": 0.96, + "learning_rate": 4.6631754344203763e-07, + "logits/chosen": -0.6391473412513733, + "logits/rejected": -0.1050008162856102, + "logps/chosen": -376.50579833984375, + "logps/rejected": -544.2982177734375, + "loss": 0.0714, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.8969063758850098, + "rewards/margins": 13.172216415405273, + "rewards/rejected": -17.069122314453125, + "step": 1500 + }, + { + "epoch": 0.96, + "eval_logits/chosen": -1.0033119916915894, + "eval_logits/rejected": -0.4300505220890045, + "eval_logps/chosen": -374.3122863769531, + "eval_logps/rejected": -470.21783447265625, + "eval_loss": 0.12438826262950897, + "eval_rewards/accuracies": 0.9453125, + "eval_rewards/chosen": -4.494921684265137, + "eval_rewards/margins": 10.89178466796875, + "eval_rewards/rejected": -15.38670539855957, + "eval_runtime": 76.5689, + "eval_samples_per_second": 13.06, + "eval_steps_per_second": 0.418, + "step": 1500 + }, + { + "epoch": 0.97, + "learning_rate": 4.657224470364199e-07, + "logits/chosen": -0.43436574935913086, + "logits/rejected": 0.07220065593719482, + "logps/chosen": -391.37725830078125, + "logps/rejected": -588.6260986328125, + "loss": 0.0546, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.231613636016846, + "rewards/margins": 13.30431842803955, + "rewards/rejected": -18.535930633544922, + "step": 1510 + }, + { + "epoch": 0.98, + "learning_rate": 4.6512735063080217e-07, + "logits/chosen": -0.6157822608947754, + "logits/rejected": 0.005131366662681103, + "logps/chosen": -411.13946533203125, + "logps/rejected": -483.9326171875, + "loss": 0.0527, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.003513336181641, + "rewards/margins": 10.31053638458252, + "rewards/rejected": -15.314050674438477, + "step": 1520 + }, + { + "epoch": 0.98, + "learning_rate": 4.6453225422518447e-07, + "logits/chosen": -0.6011049747467041, + "logits/rejected": -0.10970073938369751, + "logps/chosen": -406.9378356933594, + "logps/rejected": -468.97314453125, + "loss": 0.0739, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.8249969482421875, + "rewards/margins": 11.424562454223633, + "rewards/rejected": -16.24955940246582, + "step": 1530 + }, + { + "epoch": 0.99, + "learning_rate": 4.6393715781956676e-07, + "logits/chosen": -0.6942557096481323, + "logits/rejected": 0.16295239329338074, + "logps/chosen": -407.1282653808594, + "logps/rejected": -560.19970703125, + "loss": 0.0629, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.603503704071045, + "rewards/margins": 13.417546272277832, + "rewards/rejected": -18.02104949951172, + "step": 1540 + }, + { + "epoch": 1.0, + "learning_rate": 4.6334206141394905e-07, + "logits/chosen": -0.3027026057243347, + "logits/rejected": 0.10327012836933136, + "logps/chosen": -405.692626953125, + "logps/rejected": -576.5963134765625, + "loss": 0.0673, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.6844029426574707, + "rewards/margins": 13.106195449829102, + "rewards/rejected": -16.790597915649414, + "step": 1550 + }, + { + "epoch": 1.0, + "learning_rate": 4.6274696500833135e-07, + "logits/chosen": -0.5037144422531128, + "logits/rejected": 0.06204764172434807, + "logps/chosen": -412.0623474121094, + "logps/rejected": -525.1038818359375, + "loss": 0.061, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.6065430641174316, + "rewards/margins": 14.261393547058105, + "rewards/rejected": -16.867937088012695, + "step": 1560 + }, + { + "epoch": 1.01, + "learning_rate": 4.621518686027136e-07, + "logits/chosen": -0.526481568813324, + "logits/rejected": 0.24668976664543152, + "logps/chosen": -410.71746826171875, + "logps/rejected": -473.8052673339844, + "loss": 0.026, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.839338779449463, + "rewards/margins": 13.472638130187988, + "rewards/rejected": -16.31197738647461, + "step": 1570 + }, + { + "epoch": 1.02, + "learning_rate": 4.6155677219709594e-07, + "logits/chosen": -0.2807101607322693, + "logits/rejected": 0.3707982003688812, + "logps/chosen": -430.853759765625, + "logps/rejected": -465.86328125, + "loss": 0.016, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.3595848083496094, + "rewards/margins": 12.908388137817383, + "rewards/rejected": -16.267974853515625, + "step": 1580 + }, + { + "epoch": 1.02, + "learning_rate": 4.6096167579147823e-07, + "logits/chosen": -0.2973923683166504, + "logits/rejected": 0.30676135420799255, + "logps/chosen": -384.2185974121094, + "logps/rejected": -527.7244873046875, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.731484889984131, + "rewards/margins": 15.676348686218262, + "rewards/rejected": -19.407833099365234, + "step": 1590 + }, + { + "epoch": 1.03, + "learning_rate": 4.603665793858605e-07, + "logits/chosen": -0.5543831586837769, + "logits/rejected": 0.3015816807746887, + "logps/chosen": -395.59368896484375, + "logps/rejected": -544.031494140625, + "loss": 0.0095, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.2728145122528076, + "rewards/margins": 15.476274490356445, + "rewards/rejected": -18.74909019470215, + "step": 1600 + }, + { + "epoch": 1.03, + "eval_logits/chosen": -0.7791086435317993, + "eval_logits/rejected": -0.20377439260482788, + "eval_logps/chosen": -380.9189453125, + "eval_logps/rejected": -480.3507995605469, + "eval_loss": 0.10078604519367218, + "eval_rewards/accuracies": 0.9375, + "eval_rewards/chosen": -5.155591011047363, + "eval_rewards/margins": 11.244410514831543, + "eval_rewards/rejected": -16.40000343322754, + "eval_runtime": 76.553, + "eval_samples_per_second": 13.063, + "eval_steps_per_second": 0.418, + "step": 1600 + }, + { + "epoch": 1.03, + "learning_rate": 4.5977148298024277e-07, + "logits/chosen": -0.5526180267333984, + "logits/rejected": 0.3482429087162018, + "logps/chosen": -391.2712707519531, + "logps/rejected": -509.62847900390625, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7447268962860107, + "rewards/margins": 15.208663940429688, + "rewards/rejected": -18.95339012145996, + "step": 1610 + }, + { + "epoch": 1.04, + "learning_rate": 4.5917638657462507e-07, + "logits/chosen": -0.4088626801967621, + "logits/rejected": 0.3109140992164612, + "logps/chosen": -426.03643798828125, + "logps/rejected": -556.555908203125, + "loss": 0.0053, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.894583702087402, + "rewards/margins": 14.70109748840332, + "rewards/rejected": -19.595678329467773, + "step": 1620 + }, + { + "epoch": 1.05, + "learning_rate": 4.5858129016900736e-07, + "logits/chosen": -0.5371668934822083, + "logits/rejected": 0.3167404532432556, + "logps/chosen": -393.68524169921875, + "logps/rejected": -562.7149658203125, + "loss": 0.0129, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.771970272064209, + "rewards/margins": 15.018495559692383, + "rewards/rejected": -19.79046630859375, + "step": 1630 + }, + { + "epoch": 1.05, + "learning_rate": 4.5798619376338966e-07, + "logits/chosen": -0.247820645570755, + "logits/rejected": 0.3391680121421814, + "logps/chosen": -370.0538024902344, + "logps/rejected": -491.40252685546875, + "loss": 0.0067, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.405149936676025, + "rewards/margins": 14.295089721679688, + "rewards/rejected": -18.700239181518555, + "step": 1640 + }, + { + "epoch": 1.06, + "learning_rate": 4.5739109735777195e-07, + "logits/chosen": -0.2800091505050659, + "logits/rejected": 0.2654980719089508, + "logps/chosen": -439.1228942871094, + "logps/rejected": -581.3944091796875, + "loss": 0.0115, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -6.183625221252441, + "rewards/margins": 16.414287567138672, + "rewards/rejected": -22.597911834716797, + "step": 1650 + }, + { + "epoch": 1.07, + "learning_rate": 4.567960009521542e-07, + "logits/chosen": -0.32536178827285767, + "logits/rejected": 0.2308044731616974, + "logps/chosen": -362.37945556640625, + "logps/rejected": -542.333984375, + "loss": 0.0122, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.910571575164795, + "rewards/margins": 15.097010612487793, + "rewards/rejected": -19.007579803466797, + "step": 1660 + }, + { + "epoch": 1.07, + "learning_rate": 4.5620090454653654e-07, + "logits/chosen": -0.44402211904525757, + "logits/rejected": 0.07348278164863586, + "logps/chosen": -377.78759765625, + "logps/rejected": -574.8585205078125, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.063399314880371, + "rewards/margins": 16.107744216918945, + "rewards/rejected": -21.171140670776367, + "step": 1670 + }, + { + "epoch": 1.08, + "learning_rate": 4.5560580814091884e-07, + "logits/chosen": -0.6580984592437744, + "logits/rejected": 0.15282706916332245, + "logps/chosen": -388.4255065917969, + "logps/rejected": -543.6886596679688, + "loss": 0.0331, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.285706996917725, + "rewards/margins": 15.84802532196045, + "rewards/rejected": -20.13373374938965, + "step": 1680 + }, + { + "epoch": 1.09, + "learning_rate": 4.550107117353011e-07, + "logits/chosen": -0.5916538834571838, + "logits/rejected": 0.042367033660411835, + "logps/chosen": -425.89013671875, + "logps/rejected": -556.3162841796875, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.548110008239746, + "rewards/margins": 14.834815979003906, + "rewards/rejected": -19.38292694091797, + "step": 1690 + }, + { + "epoch": 1.09, + "learning_rate": 4.5441561532968337e-07, + "logits/chosen": -0.6858727335929871, + "logits/rejected": -0.02261565811932087, + "logps/chosen": -391.042724609375, + "logps/rejected": -572.4518432617188, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9488537311553955, + "rewards/margins": 17.172945022583008, + "rewards/rejected": -21.121801376342773, + "step": 1700 + }, + { + "epoch": 1.09, + "eval_logits/chosen": -0.9992736577987671, + "eval_logits/rejected": -0.40412917733192444, + "eval_logps/chosen": -377.3610534667969, + "eval_logps/rejected": -483.4149169921875, + "eval_loss": 0.1330709010362625, + "eval_rewards/accuracies": 0.953125, + "eval_rewards/chosen": -4.799799919128418, + "eval_rewards/margins": 11.906618118286133, + "eval_rewards/rejected": -16.706417083740234, + "eval_runtime": 76.6396, + "eval_samples_per_second": 13.048, + "eval_steps_per_second": 0.418, + "step": 1700 + }, + { + "epoch": 1.1, + "learning_rate": 4.538205189240657e-07, + "logits/chosen": -0.4306742548942566, + "logits/rejected": 0.3547573983669281, + "logps/chosen": -398.1122741699219, + "logps/rejected": -550.4100341796875, + "loss": 0.0132, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.38922643661499, + "rewards/margins": 16.458080291748047, + "rewards/rejected": -21.847307205200195, + "step": 1710 + }, + { + "epoch": 1.11, + "learning_rate": 4.5322542251844796e-07, + "logits/chosen": -0.5155312418937683, + "logits/rejected": -0.017366236075758934, + "logps/chosen": -410.91412353515625, + "logps/rejected": -553.8331298828125, + "loss": 0.0058, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.796280384063721, + "rewards/margins": 16.963947296142578, + "rewards/rejected": -21.76022720336914, + "step": 1720 + }, + { + "epoch": 1.11, + "learning_rate": 4.5263032611283026e-07, + "logits/chosen": -0.12997198104858398, + "logits/rejected": 0.2565564215183258, + "logps/chosen": -405.981201171875, + "logps/rejected": -620.6024169921875, + "loss": 0.014, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.828683376312256, + "rewards/margins": 18.5574893951416, + "rewards/rejected": -23.386173248291016, + "step": 1730 + }, + { + "epoch": 1.12, + "learning_rate": 4.5203522970721255e-07, + "logits/chosen": -0.0680803433060646, + "logits/rejected": 0.4802146852016449, + "logps/chosen": -421.7552185058594, + "logps/rejected": -488.4523010253906, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.658025026321411, + "rewards/margins": 15.093057632446289, + "rewards/rejected": -18.751081466674805, + "step": 1740 + }, + { + "epoch": 1.12, + "learning_rate": 4.5144013330159485e-07, + "logits/chosen": -0.3530596196651459, + "logits/rejected": 0.2572212815284729, + "logps/chosen": -413.51629638671875, + "logps/rejected": -577.3765869140625, + "loss": 0.0179, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.011878967285156, + "rewards/margins": 15.384442329406738, + "rewards/rejected": -19.39632225036621, + "step": 1750 + }, + { + "epoch": 1.13, + "learning_rate": 4.5084503689597714e-07, + "logits/chosen": -0.28345853090286255, + "logits/rejected": 0.42750459909439087, + "logps/chosen": -437.33941650390625, + "logps/rejected": -554.0641479492188, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.84152889251709, + "rewards/margins": 14.463783264160156, + "rewards/rejected": -19.305313110351562, + "step": 1760 + }, + { + "epoch": 1.14, + "learning_rate": 4.5024994049035944e-07, + "logits/chosen": -0.09664113819599152, + "logits/rejected": 0.22168950736522675, + "logps/chosen": -378.29632568359375, + "logps/rejected": -565.90185546875, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.932016372680664, + "rewards/margins": 17.979869842529297, + "rewards/rejected": -22.911888122558594, + "step": 1770 + }, + { + "epoch": 1.14, + "learning_rate": 4.496548440847417e-07, + "logits/chosen": -0.049262501299381256, + "logits/rejected": 0.4335893988609314, + "logps/chosen": -402.48980712890625, + "logps/rejected": -552.8167114257812, + "loss": 0.0279, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.7815442085266113, + "rewards/margins": 15.186810493469238, + "rewards/rejected": -18.968353271484375, + "step": 1780 + }, + { + "epoch": 1.15, + "learning_rate": 4.49059747679124e-07, + "logits/chosen": -0.28909042477607727, + "logits/rejected": 0.3799073398113251, + "logps/chosen": -430.60760498046875, + "logps/rejected": -537.1256713867188, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.874712944030762, + "rewards/margins": 15.4542875289917, + "rewards/rejected": -20.32900047302246, + "step": 1790 + }, + { + "epoch": 1.16, + "learning_rate": 4.484646512735063e-07, + "logits/chosen": -0.3206165134906769, + "logits/rejected": 0.06656259298324585, + "logps/chosen": -471.58001708984375, + "logps/rejected": -507.40301513671875, + "loss": 0.0293, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.206088066101074, + "rewards/margins": 13.663434982299805, + "rewards/rejected": -17.869524002075195, + "step": 1800 + }, + { + "epoch": 1.16, + "eval_logits/chosen": -0.8347997665405273, + "eval_logits/rejected": -0.41860231757164, + "eval_logps/chosen": -372.9380187988281, + "eval_logps/rejected": -465.8204345703125, + "eval_loss": 0.14275716245174408, + "eval_rewards/accuracies": 0.9609375, + "eval_rewards/chosen": -4.35749626159668, + "eval_rewards/margins": 10.589475631713867, + "eval_rewards/rejected": -14.94697093963623, + "eval_runtime": 76.7934, + "eval_samples_per_second": 13.022, + "eval_steps_per_second": 0.417, + "step": 1800 + }, + { + "epoch": 1.16, + "learning_rate": 4.4786955486788856e-07, + "logits/chosen": -0.3631385266780853, + "logits/rejected": 0.07067543268203735, + "logps/chosen": -377.46551513671875, + "logps/rejected": -519.6616821289062, + "loss": 0.0359, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.011387586593628, + "rewards/margins": 15.720315933227539, + "rewards/rejected": -18.73170280456543, + "step": 1810 + }, + { + "epoch": 1.17, + "learning_rate": 4.4727445846227086e-07, + "logits/chosen": -0.36144647002220154, + "logits/rejected": -0.008104220032691956, + "logps/chosen": -397.82940673828125, + "logps/rejected": -593.34521484375, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.28765869140625, + "rewards/margins": 15.561511039733887, + "rewards/rejected": -20.849170684814453, + "step": 1820 + }, + { + "epoch": 1.18, + "learning_rate": 4.4667936205665315e-07, + "logits/chosen": -0.1907232105731964, + "logits/rejected": 0.313555508852005, + "logps/chosen": -393.31719970703125, + "logps/rejected": -510.4048767089844, + "loss": 0.0247, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.2387800216674805, + "rewards/margins": 14.292640686035156, + "rewards/rejected": -18.531421661376953, + "step": 1830 + }, + { + "epoch": 1.18, + "learning_rate": 4.4608426565103545e-07, + "logits/chosen": -0.6258490085601807, + "logits/rejected": 0.040771596133708954, + "logps/chosen": -419.44549560546875, + "logps/rejected": -544.8959350585938, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.875296115875244, + "rewards/margins": 15.84538459777832, + "rewards/rejected": -19.720678329467773, + "step": 1840 + }, + { + "epoch": 1.19, + "learning_rate": 4.4548916924541774e-07, + "logits/chosen": -0.3880535066127777, + "logits/rejected": -0.07375472038984299, + "logps/chosen": -354.2195739746094, + "logps/rejected": -516.4483642578125, + "loss": 0.024, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.647956609725952, + "rewards/margins": 14.303915023803711, + "rewards/rejected": -16.95186996459961, + "step": 1850 + }, + { + "epoch": 1.2, + "learning_rate": 4.4489407283980004e-07, + "logits/chosen": 0.03841676935553551, + "logits/rejected": 0.44405698776245117, + "logps/chosen": -405.54351806640625, + "logps/rejected": -583.1129760742188, + "loss": 0.0113, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.536320686340332, + "rewards/margins": 15.479736328125, + "rewards/rejected": -21.016056060791016, + "step": 1860 + }, + { + "epoch": 1.2, + "learning_rate": 4.442989764341823e-07, + "logits/chosen": 0.409410297870636, + "logits/rejected": 0.6078455448150635, + "logps/chosen": -425.523193359375, + "logps/rejected": -559.0198364257812, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.118569374084473, + "rewards/margins": 15.779667854309082, + "rewards/rejected": -22.898235321044922, + "step": 1870 + }, + { + "epoch": 1.21, + "learning_rate": 4.437038800285646e-07, + "logits/chosen": 0.023495376110076904, + "logits/rejected": 0.8307549357414246, + "logps/chosen": -367.8580017089844, + "logps/rejected": -482.8976135253906, + "loss": 0.0228, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.339402198791504, + "rewards/margins": 14.32616901397705, + "rewards/rejected": -18.665569305419922, + "step": 1880 + }, + { + "epoch": 1.21, + "learning_rate": 4.431087836229469e-07, + "logits/chosen": -0.1776140034198761, + "logits/rejected": 0.4008842408657074, + "logps/chosen": -368.744873046875, + "logps/rejected": -584.6207885742188, + "loss": 0.0183, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.053328514099121, + "rewards/margins": 16.808618545532227, + "rewards/rejected": -20.861948013305664, + "step": 1890 + }, + { + "epoch": 1.22, + "learning_rate": 4.4251368721732916e-07, + "logits/chosen": -0.10556875169277191, + "logits/rejected": 0.6400087475776672, + "logps/chosen": -395.7705993652344, + "logps/rejected": -568.189697265625, + "loss": 0.0327, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.5353851318359375, + "rewards/margins": 15.502912521362305, + "rewards/rejected": -20.038297653198242, + "step": 1900 + }, + { + "epoch": 1.22, + "eval_logits/chosen": -0.3349679708480835, + "eval_logits/rejected": 0.1110042855143547, + "eval_logps/chosen": -412.0855712890625, + "eval_logps/rejected": -525.799560546875, + "eval_loss": 0.16337917745113373, + "eval_rewards/accuracies": 0.90625, + "eval_rewards/chosen": -8.27225112915039, + "eval_rewards/margins": 12.672636032104492, + "eval_rewards/rejected": -20.94488525390625, + "eval_runtime": 76.7623, + "eval_samples_per_second": 13.027, + "eval_steps_per_second": 0.417, + "step": 1900 + }, + { + "epoch": 1.23, + "learning_rate": 4.4191859081171146e-07, + "logits/chosen": 0.13517367839813232, + "logits/rejected": 0.3502965569496155, + "logps/chosen": -432.887451171875, + "logps/rejected": -610.2424926757812, + "loss": 0.0176, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.2362380027771, + "rewards/margins": 17.042871475219727, + "rewards/rejected": -22.279109954833984, + "step": 1910 + }, + { + "epoch": 1.23, + "learning_rate": 4.413234944060938e-07, + "logits/chosen": -0.011228932067751884, + "logits/rejected": 0.4079880714416504, + "logps/chosen": -397.6374206542969, + "logps/rejected": -609.8255615234375, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.824807643890381, + "rewards/margins": 15.99610424041748, + "rewards/rejected": -19.820913314819336, + "step": 1920 + }, + { + "epoch": 1.24, + "learning_rate": 4.4072839800047605e-07, + "logits/chosen": -0.048855990171432495, + "logits/rejected": 0.2614760994911194, + "logps/chosen": -425.06182861328125, + "logps/rejected": -693.6744384765625, + "loss": 0.0236, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.437471866607666, + "rewards/margins": 17.300281524658203, + "rewards/rejected": -21.73775291442871, + "step": 1930 + }, + { + "epoch": 1.25, + "learning_rate": 4.4013330159485834e-07, + "logits/chosen": -0.6388794183731079, + "logits/rejected": -0.16498331725597382, + "logps/chosen": -455.348388671875, + "logps/rejected": -607.502685546875, + "loss": 0.0343, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9607555866241455, + "rewards/margins": 16.582752227783203, + "rewards/rejected": -19.543508529663086, + "step": 1940 + }, + { + "epoch": 1.25, + "learning_rate": 4.3953820518924064e-07, + "logits/chosen": -0.7284079194068909, + "logits/rejected": -0.07122499495744705, + "logps/chosen": -416.238525390625, + "logps/rejected": -586.7880249023438, + "loss": 0.0142, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.067997932434082, + "rewards/margins": 16.282215118408203, + "rewards/rejected": -20.350215911865234, + "step": 1950 + }, + { + "epoch": 1.26, + "learning_rate": 4.3894310878362293e-07, + "logits/chosen": -0.7452508211135864, + "logits/rejected": 0.1260381042957306, + "logps/chosen": -449.06268310546875, + "logps/rejected": -529.99072265625, + "loss": 0.0258, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.9167428016662598, + "rewards/margins": 15.735280990600586, + "rewards/rejected": -18.65202522277832, + "step": 1960 + }, + { + "epoch": 1.27, + "learning_rate": 4.3834801237800523e-07, + "logits/chosen": -0.20991234481334686, + "logits/rejected": 0.1470331847667694, + "logps/chosen": -391.4617919921875, + "logps/rejected": -590.724365234375, + "loss": 0.0332, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.234528541564941, + "rewards/margins": 17.127511978149414, + "rewards/rejected": -21.362041473388672, + "step": 1970 + }, + { + "epoch": 1.27, + "learning_rate": 4.377529159723875e-07, + "logits/chosen": -0.1375136375427246, + "logits/rejected": 0.26706498861312866, + "logps/chosen": -444.4725646972656, + "logps/rejected": -529.6869506835938, + "loss": 0.0262, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.4430928230285645, + "rewards/margins": 15.911623001098633, + "rewards/rejected": -21.354717254638672, + "step": 1980 + }, + { + "epoch": 1.28, + "learning_rate": 4.3715781956676976e-07, + "logits/chosen": -0.21475636959075928, + "logits/rejected": 0.3777112364768982, + "logps/chosen": -406.0322570800781, + "logps/rejected": -613.7669067382812, + "loss": 0.0202, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.220062732696533, + "rewards/margins": 17.0797119140625, + "rewards/rejected": -21.299776077270508, + "step": 1990 + }, + { + "epoch": 1.29, + "learning_rate": 4.365627231611521e-07, + "logits/chosen": -0.42644554376602173, + "logits/rejected": 0.14130757749080658, + "logps/chosen": -414.5654296875, + "logps/rejected": -642.5784912109375, + "loss": 0.0182, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.209155321121216, + "rewards/margins": 17.73525619506836, + "rewards/rejected": -20.944412231445312, + "step": 2000 + }, + { + "epoch": 1.29, + "eval_logits/chosen": -0.6391128301620483, + "eval_logits/rejected": -0.05341078341007233, + "eval_logps/chosen": -384.2879638671875, + "eval_logps/rejected": -496.6387023925781, + "eval_loss": 0.15107305347919464, + "eval_rewards/accuracies": 0.953125, + "eval_rewards/chosen": -5.492494106292725, + "eval_rewards/margins": 12.536298751831055, + "eval_rewards/rejected": -18.028793334960938, + "eval_runtime": 76.6391, + "eval_samples_per_second": 13.048, + "eval_steps_per_second": 0.418, + "step": 2000 + }, + { + "epoch": 1.29, + "learning_rate": 4.359676267555344e-07, + "logits/chosen": -0.516070544719696, + "logits/rejected": 0.47864165902137756, + "logps/chosen": -344.65240478515625, + "logps/rejected": -633.1707763671875, + "loss": 0.0116, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.215503692626953, + "rewards/margins": 20.52490234375, + "rewards/rejected": -24.740406036376953, + "step": 2010 + }, + { + "epoch": 1.3, + "learning_rate": 4.3537253034991665e-07, + "logits/chosen": -0.2483837604522705, + "logits/rejected": 0.22826921939849854, + "logps/chosen": -390.77874755859375, + "logps/rejected": -561.1676025390625, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.410299777984619, + "rewards/margins": 18.708133697509766, + "rewards/rejected": -23.11842918395996, + "step": 2020 + }, + { + "epoch": 1.3, + "learning_rate": 4.3477743394429894e-07, + "logits/chosen": -0.058499228209257126, + "logits/rejected": 0.5287091135978699, + "logps/chosen": -399.7431640625, + "logps/rejected": -509.27593994140625, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.519265651702881, + "rewards/margins": 16.500688552856445, + "rewards/rejected": -21.01995277404785, + "step": 2030 + }, + { + "epoch": 1.31, + "learning_rate": 4.3418233753868124e-07, + "logits/chosen": 0.22348590195178986, + "logits/rejected": 0.7664919495582581, + "logps/chosen": -411.91668701171875, + "logps/rejected": -567.1552124023438, + "loss": 0.0393, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.37440299987793, + "rewards/margins": 16.45658302307129, + "rewards/rejected": -21.830982208251953, + "step": 2040 + }, + { + "epoch": 1.32, + "learning_rate": 4.3358724113306353e-07, + "logits/chosen": 0.23079347610473633, + "logits/rejected": 0.849484920501709, + "logps/chosen": -399.9824523925781, + "logps/rejected": -545.28271484375, + "loss": 0.0132, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.708091735839844, + "rewards/margins": 18.776620864868164, + "rewards/rejected": -24.484712600708008, + "step": 2050 + }, + { + "epoch": 1.32, + "learning_rate": 4.3299214472744583e-07, + "logits/chosen": 0.5551694631576538, + "logits/rejected": 1.1838432550430298, + "logps/chosen": -406.03460693359375, + "logps/rejected": -533.4246826171875, + "loss": 0.0198, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.767632961273193, + "rewards/margins": 15.5044527053833, + "rewards/rejected": -22.27208709716797, + "step": 2060 + }, + { + "epoch": 1.33, + "learning_rate": 4.323970483218281e-07, + "logits/chosen": 0.5585372447967529, + "logits/rejected": 1.1072343587875366, + "logps/chosen": -390.73870849609375, + "logps/rejected": -581.7123413085938, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.473759651184082, + "rewards/margins": 17.37258529663086, + "rewards/rejected": -22.846345901489258, + "step": 2070 + }, + { + "epoch": 1.34, + "learning_rate": 4.3180195191621036e-07, + "logits/chosen": 0.19852328300476074, + "logits/rejected": 0.6014672517776489, + "logps/chosen": -408.9750061035156, + "logps/rejected": -573.5753784179688, + "loss": 0.0317, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.9805541038513184, + "rewards/margins": 16.70760154724121, + "rewards/rejected": -20.688152313232422, + "step": 2080 + }, + { + "epoch": 1.34, + "learning_rate": 4.312068555105927e-07, + "logits/chosen": 0.2863103151321411, + "logits/rejected": 1.09657883644104, + "logps/chosen": -437.265380859375, + "logps/rejected": -549.7987060546875, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.587510108947754, + "rewards/margins": 17.130268096923828, + "rewards/rejected": -21.7177791595459, + "step": 2090 + }, + { + "epoch": 1.35, + "learning_rate": 4.30611759104975e-07, + "logits/chosen": 0.16401129961013794, + "logits/rejected": 0.43381816148757935, + "logps/chosen": -367.1972351074219, + "logps/rejected": -574.0045776367188, + "loss": 0.0181, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.683870315551758, + "rewards/margins": 18.48448944091797, + "rewards/rejected": -22.168359756469727, + "step": 2100 + }, + { + "epoch": 1.35, + "eval_logits/chosen": -0.38096773624420166, + "eval_logits/rejected": 0.2048163115978241, + "eval_logps/chosen": -379.0364074707031, + "eval_logps/rejected": -488.2851867675781, + "eval_loss": 0.16009199619293213, + "eval_rewards/accuracies": 0.9140625, + "eval_rewards/chosen": -4.9673357009887695, + "eval_rewards/margins": 12.22611141204834, + "eval_rewards/rejected": -17.19344711303711, + "eval_runtime": 77.0334, + "eval_samples_per_second": 12.981, + "eval_steps_per_second": 0.415, + "step": 2100 + }, + { + "epoch": 1.36, + "learning_rate": 4.3001666269935725e-07, + "logits/chosen": 0.4979768395423889, + "logits/rejected": 0.9932268261909485, + "logps/chosen": -439.8809509277344, + "logps/rejected": -572.9110107421875, + "loss": 0.0196, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.661294937133789, + "rewards/margins": 16.660076141357422, + "rewards/rejected": -21.32137107849121, + "step": 2110 + }, + { + "epoch": 1.36, + "learning_rate": 4.2942156629373954e-07, + "logits/chosen": 0.38851994276046753, + "logits/rejected": 1.0913450717926025, + "logps/chosen": -391.9513854980469, + "logps/rejected": -557.0726318359375, + "loss": 0.0222, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.788923263549805, + "rewards/margins": 16.37869644165039, + "rewards/rejected": -21.167619705200195, + "step": 2120 + }, + { + "epoch": 1.37, + "learning_rate": 4.288264698881219e-07, + "logits/chosen": 0.7406013607978821, + "logits/rejected": 1.1288832426071167, + "logps/chosen": -434.83880615234375, + "logps/rejected": -587.9083251953125, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.380326271057129, + "rewards/margins": 18.681997299194336, + "rewards/rejected": -24.06232261657715, + "step": 2130 + }, + { + "epoch": 1.38, + "learning_rate": 4.2823137348250413e-07, + "logits/chosen": 0.521850049495697, + "logits/rejected": 0.9274239540100098, + "logps/chosen": -380.3211669921875, + "logps/rejected": -581.7938232421875, + "loss": 0.0179, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.798120021820068, + "rewards/margins": 17.523136138916016, + "rewards/rejected": -22.321256637573242, + "step": 2140 + }, + { + "epoch": 1.38, + "learning_rate": 4.2763627707688643e-07, + "logits/chosen": 0.12944528460502625, + "logits/rejected": 0.8914273381233215, + "logps/chosen": -443.59954833984375, + "logps/rejected": -575.8558349609375, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.318188190460205, + "rewards/margins": 17.411693572998047, + "rewards/rejected": -21.729881286621094, + "step": 2150 + }, + { + "epoch": 1.39, + "learning_rate": 4.270411806712687e-07, + "logits/chosen": -0.12583580613136292, + "logits/rejected": 0.797269344329834, + "logps/chosen": -445.8939514160156, + "logps/rejected": -576.5032958984375, + "loss": 0.0453, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.454132080078125, + "rewards/margins": 17.303661346435547, + "rewards/rejected": -23.757795333862305, + "step": 2160 + }, + { + "epoch": 1.39, + "learning_rate": 4.26446084265651e-07, + "logits/chosen": -0.158376082777977, + "logits/rejected": 0.6286741495132446, + "logps/chosen": -425.6141662597656, + "logps/rejected": -587.5155029296875, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.210710048675537, + "rewards/margins": 16.975194931030273, + "rewards/rejected": -23.18590545654297, + "step": 2170 + }, + { + "epoch": 1.4, + "learning_rate": 4.258509878600333e-07, + "logits/chosen": -0.22286149859428406, + "logits/rejected": 0.8965922594070435, + "logps/chosen": -482.9183654785156, + "logps/rejected": -620.5008544921875, + "loss": 0.0323, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -7.59103536605835, + "rewards/margins": 16.49752426147461, + "rewards/rejected": -24.088560104370117, + "step": 2180 + }, + { + "epoch": 1.41, + "learning_rate": 4.252558914544156e-07, + "logits/chosen": 0.22360090911388397, + "logits/rejected": 0.7913219928741455, + "logps/chosen": -467.44757080078125, + "logps/rejected": -694.8656616210938, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.326881408691406, + "rewards/margins": 19.117225646972656, + "rewards/rejected": -27.444107055664062, + "step": 2190 + }, + { + "epoch": 1.41, + "learning_rate": 4.2466079504879785e-07, + "logits/chosen": -0.1699143350124359, + "logits/rejected": 0.9544417262077332, + "logps/chosen": -444.05224609375, + "logps/rejected": -574.4688110351562, + "loss": 0.0107, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -7.666506767272949, + "rewards/margins": 15.971659660339355, + "rewards/rejected": -23.638164520263672, + "step": 2200 + }, + { + "epoch": 1.41, + "eval_logits/chosen": -0.5077850222587585, + "eval_logits/rejected": 0.1793254315853119, + "eval_logps/chosen": -393.3812561035156, + "eval_logps/rejected": -497.8894958496094, + "eval_loss": 0.1433764100074768, + "eval_rewards/accuracies": 0.9296875, + "eval_rewards/chosen": -6.401820182800293, + "eval_rewards/margins": 11.752058029174805, + "eval_rewards/rejected": -18.15387725830078, + "eval_runtime": 76.5649, + "eval_samples_per_second": 13.061, + "eval_steps_per_second": 0.418, + "step": 2200 + }, + { + "epoch": 1.42, + "learning_rate": 4.240656986431802e-07, + "logits/chosen": 0.25252458453178406, + "logits/rejected": 0.6517130136489868, + "logps/chosen": -439.65478515625, + "logps/rejected": -589.1119995117188, + "loss": 0.023, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.0924177169799805, + "rewards/margins": 17.32897186279297, + "rewards/rejected": -22.421390533447266, + "step": 2210 + }, + { + "epoch": 1.43, + "learning_rate": 4.234706022375625e-07, + "logits/chosen": 0.16541361808776855, + "logits/rejected": 0.7442089319229126, + "logps/chosen": -340.91363525390625, + "logps/rejected": -542.9158935546875, + "loss": 0.0378, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -6.545586585998535, + "rewards/margins": 16.971027374267578, + "rewards/rejected": -23.51661491394043, + "step": 2220 + }, + { + "epoch": 1.43, + "learning_rate": 4.2287550583194473e-07, + "logits/chosen": 0.12309785187244415, + "logits/rejected": 0.7068944573402405, + "logps/chosen": -416.27166748046875, + "logps/rejected": -671.1712646484375, + "loss": 0.0373, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -6.756946563720703, + "rewards/margins": 19.396137237548828, + "rewards/rejected": -26.1530818939209, + "step": 2230 + }, + { + "epoch": 1.44, + "learning_rate": 4.2228040942632703e-07, + "logits/chosen": -0.17779667675495148, + "logits/rejected": 0.7514945268630981, + "logps/chosen": -387.315185546875, + "logps/rejected": -554.4407348632812, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.591644287109375, + "rewards/margins": 16.485776901245117, + "rewards/rejected": -23.07741928100586, + "step": 2240 + }, + { + "epoch": 1.45, + "learning_rate": 4.216853130207093e-07, + "logits/chosen": -0.4565064311027527, + "logits/rejected": 0.8017956018447876, + "logps/chosen": -407.85028076171875, + "logps/rejected": -557.0631103515625, + "loss": 0.0244, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.639117240905762, + "rewards/margins": 16.890506744384766, + "rewards/rejected": -21.529621124267578, + "step": 2250 + }, + { + "epoch": 1.45, + "learning_rate": 4.210902166150916e-07, + "logits/chosen": -0.1683514416217804, + "logits/rejected": 0.797057569026947, + "logps/chosen": -448.7789611816406, + "logps/rejected": -617.1752319335938, + "loss": 0.0067, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.5801100730896, + "rewards/margins": 17.799306869506836, + "rewards/rejected": -24.379417419433594, + "step": 2260 + }, + { + "epoch": 1.46, + "learning_rate": 4.204951202094739e-07, + "logits/chosen": 0.12932386994361877, + "logits/rejected": 0.9783223867416382, + "logps/chosen": -392.2380676269531, + "logps/rejected": -563.818603515625, + "loss": 0.0214, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.884858131408691, + "rewards/margins": 17.019224166870117, + "rewards/rejected": -22.904081344604492, + "step": 2270 + }, + { + "epoch": 1.47, + "learning_rate": 4.199000238038562e-07, + "logits/chosen": 0.17704737186431885, + "logits/rejected": 0.9605581164360046, + "logps/chosen": -393.4834289550781, + "logps/rejected": -520.2174072265625, + "loss": 0.0254, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.810626983642578, + "rewards/margins": 17.23440170288086, + "rewards/rejected": -22.045028686523438, + "step": 2280 + }, + { + "epoch": 1.47, + "learning_rate": 4.1930492739823845e-07, + "logits/chosen": 0.23696310818195343, + "logits/rejected": 0.8548523783683777, + "logps/chosen": -413.873046875, + "logps/rejected": -593.3323974609375, + "loss": 0.0362, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.712353229522705, + "rewards/margins": 18.40825653076172, + "rewards/rejected": -24.120609283447266, + "step": 2290 + }, + { + "epoch": 1.48, + "learning_rate": 4.187098309926208e-07, + "logits/chosen": 0.11162440478801727, + "logits/rejected": 0.8643198013305664, + "logps/chosen": -446.72216796875, + "logps/rejected": -609.0733642578125, + "loss": 0.0207, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -7.024069786071777, + "rewards/margins": 17.165082931518555, + "rewards/rejected": -24.18915367126465, + "step": 2300 + }, + { + "epoch": 1.48, + "eval_logits/chosen": -0.3969336152076721, + "eval_logits/rejected": 0.30485790967941284, + "eval_logps/chosen": -409.54925537109375, + "eval_logps/rejected": -527.1227416992188, + "eval_loss": 0.1328170895576477, + "eval_rewards/accuracies": 0.9609375, + "eval_rewards/chosen": -8.018616676330566, + "eval_rewards/margins": 13.058581352233887, + "eval_rewards/rejected": -21.077199935913086, + "eval_runtime": 76.7293, + "eval_samples_per_second": 13.033, + "eval_steps_per_second": 0.417, + "step": 2300 + }, + { + "epoch": 1.48, + "learning_rate": 4.181147345870031e-07, + "logits/chosen": -0.14238300919532776, + "logits/rejected": 0.5450645685195923, + "logps/chosen": -449.2244567871094, + "logps/rejected": -655.3807373046875, + "loss": 0.0182, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.570483207702637, + "rewards/margins": 16.494831085205078, + "rewards/rejected": -23.065311431884766, + "step": 2310 + }, + { + "epoch": 1.49, + "learning_rate": 4.1751963818138534e-07, + "logits/chosen": -0.011528102681040764, + "logits/rejected": 0.896866500377655, + "logps/chosen": -477.01226806640625, + "logps/rejected": -661.6298217773438, + "loss": 0.0098, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.549506187438965, + "rewards/margins": 19.142070770263672, + "rewards/rejected": -25.691574096679688, + "step": 2320 + }, + { + "epoch": 1.5, + "learning_rate": 4.1692454177576763e-07, + "logits/chosen": -0.2801293432712555, + "logits/rejected": 0.5617603063583374, + "logps/chosen": -442.40167236328125, + "logps/rejected": -573.85205078125, + "loss": 0.0289, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.618128776550293, + "rewards/margins": 17.062252044677734, + "rewards/rejected": -23.680381774902344, + "step": 2330 + }, + { + "epoch": 1.5, + "learning_rate": 4.1632944537015e-07, + "logits/chosen": -0.1246495246887207, + "logits/rejected": 0.732117772102356, + "logps/chosen": -445.9453125, + "logps/rejected": -648.7326049804688, + "loss": 0.0567, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -7.793832302093506, + "rewards/margins": 16.911283493041992, + "rewards/rejected": -24.705114364624023, + "step": 2340 + }, + { + "epoch": 1.51, + "learning_rate": 4.157343489645322e-07, + "logits/chosen": -0.13531716167926788, + "logits/rejected": 0.6813434362411499, + "logps/chosen": -500.760986328125, + "logps/rejected": -748.8970947265625, + "loss": 0.0249, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -8.698297500610352, + "rewards/margins": 22.551307678222656, + "rewards/rejected": -31.249608993530273, + "step": 2350 + }, + { + "epoch": 1.52, + "learning_rate": 4.151392525589145e-07, + "logits/chosen": -0.17930714786052704, + "logits/rejected": 0.5675514340400696, + "logps/chosen": -507.093994140625, + "logps/rejected": -617.2818603515625, + "loss": 0.0077, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.938199043273926, + "rewards/margins": 18.702774047851562, + "rewards/rejected": -28.640972137451172, + "step": 2360 + }, + { + "epoch": 1.52, + "learning_rate": 4.145441561532968e-07, + "logits/chosen": -0.02535531297326088, + "logits/rejected": 0.5533861517906189, + "logps/chosen": -441.52362060546875, + "logps/rejected": -658.696044921875, + "loss": 0.0219, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -7.094923496246338, + "rewards/margins": 19.90808868408203, + "rewards/rejected": -27.00301170349121, + "step": 2370 + }, + { + "epoch": 1.53, + "learning_rate": 4.139490597476791e-07, + "logits/chosen": -0.01725180074572563, + "logits/rejected": 0.545805811882019, + "logps/chosen": -431.83197021484375, + "logps/rejected": -565.3300170898438, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.328795909881592, + "rewards/margins": 17.926631927490234, + "rewards/rejected": -22.255428314208984, + "step": 2380 + }, + { + "epoch": 1.54, + "learning_rate": 4.133539633420614e-07, + "logits/chosen": -0.21750584244728088, + "logits/rejected": 0.8260287046432495, + "logps/chosen": -426.7081604003906, + "logps/rejected": -571.2562255859375, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.599783897399902, + "rewards/margins": 18.57124900817871, + "rewards/rejected": -23.17103385925293, + "step": 2390 + }, + { + "epoch": 1.54, + "learning_rate": 4.127588669364437e-07, + "logits/chosen": 0.12132026255130768, + "logits/rejected": 0.586942195892334, + "logps/chosen": -387.7074890136719, + "logps/rejected": -575.88623046875, + "loss": 0.0237, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.400722980499268, + "rewards/margins": 19.196186065673828, + "rewards/rejected": -24.596908569335938, + "step": 2400 + }, + { + "epoch": 1.54, + "eval_logits/chosen": -0.4777054190635681, + "eval_logits/rejected": 0.22216691076755524, + "eval_logps/chosen": -392.7734069824219, + "eval_logps/rejected": -515.349365234375, + "eval_loss": 0.1322525590658188, + "eval_rewards/accuracies": 0.9765625, + "eval_rewards/chosen": -6.341035842895508, + "eval_rewards/margins": 13.558823585510254, + "eval_rewards/rejected": -19.899858474731445, + "eval_runtime": 76.575, + "eval_samples_per_second": 13.059, + "eval_steps_per_second": 0.418, + "step": 2400 + }, + { + "epoch": 1.55, + "learning_rate": 4.1216377053082594e-07, + "logits/chosen": 0.19748568534851074, + "logits/rejected": 0.791664719581604, + "logps/chosen": -461.057861328125, + "logps/rejected": -645.6085205078125, + "loss": 0.0145, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.15015172958374, + "rewards/margins": 19.398887634277344, + "rewards/rejected": -25.549039840698242, + "step": 2410 + }, + { + "epoch": 1.56, + "learning_rate": 4.115686741252083e-07, + "logits/chosen": 0.5446011424064636, + "logits/rejected": 0.8287714123725891, + "logps/chosen": -428.01068115234375, + "logps/rejected": -596.6788330078125, + "loss": 0.0192, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.512643337249756, + "rewards/margins": 18.827342987060547, + "rewards/rejected": -25.33998680114746, + "step": 2420 + }, + { + "epoch": 1.56, + "learning_rate": 4.109735777195906e-07, + "logits/chosen": 0.2738257646560669, + "logits/rejected": 0.6724745035171509, + "logps/chosen": -443.29852294921875, + "logps/rejected": -617.8185424804688, + "loss": 0.0311, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.934741497039795, + "rewards/margins": 18.175350189208984, + "rewards/rejected": -23.110088348388672, + "step": 2430 + }, + { + "epoch": 1.57, + "learning_rate": 4.103784813139728e-07, + "logits/chosen": -0.21786899864673615, + "logits/rejected": 0.508806049823761, + "logps/chosen": -386.5865173339844, + "logps/rejected": -540.3848876953125, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1630804538726807, + "rewards/margins": 18.559303283691406, + "rewards/rejected": -21.72238540649414, + "step": 2440 + }, + { + "epoch": 1.57, + "learning_rate": 4.097833849083551e-07, + "logits/chosen": 0.27359846234321594, + "logits/rejected": 0.8219190835952759, + "logps/chosen": -489.2318420410156, + "logps/rejected": -654.9146118164062, + "loss": 0.0167, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.5973005294799805, + "rewards/margins": 19.05999183654785, + "rewards/rejected": -25.65729331970215, + "step": 2450 + }, + { + "epoch": 1.58, + "learning_rate": 4.091882885027374e-07, + "logits/chosen": 0.23141400516033173, + "logits/rejected": 0.8404422998428345, + "logps/chosen": -470.26300048828125, + "logps/rejected": -647.7288208007812, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.572993278503418, + "rewards/margins": 19.47740364074707, + "rewards/rejected": -25.050395965576172, + "step": 2460 + }, + { + "epoch": 1.59, + "learning_rate": 4.0859319209711976e-07, + "logits/chosen": 0.21321940422058105, + "logits/rejected": 0.9413207769393921, + "logps/chosen": -481.8892517089844, + "logps/rejected": -568.3132934570312, + "loss": 0.0106, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.117605686187744, + "rewards/margins": 17.3239688873291, + "rewards/rejected": -22.441572189331055, + "step": 2470 + }, + { + "epoch": 1.59, + "learning_rate": 4.07998095691502e-07, + "logits/chosen": 0.34440717101097107, + "logits/rejected": 1.0420324802398682, + "logps/chosen": -412.06756591796875, + "logps/rejected": -661.225830078125, + "loss": 0.0085, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.927117347717285, + "rewards/margins": 19.294017791748047, + "rewards/rejected": -25.22113609313965, + "step": 2480 + }, + { + "epoch": 1.6, + "learning_rate": 4.074029992858843e-07, + "logits/chosen": 0.617647647857666, + "logits/rejected": 1.0356745719909668, + "logps/chosen": -466.6419372558594, + "logps/rejected": -593.55517578125, + "loss": 0.027, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.356731414794922, + "rewards/margins": 18.503705978393555, + "rewards/rejected": -23.860441207885742, + "step": 2490 + }, + { + "epoch": 1.61, + "learning_rate": 4.0680790288026654e-07, + "logits/chosen": 0.6595112681388855, + "logits/rejected": 1.4305124282836914, + "logps/chosen": -434.178466796875, + "logps/rejected": -573.2113647460938, + "loss": 0.0269, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.551410675048828, + "rewards/margins": 15.67224407196045, + "rewards/rejected": -22.223651885986328, + "step": 2500 + }, + { + "epoch": 1.61, + "eval_logits/chosen": -0.08062884956598282, + "eval_logits/rejected": 0.4895774722099304, + "eval_logps/chosen": -393.9366149902344, + "eval_logps/rejected": -509.43963623046875, + "eval_loss": 0.151441290974617, + "eval_rewards/accuracies": 0.96875, + "eval_rewards/chosen": -6.457356929779053, + "eval_rewards/margins": 12.851531982421875, + "eval_rewards/rejected": -19.308889389038086, + "eval_runtime": 76.6294, + "eval_samples_per_second": 13.05, + "eval_steps_per_second": 0.418, + "step": 2500 + }, + { + "epoch": 1.61, + "learning_rate": 4.062128064746489e-07, + "logits/chosen": 0.7047534584999084, + "logits/rejected": 1.2956479787826538, + "logps/chosen": -373.82366943359375, + "logps/rejected": -593.6276245117188, + "loss": 0.0182, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.4311394691467285, + "rewards/margins": 15.736666679382324, + "rewards/rejected": -22.167804718017578, + "step": 2510 + }, + { + "epoch": 1.62, + "learning_rate": 4.056177100690312e-07, + "logits/chosen": 0.23054015636444092, + "logits/rejected": 0.9308466911315918, + "logps/chosen": -403.5622253417969, + "logps/rejected": -522.812744140625, + "loss": 0.03, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.561907768249512, + "rewards/margins": 15.371617317199707, + "rewards/rejected": -22.933523178100586, + "step": 2520 + }, + { + "epoch": 1.63, + "learning_rate": 4.050226136634135e-07, + "logits/chosen": 0.061969585716724396, + "logits/rejected": 0.7286633253097534, + "logps/chosen": -414.0792541503906, + "logps/rejected": -600.1736450195312, + "loss": 0.0162, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.574122428894043, + "rewards/margins": 19.657848358154297, + "rewards/rejected": -25.231969833374023, + "step": 2530 + }, + { + "epoch": 1.63, + "learning_rate": 4.044275172577957e-07, + "logits/chosen": 0.05762529373168945, + "logits/rejected": 1.0145411491394043, + "logps/chosen": -445.4334411621094, + "logps/rejected": -645.9708251953125, + "loss": 0.0138, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -7.179165840148926, + "rewards/margins": 19.949363708496094, + "rewards/rejected": -27.128530502319336, + "step": 2540 + }, + { + "epoch": 1.64, + "learning_rate": 4.0383242085217806e-07, + "logits/chosen": 0.2469167709350586, + "logits/rejected": 0.8062774538993835, + "logps/chosen": -454.03668212890625, + "logps/rejected": -624.8360595703125, + "loss": 0.0212, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.777960777282715, + "rewards/margins": 17.905128479003906, + "rewards/rejected": -24.68309211730957, + "step": 2550 + }, + { + "epoch": 1.65, + "learning_rate": 4.0323732444656036e-07, + "logits/chosen": 0.52375328540802, + "logits/rejected": 1.2141704559326172, + "logps/chosen": -423.7889099121094, + "logps/rejected": -609.6292114257812, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.806221961975098, + "rewards/margins": 18.522531509399414, + "rewards/rejected": -25.328754425048828, + "step": 2560 + }, + { + "epoch": 1.65, + "learning_rate": 4.026422280409426e-07, + "logits/chosen": 0.5363117456436157, + "logits/rejected": 0.9718011617660522, + "logps/chosen": -423.1117248535156, + "logps/rejected": -559.2742309570312, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.507771015167236, + "rewards/margins": 17.65149688720703, + "rewards/rejected": -23.159269332885742, + "step": 2570 + }, + { + "epoch": 1.66, + "learning_rate": 4.020471316353249e-07, + "logits/chosen": 0.7948285341262817, + "logits/rejected": 1.6965904235839844, + "logps/chosen": -450.24945068359375, + "logps/rejected": -587.7679443359375, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.323611259460449, + "rewards/margins": 19.197834014892578, + "rewards/rejected": -26.521448135375977, + "step": 2580 + }, + { + "epoch": 1.66, + "learning_rate": 4.0145203522970724e-07, + "logits/chosen": 0.6111919283866882, + "logits/rejected": 1.2076199054718018, + "logps/chosen": -393.1330871582031, + "logps/rejected": -658.733154296875, + "loss": 0.0212, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.268210411071777, + "rewards/margins": 22.285709381103516, + "rewards/rejected": -28.553918838500977, + "step": 2590 + }, + { + "epoch": 1.67, + "learning_rate": 4.008569388240895e-07, + "logits/chosen": 0.8838823437690735, + "logits/rejected": 1.2930363416671753, + "logps/chosen": -433.79248046875, + "logps/rejected": -662.343994140625, + "loss": 0.0183, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.181797027587891, + "rewards/margins": 20.533702850341797, + "rewards/rejected": -26.715499877929688, + "step": 2600 + }, + { + "epoch": 1.67, + "eval_logits/chosen": 0.019558865576982498, + "eval_logits/rejected": 0.6054279804229736, + "eval_logps/chosen": -407.05767822265625, + "eval_logps/rejected": -531.6161499023438, + "eval_loss": 0.177625834941864, + "eval_rewards/accuracies": 0.9375, + "eval_rewards/chosen": -7.769463539123535, + "eval_rewards/margins": 13.75707721710205, + "eval_rewards/rejected": -21.526540756225586, + "eval_runtime": 76.8929, + "eval_samples_per_second": 13.005, + "eval_steps_per_second": 0.416, + "step": 2600 + }, + { + "epoch": 1.68, + "learning_rate": 4.002618424184718e-07, + "logits/chosen": 0.37892434000968933, + "logits/rejected": 0.9920595288276672, + "logps/chosen": -391.6972351074219, + "logps/rejected": -585.7908935546875, + "loss": 0.0218, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.025783538818359, + "rewards/margins": 17.12643814086914, + "rewards/rejected": -22.1522216796875, + "step": 2610 + }, + { + "epoch": 1.68, + "learning_rate": 3.996667460128541e-07, + "logits/chosen": 0.08298696577548981, + "logits/rejected": 0.9364360570907593, + "logps/chosen": -374.4390563964844, + "logps/rejected": -576.79541015625, + "loss": 0.0334, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.216673851013184, + "rewards/margins": 18.307117462158203, + "rewards/rejected": -23.523792266845703, + "step": 2620 + }, + { + "epoch": 1.69, + "learning_rate": 3.990716496072363e-07, + "logits/chosen": 0.30024105310440063, + "logits/rejected": 0.9240363836288452, + "logps/chosen": -419.29510498046875, + "logps/rejected": -590.0726928710938, + "loss": 0.0232, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.619715690612793, + "rewards/margins": 17.221851348876953, + "rewards/rejected": -23.841564178466797, + "step": 2630 + }, + { + "epoch": 1.7, + "learning_rate": 3.9847655320161867e-07, + "logits/chosen": 0.2850233018398285, + "logits/rejected": 0.6386219263076782, + "logps/chosen": -397.0573425292969, + "logps/rejected": -577.4563598632812, + "loss": 0.0089, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -6.049335479736328, + "rewards/margins": 16.51738739013672, + "rewards/rejected": -22.56671905517578, + "step": 2640 + }, + { + "epoch": 1.7, + "learning_rate": 3.9788145679600096e-07, + "logits/chosen": 0.2091820240020752, + "logits/rejected": 0.9494321942329407, + "logps/chosen": -415.09869384765625, + "logps/rejected": -584.553466796875, + "loss": 0.0211, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.250141620635986, + "rewards/margins": 17.63558578491211, + "rewards/rejected": -22.88572883605957, + "step": 2650 + }, + { + "epoch": 1.71, + "learning_rate": 3.972863603903832e-07, + "logits/chosen": 0.15588752925395966, + "logits/rejected": 0.9101377725601196, + "logps/chosen": -440.03546142578125, + "logps/rejected": -556.76953125, + "loss": 0.0258, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.330071449279785, + "rewards/margins": 16.938446044921875, + "rewards/rejected": -23.26851463317871, + "step": 2660 + }, + { + "epoch": 1.72, + "learning_rate": 3.966912639847655e-07, + "logits/chosen": 0.43947821855545044, + "logits/rejected": 1.1439796686172485, + "logps/chosen": -408.59869384765625, + "logps/rejected": -561.8624267578125, + "loss": 0.0274, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -6.567991733551025, + "rewards/margins": 18.473587036132812, + "rewards/rejected": -25.041580200195312, + "step": 2670 + }, + { + "epoch": 1.72, + "learning_rate": 3.9609616757914784e-07, + "logits/chosen": 0.639157772064209, + "logits/rejected": 1.2126656770706177, + "logps/chosen": -433.49432373046875, + "logps/rejected": -545.498779296875, + "loss": 0.0441, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.019906044006348, + "rewards/margins": 16.54349708557129, + "rewards/rejected": -22.563400268554688, + "step": 2680 + }, + { + "epoch": 1.73, + "learning_rate": 3.955010711735301e-07, + "logits/chosen": 1.0705523490905762, + "logits/rejected": 1.574342966079712, + "logps/chosen": -402.82073974609375, + "logps/rejected": -556.3751831054688, + "loss": 0.0333, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.881136417388916, + "rewards/margins": 16.096803665161133, + "rewards/rejected": -22.977941513061523, + "step": 2690 + }, + { + "epoch": 1.74, + "learning_rate": 3.949059747679124e-07, + "logits/chosen": 0.6093908548355103, + "logits/rejected": 1.4320948123931885, + "logps/chosen": -416.2469177246094, + "logps/rejected": -563.9771118164062, + "loss": 0.0219, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.77686071395874, + "rewards/margins": 17.765539169311523, + "rewards/rejected": -23.54239845275879, + "step": 2700 + }, + { + "epoch": 1.74, + "eval_logits/chosen": -0.009464547038078308, + "eval_logits/rejected": 0.5373153686523438, + "eval_logps/chosen": -408.0279541015625, + "eval_logps/rejected": -531.8214721679688, + "eval_loss": 0.14935775101184845, + "eval_rewards/accuracies": 0.9296875, + "eval_rewards/chosen": -7.866490364074707, + "eval_rewards/margins": 13.680585861206055, + "eval_rewards/rejected": -21.547077178955078, + "eval_runtime": 76.6739, + "eval_samples_per_second": 13.042, + "eval_steps_per_second": 0.417, + "step": 2700 + }, + { + "epoch": 1.74, + "learning_rate": 3.943108783622947e-07, + "logits/chosen": 0.5636851787567139, + "logits/rejected": 0.969277024269104, + "logps/chosen": -465.9778747558594, + "logps/rejected": -580.30810546875, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.8137688636779785, + "rewards/margins": 17.516647338867188, + "rewards/rejected": -25.330415725708008, + "step": 2710 + }, + { + "epoch": 1.75, + "learning_rate": 3.9371578195667697e-07, + "logits/chosen": 0.28570881485939026, + "logits/rejected": 1.0974262952804565, + "logps/chosen": -420.02520751953125, + "logps/rejected": -557.4937744140625, + "loss": 0.0242, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.895697116851807, + "rewards/margins": 18.642253875732422, + "rewards/rejected": -24.53795051574707, + "step": 2720 + }, + { + "epoch": 1.75, + "learning_rate": 3.9312068555105927e-07, + "logits/chosen": 0.2833688259124756, + "logits/rejected": 1.2028071880340576, + "logps/chosen": -428.30242919921875, + "logps/rejected": -601.4442138671875, + "loss": 0.022, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.424818992614746, + "rewards/margins": 16.82640838623047, + "rewards/rejected": -22.2512264251709, + "step": 2730 + }, + { + "epoch": 1.76, + "learning_rate": 3.9252558914544156e-07, + "logits/chosen": -0.17283181846141815, + "logits/rejected": 0.6285573840141296, + "logps/chosen": -407.96063232421875, + "logps/rejected": -656.8048095703125, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.4999213218688965, + "rewards/margins": 18.290767669677734, + "rewards/rejected": -23.790691375732422, + "step": 2740 + }, + { + "epoch": 1.77, + "learning_rate": 3.919304927398238e-07, + "logits/chosen": 0.1602792888879776, + "logits/rejected": 0.6874667406082153, + "logps/chosen": -375.77960205078125, + "logps/rejected": -546.3209228515625, + "loss": 0.0288, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.720765113830566, + "rewards/margins": 16.913997650146484, + "rewards/rejected": -22.634761810302734, + "step": 2750 + }, + { + "epoch": 1.77, + "learning_rate": 3.9133539633420615e-07, + "logits/chosen": 0.2454497367143631, + "logits/rejected": 0.6986191272735596, + "logps/chosen": -390.337890625, + "logps/rejected": -560.3748779296875, + "loss": 0.0155, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.7264814376831055, + "rewards/margins": 17.356287002563477, + "rewards/rejected": -23.082767486572266, + "step": 2760 + }, + { + "epoch": 1.78, + "learning_rate": 3.9074029992858845e-07, + "logits/chosen": 0.38854673504829407, + "logits/rejected": 1.452370047569275, + "logps/chosen": -415.73297119140625, + "logps/rejected": -565.9852294921875, + "loss": 0.016, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.159702777862549, + "rewards/margins": 17.43438148498535, + "rewards/rejected": -23.594083786010742, + "step": 2770 + }, + { + "epoch": 1.79, + "learning_rate": 3.901452035229707e-07, + "logits/chosen": 0.7071353197097778, + "logits/rejected": 1.089143991470337, + "logps/chosen": -402.5384216308594, + "logps/rejected": -598.9990234375, + "loss": 0.0203, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -7.571539402008057, + "rewards/margins": 18.129215240478516, + "rewards/rejected": -25.700754165649414, + "step": 2780 + }, + { + "epoch": 1.79, + "learning_rate": 3.89550107117353e-07, + "logits/chosen": 0.5019019842147827, + "logits/rejected": 1.2631019353866577, + "logps/chosen": -412.53607177734375, + "logps/rejected": -484.9837341308594, + "loss": 0.0231, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.363646984100342, + "rewards/margins": 15.107401847839355, + "rewards/rejected": -19.471050262451172, + "step": 2790 + }, + { + "epoch": 1.8, + "learning_rate": 3.8895501071173533e-07, + "logits/chosen": 0.5883782505989075, + "logits/rejected": 1.2746624946594238, + "logps/chosen": -431.44287109375, + "logps/rejected": -637.4125366210938, + "loss": 0.0335, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.525457859039307, + "rewards/margins": 19.470340728759766, + "rewards/rejected": -25.995798110961914, + "step": 2800 + }, + { + "epoch": 1.8, + "eval_logits/chosen": -0.053867146372795105, + "eval_logits/rejected": 0.6149381399154663, + "eval_logps/chosen": -404.7615051269531, + "eval_logps/rejected": -530.05126953125, + "eval_loss": 0.15537042915821075, + "eval_rewards/accuracies": 0.9609375, + "eval_rewards/chosen": -7.539842128753662, + "eval_rewards/margins": 13.830206871032715, + "eval_rewards/rejected": -21.37004852294922, + "eval_runtime": 76.7186, + "eval_samples_per_second": 13.035, + "eval_steps_per_second": 0.417, + "step": 2800 + }, + { + "epoch": 1.81, + "learning_rate": 3.8835991430611757e-07, + "logits/chosen": 0.6285505294799805, + "logits/rejected": 1.1924630403518677, + "logps/chosen": -426.795166015625, + "logps/rejected": -537.3714599609375, + "loss": 0.0291, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -6.4925079345703125, + "rewards/margins": 17.20037078857422, + "rewards/rejected": -23.69287872314453, + "step": 2810 + }, + { + "epoch": 1.81, + "learning_rate": 3.8776481790049987e-07, + "logits/chosen": 0.3737705945968628, + "logits/rejected": 1.088739275932312, + "logps/chosen": -472.11669921875, + "logps/rejected": -613.3414916992188, + "loss": 0.0219, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.82584285736084, + "rewards/margins": 20.265369415283203, + "rewards/rejected": -26.09120750427246, + "step": 2820 + }, + { + "epoch": 1.82, + "learning_rate": 3.8716972149488216e-07, + "logits/chosen": 0.4724366068840027, + "logits/rejected": 1.5313258171081543, + "logps/chosen": -474.59326171875, + "logps/rejected": -591.4876098632812, + "loss": 0.0295, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.544611930847168, + "rewards/margins": 20.092144012451172, + "rewards/rejected": -25.636754989624023, + "step": 2830 + }, + { + "epoch": 1.83, + "learning_rate": 3.865746250892644e-07, + "logits/chosen": 0.6960526704788208, + "logits/rejected": 1.0484097003936768, + "logps/chosen": -442.74298095703125, + "logps/rejected": -641.082763671875, + "loss": 0.0359, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.976676940917969, + "rewards/margins": 18.82603645324707, + "rewards/rejected": -24.802715301513672, + "step": 2840 + }, + { + "epoch": 1.83, + "learning_rate": 3.8597952868364675e-07, + "logits/chosen": 0.6734079122543335, + "logits/rejected": 1.2869551181793213, + "logps/chosen": -410.61846923828125, + "logps/rejected": -535.3711547851562, + "loss": 0.0256, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.3238630294799805, + "rewards/margins": 16.99979019165039, + "rewards/rejected": -22.323650360107422, + "step": 2850 + }, + { + "epoch": 1.84, + "learning_rate": 3.8538443227802905e-07, + "logits/chosen": 0.39041590690612793, + "logits/rejected": 1.0508239269256592, + "logps/chosen": -455.080322265625, + "logps/rejected": -550.6773681640625, + "loss": 0.0194, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.037131309509277, + "rewards/margins": 17.071130752563477, + "rewards/rejected": -21.10826301574707, + "step": 2860 + }, + { + "epoch": 1.84, + "learning_rate": 3.847893358724113e-07, + "logits/chosen": 0.21260789036750793, + "logits/rejected": 1.1434606313705444, + "logps/chosen": -434.0419006347656, + "logps/rejected": -555.1299438476562, + "loss": 0.0233, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.177916526794434, + "rewards/margins": 15.855340957641602, + "rewards/rejected": -21.03325843811035, + "step": 2870 + }, + { + "epoch": 1.85, + "learning_rate": 3.841942394667936e-07, + "logits/chosen": 0.26587316393852234, + "logits/rejected": 1.1289548873901367, + "logps/chosen": -388.0738830566406, + "logps/rejected": -598.3447265625, + "loss": 0.0343, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.688809394836426, + "rewards/margins": 18.783828735351562, + "rewards/rejected": -24.472637176513672, + "step": 2880 + }, + { + "epoch": 1.86, + "learning_rate": 3.8359914306117593e-07, + "logits/chosen": 0.9435780644416809, + "logits/rejected": 1.0387550592422485, + "logps/chosen": -420.2930603027344, + "logps/rejected": -602.9673461914062, + "loss": 0.0159, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.682863712310791, + "rewards/margins": 18.17510986328125, + "rewards/rejected": -24.857975006103516, + "step": 2890 + }, + { + "epoch": 1.86, + "learning_rate": 3.8300404665555817e-07, + "logits/chosen": 0.5122434496879578, + "logits/rejected": 1.1984080076217651, + "logps/chosen": -479.920654296875, + "logps/rejected": -683.6166381835938, + "loss": 0.0145, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.739544868469238, + "rewards/margins": 21.551074981689453, + "rewards/rejected": -28.290618896484375, + "step": 2900 + }, + { + "epoch": 1.86, + "eval_logits/chosen": -0.22228558361530304, + "eval_logits/rejected": 0.4165645241737366, + "eval_logps/chosen": -403.52783203125, + "eval_logps/rejected": -530.5889892578125, + "eval_loss": 0.1386074274778366, + "eval_rewards/accuracies": 0.9609375, + "eval_rewards/chosen": -7.416477680206299, + "eval_rewards/margins": 14.007347106933594, + "eval_rewards/rejected": -21.423826217651367, + "eval_runtime": 76.7881, + "eval_samples_per_second": 13.023, + "eval_steps_per_second": 0.417, + "step": 2900 + }, + { + "epoch": 1.87, + "learning_rate": 3.8240895024994047e-07, + "logits/chosen": 0.35297220945358276, + "logits/rejected": 1.0394313335418701, + "logps/chosen": -442.6025390625, + "logps/rejected": -655.9295654296875, + "loss": 0.0235, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.260979175567627, + "rewards/margins": 20.33395004272461, + "rewards/rejected": -26.59493064880371, + "step": 2910 + }, + { + "epoch": 1.88, + "learning_rate": 3.8181385384432276e-07, + "logits/chosen": 0.402191162109375, + "logits/rejected": 1.116523265838623, + "logps/chosen": -393.9117126464844, + "logps/rejected": -693.4283447265625, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.633070468902588, + "rewards/margins": 21.96987533569336, + "rewards/rejected": -26.602941513061523, + "step": 2920 + }, + { + "epoch": 1.88, + "learning_rate": 3.8121875743870506e-07, + "logits/chosen": 0.16651079058647156, + "logits/rejected": 0.6804031133651733, + "logps/chosen": -412.31512451171875, + "logps/rejected": -665.4926147460938, + "loss": 0.0146, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.822356700897217, + "rewards/margins": 19.13541030883789, + "rewards/rejected": -23.957767486572266, + "step": 2930 + }, + { + "epoch": 1.89, + "learning_rate": 3.8062366103308735e-07, + "logits/chosen": 0.1117088571190834, + "logits/rejected": 1.083188772201538, + "logps/chosen": -369.11370849609375, + "logps/rejected": -570.9990844726562, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.329820156097412, + "rewards/margins": 19.094078063964844, + "rewards/rejected": -22.423898696899414, + "step": 2940 + }, + { + "epoch": 1.9, + "learning_rate": 3.8002856462746965e-07, + "logits/chosen": 0.11049242317676544, + "logits/rejected": 0.9105658531188965, + "logps/chosen": -433.6129455566406, + "logps/rejected": -589.2352294921875, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2948660850524902, + "rewards/margins": 18.181903839111328, + "rewards/rejected": -21.47677230834961, + "step": 2950 + }, + { + "epoch": 1.9, + "learning_rate": 3.794334682218519e-07, + "logits/chosen": -0.10087742656469345, + "logits/rejected": 0.8825875520706177, + "logps/chosen": -401.3361511230469, + "logps/rejected": -503.46710205078125, + "loss": 0.0172, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.8647804260253906, + "rewards/margins": 15.990776062011719, + "rewards/rejected": -19.85555648803711, + "step": 2960 + }, + { + "epoch": 1.91, + "learning_rate": 3.7883837181623424e-07, + "logits/chosen": 0.07600893825292587, + "logits/rejected": 0.9553617238998413, + "logps/chosen": -408.1060485839844, + "logps/rejected": -534.631591796875, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9009666442871094, + "rewards/margins": 17.22827911376953, + "rewards/rejected": -21.12924575805664, + "step": 2970 + }, + { + "epoch": 1.92, + "learning_rate": 3.7824327541061653e-07, + "logits/chosen": 0.2739563286304474, + "logits/rejected": 0.8527243733406067, + "logps/chosen": -437.29638671875, + "logps/rejected": -695.6834106445312, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.193564414978027, + "rewards/margins": 21.4495792388916, + "rewards/rejected": -26.643142700195312, + "step": 2980 + }, + { + "epoch": 1.92, + "learning_rate": 3.776481790049988e-07, + "logits/chosen": 0.43024197220802307, + "logits/rejected": 1.1188406944274902, + "logps/chosen": -386.9818115234375, + "logps/rejected": -554.8717041015625, + "loss": 0.0291, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.847230911254883, + "rewards/margins": 17.605852127075195, + "rewards/rejected": -23.453083038330078, + "step": 2990 + }, + { + "epoch": 1.93, + "learning_rate": 3.7705308259938107e-07, + "logits/chosen": 0.6990832090377808, + "logits/rejected": 1.3339582681655884, + "logps/chosen": -375.6474609375, + "logps/rejected": -568.9274291992188, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8585574626922607, + "rewards/margins": 18.85175323486328, + "rewards/rejected": -22.710311889648438, + "step": 3000 + }, + { + "epoch": 1.93, + "eval_logits/chosen": -0.04803978279232979, + "eval_logits/rejected": 0.6017779111862183, + "eval_logps/chosen": -375.2929992675781, + "eval_logps/rejected": -498.0708312988281, + "eval_loss": 0.1309323012828827, + "eval_rewards/accuracies": 0.953125, + "eval_rewards/chosen": -4.592996120452881, + "eval_rewards/margins": 13.579015731811523, + "eval_rewards/rejected": -18.17201042175293, + "eval_runtime": 76.759, + "eval_samples_per_second": 13.028, + "eval_steps_per_second": 0.417, + "step": 3000 + }, + { + "epoch": 1.93, + "learning_rate": 3.764579861937634e-07, + "logits/chosen": 0.5929206609725952, + "logits/rejected": 1.196090579032898, + "logps/chosen": -398.5981750488281, + "logps/rejected": -553.6544799804688, + "loss": 0.0145, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.344789028167725, + "rewards/margins": 19.267980575561523, + "rewards/rejected": -23.612768173217773, + "step": 3010 + }, + { + "epoch": 1.94, + "learning_rate": 3.7586288978814566e-07, + "logits/chosen": 0.46780499815940857, + "logits/rejected": 1.3755557537078857, + "logps/chosen": -408.9537658691406, + "logps/rejected": -639.6270141601562, + "loss": 0.0271, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.787570953369141, + "rewards/margins": 19.320308685302734, + "rewards/rejected": -26.107879638671875, + "step": 3020 + }, + { + "epoch": 1.95, + "learning_rate": 3.7526779338252795e-07, + "logits/chosen": 0.47549518942832947, + "logits/rejected": 1.1248539686203003, + "logps/chosen": -458.66864013671875, + "logps/rejected": -612.8165283203125, + "loss": 0.0257, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.648183345794678, + "rewards/margins": 18.065431594848633, + "rewards/rejected": -23.713613510131836, + "step": 3030 + }, + { + "epoch": 1.95, + "learning_rate": 3.7467269697691025e-07, + "logits/chosen": 0.022746117785573006, + "logits/rejected": 1.0212655067443848, + "logps/chosen": -437.48583984375, + "logps/rejected": -603.9195556640625, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.6978535652160645, + "rewards/margins": 19.632654190063477, + "rewards/rejected": -25.330509185791016, + "step": 3040 + }, + { + "epoch": 1.96, + "learning_rate": 3.740776005712925e-07, + "logits/chosen": -0.12851546704769135, + "logits/rejected": 0.7596645951271057, + "logps/chosen": -444.29461669921875, + "logps/rejected": -648.9785766601562, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.788307189941406, + "rewards/margins": 18.562524795532227, + "rewards/rejected": -24.350830078125, + "step": 3050 + }, + { + "epoch": 1.97, + "learning_rate": 3.7348250416567484e-07, + "logits/chosen": 0.014139672741293907, + "logits/rejected": 0.8716185688972473, + "logps/chosen": -395.7372131347656, + "logps/rejected": -608.2803955078125, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.413556098937988, + "rewards/margins": 19.46448516845703, + "rewards/rejected": -23.878040313720703, + "step": 3060 + }, + { + "epoch": 1.97, + "learning_rate": 3.7288740776005713e-07, + "logits/chosen": 0.32760128378868103, + "logits/rejected": 0.5136176943778992, + "logps/chosen": -420.23480224609375, + "logps/rejected": -669.7601318359375, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.701635360717773, + "rewards/margins": 19.267168045043945, + "rewards/rejected": -23.96880531311035, + "step": 3070 + }, + { + "epoch": 1.98, + "learning_rate": 3.722923113544394e-07, + "logits/chosen": 0.4951232373714447, + "logits/rejected": 1.083812952041626, + "logps/chosen": -412.07464599609375, + "logps/rejected": -682.0787353515625, + "loss": 0.0255, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.654479026794434, + "rewards/margins": 18.33094596862793, + "rewards/rejected": -22.985427856445312, + "step": 3080 + }, + { + "epoch": 1.99, + "learning_rate": 3.7169721494882167e-07, + "logits/chosen": 0.7839171290397644, + "logits/rejected": 1.5858978033065796, + "logps/chosen": -398.9652099609375, + "logps/rejected": -555.3340454101562, + "loss": 0.022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.493814945220947, + "rewards/margins": 17.03528594970703, + "rewards/rejected": -21.52910041809082, + "step": 3090 + }, + { + "epoch": 1.99, + "learning_rate": 3.71102118543204e-07, + "logits/chosen": 0.8546144366264343, + "logits/rejected": 1.6159175634384155, + "logps/chosen": -427.02978515625, + "logps/rejected": -589.3820190429688, + "loss": 0.0187, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.833371639251709, + "rewards/margins": 18.236379623413086, + "rewards/rejected": -25.069753646850586, + "step": 3100 + }, + { + "epoch": 1.99, + "eval_logits/chosen": 0.0045381635427474976, + "eval_logits/rejected": 0.7645629048347473, + "eval_logps/chosen": -400.106201171875, + "eval_logps/rejected": -530.0531616210938, + "eval_loss": 0.13586518168449402, + "eval_rewards/accuracies": 0.953125, + "eval_rewards/chosen": -7.074316024780273, + "eval_rewards/margins": 14.295927047729492, + "eval_rewards/rejected": -21.370241165161133, + "eval_runtime": 76.6017, + "eval_samples_per_second": 13.055, + "eval_steps_per_second": 0.418, + "step": 3100 + }, + { + "epoch": 2.0, + "learning_rate": 3.7050702213758626e-07, + "logits/chosen": 0.6572461128234863, + "logits/rejected": 1.7969856262207031, + "logps/chosen": -398.08453369140625, + "logps/rejected": -607.6263427734375, + "loss": 0.0082, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.214932918548584, + "rewards/margins": 21.25518035888672, + "rewards/rejected": -27.470117568969727, + "step": 3110 + } + ], + "logging_steps": 10, + "max_steps": 9336, + "num_train_epochs": 6, + "save_steps": 500, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}