{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9988002399520095, "eval_steps": 10000, "global_step": 1666, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 5.988023952095808e-08, "logits/chosen": 0.08604730665683746, "logits/rejected": 0.14735615253448486, "logps/chosen": -306.490966796875, "logps/rejected": -284.1272277832031, "loss": 0.3265, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.00030196673469617963, "rewards/margins": 0.0003302523400634527, "rewards/rejected": -2.8285570806474425e-05, "step": 10 }, { "epoch": 0.02, "learning_rate": 1.1976047904191617e-07, "logits/chosen": 0.07679140567779541, "logits/rejected": 0.13602732121944427, "logps/chosen": -270.78839111328125, "logps/rejected": -239.93063354492188, "loss": 0.3261, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0004934846656396985, "rewards/margins": 0.0003515507560223341, "rewards/rejected": 0.00014193373499438167, "step": 20 }, { "epoch": 0.04, "learning_rate": 1.7964071856287425e-07, "logits/chosen": 0.08353379368782043, "logits/rejected": 0.18339572846889496, "logps/chosen": -350.9549255371094, "logps/rejected": -297.22540283203125, "loss": 0.3184, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0014796562027186155, "rewards/margins": 0.0019894675351679325, "rewards/rejected": -0.0005098110414110124, "step": 30 }, { "epoch": 0.05, "learning_rate": 2.3952095808383233e-07, "logits/chosen": 0.019822608679533005, "logits/rejected": 0.08296267688274384, "logps/chosen": -319.4927062988281, "logps/rejected": -283.9751892089844, "loss": 0.3256, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.002856344450265169, "rewards/margins": 0.002463708631694317, "rewards/rejected": 0.00039263576036319137, "step": 40 }, { "epoch": 0.06, "learning_rate": 2.9940119760479036e-07, "logits/chosen": 0.08549543470144272, "logits/rejected": 0.17756062746047974, "logps/chosen": -323.5517272949219, "logps/rejected": -260.97430419921875, "loss": 0.334, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.009697018191218376, "rewards/margins": 0.006611070595681667, "rewards/rejected": 0.003085947362706065, "step": 50 }, { "epoch": 0.07, "learning_rate": 3.592814371257485e-07, "logits/chosen": 0.06589554995298386, "logits/rejected": 0.1306503564119339, "logps/chosen": -298.95758056640625, "logps/rejected": -270.60491943359375, "loss": 0.3363, "rewards/accuracies": 0.53125, "rewards/chosen": 0.013030062429606915, "rewards/margins": 0.015959080308675766, "rewards/rejected": -0.002929018810391426, "step": 60 }, { "epoch": 0.08, "learning_rate": 4.191616766467065e-07, "logits/chosen": 0.12031495571136475, "logits/rejected": 0.1805482804775238, "logps/chosen": -324.9089050292969, "logps/rejected": -258.1722106933594, "loss": 0.35, "rewards/accuracies": 0.59375, "rewards/chosen": 0.01291816495358944, "rewards/margins": 0.024499254301190376, "rewards/rejected": -0.011581086553633213, "step": 70 }, { "epoch": 0.1, "learning_rate": 4.790419161676647e-07, "logits/chosen": 0.07129839807748795, "logits/rejected": 0.06257729232311249, "logps/chosen": -348.3440856933594, "logps/rejected": -301.13201904296875, "loss": 0.3534, "rewards/accuracies": 0.5625, "rewards/chosen": 0.020938226953148842, "rewards/margins": 0.04414420947432518, "rewards/rejected": -0.023205982521176338, "step": 80 }, { "epoch": 0.11, "learning_rate": 5.389221556886228e-07, "logits/chosen": 0.08929282426834106, "logits/rejected": 0.17325028777122498, "logps/chosen": -316.1700439453125, "logps/rejected": -239.6669158935547, "loss": 0.3401, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.029331078752875328, "rewards/margins": 0.04176971688866615, "rewards/rejected": -0.012438638135790825, "step": 90 }, { "epoch": 0.12, "learning_rate": 5.988023952095807e-07, "logits/chosen": 0.08235646784305573, "logits/rejected": 0.20089676976203918, "logps/chosen": -346.5120849609375, "logps/rejected": -244.7434844970703, "loss": 0.3598, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.024305399507284164, "rewards/margins": 0.05784136801958084, "rewards/rejected": -0.03353596478700638, "step": 100 }, { "epoch": 0.13, "learning_rate": 6.586826347305389e-07, "logits/chosen": 0.09147349745035172, "logits/rejected": 0.14871755242347717, "logps/chosen": -320.4288024902344, "logps/rejected": -296.7561950683594, "loss": 0.3659, "rewards/accuracies": 0.59375, "rewards/chosen": 0.02884674072265625, "rewards/margins": 0.07738146930932999, "rewards/rejected": -0.04853471741080284, "step": 110 }, { "epoch": 0.14, "learning_rate": 7.18562874251497e-07, "logits/chosen": 0.11941643804311752, "logits/rejected": 0.19315120577812195, "logps/chosen": -307.4304504394531, "logps/rejected": -267.32354736328125, "loss": 0.3478, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.017546942457556725, "rewards/margins": 0.04876334220170975, "rewards/rejected": -0.031216394156217575, "step": 120 }, { "epoch": 0.16, "learning_rate": 7.784431137724551e-07, "logits/chosen": 0.029125332832336426, "logits/rejected": 0.13452430069446564, "logps/chosen": -312.5350646972656, "logps/rejected": -282.9906921386719, "loss": 0.3476, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.00010833759006345645, "rewards/margins": 0.07449211925268173, "rewards/rejected": -0.074600450694561, "step": 130 }, { "epoch": 0.17, "learning_rate": 8.38323353293413e-07, "logits/chosen": 0.07874087244272232, "logits/rejected": 0.1653667837381363, "logps/chosen": -323.27349853515625, "logps/rejected": -291.1534729003906, "loss": 0.3508, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.02968321368098259, "rewards/margins": 0.08127189427614212, "rewards/rejected": -0.05158866569399834, "step": 140 }, { "epoch": 0.18, "learning_rate": 8.982035928143712e-07, "logits/chosen": 0.08668439090251923, "logits/rejected": 0.2019130289554596, "logps/chosen": -331.7438049316406, "logps/rejected": -282.2138671875, "loss": 0.3408, "rewards/accuracies": 0.65625, "rewards/chosen": 0.01081451028585434, "rewards/margins": 0.11802507936954498, "rewards/rejected": -0.10721053928136826, "step": 150 }, { "epoch": 0.19, "learning_rate": 9.580838323353293e-07, "logits/chosen": 0.17280864715576172, "logits/rejected": 0.2055545598268509, "logps/chosen": -274.8600769042969, "logps/rejected": -270.06292724609375, "loss": 0.3387, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0733034759759903, "rewards/margins": 0.10196540504693985, "rewards/rejected": -0.17526885867118835, "step": 160 }, { "epoch": 0.2, "learning_rate": 9.999901172555115e-07, "logits/chosen": 0.11693236976861954, "logits/rejected": 0.22318188846111298, "logps/chosen": -295.5730895996094, "logps/rejected": -241.427978515625, "loss": 0.3278, "rewards/accuracies": 0.59375, "rewards/chosen": -0.10353277623653412, "rewards/margins": 0.08885890245437622, "rewards/rejected": -0.19239167869091034, "step": 170 }, { "epoch": 0.22, "learning_rate": 9.998144348880984e-07, "logits/chosen": 0.13397441804409027, "logits/rejected": 0.17609907686710358, "logps/chosen": -315.9989013671875, "logps/rejected": -284.2611999511719, "loss": 0.2989, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.13113589584827423, "rewards/margins": 0.20865917205810547, "rewards/rejected": -0.3397950530052185, "step": 180 }, { "epoch": 0.23, "learning_rate": 9.994192247951515e-07, "logits/chosen": 0.0798071026802063, "logits/rejected": 0.19820377230644226, "logps/chosen": -370.124755859375, "logps/rejected": -300.3252258300781, "loss": 0.2864, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1878439486026764, "rewards/margins": 0.19493091106414795, "rewards/rejected": -0.38277485966682434, "step": 190 }, { "epoch": 0.24, "learning_rate": 9.988046605602389e-07, "logits/chosen": 0.062232185155153275, "logits/rejected": 0.10917310416698456, "logps/chosen": -371.1097106933594, "logps/rejected": -357.1819763183594, "loss": 0.2606, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.22029490768909454, "rewards/margins": 0.28260841965675354, "rewards/rejected": -0.5029032230377197, "step": 200 }, { "epoch": 0.25, "learning_rate": 9.979710121113161e-07, "logits/chosen": 0.034885063767433167, "logits/rejected": 0.21861211955547333, "logps/chosen": -342.1944885253906, "logps/rejected": -289.22650146484375, "loss": 0.2492, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.2956531047821045, "rewards/margins": 0.1267615407705307, "rewards/rejected": -0.422414630651474, "step": 210 }, { "epoch": 0.26, "learning_rate": 9.969186456021698e-07, "logits/chosen": 0.16665050387382507, "logits/rejected": 0.2314036637544632, "logps/chosen": -332.11944580078125, "logps/rejected": -352.77984619140625, "loss": 0.2453, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.5235787630081177, "rewards/margins": 0.11343353986740112, "rewards/rejected": -0.637012243270874, "step": 220 }, { "epoch": 0.28, "learning_rate": 9.956480232515958e-07, "logits/chosen": 0.08058114349842072, "logits/rejected": 0.1627165824174881, "logps/chosen": -358.8059997558594, "logps/rejected": -323.8888854980469, "loss": 0.2318, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4380015432834625, "rewards/margins": 0.17821909487247467, "rewards/rejected": -0.6162205934524536, "step": 230 }, { "epoch": 0.29, "learning_rate": 9.941597031403838e-07, "logits/chosen": 0.006361488252878189, "logits/rejected": 0.14310383796691895, "logps/chosen": -389.48565673828125, "logps/rejected": -322.10211181640625, "loss": 0.2383, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4093681275844574, "rewards/margins": 0.3130263388156891, "rewards/rejected": -0.7223945260047913, "step": 240 }, { "epoch": 0.3, "learning_rate": 9.924543389661986e-07, "logits/chosen": -0.02250669337809086, "logits/rejected": 0.07530005276203156, "logps/chosen": -340.83258056640625, "logps/rejected": -296.439208984375, "loss": 0.2014, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5589783787727356, "rewards/margins": 0.13796114921569824, "rewards/rejected": -0.6969395279884338, "step": 250 }, { "epoch": 0.31, "learning_rate": 9.905326797564637e-07, "logits/chosen": 0.0301060788333416, "logits/rejected": 0.040543533861637115, "logps/chosen": -358.6849365234375, "logps/rejected": -356.5751037597656, "loss": 0.2045, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4870468080043793, "rewards/margins": 0.35052934288978577, "rewards/rejected": -0.8375760912895203, "step": 260 }, { "epoch": 0.32, "learning_rate": 9.883955695393743e-07, "logits/chosen": 0.000649239111226052, "logits/rejected": 0.022925155237317085, "logps/chosen": -414.7745666503906, "logps/rejected": -382.8134765625, "loss": 0.1942, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.5404749512672424, "rewards/margins": 0.33123961091041565, "rewards/rejected": -0.8717145919799805, "step": 270 }, { "epoch": 0.34, "learning_rate": 9.860439469731857e-07, "logits/chosen": -0.09281344711780548, "logits/rejected": -0.03658398985862732, "logps/chosen": -405.03375244140625, "logps/rejected": -366.6494140625, "loss": 0.185, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.6410386562347412, "rewards/margins": 0.3488469123840332, "rewards/rejected": -0.9898855090141296, "step": 280 }, { "epoch": 0.35, "learning_rate": 9.834788449339357e-07, "logits/chosen": -0.08095192164182663, "logits/rejected": 0.01947859302163124, "logps/chosen": -394.8659362792969, "logps/rejected": -363.68896484375, "loss": 0.1967, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6716563701629639, "rewards/margins": 0.35823893547058105, "rewards/rejected": -1.029895305633545, "step": 290 }, { "epoch": 0.36, "learning_rate": 9.807013900617874e-07, "logits/chosen": -0.0883181020617485, "logits/rejected": -0.04594338685274124, "logps/chosen": -368.65673828125, "logps/rejected": -391.3207092285156, "loss": 0.172, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6309670805931091, "rewards/margins": 0.35369187593460083, "rewards/rejected": -0.9846588969230652, "step": 300 }, { "epoch": 0.37, "learning_rate": 9.777128022661876e-07, "logits/chosen": -0.09762546420097351, "logits/rejected": -0.0024875595699995756, "logps/chosen": -399.7887878417969, "logps/rejected": -358.4689025878906, "loss": 0.1668, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8196405172348022, "rewards/margins": 0.29697737097740173, "rewards/rejected": -1.1166179180145264, "step": 310 }, { "epoch": 0.38, "learning_rate": 9.745143941900607e-07, "logits/chosen": 0.008178139105439186, "logits/rejected": 0.06296094506978989, "logps/chosen": -318.7600402832031, "logps/rejected": -331.10064697265625, "loss": 0.1882, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7780025005340576, "rewards/margins": 0.26487836241722107, "rewards/rejected": -1.0428807735443115, "step": 320 }, { "epoch": 0.4, "learning_rate": 9.711075706332709e-07, "logits/chosen": 0.013791674748063087, "logits/rejected": 0.0160694383084774, "logps/chosen": -388.42034912109375, "logps/rejected": -389.6010437011719, "loss": 0.2253, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.7162211537361145, "rewards/margins": 0.36372461915016174, "rewards/rejected": -1.0799458026885986, "step": 330 }, { "epoch": 0.41, "learning_rate": 9.674938279356085e-07, "logits/chosen": -0.03776586428284645, "logits/rejected": 0.06259050965309143, "logps/chosen": -374.6064453125, "logps/rejected": -347.88470458984375, "loss": 0.205, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.534817099571228, "rewards/margins": 0.38740724325180054, "rewards/rejected": -0.9222243428230286, "step": 340 }, { "epoch": 0.42, "learning_rate": 9.636747533195696e-07, "logits/chosen": -0.01885460875928402, "logits/rejected": 0.02142554149031639, "logps/chosen": -394.37554931640625, "logps/rejected": -375.2349853515625, "loss": 0.1826, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6455094218254089, "rewards/margins": 0.3467075526714325, "rewards/rejected": -0.992216944694519, "step": 350 }, { "epoch": 0.43, "learning_rate": 9.596520241932198e-07, "logits/chosen": -0.017878394573926926, "logits/rejected": 0.017517492175102234, "logps/chosen": -352.70306396484375, "logps/rejected": -338.069091796875, "loss": 0.19, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.691541314125061, "rewards/margins": 0.2788214087486267, "rewards/rejected": -0.970362663269043, "step": 360 }, { "epoch": 0.44, "learning_rate": 9.554274074134438e-07, "logits/chosen": 0.021682120859622955, "logits/rejected": -0.029350418597459793, "logps/chosen": -371.61688232421875, "logps/rejected": -414.383056640625, "loss": 0.1841, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7872886657714844, "rewards/margins": 0.4845043122768402, "rewards/rejected": -1.271793007850647, "step": 370 }, { "epoch": 0.46, "learning_rate": 9.510027585099106e-07, "logits/chosen": -0.12592732906341553, "logits/rejected": -0.046247534453868866, "logps/chosen": -406.028564453125, "logps/rejected": -401.7203674316406, "loss": 0.163, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9996460676193237, "rewards/margins": 0.5138468742370605, "rewards/rejected": -1.5134929418563843, "step": 380 }, { "epoch": 0.47, "learning_rate": 9.463800208700903e-07, "logits/chosen": -0.08930721133947372, "logits/rejected": -0.05070197582244873, "logps/chosen": -451.42279052734375, "logps/rejected": -448.52239990234375, "loss": 0.1745, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8742043375968933, "rewards/margins": 0.6940380334854126, "rewards/rejected": -1.5682423114776611, "step": 390 }, { "epoch": 0.48, "learning_rate": 9.415612248856824e-07, "logits/chosen": -0.05381837487220764, "logits/rejected": 0.003485634922981262, "logps/chosen": -368.0176086425781, "logps/rejected": -357.1371765136719, "loss": 0.1836, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7229073643684387, "rewards/margins": 0.3449392020702362, "rewards/rejected": -1.067846655845642, "step": 400 }, { "epoch": 0.49, "learning_rate": 9.365484870608296e-07, "logits/chosen": 0.029063940048217773, "logits/rejected": 0.097762331366539, "logps/chosen": -411.65362548828125, "logps/rejected": -373.55780029296875, "loss": 0.1899, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.8145805597305298, "rewards/margins": 0.2862567603588104, "rewards/rejected": -1.100837230682373, "step": 410 }, { "epoch": 0.5, "learning_rate": 9.313440090825118e-07, "logits/chosen": -0.05664276331663132, "logits/rejected": -0.03467511385679245, "logps/chosen": -398.04229736328125, "logps/rejected": -385.92242431640625, "loss": 0.1805, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8589332699775696, "rewards/margins": 0.4059682786464691, "rewards/rejected": -1.2649013996124268, "step": 420 }, { "epoch": 0.52, "learning_rate": 9.259500768535226e-07, "logits/chosen": -0.08377309143543243, "logits/rejected": -0.07356628775596619, "logps/chosen": -447.7481994628906, "logps/rejected": -414.64691162109375, "loss": 0.1551, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9274501800537109, "rewards/margins": 0.39995989203453064, "rewards/rejected": -1.3274099826812744, "step": 430 }, { "epoch": 0.53, "learning_rate": 9.203690594884599e-07, "logits/chosen": -0.12356811761856079, "logits/rejected": -0.022806715220212936, "logps/chosen": -456.0494689941406, "logps/rejected": -365.2657165527344, "loss": 0.1655, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7116681337356567, "rewards/margins": 0.41433924436569214, "rewards/rejected": -1.126007318496704, "step": 440 }, { "epoch": 0.54, "learning_rate": 9.146034082731666e-07, "logits/chosen": -0.09179284423589706, "logits/rejected": -0.0003077193978242576, "logps/chosen": -437.27130126953125, "logps/rejected": -412.7220764160156, "loss": 0.1777, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8441268801689148, "rewards/margins": 0.415422260761261, "rewards/rejected": -1.2595491409301758, "step": 450 }, { "epoch": 0.55, "learning_rate": 9.086556555880808e-07, "logits/chosen": -0.06583790481090546, "logits/rejected": -0.04182542487978935, "logps/chosen": -418.4779357910156, "logps/rejected": -353.83837890625, "loss": 0.1726, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.9192927479743958, "rewards/margins": 0.37663576006889343, "rewards/rejected": -1.295928716659546, "step": 460 }, { "epoch": 0.56, "learning_rate": 9.025284137959672e-07, "logits/chosen": -0.06473040580749512, "logits/rejected": -0.0440254732966423, "logps/chosen": -362.92718505859375, "logps/rejected": -377.1036071777344, "loss": 0.1812, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7919570803642273, "rewards/margins": 0.5987037420272827, "rewards/rejected": -1.3906608819961548, "step": 470 }, { "epoch": 0.58, "learning_rate": 8.962243740945193e-07, "logits/chosen": -0.16925401985645294, "logits/rejected": -0.08770541846752167, "logps/chosen": -388.9159240722656, "logps/rejected": -359.47113037109375, "loss": 0.1672, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6586478352546692, "rewards/margins": 0.4439294934272766, "rewards/rejected": -1.1025774478912354, "step": 480 }, { "epoch": 0.59, "learning_rate": 8.897463053343362e-07, "logits/chosen": -0.15098915994167328, "logits/rejected": -0.09551471471786499, "logps/chosen": -375.2362976074219, "logps/rejected": -377.40374755859375, "loss": 0.1654, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7528899312019348, "rewards/margins": 0.4329261779785156, "rewards/rejected": -1.1858160495758057, "step": 490 }, { "epoch": 0.6, "learning_rate": 8.83097052802791e-07, "logits/chosen": -0.19105754792690277, "logits/rejected": -0.20415177941322327, "logps/chosen": -438.22900390625, "logps/rejected": -413.8765563964844, "loss": 0.1546, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1042355298995972, "rewards/margins": 0.324155330657959, "rewards/rejected": -1.4283908605575562, "step": 500 }, { "epoch": 0.61, "learning_rate": 8.762795369743302e-07, "logits/chosen": -0.1732911914587021, "logits/rejected": -0.14861652255058289, "logps/chosen": -430.3330078125, "logps/rejected": -412.541748046875, "loss": 0.1798, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.980789303779602, "rewards/margins": 0.3685452342033386, "rewards/rejected": -1.349334478378296, "step": 510 }, { "epoch": 0.62, "learning_rate": 8.692967522277452e-07, "logits/chosen": -0.2620302140712738, "logits/rejected": -0.23215556144714355, "logps/chosen": -397.580810546875, "logps/rejected": -401.218017578125, "loss": 0.1675, "rewards/accuracies": 0.65625, "rewards/chosen": -0.922555148601532, "rewards/margins": 0.5658455491065979, "rewards/rejected": -1.4884006977081299, "step": 520 }, { "epoch": 0.64, "learning_rate": 8.621517655309871e-07, "logits/chosen": -0.22505703568458557, "logits/rejected": -0.21353265643119812, "logps/chosen": -351.1321105957031, "logps/rejected": -402.13812255859375, "loss": 0.1616, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9518992304801941, "rewards/margins": 0.576888918876648, "rewards/rejected": -1.5287882089614868, "step": 530 }, { "epoch": 0.65, "learning_rate": 8.548477150940976e-07, "logits/chosen": -0.2001039981842041, "logits/rejected": -0.11327888816595078, "logps/chosen": -493.7047424316406, "logps/rejected": -436.589111328125, "loss": 0.1636, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9714105725288391, "rewards/margins": 0.33609670400619507, "rewards/rejected": -1.3075072765350342, "step": 540 }, { "epoch": 0.66, "learning_rate": 8.473878089908488e-07, "logits/chosen": -0.21582035720348358, "logits/rejected": -0.1645367294549942, "logps/chosen": -398.8905029296875, "logps/rejected": -387.6484680175781, "loss": 0.1616, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.990231990814209, "rewards/margins": 0.44936639070510864, "rewards/rejected": -1.4395983219146729, "step": 550 }, { "epoch": 0.67, "learning_rate": 8.397753237496989e-07, "logits/chosen": -0.14817172288894653, "logits/rejected": -0.1120673194527626, "logps/chosen": -411.04547119140625, "logps/rejected": -383.22772216796875, "loss": 0.1901, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7595073580741882, "rewards/margins": 0.40615516901016235, "rewards/rejected": -1.1656625270843506, "step": 560 }, { "epoch": 0.68, "learning_rate": 8.320136029146792e-07, "logits/chosen": -0.16047896444797516, "logits/rejected": -0.07018055766820908, "logps/chosen": -400.98187255859375, "logps/rejected": -362.17718505859375, "loss": 0.2001, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6892865896224976, "rewards/margins": 0.28408753871917725, "rewards/rejected": -0.97337406873703, "step": 570 }, { "epoch": 0.7, "learning_rate": 8.241060555768485e-07, "logits/chosen": -0.0704733356833458, "logits/rejected": -0.06501320749521255, "logps/chosen": -396.9364318847656, "logps/rejected": -431.4285583496094, "loss": 0.1785, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8727092742919922, "rewards/margins": 0.596550464630127, "rewards/rejected": -1.4692598581314087, "step": 580 }, { "epoch": 0.71, "learning_rate": 8.160561548769579e-07, "logits/chosen": -0.28295397758483887, "logits/rejected": -0.14965271949768066, "logps/chosen": -452.00927734375, "logps/rejected": -374.82147216796875, "loss": 0.1728, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.9421501159667969, "rewards/margins": 0.42659133672714233, "rewards/rejected": -1.3687413930892944, "step": 590 }, { "epoch": 0.72, "learning_rate": 8.078674364799822e-07, "logits/chosen": -0.14142009615898132, "logits/rejected": -0.04986618459224701, "logps/chosen": -406.5389709472656, "logps/rejected": -386.47332763671875, "loss": 0.177, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.805712878704071, "rewards/margins": 0.3872813880443573, "rewards/rejected": -1.1929943561553955, "step": 600 }, { "epoch": 0.73, "learning_rate": 7.995434970221915e-07, "logits/chosen": -0.17559921741485596, "logits/rejected": -0.11661572754383087, "logps/chosen": -403.9606018066406, "logps/rejected": -384.85552978515625, "loss": 0.1623, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0488908290863037, "rewards/margins": 0.48383307456970215, "rewards/rejected": -1.5327237844467163, "step": 610 }, { "epoch": 0.74, "learning_rate": 7.910879925314412e-07, "logits/chosen": -0.17368339002132416, "logits/rejected": -0.14526596665382385, "logps/chosen": -452.844482421875, "logps/rejected": -468.4239196777344, "loss": 0.1568, "rewards/accuracies": 0.625, "rewards/chosen": -1.206775426864624, "rewards/margins": 0.4726572036743164, "rewards/rejected": -1.6794328689575195, "step": 620 }, { "epoch": 0.76, "learning_rate": 7.825046368213781e-07, "logits/chosen": -0.0924375057220459, "logits/rejected": -0.050202567130327225, "logps/chosen": -395.1549377441406, "logps/rejected": -428.9422302246094, "loss": 0.1588, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0802223682403564, "rewards/margins": 0.5186144113540649, "rewards/rejected": -1.5988366603851318, "step": 630 }, { "epoch": 0.77, "learning_rate": 7.737971998602646e-07, "logits/chosen": -0.12860114872455597, "logits/rejected": -0.08621262013912201, "logps/chosen": -436.1148376464844, "logps/rejected": -423.77197265625, "loss": 0.1665, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0132167339324951, "rewards/margins": 0.6689234972000122, "rewards/rejected": -1.6821401119232178, "step": 640 }, { "epoch": 0.78, "learning_rate": 7.649695061151383e-07, "logits/chosen": -0.10626170784235, "logits/rejected": 0.04015485942363739, "logps/chosen": -422.88482666015625, "logps/rejected": -401.40399169921875, "loss": 0.1549, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.095649003982544, "rewards/margins": 0.6243374943733215, "rewards/rejected": -1.7199863195419312, "step": 650 }, { "epoch": 0.79, "learning_rate": 7.560254328720362e-07, "logits/chosen": -0.06364638358354568, "logits/rejected": -0.053037650883197784, "logps/chosen": -399.8140563964844, "logps/rejected": -377.3336181640625, "loss": 0.1513, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9602818489074707, "rewards/margins": 0.49895793199539185, "rewards/rejected": -1.4592397212982178, "step": 660 }, { "epoch": 0.8, "learning_rate": 7.469689085330195e-07, "logits/chosen": -0.10155004262924194, "logits/rejected": -0.053514860570430756, "logps/chosen": -433.72393798828125, "logps/rejected": -445.9632873535156, "loss": 0.1623, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0109504461288452, "rewards/margins": 0.7182562351226807, "rewards/rejected": -1.7292066812515259, "step": 670 }, { "epoch": 0.82, "learning_rate": 7.37803910890746e-07, "logits/chosen": 0.04309063404798508, "logits/rejected": 0.05086972564458847, "logps/chosen": -364.6871643066406, "logps/rejected": -444.87615966796875, "loss": 0.1664, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9387286305427551, "rewards/margins": 0.5190192461013794, "rewards/rejected": -1.4577480554580688, "step": 680 }, { "epoch": 0.83, "learning_rate": 7.285344653813504e-07, "logits/chosen": 0.005590127781033516, "logits/rejected": 0.05527013540267944, "logps/chosen": -420.50421142578125, "logps/rejected": -392.6619567871094, "loss": 0.1652, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7855395078659058, "rewards/margins": 0.5350450873374939, "rewards/rejected": -1.3205845355987549, "step": 690 }, { "epoch": 0.84, "learning_rate": 7.19164643316399e-07, "logits/chosen": -0.14258572459220886, "logits/rejected": -0.07622213661670685, "logps/chosen": -450.7547302246094, "logps/rejected": -421.08740234375, "loss": 0.1606, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8017450571060181, "rewards/margins": 0.7106142044067383, "rewards/rejected": -1.5123592615127563, "step": 700 }, { "epoch": 0.85, "learning_rate": 7.096985600946937e-07, "logits/chosen": -0.07260491698980331, "logits/rejected": -0.03210210055112839, "logps/chosen": -395.57122802734375, "logps/rejected": -388.82244873046875, "loss": 0.1695, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0168393850326538, "rewards/margins": 0.4624442160129547, "rewards/rejected": -1.4792835712432861, "step": 710 }, { "epoch": 0.86, "learning_rate": 7.001403733947133e-07, "logits/chosen": -0.07182411104440689, "logits/rejected": -0.03923854976892471, "logps/chosen": -382.0772399902344, "logps/rejected": -392.26947021484375, "loss": 0.1581, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0459500551223755, "rewards/margins": 0.514189600944519, "rewards/rejected": -1.560139536857605, "step": 720 }, { "epoch": 0.88, "learning_rate": 6.904942813484846e-07, "logits/chosen": -0.011922065168619156, "logits/rejected": -0.011497074738144875, "logps/chosen": -387.7651672363281, "logps/rejected": -401.37579345703125, "loss": 0.1647, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9897109270095825, "rewards/margins": 0.43441686034202576, "rewards/rejected": -1.4241278171539307, "step": 730 }, { "epoch": 0.89, "learning_rate": 6.807645206976847e-07, "logits/chosen": -0.10563385486602783, "logits/rejected": -0.026922887191176414, "logps/chosen": -463.51904296875, "logps/rejected": -396.99237060546875, "loss": 0.1522, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9849216341972351, "rewards/margins": 0.29621127247810364, "rewards/rejected": -1.2811329364776611, "step": 740 }, { "epoch": 0.9, "learning_rate": 6.709553649327864e-07, "logits/chosen": -0.12097591161727905, "logits/rejected": -0.08931994438171387, "logps/chosen": -408.4173889160156, "logps/rejected": -437.5682067871094, "loss": 0.1462, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0879547595977783, "rewards/margins": 0.5936441421508789, "rewards/rejected": -1.6815989017486572, "step": 750 }, { "epoch": 0.91, "learning_rate": 6.610711224160624e-07, "logits/chosen": -0.04705732315778732, "logits/rejected": -0.07320089638233185, "logps/chosen": -412.73016357421875, "logps/rejected": -459.07379150390625, "loss": 0.1555, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0964686870574951, "rewards/margins": 0.4577638506889343, "rewards/rejected": -1.5542323589324951, "step": 760 }, { "epoch": 0.92, "learning_rate": 6.51116134489272e-07, "logits/chosen": -0.1511673927307129, "logits/rejected": -0.09142941236495972, "logps/chosen": -437.65899658203125, "logps/rejected": -391.7604675292969, "loss": 0.1633, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8741817474365234, "rewards/margins": 0.5760532021522522, "rewards/rejected": -1.4502347707748413, "step": 770 }, { "epoch": 0.94, "learning_rate": 6.410947735668653e-07, "logits/chosen": -0.0881911963224411, "logits/rejected": -0.03626961261034012, "logps/chosen": -463.324951171875, "logps/rejected": -437.611328125, "loss": 0.1497, "rewards/accuracies": 0.71875, "rewards/chosen": -1.101100206375122, "rewards/margins": 0.6774808168411255, "rewards/rejected": -1.778580904006958, "step": 780 }, { "epoch": 0.95, "learning_rate": 6.310114412155368e-07, "logits/chosen": -0.12566931545734406, "logits/rejected": -0.05288320779800415, "logps/chosen": -433.12139892578125, "logps/rejected": -450.35491943359375, "loss": 0.1387, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1941142082214355, "rewards/margins": 0.65628582239151, "rewards/rejected": -1.8503999710083008, "step": 790 }, { "epoch": 0.96, "learning_rate": 6.208705662209762e-07, "logits/chosen": -0.15951304137706757, "logits/rejected": -0.08058343082666397, "logps/chosen": -434.4078063964844, "logps/rejected": -420.0638122558594, "loss": 0.1476, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1902692317962646, "rewards/margins": 0.41559162735939026, "rewards/rejected": -1.605860948562622, "step": 800 }, { "epoch": 0.97, "learning_rate": 6.106766026426648e-07, "logits/chosen": -0.17886283993721008, "logits/rejected": -0.07015601545572281, "logps/chosen": -441.0155334472656, "logps/rejected": -408.666748046875, "loss": 0.1398, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0972322225570679, "rewards/margins": 0.6500229239463806, "rewards/rejected": -1.7472550868988037, "step": 810 }, { "epoch": 0.98, "learning_rate": 6.004340278575695e-07, "logits/chosen": -0.11742790788412094, "logits/rejected": -0.08097358047962189, "logps/chosen": -481.71258544921875, "logps/rejected": -490.98876953125, "loss": 0.13, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1779887676239014, "rewards/margins": 0.5510334372520447, "rewards/rejected": -1.7290220260620117, "step": 820 }, { "epoch": 1.0, "learning_rate": 5.901473405935966e-07, "logits/chosen": -0.07999231666326523, "logits/rejected": -0.0074430713430047035, "logps/chosen": -432.86981201171875, "logps/rejected": -416.0740661621094, "loss": 0.1415, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1524440050125122, "rewards/margins": 0.4403459429740906, "rewards/rejected": -1.592789888381958, "step": 830 }, { "epoch": 1.01, "learning_rate": 5.798210589536672e-07, "logits/chosen": -0.06455966830253601, "logits/rejected": -0.09206490218639374, "logps/chosen": -372.489501953125, "logps/rejected": -415.30987548828125, "loss": 0.1047, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.02300226688385, "rewards/margins": 0.6669188737869263, "rewards/rejected": -1.6899211406707764, "step": 840 }, { "epoch": 1.02, "learning_rate": 5.694597184312832e-07, "logits/chosen": -0.1633467972278595, "logits/rejected": -0.11246392875909805, "logps/chosen": -488.162353515625, "logps/rejected": -509.6139221191406, "loss": 0.0701, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.3693019151687622, "rewards/margins": 1.0748904943466187, "rewards/rejected": -2.444192409515381, "step": 850 }, { "epoch": 1.03, "learning_rate": 5.590678699184552e-07, "logits/chosen": -0.16048532724380493, "logits/rejected": -0.1280759871006012, "logps/chosen": -512.06103515625, "logps/rejected": -648.7377319335938, "loss": 0.0457, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.176934242248535, "rewards/margins": 1.2973973751068115, "rewards/rejected": -3.4743316173553467, "step": 860 }, { "epoch": 1.04, "learning_rate": 5.486500777068659e-07, "logits/chosen": -0.09887855499982834, "logits/rejected": -0.06003720685839653, "logps/chosen": -567.089111328125, "logps/rejected": -624.2326049804688, "loss": 0.0408, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.2174699306488037, "rewards/margins": 1.4321911334991455, "rewards/rejected": -3.6496613025665283, "step": 870 }, { "epoch": 1.06, "learning_rate": 5.382109174831493e-07, "logits/chosen": -0.08137073367834091, "logits/rejected": -0.01235194131731987, "logps/chosen": -535.8220825195312, "logps/rejected": -577.6448974609375, "loss": 0.0384, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.318791389465332, "rewards/margins": 0.9558296203613281, "rewards/rejected": -3.2746212482452393, "step": 880 }, { "epoch": 1.07, "learning_rate": 5.277549743191652e-07, "logits/chosen": -0.05193132162094116, "logits/rejected": -0.004759219475090504, "logps/chosen": -552.9955444335938, "logps/rejected": -598.2587890625, "loss": 0.0363, "rewards/accuracies": 0.75, "rewards/chosen": -2.1983516216278076, "rewards/margins": 1.213283896446228, "rewards/rejected": -3.411635637283325, "step": 890 }, { "epoch": 1.08, "learning_rate": 5.172868406581501e-07, "logits/chosen": -0.03779071569442749, "logits/rejected": 0.056744299829006195, "logps/chosen": -592.2105712890625, "logps/rejected": -628.3460693359375, "loss": 0.0302, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.635134220123291, "rewards/margins": 1.1256375312805176, "rewards/rejected": -3.760772228240967, "step": 900 }, { "epoch": 1.09, "learning_rate": 5.068111142976319e-07, "logits/chosen": -0.12420773506164551, "logits/rejected": -0.029382145032286644, "logps/chosen": -597.504638671875, "logps/rejected": -654.6985473632812, "loss": 0.0294, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.694098949432373, "rewards/margins": 1.2260013818740845, "rewards/rejected": -3.920100450515747, "step": 910 }, { "epoch": 1.1, "learning_rate": 4.963323963699926e-07, "logits/chosen": -0.023588549345731735, "logits/rejected": 0.0034168108832091093, "logps/chosen": -603.0902099609375, "logps/rejected": -727.82861328125, "loss": 0.0216, "rewards/accuracies": 0.71875, "rewards/chosen": -3.0236687660217285, "rewards/margins": 1.304715633392334, "rewards/rejected": -4.3283843994140625, "step": 920 }, { "epoch": 1.12, "learning_rate": 4.858552893215655e-07, "logits/chosen": -0.007192631717771292, "logits/rejected": 0.07475622743368149, "logps/chosen": -670.5064697265625, "logps/rejected": -776.722900390625, "loss": 0.0176, "rewards/accuracies": 0.78125, "rewards/chosen": -3.2482478618621826, "rewards/margins": 1.7695789337158203, "rewards/rejected": -5.017827033996582, "step": 930 }, { "epoch": 1.13, "learning_rate": 4.753843948911556e-07, "logits/chosen": 0.12182103097438812, "logits/rejected": 0.03238976001739502, "logps/chosen": -625.9619750976562, "logps/rejected": -750.8270263671875, "loss": 0.0191, "rewards/accuracies": 0.75, "rewards/chosen": -3.034609317779541, "rewards/margins": 1.7803478240966797, "rewards/rejected": -4.814957141876221, "step": 940 }, { "epoch": 1.14, "learning_rate": 4.649243120888722e-07, "logits/chosen": 0.011549100279808044, "logits/rejected": 0.03488563746213913, "logps/chosen": -563.0631103515625, "logps/rejected": -704.4915161132812, "loss": 0.0259, "rewards/accuracies": 0.78125, "rewards/chosen": -2.4957776069641113, "rewards/margins": 1.694275140762329, "rewards/rejected": -4.190052509307861, "step": 950 }, { "epoch": 1.15, "learning_rate": 4.544796351761574e-07, "logits/chosen": -0.055170875042676926, "logits/rejected": 0.028366830199956894, "logps/chosen": -607.5286865234375, "logps/rejected": -650.486572265625, "loss": 0.0332, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.490236282348633, "rewards/margins": 1.102222204208374, "rewards/rejected": -3.592458724975586, "step": 960 }, { "epoch": 1.16, "learning_rate": 4.440549516479022e-07, "logits/chosen": -0.015740562230348587, "logits/rejected": 0.13129273056983948, "logps/chosen": -615.1241455078125, "logps/rejected": -619.5276489257812, "loss": 0.0316, "rewards/accuracies": 0.78125, "rewards/chosen": -2.452130079269409, "rewards/margins": 1.2859001159667969, "rewards/rejected": -3.738029956817627, "step": 970 }, { "epoch": 1.18, "learning_rate": 4.336548402175345e-07, "logits/chosen": 0.025289198383688927, "logits/rejected": 0.15218539535999298, "logps/chosen": -558.3017578125, "logps/rejected": -641.06494140625, "loss": 0.0211, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.584613800048828, "rewards/margins": 1.562873125076294, "rewards/rejected": -4.147486686706543, "step": 980 }, { "epoch": 1.19, "learning_rate": 4.232838688059627e-07, "logits/chosen": 0.1965368539094925, "logits/rejected": 0.2399691641330719, "logps/chosen": -523.7330322265625, "logps/rejected": -633.6004028320312, "loss": 0.0231, "rewards/accuracies": 0.75, "rewards/chosen": -2.6117119789123535, "rewards/margins": 1.3616979122161865, "rewards/rejected": -3.973410129547119, "step": 990 }, { "epoch": 1.2, "learning_rate": 4.129465925352618e-07, "logits/chosen": 0.19317726790905, "logits/rejected": 0.25055578351020813, "logps/chosen": -577.004150390625, "logps/rejected": -663.4541015625, "loss": 0.0246, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.840785503387451, "rewards/margins": 1.2757389545440674, "rewards/rejected": -4.1165242195129395, "step": 1000 }, { "epoch": 1.21, "learning_rate": 4.0264755172797837e-07, "logits/chosen": 0.12826010584831238, "logits/rejected": 0.30189377069473267, "logps/chosen": -596.4738159179688, "logps/rejected": -651.32666015625, "loss": 0.0217, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.827807664871216, "rewards/margins": 1.3351318836212158, "rewards/rejected": -4.162939071655273, "step": 1010 }, { "epoch": 1.22, "learning_rate": 3.9239126991293775e-07, "logits/chosen": 0.059239018708467484, "logits/rejected": 0.23136253654956818, "logps/chosen": -616.50341796875, "logps/rejected": -693.9109497070312, "loss": 0.021, "rewards/accuracies": 0.75, "rewards/chosen": -2.953068494796753, "rewards/margins": 1.526259183883667, "rewards/rejected": -4.479328155517578, "step": 1020 }, { "epoch": 1.24, "learning_rate": 3.82182251838427e-07, "logits/chosen": 0.25944507122039795, "logits/rejected": 0.26485711336135864, "logps/chosen": -576.2420043945312, "logps/rejected": -677.7175903320312, "loss": 0.025, "rewards/accuracies": 0.78125, "rewards/chosen": -2.817415237426758, "rewards/margins": 1.3848437070846558, "rewards/rejected": -4.202259063720703, "step": 1030 }, { "epoch": 1.25, "learning_rate": 3.720249814936255e-07, "logits/chosen": 0.2337515652179718, "logits/rejected": 0.26738518476486206, "logps/chosen": -571.0965576171875, "logps/rejected": -680.7943725585938, "loss": 0.0222, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.5748653411865234, "rewards/margins": 1.5103648900985718, "rewards/rejected": -4.085230350494385, "step": 1040 }, { "epoch": 1.26, "learning_rate": 3.6192392013915473e-07, "logits/chosen": 0.19163131713867188, "logits/rejected": 0.31179821491241455, "logps/chosen": -614.9716796875, "logps/rejected": -675.076904296875, "loss": 0.0192, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -3.0146121978759766, "rewards/margins": 1.1077396869659424, "rewards/rejected": -4.12235164642334, "step": 1050 }, { "epoch": 1.27, "learning_rate": 3.5188350434761025e-07, "logits/chosen": 0.28871843218803406, "logits/rejected": 0.36171257495880127, "logps/chosen": -610.6722412109375, "logps/rejected": -736.9325561523438, "loss": 0.0176, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.029247999191284, "rewards/margins": 1.671494483947754, "rewards/rejected": -4.700742244720459, "step": 1060 }, { "epoch": 1.28, "learning_rate": 3.419081440549368e-07, "logits/chosen": 0.3570582866668701, "logits/rejected": 0.33581867814064026, "logps/chosen": -561.0586547851562, "logps/rejected": -679.4564819335938, "loss": 0.0187, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.8346028327941895, "rewards/margins": 1.4775307178497314, "rewards/rejected": -4.3121337890625, "step": 1070 }, { "epoch": 1.3, "learning_rate": 3.3200222062350324e-07, "logits/chosen": 0.3482256233692169, "logits/rejected": 0.41126948595046997, "logps/chosen": -585.4675903320312, "logps/rejected": -710.6563720703125, "loss": 0.0179, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -3.067091703414917, "rewards/margins": 1.3588765859603882, "rewards/rejected": -4.425968647003174, "step": 1080 }, { "epoch": 1.31, "learning_rate": 3.2217008491772724e-07, "logits/chosen": 0.18900027871131897, "logits/rejected": 0.39844125509262085, "logps/chosen": -633.6024780273438, "logps/rejected": -654.0338745117188, "loss": 0.0193, "rewards/accuracies": 0.75, "rewards/chosen": -2.8086068630218506, "rewards/margins": 1.3810145854949951, "rewards/rejected": -4.189621925354004, "step": 1090 }, { "epoch": 1.32, "learning_rate": 3.124160553930953e-07, "logits/chosen": 0.3631289005279541, "logits/rejected": 0.4466603398323059, "logps/chosen": -614.293701171875, "logps/rejected": -700.2027587890625, "loss": 0.0195, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.86740779876709, "rewards/margins": 1.5678730010986328, "rewards/rejected": -4.435280799865723, "step": 1100 }, { "epoch": 1.33, "learning_rate": 3.027444161994178e-07, "logits/chosen": 0.28234532475471497, "logits/rejected": 0.46483272314071655, "logps/chosen": -621.5269775390625, "logps/rejected": -659.30078125, "loss": 0.0199, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.8128793239593506, "rewards/margins": 1.1673749685287476, "rewards/rejected": -3.9802539348602295, "step": 1110 }, { "epoch": 1.34, "learning_rate": 2.9315941529915055e-07, "logits/chosen": 0.3489723205566406, "logits/rejected": 0.4230921268463135, "logps/chosen": -590.6229248046875, "logps/rejected": -712.2610473632812, "loss": 0.0191, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.7181546688079834, "rewards/margins": 1.7558250427246094, "rewards/rejected": -4.4739789962768555, "step": 1120 }, { "epoch": 1.36, "learning_rate": 2.8366526260161205e-07, "logits/chosen": 0.33894267678260803, "logits/rejected": 0.39979317784309387, "logps/chosen": -658.9381103515625, "logps/rejected": -717.6463623046875, "loss": 0.0177, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.9368855953216553, "rewards/margins": 1.5531091690063477, "rewards/rejected": -4.489995002746582, "step": 1130 }, { "epoch": 1.37, "learning_rate": 2.742661281139129e-07, "logits/chosen": 0.2938409745693207, "logits/rejected": 0.5151162147521973, "logps/chosen": -633.9880981445312, "logps/rejected": -735.9833984375, "loss": 0.0179, "rewards/accuracies": 0.75, "rewards/chosen": -3.052492618560791, "rewards/margins": 1.437638759613037, "rewards/rejected": -4.490131378173828, "step": 1140 }, { "epoch": 1.38, "learning_rate": 2.6496614010941214e-07, "logits/chosen": 0.4524363577365875, "logits/rejected": 0.5436291098594666, "logps/chosen": -619.8587646484375, "logps/rejected": -703.4625244140625, "loss": 0.0157, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.3662285804748535, "rewards/margins": 1.1978504657745361, "rewards/rejected": -4.5640788078308105, "step": 1150 }, { "epoch": 1.39, "learning_rate": 2.557693833145038e-07, "logits/chosen": 0.3934541940689087, "logits/rejected": 0.5870059132575989, "logps/chosen": -661.9193115234375, "logps/rejected": -741.275634765625, "loss": 0.0142, "rewards/accuracies": 0.75, "rewards/chosen": -3.4275155067443848, "rewards/margins": 1.533154845237732, "rewards/rejected": -4.9606709480285645, "step": 1160 }, { "epoch": 1.4, "learning_rate": 2.4667989711452873e-07, "logits/chosen": 0.34201499819755554, "logits/rejected": 0.42435866594314575, "logps/chosen": -641.9842529296875, "logps/rejected": -678.3228759765625, "loss": 0.0154, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.073949098587036, "rewards/margins": 1.3109055757522583, "rewards/rejected": -4.384854316711426, "step": 1170 }, { "epoch": 1.42, "learning_rate": 2.3770167377960237e-07, "logits/chosen": 0.48571348190307617, "logits/rejected": 0.6182373762130737, "logps/chosen": -640.8084716796875, "logps/rejected": -753.9522705078125, "loss": 0.0167, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.2477283477783203, "rewards/margins": 1.6778684854507446, "rewards/rejected": -4.925596714019775, "step": 1180 }, { "epoch": 1.43, "learning_rate": 2.2883865671113633e-07, "logits/chosen": 0.4899370074272156, "logits/rejected": 0.5625158548355103, "logps/chosen": -604.4632568359375, "logps/rejected": -694.26708984375, "loss": 0.0158, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.1623740196228027, "rewards/margins": 1.3024260997772217, "rewards/rejected": -4.464799880981445, "step": 1190 }, { "epoch": 1.44, "learning_rate": 2.200947387098232e-07, "logits/chosen": 0.4543294310569763, "logits/rejected": 0.4793587625026703, "logps/chosen": -605.4785766601562, "logps/rejected": -694.0240478515625, "loss": 0.0134, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.003962993621826, "rewards/margins": 1.3972245454788208, "rewards/rejected": -4.401187419891357, "step": 1200 }, { "epoch": 1.45, "learning_rate": 2.1147376026584757e-07, "logits/chosen": 0.4088858664035797, "logits/rejected": 0.4953531324863434, "logps/chosen": -677.3485717773438, "logps/rejected": -794.863525390625, "loss": 0.0143, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.3876795768737793, "rewards/margins": 1.775921106338501, "rewards/rejected": -5.163600921630859, "step": 1210 }, { "epoch": 1.46, "learning_rate": 2.0297950787207047e-07, "logits/chosen": 0.36632710695266724, "logits/rejected": 0.48743313550949097, "logps/chosen": -643.0319213867188, "logps/rejected": -722.9007568359375, "loss": 0.0147, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.1978135108947754, "rewards/margins": 1.263356328010559, "rewards/rejected": -4.461170196533203, "step": 1220 }, { "epoch": 1.48, "learning_rate": 1.9461571236093288e-07, "logits/chosen": 0.4280903935432434, "logits/rejected": 0.6009167432785034, "logps/chosen": -613.9682006835938, "logps/rejected": -689.1741333007812, "loss": 0.0176, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.1748690605163574, "rewards/margins": 1.0791981220245361, "rewards/rejected": -4.2540669441223145, "step": 1230 }, { "epoch": 1.49, "learning_rate": 1.8638604726580476e-07, "logits/chosen": 0.417907178401947, "logits/rejected": 0.6247476935386658, "logps/chosen": -657.6587524414062, "logps/rejected": -703.07470703125, "loss": 0.0182, "rewards/accuracies": 0.71875, "rewards/chosen": -3.0835118293762207, "rewards/margins": 1.2861461639404297, "rewards/rejected": -4.369658470153809, "step": 1240 }, { "epoch": 1.5, "learning_rate": 1.782941272075017e-07, "logits/chosen": 0.4170072674751282, "logits/rejected": 0.6365597248077393, "logps/chosen": -678.8447265625, "logps/rejected": -731.3723754882812, "loss": 0.0154, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.4483695030212402, "rewards/margins": 1.1840156316757202, "rewards/rejected": -4.63238525390625, "step": 1250 }, { "epoch": 1.51, "learning_rate": 1.7034350630667626e-07, "logits/chosen": 0.3995654582977295, "logits/rejected": 0.5743976831436157, "logps/chosen": -584.9893798828125, "logps/rejected": -697.0750732421875, "loss": 0.0175, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.018463611602783, "rewards/margins": 1.5104460716247559, "rewards/rejected": -4.528909683227539, "step": 1260 }, { "epoch": 1.52, "learning_rate": 1.6253767662278345e-07, "logits/chosen": 0.4882670044898987, "logits/rejected": 0.5395201444625854, "logps/chosen": -603.7424926757812, "logps/rejected": -686.8632202148438, "loss": 0.0182, "rewards/accuracies": 0.71875, "rewards/chosen": -2.8431849479675293, "rewards/margins": 1.2739489078521729, "rewards/rejected": -4.117133617401123, "step": 1270 }, { "epoch": 1.54, "learning_rate": 1.548800666203028e-07, "logits/chosen": 0.5481593608856201, "logits/rejected": 0.5967272520065308, "logps/chosen": -568.0536499023438, "logps/rejected": -666.0924072265625, "loss": 0.0186, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.577122926712036, "rewards/margins": 1.5425646305084229, "rewards/rejected": -4.119688034057617, "step": 1280 }, { "epoch": 1.55, "learning_rate": 1.4737403966289385e-07, "logits/chosen": 0.4185335636138916, "logits/rejected": 0.558224618434906, "logps/chosen": -628.1019897460938, "logps/rejected": -670.2452392578125, "loss": 0.0206, "rewards/accuracies": 0.78125, "rewards/chosen": -2.917240619659424, "rewards/margins": 1.3834682703018188, "rewards/rejected": -4.300709247589111, "step": 1290 }, { "epoch": 1.56, "learning_rate": 1.400228925361449e-07, "logits/chosen": 0.456881046295166, "logits/rejected": 0.577447772026062, "logps/chosen": -643.3248901367188, "logps/rejected": -714.2346801757812, "loss": 0.0143, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.2684836387634277, "rewards/margins": 1.3436448574066162, "rewards/rejected": -4.612128257751465, "step": 1300 }, { "epoch": 1.57, "learning_rate": 1.328298539995637e-07, "logits/chosen": 0.5743575096130371, "logits/rejected": 0.5451524257659912, "logps/chosen": -630.0186767578125, "logps/rejected": -750.6807861328125, "loss": 0.0148, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1200015544891357, "rewards/margins": 1.702667236328125, "rewards/rejected": -4.82266902923584, "step": 1310 }, { "epoch": 1.58, "learning_rate": 1.257980833684471e-07, "logits/chosen": 0.606614351272583, "logits/rejected": 0.6577982902526855, "logps/chosen": -708.2349853515625, "logps/rejected": -840.3240966796875, "loss": 0.0138, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.5392088890075684, "rewards/margins": 1.9698632955551147, "rewards/rejected": -5.509071350097656, "step": 1320 }, { "epoch": 1.6, "learning_rate": 1.1893066912625078e-07, "logits/chosen": 0.4186869263648987, "logits/rejected": 0.5709268450737, "logps/chosen": -712.13427734375, "logps/rejected": -783.5457763671875, "loss": 0.0152, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.4307665824890137, "rewards/margins": 1.4622032642364502, "rewards/rejected": -4.892970085144043, "step": 1330 }, { "epoch": 1.61, "learning_rate": 1.1223062756807078e-07, "logits/chosen": 0.5084559917449951, "logits/rejected": 0.6343249678611755, "logps/chosen": -659.8488159179688, "logps/rejected": -722.3294067382812, "loss": 0.0153, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.1651368141174316, "rewards/margins": 1.3635410070419312, "rewards/rejected": -4.528677940368652, "step": 1340 }, { "epoch": 1.62, "learning_rate": 1.0570090147583088e-07, "logits/chosen": 0.4348738193511963, "logits/rejected": 0.6137397289276123, "logps/chosen": -639.1541748046875, "logps/rejected": -713.1842651367188, "loss": 0.0168, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.3276889324188232, "rewards/margins": 1.2510149478912354, "rewards/rejected": -4.578703880310059, "step": 1350 }, { "epoch": 1.63, "learning_rate": 9.934435882575848e-08, "logits/chosen": 0.4675541818141937, "logits/rejected": 0.7556678056716919, "logps/chosen": -631.5349731445312, "logps/rejected": -652.1285400390625, "loss": 0.0157, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.182419538497925, "rewards/margins": 0.9836376309394836, "rewards/rejected": -4.166057586669922, "step": 1360 }, { "epoch": 1.64, "learning_rate": 9.316379152871668e-08, "logits/chosen": 0.7302010655403137, "logits/rejected": 0.8742607831954956, "logps/chosen": -614.2385864257812, "logps/rejected": -697.8746948242188, "loss": 0.0143, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.9696247577667236, "rewards/margins": 1.4906537532806396, "rewards/rejected": -4.460278511047363, "step": 1370 }, { "epoch": 1.66, "learning_rate": 8.716191420394509e-08, "logits/chosen": 0.5582844018936157, "logits/rejected": 0.7243419289588928, "logps/chosen": -625.010009765625, "logps/rejected": -712.3367919921875, "loss": 0.0134, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.13081431388855, "rewards/margins": 1.5529491901397705, "rewards/rejected": -4.6837639808654785, "step": 1380 }, { "epoch": 1.67, "learning_rate": 8.134136298674931e-08, "logits/chosen": 0.48102036118507385, "logits/rejected": 0.6001688838005066, "logps/chosen": -685.7195434570312, "logps/rejected": -809.7654418945312, "loss": 0.014, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.4015579223632812, "rewards/margins": 1.7538063526153564, "rewards/rejected": -5.155364036560059, "step": 1390 }, { "epoch": 1.68, "learning_rate": 7.570469437066146e-08, "logits/chosen": 0.5783728957176208, "logits/rejected": 0.5673348307609558, "logps/chosen": -617.6305541992188, "logps/rejected": -682.7364501953125, "loss": 0.016, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -3.1451942920684814, "rewards/margins": 1.0919114351272583, "rewards/rejected": -4.237105369567871, "step": 1400 }, { "epoch": 1.69, "learning_rate": 7.025438408458106e-08, "logits/chosen": 0.5475814938545227, "logits/rejected": 0.6235382556915283, "logps/chosen": -555.6197509765625, "logps/rejected": -664.3532104492188, "loss": 0.0146, "rewards/accuracies": 0.78125, "rewards/chosen": -2.915714740753174, "rewards/margins": 1.3652766942977905, "rewards/rejected": -4.280991554260254, "step": 1410 }, { "epoch": 1.7, "learning_rate": 6.49928260053893e-08, "logits/chosen": 0.5400280952453613, "logits/rejected": 0.5789826512336731, "logps/chosen": -614.4274291992188, "logps/rejected": -710.9558715820312, "loss": 0.0147, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.1186587810516357, "rewards/margins": 1.3737398386001587, "rewards/rejected": -4.492398262023926, "step": 1420 }, { "epoch": 1.72, "learning_rate": 5.992233110651412e-08, "logits/chosen": 0.5550512075424194, "logits/rejected": 0.7411925196647644, "logps/chosen": -650.88330078125, "logps/rejected": -781.0955200195312, "loss": 0.015, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.1585283279418945, "rewards/margins": 1.8404815196990967, "rewards/rejected": -4.999009609222412, "step": 1430 }, { "epoch": 1.73, "learning_rate": 5.504512644290787e-08, "logits/chosen": 0.46736687421798706, "logits/rejected": 0.5480989217758179, "logps/chosen": -680.0697021484375, "logps/rejected": -811.5567626953125, "loss": 0.0137, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.3890538215637207, "rewards/margins": 1.7540241479873657, "rewards/rejected": -5.143077373504639, "step": 1440 }, { "epoch": 1.74, "learning_rate": 5.036335417288373e-08, "logits/chosen": 0.6140528917312622, "logits/rejected": 0.7194653749465942, "logps/chosen": -698.3580322265625, "logps/rejected": -771.7311401367188, "loss": 0.0156, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.3486709594726562, "rewards/margins": 1.5369219779968262, "rewards/rejected": -4.885592460632324, "step": 1450 }, { "epoch": 1.75, "learning_rate": 4.587907061724033e-08, "logits/chosen": 0.5339714288711548, "logits/rejected": 0.7165490984916687, "logps/chosen": -711.3111572265625, "logps/rejected": -760.9708251953125, "loss": 0.0133, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.3206558227539062, "rewards/margins": 1.4652854204177856, "rewards/rejected": -4.785941123962402, "step": 1460 }, { "epoch": 1.76, "learning_rate": 4.1594245356087467e-08, "logits/chosen": 0.699637234210968, "logits/rejected": 0.6114022135734558, "logps/chosen": -597.2353515625, "logps/rejected": -700.9934692382812, "loss": 0.0135, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.9018421173095703, "rewards/margins": 1.5124543905258179, "rewards/rejected": -4.4142961502075195, "step": 1470 }, { "epoch": 1.78, "learning_rate": 3.751076036377071e-08, "logits/chosen": 0.5522537231445312, "logits/rejected": 0.6186084151268005, "logps/chosen": -587.6602783203125, "logps/rejected": -697.8819580078125, "loss": 0.0129, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.0739331245422363, "rewards/margins": 1.5802268981933594, "rewards/rejected": -4.654160022735596, "step": 1480 }, { "epoch": 1.79, "learning_rate": 3.363040918227289e-08, "logits/chosen": 0.6091222763061523, "logits/rejected": 0.6912073493003845, "logps/chosen": -643.3858032226562, "logps/rejected": -762.7769775390625, "loss": 0.0144, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -3.5035293102264404, "rewards/margins": 1.3396353721618652, "rewards/rejected": -4.843164920806885, "step": 1490 }, { "epoch": 1.8, "learning_rate": 2.995489613345753e-08, "logits/chosen": 0.5477187037467957, "logits/rejected": 0.7094759345054626, "logps/chosen": -591.9749755859375, "logps/rejected": -670.8548583984375, "loss": 0.0136, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.1552910804748535, "rewards/margins": 1.3263123035430908, "rewards/rejected": -4.481603622436523, "step": 1500 }, { "epoch": 1.81, "learning_rate": 2.6485835570499494e-08, "logits/chosen": 0.5902107954025269, "logits/rejected": 0.5623574256896973, "logps/chosen": -645.7799072265625, "logps/rejected": -759.1571044921875, "loss": 0.0126, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.3200345039367676, "rewards/margins": 1.6903254985809326, "rewards/rejected": -5.010359764099121, "step": 1510 }, { "epoch": 1.82, "learning_rate": 2.3224751168831048e-08, "logits/chosen": 0.5822694301605225, "logits/rejected": 0.6582841873168945, "logps/chosen": -657.9237670898438, "logps/rejected": -722.8333740234375, "loss": 0.0138, "rewards/accuracies": 0.78125, "rewards/chosen": -3.3156898021698, "rewards/margins": 1.7110404968261719, "rewards/rejected": -5.026730537414551, "step": 1520 }, { "epoch": 1.84, "learning_rate": 2.0173075256915418e-08, "logits/chosen": 0.571466326713562, "logits/rejected": 0.6854621767997742, "logps/chosen": -628.5067138671875, "logps/rejected": -758.9906005859375, "loss": 0.014, "rewards/accuracies": 0.75, "rewards/chosen": -3.305952548980713, "rewards/margins": 1.6585476398468018, "rewards/rejected": -4.964500427246094, "step": 1530 }, { "epoch": 1.85, "learning_rate": 1.7332148187142126e-08, "logits/chosen": 0.42277950048446655, "logits/rejected": 0.6087282299995422, "logps/chosen": -669.0001220703125, "logps/rejected": -746.3592529296875, "loss": 0.0136, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.435054063796997, "rewards/margins": 1.4669480323791504, "rewards/rejected": -4.902002334594727, "step": 1540 }, { "epoch": 1.86, "learning_rate": 1.4703217747118746e-08, "logits/chosen": 0.5524539947509766, "logits/rejected": 0.6938909292221069, "logps/chosen": -664.91796875, "logps/rejected": -753.2789306640625, "loss": 0.0145, "rewards/accuracies": 0.78125, "rewards/chosen": -3.3968822956085205, "rewards/margins": 1.6205644607543945, "rewards/rejected": -5.017446517944336, "step": 1550 }, { "epoch": 1.87, "learning_rate": 1.2287438611620182e-08, "logits/chosen": 0.5591127872467041, "logits/rejected": 0.598191499710083, "logps/chosen": -639.44921875, "logps/rejected": -717.9773559570312, "loss": 0.0126, "rewards/accuracies": 0.75, "rewards/chosen": -3.239208698272705, "rewards/margins": 1.4245617389678955, "rewards/rejected": -4.6637701988220215, "step": 1560 }, { "epoch": 1.88, "learning_rate": 1.0085871835434023e-08, "logits/chosen": 0.5225650072097778, "logits/rejected": 0.7359067797660828, "logps/chosen": -657.2638549804688, "logps/rejected": -744.9007568359375, "loss": 0.0137, "rewards/accuracies": 0.71875, "rewards/chosen": -3.3361191749572754, "rewards/margins": 1.434823751449585, "rewards/rejected": -4.7709431648254395, "step": 1570 }, { "epoch": 1.9, "learning_rate": 8.099484387325494e-09, "logits/chosen": 0.4896848797798157, "logits/rejected": 0.623939037322998, "logps/chosen": -687.8892822265625, "logps/rejected": -697.3175048828125, "loss": 0.0135, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.2713406085968018, "rewards/margins": 1.3368250131607056, "rewards/rejected": -4.608165740966797, "step": 1580 }, { "epoch": 1.91, "learning_rate": 6.3291487253271936e-09, "logits/chosen": 0.5212961435317993, "logits/rejected": 0.6600114107131958, "logps/chosen": -685.01806640625, "logps/rejected": -809.1822509765625, "loss": 0.0146, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.535987377166748, "rewards/margins": 1.684024453163147, "rewards/rejected": -5.2200117111206055, "step": 1590 }, { "epoch": 1.92, "learning_rate": 4.775642413539338e-09, "logits/chosen": 0.5968783497810364, "logits/rejected": 0.6918438076972961, "logps/chosen": -659.1408081054688, "logps/rejected": -735.6487426757812, "loss": 0.0143, "rewards/accuracies": 0.71875, "rewards/chosen": -3.4331729412078857, "rewards/margins": 1.3792767524719238, "rewards/rejected": -4.8124494552612305, "step": 1600 }, { "epoch": 1.93, "learning_rate": 3.4396477806090674e-09, "logits/chosen": 0.4734949469566345, "logits/rejected": 0.6298776268959045, "logps/chosen": -700.484375, "logps/rejected": -780.0468139648438, "loss": 0.0149, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.4217820167541504, "rewards/margins": 1.7249329090118408, "rewards/rejected": -5.146714687347412, "step": 1610 }, { "epoch": 1.94, "learning_rate": 2.321751620039447e-09, "logits/chosen": 0.5043723583221436, "logits/rejected": 0.6294914484024048, "logps/chosen": -652.2603149414062, "logps/rejected": -759.38818359375, "loss": 0.0138, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.322589874267578, "rewards/margins": 1.9566528797149658, "rewards/rejected": -5.279242992401123, "step": 1620 }, { "epoch": 1.96, "learning_rate": 1.422444932458633e-09, "logits/chosen": 0.5357510447502136, "logits/rejected": 0.6437471508979797, "logps/chosen": -695.9456176757812, "logps/rejected": -788.6571044921875, "loss": 0.0133, "rewards/accuracies": 0.78125, "rewards/chosen": -3.4196677207946777, "rewards/margins": 1.7799441814422607, "rewards/rejected": -5.199612617492676, "step": 1630 }, { "epoch": 1.97, "learning_rate": 7.421227099634886e-10, "logits/chosen": 0.5378649234771729, "logits/rejected": 0.6602402925491333, "logps/chosen": -664.1305541992188, "logps/rejected": -798.1599731445312, "loss": 0.0119, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.2800285816192627, "rewards/margins": 1.8218231201171875, "rewards/rejected": -5.1018524169921875, "step": 1640 }, { "epoch": 1.98, "learning_rate": 2.8108376263175083e-10, "logits/chosen": 0.6657751202583313, "logits/rejected": 0.7019415497779846, "logps/chosen": -635.3778076171875, "logps/rejected": -713.1087036132812, "loss": 0.0138, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -3.1780152320861816, "rewards/margins": 1.308586835861206, "rewards/rejected": -4.486601829528809, "step": 1650 }, { "epoch": 1.99, "learning_rate": 3.953058727912406e-11, "logits/chosen": 0.5478759407997131, "logits/rejected": 0.7440884113311768, "logps/chosen": -651.7216796875, "logps/rejected": -715.89599609375, "loss": 0.0144, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.3715739250183105, "rewards/margins": 1.3552334308624268, "rewards/rejected": -4.726807594299316, "step": 1660 }, { "epoch": 2.0, "step": 1666, "total_flos": 0.0, "train_loss": 0.11590367368682951, "train_runtime": 24766.7088, "train_samples_per_second": 8.615, "train_steps_per_second": 0.067 } ], "logging_steps": 10, "max_steps": 1666, "num_train_epochs": 2, "save_steps": 10000, "total_flos": 0.0, "trial_name": null, "trial_params": null }