{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 4164, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007204610951008645, "grad_norm": 14.58157977905889, "learning_rate": 1.199040767386091e-10, "logits/chosen": -1.901450514793396, "logits/rejected": -1.9076323509216309, "logps/chosen": -0.8524526953697205, "logps/rejected": -0.9626365900039673, "loss": 1.1927, "rewards/accuracies": 0.5, "rewards/chosen": -1.704905390739441, "rewards/margins": 0.22036786377429962, "rewards/rejected": -1.9252731800079346, "step": 1 }, { "epoch": 0.007204610951008645, "grad_norm": 17.709463159121455, "learning_rate": 1.199040767386091e-09, "logits/chosen": -2.020684242248535, "logits/rejected": -2.0064282417297363, "logps/chosen": -1.0048482418060303, "logps/rejected": -1.1098697185516357, "loss": 1.216, "rewards/accuracies": 0.5208333134651184, "rewards/chosen": -2.0096964836120605, "rewards/margins": 0.21004274487495422, "rewards/rejected": -2.2197394371032715, "step": 10 }, { "epoch": 0.01440922190201729, "grad_norm": 22.640302051500377, "learning_rate": 2.398081534772182e-09, "logits/chosen": -2.021089792251587, "logits/rejected": -2.0176689624786377, "logps/chosen": -1.0516496896743774, "logps/rejected": -1.1834802627563477, "loss": 1.1858, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.103299379348755, "rewards/margins": 0.26366108655929565, "rewards/rejected": -2.3669605255126953, "step": 20 }, { "epoch": 0.021613832853025938, "grad_norm": 17.8606028438409, "learning_rate": 3.597122302158273e-09, "logits/chosen": -1.9866092205047607, "logits/rejected": -1.9793494939804077, "logps/chosen": -1.0540482997894287, "logps/rejected": -1.1519711017608643, "loss": 1.2346, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.1080965995788574, "rewards/margins": 0.19584545493125916, "rewards/rejected": -2.3039422035217285, "step": 30 }, { "epoch": 0.02881844380403458, "grad_norm": 19.245572130250604, "learning_rate": 4.796163069544364e-09, "logits/chosen": -2.0317888259887695, "logits/rejected": -2.031811475753784, "logps/chosen": -1.0351777076721191, "logps/rejected": -1.136722207069397, "loss": 1.2355, "rewards/accuracies": 0.59375, "rewards/chosen": -2.0703554153442383, "rewards/margins": 0.20308911800384521, "rewards/rejected": -2.273444414138794, "step": 40 }, { "epoch": 0.03602305475504323, "grad_norm": 14.943806509066846, "learning_rate": 5.995203836930456e-09, "logits/chosen": -1.9625627994537354, "logits/rejected": -1.9631847143173218, "logps/chosen": -0.9414892196655273, "logps/rejected": -1.007533311843872, "loss": 1.2547, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8829784393310547, "rewards/margins": 0.13208839297294617, "rewards/rejected": -2.015066623687744, "step": 50 }, { "epoch": 0.043227665706051875, "grad_norm": 21.528231741291215, "learning_rate": 7.194244604316546e-09, "logits/chosen": -2.033930778503418, "logits/rejected": -2.0294690132141113, "logps/chosen": -1.0896106958389282, "logps/rejected": -1.1459602117538452, "loss": 1.2679, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.1792213916778564, "rewards/margins": 0.1126992255449295, "rewards/rejected": -2.2919204235076904, "step": 60 }, { "epoch": 0.05043227665706052, "grad_norm": 20.70296936549822, "learning_rate": 8.393285371702639e-09, "logits/chosen": -2.0241129398345947, "logits/rejected": -2.0117270946502686, "logps/chosen": -1.1098978519439697, "logps/rejected": -1.204820156097412, "loss": 1.2271, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.2197957038879395, "rewards/margins": 0.1898445188999176, "rewards/rejected": -2.409640312194824, "step": 70 }, { "epoch": 0.05763688760806916, "grad_norm": 24.40623296093575, "learning_rate": 9.592326139088728e-09, "logits/chosen": -2.0398144721984863, "logits/rejected": -2.036891222000122, "logps/chosen": -1.1656566858291626, "logps/rejected": -1.237831473350525, "loss": 1.2527, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.331313371658325, "rewards/margins": 0.14434944093227386, "rewards/rejected": -2.47566294670105, "step": 80 }, { "epoch": 0.06484149855907781, "grad_norm": 15.525751311455734, "learning_rate": 1.0791366906474819e-08, "logits/chosen": -2.0057613849639893, "logits/rejected": -2.0072615146636963, "logps/chosen": -1.0418776273727417, "logps/rejected": -1.1488852500915527, "loss": 1.215, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.0837552547454834, "rewards/margins": 0.21401505172252655, "rewards/rejected": -2.2977705001831055, "step": 90 }, { "epoch": 0.07204610951008646, "grad_norm": 19.01739570575657, "learning_rate": 1.1990407673860912e-08, "logits/chosen": -2.0440549850463867, "logits/rejected": -2.038007974624634, "logps/chosen": -1.0073726177215576, "logps/rejected": -1.114424467086792, "loss": 1.2172, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.0147452354431152, "rewards/margins": 0.21410349011421204, "rewards/rejected": -2.228848934173584, "step": 100 }, { "epoch": 0.0792507204610951, "grad_norm": 16.468864603689383, "learning_rate": 1.3189448441247003e-08, "logits/chosen": -1.986783742904663, "logits/rejected": -1.975547194480896, "logps/chosen": -1.0294089317321777, "logps/rejected": -1.1291263103485107, "loss": 1.2279, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0588178634643555, "rewards/margins": 0.19943459331989288, "rewards/rejected": -2.2582526206970215, "step": 110 }, { "epoch": 0.08645533141210375, "grad_norm": 18.27069220463476, "learning_rate": 1.4388489208633092e-08, "logits/chosen": -1.9731948375701904, "logits/rejected": -1.9713401794433594, "logps/chosen": -0.9640307426452637, "logps/rejected": -1.0653537511825562, "loss": 1.2087, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9280614852905273, "rewards/margins": 0.2026461362838745, "rewards/rejected": -2.1307075023651123, "step": 120 }, { "epoch": 0.0936599423631124, "grad_norm": 17.232187953156046, "learning_rate": 1.5587529976019183e-08, "logits/chosen": -2.066575527191162, "logits/rejected": -2.065995931625366, "logps/chosen": -1.0801920890808105, "logps/rejected": -1.1521753072738647, "loss": 1.2549, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.160384178161621, "rewards/margins": 0.14396657049655914, "rewards/rejected": -2.3043506145477295, "step": 130 }, { "epoch": 0.10086455331412104, "grad_norm": 20.847348575081657, "learning_rate": 1.6786570743405277e-08, "logits/chosen": -1.9832985401153564, "logits/rejected": -1.9769630432128906, "logps/chosen": -0.9781940579414368, "logps/rejected": -1.122657060623169, "loss": 1.1694, "rewards/accuracies": 0.59375, "rewards/chosen": -1.9563881158828735, "rewards/margins": 0.2889261543750763, "rewards/rejected": -2.245314121246338, "step": 140 }, { "epoch": 0.10806916426512968, "grad_norm": 19.95238793204191, "learning_rate": 1.7985611510791365e-08, "logits/chosen": -1.9963840246200562, "logits/rejected": -1.9920928478240967, "logps/chosen": -1.0187867879867554, "logps/rejected": -1.136918306350708, "loss": 1.2067, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.0375735759735107, "rewards/margins": 0.23626303672790527, "rewards/rejected": -2.273836612701416, "step": 150 }, { "epoch": 0.11527377521613832, "grad_norm": 17.4507491502089, "learning_rate": 1.9184652278177456e-08, "logits/chosen": -2.00455904006958, "logits/rejected": -1.9985454082489014, "logps/chosen": -0.9479260444641113, "logps/rejected": -1.0970423221588135, "loss": 1.1509, "rewards/accuracies": 0.65625, "rewards/chosen": -1.8958520889282227, "rewards/margins": 0.2982328534126282, "rewards/rejected": -2.194084644317627, "step": 160 }, { "epoch": 0.12247838616714697, "grad_norm": 22.64495005377011, "learning_rate": 2.038369304556355e-08, "logits/chosen": -2.0030527114868164, "logits/rejected": -1.995448350906372, "logps/chosen": -1.0368740558624268, "logps/rejected": -1.1604634523391724, "loss": 1.2057, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.0737481117248535, "rewards/margins": 0.24717874825000763, "rewards/rejected": -2.3209269046783447, "step": 170 }, { "epoch": 0.12968299711815562, "grad_norm": 23.590437364971006, "learning_rate": 2.1582733812949638e-08, "logits/chosen": -2.0346579551696777, "logits/rejected": -2.027749538421631, "logps/chosen": -1.020750641822815, "logps/rejected": -1.1084620952606201, "loss": 1.2476, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.04150128364563, "rewards/margins": 0.17542308568954468, "rewards/rejected": -2.2169241905212402, "step": 180 }, { "epoch": 0.13688760806916425, "grad_norm": 22.966874261128403, "learning_rate": 2.278177458033573e-08, "logits/chosen": -2.073704719543457, "logits/rejected": -2.0714824199676514, "logps/chosen": -0.9697921872138977, "logps/rejected": -1.065453290939331, "loss": 1.212, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9395843744277954, "rewards/margins": 0.19132229685783386, "rewards/rejected": -2.130906581878662, "step": 190 }, { "epoch": 0.1440922190201729, "grad_norm": 22.638490791764895, "learning_rate": 2.3980815347721823e-08, "logits/chosen": -2.0427424907684326, "logits/rejected": -2.0397419929504395, "logps/chosen": -1.0259110927581787, "logps/rejected": -1.1529022455215454, "loss": 1.1871, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0518221855163574, "rewards/margins": 0.2539823353290558, "rewards/rejected": -2.305804491043091, "step": 200 }, { "epoch": 0.15129682997118155, "grad_norm": 21.113736148839788, "learning_rate": 2.517985611510791e-08, "logits/chosen": -2.0403473377227783, "logits/rejected": -2.037600040435791, "logps/chosen": -1.0739350318908691, "logps/rejected": -1.150781273841858, "loss": 1.2504, "rewards/accuracies": 0.5625, "rewards/chosen": -2.1478700637817383, "rewards/margins": 0.15369237959384918, "rewards/rejected": -2.301562547683716, "step": 210 }, { "epoch": 0.1585014409221902, "grad_norm": 15.482070000655302, "learning_rate": 2.6378896882494006e-08, "logits/chosen": -1.9863160848617554, "logits/rejected": -1.982267141342163, "logps/chosen": -1.0080206394195557, "logps/rejected": -1.176837682723999, "loss": 1.1505, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.0160412788391113, "rewards/margins": 0.3376340866088867, "rewards/rejected": -2.353675365447998, "step": 220 }, { "epoch": 0.16570605187319884, "grad_norm": 17.014637756082593, "learning_rate": 2.7577937649880097e-08, "logits/chosen": -2.021378993988037, "logits/rejected": -2.021695613861084, "logps/chosen": -1.0124410390853882, "logps/rejected": -1.12635338306427, "loss": 1.2019, "rewards/accuracies": 0.625, "rewards/chosen": -2.0248820781707764, "rewards/margins": 0.22782447934150696, "rewards/rejected": -2.25270676612854, "step": 230 }, { "epoch": 0.1729106628242075, "grad_norm": 22.32772580016105, "learning_rate": 2.8776978417266184e-08, "logits/chosen": -2.0529181957244873, "logits/rejected": -2.0477967262268066, "logps/chosen": -1.0616161823272705, "logps/rejected": -1.1394503116607666, "loss": 1.2614, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.123232364654541, "rewards/margins": 0.15566802024841309, "rewards/rejected": -2.278900623321533, "step": 240 }, { "epoch": 0.18011527377521613, "grad_norm": 19.079728631088813, "learning_rate": 2.997601918465228e-08, "logits/chosen": -1.9696033000946045, "logits/rejected": -1.9657630920410156, "logps/chosen": -1.0835182666778564, "logps/rejected": -1.1734166145324707, "loss": 1.2393, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.167036533355713, "rewards/margins": 0.17979690432548523, "rewards/rejected": -2.3468332290649414, "step": 250 }, { "epoch": 0.1873198847262248, "grad_norm": 21.30398020890557, "learning_rate": 3.1175059952038366e-08, "logits/chosen": -1.9843509197235107, "logits/rejected": -1.9924278259277344, "logps/chosen": -1.1062877178192139, "logps/rejected": -1.2165796756744385, "loss": 1.2142, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.2125754356384277, "rewards/margins": 0.22058391571044922, "rewards/rejected": -2.433159351348877, "step": 260 }, { "epoch": 0.19452449567723343, "grad_norm": 20.993622960377618, "learning_rate": 3.237410071942446e-08, "logits/chosen": -2.0651376247406006, "logits/rejected": -2.0571722984313965, "logps/chosen": -1.0719540119171143, "logps/rejected": -1.2004284858703613, "loss": 1.181, "rewards/accuracies": 0.59375, "rewards/chosen": -2.1439080238342285, "rewards/margins": 0.2569490075111389, "rewards/rejected": -2.4008569717407227, "step": 270 }, { "epoch": 0.2017291066282421, "grad_norm": 25.067055659781758, "learning_rate": 3.3573141486810555e-08, "logits/chosen": -2.014195680618286, "logits/rejected": -2.012540102005005, "logps/chosen": -0.935396671295166, "logps/rejected": -1.049852967262268, "loss": 1.1977, "rewards/accuracies": 0.53125, "rewards/chosen": -1.870793342590332, "rewards/margins": 0.2289123237133026, "rewards/rejected": -2.099705934524536, "step": 280 }, { "epoch": 0.20893371757925072, "grad_norm": 21.777264205916122, "learning_rate": 3.477218225419664e-08, "logits/chosen": -2.044172763824463, "logits/rejected": -2.0461270809173584, "logps/chosen": -1.0135643482208252, "logps/rejected": -1.1082309484481812, "loss": 1.2343, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.0271286964416504, "rewards/margins": 0.18933361768722534, "rewards/rejected": -2.2164618968963623, "step": 290 }, { "epoch": 0.21613832853025935, "grad_norm": 20.318543545834533, "learning_rate": 3.597122302158273e-08, "logits/chosen": -2.0240025520324707, "logits/rejected": -2.0156774520874023, "logps/chosen": -1.0902036428451538, "logps/rejected": -1.1914021968841553, "loss": 1.2135, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.1804072856903076, "rewards/margins": 0.20239713788032532, "rewards/rejected": -2.3828043937683105, "step": 300 }, { "epoch": 0.22334293948126802, "grad_norm": 18.50470861360763, "learning_rate": 3.717026378896883e-08, "logits/chosen": -1.9557920694351196, "logits/rejected": -1.955775260925293, "logps/chosen": -1.0874634981155396, "logps/rejected": -1.1727240085601807, "loss": 1.2381, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.174926996231079, "rewards/margins": 0.17052076756954193, "rewards/rejected": -2.3454480171203613, "step": 310 }, { "epoch": 0.23054755043227665, "grad_norm": 15.935054480540096, "learning_rate": 3.836930455635491e-08, "logits/chosen": -2.031646966934204, "logits/rejected": -2.0232386589050293, "logps/chosen": -1.0084177255630493, "logps/rejected": -1.1408658027648926, "loss": 1.1926, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.0168354511260986, "rewards/margins": 0.2648962140083313, "rewards/rejected": -2.281731605529785, "step": 320 }, { "epoch": 0.2377521613832853, "grad_norm": 15.808626134367197, "learning_rate": 3.9568345323741003e-08, "logits/chosen": -2.019885778427124, "logits/rejected": -2.022150754928589, "logps/chosen": -1.0463831424713135, "logps/rejected": -1.069990873336792, "loss": 1.3364, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -2.092766284942627, "rewards/margins": 0.04721563309431076, "rewards/rejected": -2.139981746673584, "step": 330 }, { "epoch": 0.24495677233429394, "grad_norm": 18.32115617252851, "learning_rate": 4.07673860911271e-08, "logits/chosen": -2.0614123344421387, "logits/rejected": -2.055767297744751, "logps/chosen": -1.0877503156661987, "logps/rejected": -1.16796875, "loss": 1.2366, "rewards/accuracies": 0.5625, "rewards/chosen": -2.1755006313323975, "rewards/margins": 0.16043710708618164, "rewards/rejected": -2.3359375, "step": 340 }, { "epoch": 0.2521613832853026, "grad_norm": 19.369790564686102, "learning_rate": 4.1966426858513185e-08, "logits/chosen": -1.9940099716186523, "logits/rejected": -1.9883639812469482, "logps/chosen": -0.9887149930000305, "logps/rejected": -1.1154861450195312, "loss": 1.1858, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.977429986000061, "rewards/margins": 0.25354230403900146, "rewards/rejected": -2.2309722900390625, "step": 350 }, { "epoch": 0.25936599423631124, "grad_norm": 21.686526135721945, "learning_rate": 4.3165467625899276e-08, "logits/chosen": -1.9959064722061157, "logits/rejected": -1.9917312860488892, "logps/chosen": -1.0866310596466064, "logps/rejected": -1.2025970220565796, "loss": 1.1977, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.173262119293213, "rewards/margins": 0.23193176090717316, "rewards/rejected": -2.405194044113159, "step": 360 }, { "epoch": 0.2665706051873199, "grad_norm": 18.21919999535183, "learning_rate": 4.4364508393285374e-08, "logits/chosen": -2.0002856254577637, "logits/rejected": -2.000253200531006, "logps/chosen": -1.0520254373550415, "logps/rejected": -1.180267572402954, "loss": 1.1778, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.104050874710083, "rewards/margins": 0.2564844489097595, "rewards/rejected": -2.360535144805908, "step": 370 }, { "epoch": 0.2737752161383285, "grad_norm": 16.536106044001812, "learning_rate": 4.556354916067146e-08, "logits/chosen": -2.028313398361206, "logits/rejected": -2.032285213470459, "logps/chosen": -1.0125257968902588, "logps/rejected": -1.0858430862426758, "loss": 1.2682, "rewards/accuracies": 0.5625, "rewards/chosen": -2.0250515937805176, "rewards/margins": 0.14663462340831757, "rewards/rejected": -2.1716861724853516, "step": 380 }, { "epoch": 0.28097982708933716, "grad_norm": 15.31773608533987, "learning_rate": 4.676258992805755e-08, "logits/chosen": -2.0320096015930176, "logits/rejected": -2.0257675647735596, "logps/chosen": -1.0224783420562744, "logps/rejected": -1.1486625671386719, "loss": 1.1819, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.044956684112549, "rewards/margins": 0.25236865878105164, "rewards/rejected": -2.2973251342773438, "step": 390 }, { "epoch": 0.2881844380403458, "grad_norm": 18.995537958721503, "learning_rate": 4.796163069544365e-08, "logits/chosen": -2.034123420715332, "logits/rejected": -2.034450054168701, "logps/chosen": -0.9964189529418945, "logps/rejected": -1.0486726760864258, "loss": 1.2726, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.992837905883789, "rewards/margins": 0.10450725257396698, "rewards/rejected": -2.0973453521728516, "step": 400 }, { "epoch": 0.2953890489913545, "grad_norm": 18.624392586338367, "learning_rate": 4.916067146282973e-08, "logits/chosen": -2.0277891159057617, "logits/rejected": -2.0259571075439453, "logps/chosen": -1.0748345851898193, "logps/rejected": -1.1457411050796509, "loss": 1.262, "rewards/accuracies": 0.53125, "rewards/chosen": -2.1496691703796387, "rewards/margins": 0.14181289076805115, "rewards/rejected": -2.2914822101593018, "step": 410 }, { "epoch": 0.3025936599423631, "grad_norm": 16.76581954495512, "learning_rate": 4.999992091672379e-08, "logits/chosen": -2.011078119277954, "logits/rejected": -2.0153493881225586, "logps/chosen": -1.0450259447097778, "logps/rejected": -1.1236448287963867, "loss": 1.2425, "rewards/accuracies": 0.53125, "rewards/chosen": -2.0900518894195557, "rewards/margins": 0.15723773837089539, "rewards/rejected": -2.2472896575927734, "step": 420 }, { "epoch": 0.30979827089337175, "grad_norm": 17.72733209255425, "learning_rate": 4.999851500573209e-08, "logits/chosen": -1.9903459548950195, "logits/rejected": -1.991233229637146, "logps/chosen": -1.0592777729034424, "logps/rejected": -1.0997775793075562, "loss": 1.3022, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -2.1185555458068848, "rewards/margins": 0.08099973201751709, "rewards/rejected": -2.1995551586151123, "step": 430 }, { "epoch": 0.3170028818443804, "grad_norm": 15.96665018689344, "learning_rate": 4.999535180235972e-08, "logits/chosen": -1.990563988685608, "logits/rejected": -1.9907207489013672, "logps/chosen": -1.0212013721466064, "logps/rejected": -1.1435030698776245, "loss": 1.1959, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.042402744293213, "rewards/margins": 0.2446034699678421, "rewards/rejected": -2.287006139755249, "step": 440 }, { "epoch": 0.3242074927953891, "grad_norm": 17.84897470512453, "learning_rate": 4.9990431528966836e-08, "logits/chosen": -2.010443925857544, "logits/rejected": -2.006673574447632, "logps/chosen": -1.1450097560882568, "logps/rejected": -1.1849489212036133, "loss": 1.3018, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.2900195121765137, "rewards/margins": 0.07987822592258453, "rewards/rejected": -2.3698978424072266, "step": 450 }, { "epoch": 0.3314121037463977, "grad_norm": 24.49190807066052, "learning_rate": 4.9983754531428326e-08, "logits/chosen": -2.006472110748291, "logits/rejected": -2.00079083442688, "logps/chosen": -1.1708580255508423, "logps/rejected": -1.2872368097305298, "loss": 1.2012, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.3417160511016846, "rewards/margins": 0.23275737464427948, "rewards/rejected": -2.5744736194610596, "step": 460 }, { "epoch": 0.33861671469740634, "grad_norm": 23.024434569130843, "learning_rate": 4.997532127910954e-08, "logits/chosen": -2.0429301261901855, "logits/rejected": -2.0308475494384766, "logps/chosen": -1.100434422492981, "logps/rejected": -1.2019624710083008, "loss": 1.2198, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.200868844985962, "rewards/margins": 0.20305626094341278, "rewards/rejected": -2.4039249420166016, "step": 470 }, { "epoch": 0.345821325648415, "grad_norm": 21.129827787614413, "learning_rate": 4.996513236483331e-08, "logits/chosen": -2.101729154586792, "logits/rejected": -2.091571569442749, "logps/chosen": -0.9851749539375305, "logps/rejected": -1.106676459312439, "loss": 1.185, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.970349907875061, "rewards/margins": 0.2430029660463333, "rewards/rejected": -2.213352918624878, "step": 480 }, { "epoch": 0.3530259365994236, "grad_norm": 18.94655048736081, "learning_rate": 4.9953188504838225e-08, "logits/chosen": -2.0206782817840576, "logits/rejected": -2.0197720527648926, "logps/chosen": -0.9880903959274292, "logps/rejected": -1.1017425060272217, "loss": 1.1937, "rewards/accuracies": 0.625, "rewards/chosen": -1.9761807918548584, "rewards/margins": 0.2273043841123581, "rewards/rejected": -2.2034850120544434, "step": 490 }, { "epoch": 0.36023054755043227, "grad_norm": 18.60846892662722, "learning_rate": 4.993949053872834e-08, "logits/chosen": -2.019057035446167, "logits/rejected": -2.0055313110351562, "logps/chosen": -1.0131161212921143, "logps/rejected": -1.139453649520874, "loss": 1.1821, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0262322425842285, "rewards/margins": 0.2526749074459076, "rewards/rejected": -2.278907299041748, "step": 500 }, { "epoch": 0.36743515850144093, "grad_norm": 19.18531858517567, "learning_rate": 4.9924039429414086e-08, "logits/chosen": -2.0883572101593018, "logits/rejected": -2.0818283557891846, "logps/chosen": -1.0440417528152466, "logps/rejected": -1.1581791639328003, "loss": 1.2079, "rewards/accuracies": 0.65625, "rewards/chosen": -2.088083505630493, "rewards/margins": 0.22827525436878204, "rewards/rejected": -2.3163583278656006, "step": 510 }, { "epoch": 0.3746397694524496, "grad_norm": 16.068632795684866, "learning_rate": 4.990683626304467e-08, "logits/chosen": -2.010894775390625, "logits/rejected": -2.0092484951019287, "logps/chosen": -1.1070988178253174, "logps/rejected": -1.2031704187393188, "loss": 1.2198, "rewards/accuracies": 0.5625, "rewards/chosen": -2.2141976356506348, "rewards/margins": 0.19214320182800293, "rewards/rejected": -2.4063408374786377, "step": 520 }, { "epoch": 0.3818443804034582, "grad_norm": 17.727178124609676, "learning_rate": 4.9887882248931646e-08, "logits/chosen": -1.97884202003479, "logits/rejected": -1.968973159790039, "logps/chosen": -0.9846093058586121, "logps/rejected": -1.0614283084869385, "loss": 1.2503, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.9692186117172241, "rewards/margins": 0.15363821387290955, "rewards/rejected": -2.122856616973877, "step": 530 }, { "epoch": 0.38904899135446686, "grad_norm": 22.67722196494781, "learning_rate": 4.986717871946393e-08, "logits/chosen": -2.004068374633789, "logits/rejected": -1.99717116355896, "logps/chosen": -1.0308892726898193, "logps/rejected": -1.1323744058609009, "loss": 1.2209, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.0617785453796387, "rewards/margins": 0.20297034084796906, "rewards/rejected": -2.2647488117218018, "step": 540 }, { "epoch": 0.3962536023054755, "grad_norm": 17.281352424891857, "learning_rate": 4.984472713001416e-08, "logits/chosen": -1.9620494842529297, "logits/rejected": -1.962517499923706, "logps/chosen": -1.0005210638046265, "logps/rejected": -1.0776532888412476, "loss": 1.2683, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.001042127609253, "rewards/margins": 0.1542646884918213, "rewards/rejected": -2.155306577682495, "step": 550 }, { "epoch": 0.4034582132564842, "grad_norm": 17.117159642375974, "learning_rate": 4.982052905883637e-08, "logits/chosen": -2.031991481781006, "logits/rejected": -2.0326719284057617, "logps/chosen": -1.080214262008667, "logps/rejected": -1.181120753288269, "loss": 1.224, "rewards/accuracies": 0.5625, "rewards/chosen": -2.160428524017334, "rewards/margins": 0.2018129527568817, "rewards/rejected": -2.362241506576538, "step": 560 }, { "epoch": 0.4106628242074928, "grad_norm": 16.328895540705197, "learning_rate": 4.979458620695505e-08, "logits/chosen": -2.029468059539795, "logits/rejected": -2.0152204036712646, "logps/chosen": -1.0948221683502197, "logps/rejected": -1.208194613456726, "loss": 1.2094, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.1896443367004395, "rewards/margins": 0.22674505412578583, "rewards/rejected": -2.416389226913452, "step": 570 }, { "epoch": 0.41786743515850144, "grad_norm": 19.61140460251683, "learning_rate": 4.976690039804555e-08, "logits/chosen": -2.033027172088623, "logits/rejected": -2.0314948558807373, "logps/chosen": -0.9877282381057739, "logps/rejected": -1.0673277378082275, "loss": 1.2473, "rewards/accuracies": 0.5625, "rewards/chosen": -1.9754564762115479, "rewards/margins": 0.1591992825269699, "rewards/rejected": -2.134655475616455, "step": 580 }, { "epoch": 0.4250720461095101, "grad_norm": 21.430631009789273, "learning_rate": 4.973747357830592e-08, "logits/chosen": -2.0215108394622803, "logits/rejected": -2.021780490875244, "logps/chosen": -1.0275431871414185, "logps/rejected": -1.1647249460220337, "loss": 1.1677, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.055086374282837, "rewards/margins": 0.2743634283542633, "rewards/rejected": -2.3294498920440674, "step": 590 }, { "epoch": 0.4322766570605187, "grad_norm": 19.463998303694815, "learning_rate": 4.970630781632009e-08, "logits/chosen": -2.0801994800567627, "logits/rejected": -2.076254367828369, "logps/chosen": -1.0327340364456177, "logps/rejected": -1.1751863956451416, "loss": 1.1681, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0654680728912354, "rewards/margins": 0.28490471839904785, "rewards/rejected": -2.350372791290283, "step": 600 }, { "epoch": 0.43948126801152737, "grad_norm": 21.00995063503415, "learning_rate": 4.967340530291242e-08, "logits/chosen": -2.027909517288208, "logits/rejected": -2.0180211067199707, "logps/chosen": -1.0928115844726562, "logps/rejected": -1.1507136821746826, "loss": 1.2682, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.1856231689453125, "rewards/margins": 0.11580429971218109, "rewards/rejected": -2.3014273643493652, "step": 610 }, { "epoch": 0.44668587896253603, "grad_norm": 24.905225792062406, "learning_rate": 4.9638768350993755e-08, "logits/chosen": -2.0285048484802246, "logits/rejected": -2.021249771118164, "logps/chosen": -0.9952943921089172, "logps/rejected": -1.0829205513000488, "loss": 1.2345, "rewards/accuracies": 0.53125, "rewards/chosen": -1.9905887842178345, "rewards/margins": 0.17525213956832886, "rewards/rejected": -2.1658411026000977, "step": 620 }, { "epoch": 0.4538904899135447, "grad_norm": 20.771750563160076, "learning_rate": 4.9602399395398786e-08, "logits/chosen": -2.0377490520477295, "logits/rejected": -2.037675380706787, "logps/chosen": -1.027521014213562, "logps/rejected": -1.1547839641571045, "loss": 1.183, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.055042028427124, "rewards/margins": 0.2545255422592163, "rewards/rejected": -2.309567928314209, "step": 630 }, { "epoch": 0.4610951008645533, "grad_norm": 16.17835710154515, "learning_rate": 4.9564300992714914e-08, "logits/chosen": -1.9597883224487305, "logits/rejected": -1.9607963562011719, "logps/chosen": -1.0108855962753296, "logps/rejected": -1.116549015045166, "loss": 1.2101, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.021771192550659, "rewards/margins": 0.21132683753967285, "rewards/rejected": -2.233098030090332, "step": 640 }, { "epoch": 0.46829971181556196, "grad_norm": 21.86769715087536, "learning_rate": 4.952447582110253e-08, "logits/chosen": -2.0587735176086426, "logits/rejected": -2.044377565383911, "logps/chosen": -1.0383652448654175, "logps/rejected": -1.1178988218307495, "loss": 1.2479, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.076730489730835, "rewards/margins": 0.15906734764575958, "rewards/rejected": -2.235797643661499, "step": 650 }, { "epoch": 0.4755043227665706, "grad_norm": 23.755054747254476, "learning_rate": 4.948292668010676e-08, "logits/chosen": -2.031721353530884, "logits/rejected": -2.032727003097534, "logps/chosen": -1.0880773067474365, "logps/rejected": -1.1748898029327393, "loss": 1.2449, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.176154613494873, "rewards/margins": 0.17362497746944427, "rewards/rejected": -2.3497796058654785, "step": 660 }, { "epoch": 0.4827089337175792, "grad_norm": 20.474460354625247, "learning_rate": 4.943965649046064e-08, "logits/chosen": -2.0048508644104004, "logits/rejected": -1.9955081939697266, "logps/chosen": -1.062713384628296, "logps/rejected": -1.1663198471069336, "loss": 1.2154, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.125426769256592, "rewards/margins": 0.2072126865386963, "rewards/rejected": -2.332639694213867, "step": 670 }, { "epoch": 0.4899135446685879, "grad_norm": 19.048186528049722, "learning_rate": 4.9394668293879835e-08, "logits/chosen": -1.959315538406372, "logits/rejected": -1.9503145217895508, "logps/chosen": -1.0368311405181885, "logps/rejected": -1.1063206195831299, "loss": 1.2624, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.073662281036377, "rewards/margins": 0.13897888362407684, "rewards/rejected": -2.2126412391662598, "step": 680 }, { "epoch": 0.49711815561959655, "grad_norm": 24.933354819026505, "learning_rate": 4.93479652528488e-08, "logits/chosen": -2.020735502243042, "logits/rejected": -2.0154590606689453, "logps/chosen": -1.1052331924438477, "logps/rejected": -1.209161639213562, "loss": 1.2262, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.2104663848876953, "rewards/margins": 0.2078566551208496, "rewards/rejected": -2.418323278427124, "step": 690 }, { "epoch": 0.5043227665706052, "grad_norm": 20.317629206968732, "learning_rate": 4.929955065039848e-08, "logits/chosen": -2.0213494300842285, "logits/rejected": -2.0158300399780273, "logps/chosen": -1.0192697048187256, "logps/rejected": -1.1514381170272827, "loss": 1.1829, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.038539409637451, "rewards/margins": 0.2643369436264038, "rewards/rejected": -2.3028762340545654, "step": 700 }, { "epoch": 0.5115273775216138, "grad_norm": 19.004922715885144, "learning_rate": 4.92494278898755e-08, "logits/chosen": -1.985918402671814, "logits/rejected": -1.982656717300415, "logps/chosen": -0.8973722457885742, "logps/rejected": -1.0216716527938843, "loss": 1.1973, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.7947444915771484, "rewards/margins": 0.24859857559204102, "rewards/rejected": -2.0433433055877686, "step": 710 }, { "epoch": 0.5187319884726225, "grad_norm": 18.960064654240945, "learning_rate": 4.9197600494702955e-08, "logits/chosen": -2.007420539855957, "logits/rejected": -2.001126289367676, "logps/chosen": -1.0426667928695679, "logps/rejected": -1.1658456325531006, "loss": 1.1852, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0853335857391357, "rewards/margins": 0.2463577687740326, "rewards/rejected": -2.331691265106201, "step": 720 }, { "epoch": 0.5259365994236311, "grad_norm": 20.53343043509484, "learning_rate": 4.9144072108132725e-08, "logits/chosen": -2.0134854316711426, "logits/rejected": -2.0023691654205322, "logps/chosen": -1.0226707458496094, "logps/rejected": -1.1051828861236572, "loss": 1.2518, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0453414916992188, "rewards/margins": 0.16502413153648376, "rewards/rejected": -2.2103657722473145, "step": 730 }, { "epoch": 0.5331412103746398, "grad_norm": 17.758862211588106, "learning_rate": 4.908884649298937e-08, "logits/chosen": -1.9972114562988281, "logits/rejected": -2.004119634628296, "logps/chosen": -1.0192463397979736, "logps/rejected": -1.0796899795532227, "loss": 1.2835, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -2.0384926795959473, "rewards/margins": 0.12088724225759506, "rewards/rejected": -2.1593799591064453, "step": 740 }, { "epoch": 0.5403458213256485, "grad_norm": 23.124810759913256, "learning_rate": 4.903192753140557e-08, "logits/chosen": -2.0147690773010254, "logits/rejected": -2.009342908859253, "logps/chosen": -1.1004369258880615, "logps/rejected": -1.1906808614730835, "loss": 1.2378, "rewards/accuracies": 0.53125, "rewards/chosen": -2.200873851776123, "rewards/margins": 0.18048794567584991, "rewards/rejected": -2.381361722946167, "step": 750 }, { "epoch": 0.547550432276657, "grad_norm": 19.72534726379729, "learning_rate": 4.897331922454931e-08, "logits/chosen": -1.9795690774917603, "logits/rejected": -1.9833734035491943, "logps/chosen": -1.0041850805282593, "logps/rejected": -1.1136337518692017, "loss": 1.2165, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.0083701610565186, "rewards/margins": 0.2188970297574997, "rewards/rejected": -2.2272675037384033, "step": 760 }, { "epoch": 0.5547550432276657, "grad_norm": 20.539097658978797, "learning_rate": 4.891302569234256e-08, "logits/chosen": -1.9727134704589844, "logits/rejected": -1.9754774570465088, "logps/chosen": -0.9772794842720032, "logps/rejected": -1.1290626525878906, "loss": 1.1643, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.9545589685440063, "rewards/margins": 0.3035663962364197, "rewards/rejected": -2.2581253051757812, "step": 770 }, { "epoch": 0.5619596541786743, "grad_norm": 22.07597844396349, "learning_rate": 4.8851051173171656e-08, "logits/chosen": -1.9940038919448853, "logits/rejected": -1.9926246404647827, "logps/chosen": -1.0405977964401245, "logps/rejected": -1.1220670938491821, "loss": 1.2393, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.081195592880249, "rewards/margins": 0.1629386693239212, "rewards/rejected": -2.2441341876983643, "step": 780 }, { "epoch": 0.569164265129683, "grad_norm": 17.470111374688827, "learning_rate": 4.87874000235894e-08, "logits/chosen": -2.013667106628418, "logits/rejected": -2.0078587532043457, "logps/chosen": -1.0763031244277954, "logps/rejected": -1.233242154121399, "loss": 1.1596, "rewards/accuracies": 0.625, "rewards/chosen": -2.152606248855591, "rewards/margins": 0.3138778507709503, "rewards/rejected": -2.466484308242798, "step": 790 }, { "epoch": 0.5763688760806917, "grad_norm": 19.520543671943127, "learning_rate": 4.872207671800876e-08, "logits/chosen": -2.0354135036468506, "logits/rejected": -2.0318105220794678, "logps/chosen": -1.0444949865341187, "logps/rejected": -1.1220977306365967, "loss": 1.2567, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.0889899730682373, "rewards/margins": 0.15520496666431427, "rewards/rejected": -2.2441954612731934, "step": 800 }, { "epoch": 0.5835734870317003, "grad_norm": 15.931565272235597, "learning_rate": 4.865508584838841e-08, "logits/chosen": -2.0230934619903564, "logits/rejected": -2.025510311126709, "logps/chosen": -1.0136370658874512, "logps/rejected": -1.1028186082839966, "loss": 1.2343, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.0272741317749023, "rewards/margins": 0.17836324870586395, "rewards/rejected": -2.205637216567993, "step": 810 }, { "epoch": 0.590778097982709, "grad_norm": 21.101696006896514, "learning_rate": 4.858643212390985e-08, "logits/chosen": -2.0232460498809814, "logits/rejected": -2.0134730339050293, "logps/chosen": -1.0298725366592407, "logps/rejected": -1.1151840686798096, "loss": 1.2504, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.0597450733184814, "rewards/margins": 0.1706230342388153, "rewards/rejected": -2.230368137359619, "step": 820 }, { "epoch": 0.5979827089337176, "grad_norm": 18.384683685983724, "learning_rate": 4.851612037064643e-08, "logits/chosen": -2.0008656978607178, "logits/rejected": -1.9988391399383545, "logps/chosen": -0.96119225025177, "logps/rejected": -1.0799301862716675, "loss": 1.2051, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.92238450050354, "rewards/margins": 0.2374759167432785, "rewards/rejected": -2.159860372543335, "step": 830 }, { "epoch": 0.6051873198847262, "grad_norm": 15.976129382373403, "learning_rate": 4.8444155531224065e-08, "logits/chosen": -2.0319008827209473, "logits/rejected": -2.031928539276123, "logps/chosen": -1.0886750221252441, "logps/rejected": -1.1605113744735718, "loss": 1.2625, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.1773500442504883, "rewards/margins": 0.143672913312912, "rewards/rejected": -2.3210227489471436, "step": 840 }, { "epoch": 0.6123919308357348, "grad_norm": 15.372106337343025, "learning_rate": 4.8370542664473805e-08, "logits/chosen": -2.03184175491333, "logits/rejected": -2.0259571075439453, "logps/chosen": -1.0505023002624512, "logps/rejected": -1.15494704246521, "loss": 1.2248, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.1010046005249023, "rewards/margins": 0.20888929069042206, "rewards/rejected": -2.30989408493042, "step": 850 }, { "epoch": 0.6195965417867435, "grad_norm": 17.833021138756298, "learning_rate": 4.829528694507624e-08, "logits/chosen": -2.011185646057129, "logits/rejected": -2.0070912837982178, "logps/chosen": -1.161972999572754, "logps/rejected": -1.218332290649414, "loss": 1.28, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.323945999145508, "rewards/margins": 0.11271880567073822, "rewards/rejected": -2.436664581298828, "step": 860 }, { "epoch": 0.6268011527377522, "grad_norm": 20.10043591744987, "learning_rate": 4.821839366319768e-08, "logits/chosen": -2.0453944206237793, "logits/rejected": -2.0392508506774902, "logps/chosen": -1.0053439140319824, "logps/rejected": -1.12282395362854, "loss": 1.1973, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.010687828063965, "rewards/margins": 0.23495987057685852, "rewards/rejected": -2.24564790725708, "step": 870 }, { "epoch": 0.6340057636887608, "grad_norm": 19.633475514009838, "learning_rate": 4.813986822411833e-08, "logits/chosen": -2.037318706512451, "logits/rejected": -2.035334825515747, "logps/chosen": -1.0152684450149536, "logps/rejected": -1.0797330141067505, "loss": 1.2669, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.0305368900299072, "rewards/margins": 0.12892897427082062, "rewards/rejected": -2.159466028213501, "step": 880 }, { "epoch": 0.6412103746397695, "grad_norm": 19.900627573984437, "learning_rate": 4.805971614785231e-08, "logits/chosen": -2.0658364295959473, "logits/rejected": -2.0642929077148438, "logps/chosen": -1.0170501470565796, "logps/rejected": -1.11166250705719, "loss": 1.2213, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.034100294113159, "rewards/margins": 0.1892244815826416, "rewards/rejected": -2.22332501411438, "step": 890 }, { "epoch": 0.6484149855907781, "grad_norm": 20.046745017622534, "learning_rate": 4.797794306875963e-08, "logits/chosen": -1.9768317937850952, "logits/rejected": -1.9782040119171143, "logps/chosen": -1.1424155235290527, "logps/rejected": -1.2143452167510986, "loss": 1.2686, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.2848310470581055, "rewards/margins": 0.14385904371738434, "rewards/rejected": -2.4286904335021973, "step": 900 }, { "epoch": 0.6556195965417867, "grad_norm": 20.156486798671747, "learning_rate": 4.7894554735150076e-08, "logits/chosen": -1.979318618774414, "logits/rejected": -1.9829566478729248, "logps/chosen": -1.042389154434204, "logps/rejected": -1.108424186706543, "loss": 1.2626, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.084778308868408, "rewards/margins": 0.1320703774690628, "rewards/rejected": -2.216848373413086, "step": 910 }, { "epoch": 0.6628242074927954, "grad_norm": 23.296556306421977, "learning_rate": 4.7809557008879185e-08, "logits/chosen": -2.017183780670166, "logits/rejected": -2.0119588375091553, "logps/chosen": -0.9740872383117676, "logps/rejected": -1.0616848468780518, "loss": 1.2388, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.9481744766235352, "rewards/margins": 0.1751951277256012, "rewards/rejected": -2.1233696937561035, "step": 920 }, { "epoch": 0.670028818443804, "grad_norm": 18.069785801871536, "learning_rate": 4.772295586493613e-08, "logits/chosen": -2.057365894317627, "logits/rejected": -2.054624080657959, "logps/chosen": -1.0349071025848389, "logps/rejected": -1.1510379314422607, "loss": 1.193, "rewards/accuracies": 0.625, "rewards/chosen": -2.0698142051696777, "rewards/margins": 0.23226144909858704, "rewards/rejected": -2.3020758628845215, "step": 930 }, { "epoch": 0.6772334293948127, "grad_norm": 19.620026043686646, "learning_rate": 4.763475739102374e-08, "logits/chosen": -2.00927472114563, "logits/rejected": -2.015021562576294, "logps/chosen": -1.1269561052322388, "logps/rejected": -1.1944589614868164, "loss": 1.2561, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.2539122104644775, "rewards/margins": 0.13500596582889557, "rewards/rejected": -2.388917922973633, "step": 940 }, { "epoch": 0.6844380403458213, "grad_norm": 15.430566823053855, "learning_rate": 4.754496778713054e-08, "logits/chosen": -1.9693466424942017, "logits/rejected": -1.9732694625854492, "logps/chosen": -1.0118048191070557, "logps/rejected": -1.1344263553619385, "loss": 1.2008, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0236096382141113, "rewards/margins": 0.24524304270744324, "rewards/rejected": -2.268852710723877, "step": 950 }, { "epoch": 0.69164265129683, "grad_norm": 21.25135809120288, "learning_rate": 4.7453593365094926e-08, "logits/chosen": -2.04045033454895, "logits/rejected": -2.039541244506836, "logps/chosen": -1.049902319908142, "logps/rejected": -1.1590924263000488, "loss": 1.2091, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.099804639816284, "rewards/margins": 0.21838030219078064, "rewards/rejected": -2.3181848526000977, "step": 960 }, { "epoch": 0.6988472622478387, "grad_norm": 21.39072451404026, "learning_rate": 4.736064054816145e-08, "logits/chosen": -2.042609691619873, "logits/rejected": -2.0387399196624756, "logps/chosen": -0.9685258865356445, "logps/rejected": -1.0943108797073364, "loss": 1.1795, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.937051773071289, "rewards/margins": 0.25157004594802856, "rewards/rejected": -2.188621759414673, "step": 970 }, { "epoch": 0.7060518731988472, "grad_norm": 17.20168162072602, "learning_rate": 4.726611587052933e-08, "logits/chosen": -1.9772526025772095, "logits/rejected": -1.9768762588500977, "logps/chosen": -1.1084102392196655, "logps/rejected": -1.2353932857513428, "loss": 1.1801, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.216820478439331, "rewards/margins": 0.2539660334587097, "rewards/rejected": -2.4707865715026855, "step": 980 }, { "epoch": 0.7132564841498559, "grad_norm": 22.219628346195623, "learning_rate": 4.71700259768931e-08, "logits/chosen": -2.0274641513824463, "logits/rejected": -2.0244956016540527, "logps/chosen": -1.109879732131958, "logps/rejected": -1.2068617343902588, "loss": 1.2336, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.219759464263916, "rewards/margins": 0.19396351277828217, "rewards/rejected": -2.4137234687805176, "step": 990 }, { "epoch": 0.7204610951008645, "grad_norm": 19.81819744621828, "learning_rate": 4.707237762197549e-08, "logits/chosen": -2.013184070587158, "logits/rejected": -2.0100245475769043, "logps/chosen": -1.0080925226211548, "logps/rejected": -1.1278679370880127, "loss": 1.2121, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.0161850452423096, "rewards/margins": 0.23955106735229492, "rewards/rejected": -2.2557358741760254, "step": 1000 }, { "epoch": 0.7276657060518732, "grad_norm": 23.320316952087914, "learning_rate": 4.697317767005265e-08, "logits/chosen": -2.0245862007141113, "logits/rejected": -2.0211946964263916, "logps/chosen": -1.002010703086853, "logps/rejected": -1.0942790508270264, "loss": 1.2568, "rewards/accuracies": 0.53125, "rewards/chosen": -2.004021406173706, "rewards/margins": 0.1845366507768631, "rewards/rejected": -2.1885581016540527, "step": 1010 }, { "epoch": 0.7348703170028819, "grad_norm": 17.35614684932965, "learning_rate": 4.6872433094471577e-08, "logits/chosen": -2.0214576721191406, "logits/rejected": -2.01664137840271, "logps/chosen": -1.0324314832687378, "logps/rejected": -1.127612590789795, "loss": 1.2122, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.0648629665374756, "rewards/margins": 0.19036227464675903, "rewards/rejected": -2.25522518157959, "step": 1020 }, { "epoch": 0.7420749279538905, "grad_norm": 16.487356163413914, "learning_rate": 4.677015097715994e-08, "logits/chosen": -1.9668807983398438, "logits/rejected": -1.9662902355194092, "logps/chosen": -1.0229971408843994, "logps/rejected": -1.1552445888519287, "loss": 1.1997, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.045994281768799, "rewards/margins": 0.2644946873188019, "rewards/rejected": -2.3104891777038574, "step": 1030 }, { "epoch": 0.7492795389048992, "grad_norm": 17.492033929105126, "learning_rate": 4.666633850812825e-08, "logits/chosen": -2.0216238498687744, "logits/rejected": -2.0157718658447266, "logps/chosen": -1.0129607915878296, "logps/rejected": -1.0947651863098145, "loss": 1.2367, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.025921583175659, "rewards/margins": 0.16360855102539062, "rewards/rejected": -2.189530372619629, "step": 1040 }, { "epoch": 0.7564841498559077, "grad_norm": 17.49180259130834, "learning_rate": 4.656100298496439e-08, "logits/chosen": -1.971518874168396, "logits/rejected": -1.9679629802703857, "logps/chosen": -0.9385242462158203, "logps/rejected": -1.0688835382461548, "loss": 1.1859, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8770484924316406, "rewards/margins": 0.26071876287460327, "rewards/rejected": -2.1377670764923096, "step": 1050 }, { "epoch": 0.7636887608069164, "grad_norm": 17.905832545876255, "learning_rate": 4.6454151812320715e-08, "logits/chosen": -2.0001180171966553, "logits/rejected": -1.9940083026885986, "logps/chosen": -1.03890061378479, "logps/rejected": -1.1473093032836914, "loss": 1.2178, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.07780122756958, "rewards/margins": 0.21681778132915497, "rewards/rejected": -2.294618606567383, "step": 1060 }, { "epoch": 0.770893371757925, "grad_norm": 20.829212072329433, "learning_rate": 4.6345792501393434e-08, "logits/chosen": -2.0026588439941406, "logits/rejected": -2.0007362365722656, "logps/chosen": -1.0745230913162231, "logps/rejected": -1.201542615890503, "loss": 1.2046, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.1490461826324463, "rewards/margins": 0.2540392279624939, "rewards/rejected": -2.403085231781006, "step": 1070 }, { "epoch": 0.7780979827089337, "grad_norm": 20.734671350383845, "learning_rate": 4.6235932669394676e-08, "logits/chosen": -2.0293848514556885, "logits/rejected": -2.030176877975464, "logps/chosen": -1.0878403186798096, "logps/rejected": -1.196656584739685, "loss": 1.2186, "rewards/accuracies": 0.625, "rewards/chosen": -2.175680637359619, "rewards/margins": 0.21763241291046143, "rewards/rejected": -2.39331316947937, "step": 1080 }, { "epoch": 0.7853025936599424, "grad_norm": 24.317214064629283, "learning_rate": 4.612458003901698e-08, "logits/chosen": -2.041074514389038, "logits/rejected": -2.0332765579223633, "logps/chosen": -1.109058141708374, "logps/rejected": -1.2108246088027954, "loss": 1.2286, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.218116283416748, "rewards/margins": 0.20353302359580994, "rewards/rejected": -2.421649217605591, "step": 1090 }, { "epoch": 0.792507204610951, "grad_norm": 23.34024566936978, "learning_rate": 4.6011742437890476e-08, "logits/chosen": -2.028428077697754, "logits/rejected": -2.023019790649414, "logps/chosen": -1.0458049774169922, "logps/rejected": -1.1794006824493408, "loss": 1.1775, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0916099548339844, "rewards/margins": 0.2671913504600525, "rewards/rejected": -2.3588013648986816, "step": 1100 }, { "epoch": 0.7997118155619597, "grad_norm": 16.933139927466357, "learning_rate": 4.589742779803259e-08, "logits/chosen": -2.025526523590088, "logits/rejected": -2.018397569656372, "logps/chosen": -1.0093412399291992, "logps/rejected": -1.129741907119751, "loss": 1.1948, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0186824798583984, "rewards/margins": 0.2408013790845871, "rewards/rejected": -2.259483814239502, "step": 1110 }, { "epoch": 0.8069164265129684, "grad_norm": 18.433386982266423, "learning_rate": 4.5781644155290486e-08, "logits/chosen": -1.9837512969970703, "logits/rejected": -1.9759635925292969, "logps/chosen": -1.047893762588501, "logps/rejected": -1.1082303524017334, "loss": 1.2713, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.095787525177002, "rewards/margins": 0.12067310512065887, "rewards/rejected": -2.216460704803467, "step": 1120 }, { "epoch": 0.8141210374639769, "grad_norm": 18.152544924178944, "learning_rate": 4.566439964877613e-08, "logits/chosen": -2.0132524967193604, "logits/rejected": -2.0092389583587646, "logps/chosen": -0.9992140531539917, "logps/rejected": -1.0850255489349365, "loss": 1.2443, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.9984281063079834, "rewards/margins": 0.17162318527698517, "rewards/rejected": -2.170051097869873, "step": 1130 }, { "epoch": 0.8213256484149856, "grad_norm": 16.195560643437258, "learning_rate": 4.554570252029421e-08, "logits/chosen": -2.0523180961608887, "logits/rejected": -2.0510833263397217, "logps/chosen": -1.0488303899765015, "logps/rejected": -1.1647334098815918, "loss": 1.201, "rewards/accuracies": 0.59375, "rewards/chosen": -2.097660779953003, "rewards/margins": 0.23180584609508514, "rewards/rejected": -2.3294668197631836, "step": 1140 }, { "epoch": 0.8285302593659942, "grad_norm": 17.9745846350065, "learning_rate": 4.542556111376274e-08, "logits/chosen": -2.0492236614227295, "logits/rejected": -2.0428290367126465, "logps/chosen": -1.0749974250793457, "logps/rejected": -1.166634202003479, "loss": 1.2395, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.1499948501586914, "rewards/margins": 0.18327349424362183, "rewards/rejected": -2.333268404006958, "step": 1150 }, { "epoch": 0.8357348703170029, "grad_norm": 23.066926614034124, "learning_rate": 4.5303983874626506e-08, "logits/chosen": -1.9916588068008423, "logits/rejected": -1.9900974035263062, "logps/chosen": -1.0387059450149536, "logps/rejected": -1.115934133529663, "loss": 1.2651, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.0774118900299072, "rewards/margins": 0.15445652604103088, "rewards/rejected": -2.231868267059326, "step": 1160 }, { "epoch": 0.8429394812680115, "grad_norm": 20.106291828506194, "learning_rate": 4.518097934926339e-08, "logits/chosen": -1.995008111000061, "logits/rejected": -1.9863529205322266, "logps/chosen": -1.0165393352508545, "logps/rejected": -1.1260240077972412, "loss": 1.2057, "rewards/accuracies": 0.625, "rewards/chosen": -2.033078670501709, "rewards/margins": 0.21896927058696747, "rewards/rejected": -2.2520480155944824, "step": 1170 }, { "epoch": 0.8501440922190202, "grad_norm": 22.8857527390999, "learning_rate": 4.505655618438363e-08, "logits/chosen": -1.9628753662109375, "logits/rejected": -1.9588840007781982, "logps/chosen": -1.0615794658660889, "logps/rejected": -1.1646844148635864, "loss": 1.2307, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.1231589317321777, "rewards/margins": 0.20620973408222198, "rewards/rejected": -2.329368829727173, "step": 1180 }, { "epoch": 0.8573487031700289, "grad_norm": 17.434899766590377, "learning_rate": 4.4930723126421945e-08, "logits/chosen": -2.0546653270721436, "logits/rejected": -2.047938585281372, "logps/chosen": -1.0720479488372803, "logps/rejected": -1.1471359729766846, "loss": 1.252, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.1440958976745605, "rewards/margins": 0.15017575025558472, "rewards/rejected": -2.294271945953369, "step": 1190 }, { "epoch": 0.8645533141210374, "grad_norm": 22.14075015263452, "learning_rate": 4.48034890209227e-08, "logits/chosen": -1.983888030052185, "logits/rejected": -1.9716113805770874, "logps/chosen": -1.0871121883392334, "logps/rejected": -1.1737545728683472, "loss": 1.23, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.174224376678467, "rewards/margins": 0.1732848584651947, "rewards/rejected": -2.3475091457366943, "step": 1200 }, { "epoch": 0.8717579250720461, "grad_norm": 18.53077050982448, "learning_rate": 4.4674862811918155e-08, "logits/chosen": -1.971573829650879, "logits/rejected": -1.980055570602417, "logps/chosen": -0.9388012886047363, "logps/rejected": -1.091797113418579, "loss": 1.1596, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8776025772094727, "rewards/margins": 0.30599164962768555, "rewards/rejected": -2.183594226837158, "step": 1210 }, { "epoch": 0.8789625360230547, "grad_norm": 17.441252552193376, "learning_rate": 4.454485354129966e-08, "logits/chosen": -1.9985713958740234, "logits/rejected": -1.994210958480835, "logps/chosen": -1.0104329586029053, "logps/rejected": -1.11543869972229, "loss": 1.2194, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.0208659172058105, "rewards/margins": 0.21001139283180237, "rewards/rejected": -2.23087739944458, "step": 1220 }, { "epoch": 0.8861671469740634, "grad_norm": 17.20275284474546, "learning_rate": 4.4413470348182124e-08, "logits/chosen": -1.9755537509918213, "logits/rejected": -1.9634100198745728, "logps/chosen": -0.9853811264038086, "logps/rejected": -1.076774001121521, "loss": 1.2316, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.9707622528076172, "rewards/margins": 0.18278571963310242, "rewards/rejected": -2.153548002243042, "step": 1230 }, { "epoch": 0.8933717579250721, "grad_norm": 21.253905408711432, "learning_rate": 4.42807224682615e-08, "logits/chosen": -1.9815731048583984, "logits/rejected": -1.9793331623077393, "logps/chosen": -0.9373159408569336, "logps/rejected": -1.0729162693023682, "loss": 1.181, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.8746318817138672, "rewards/margins": 0.2712007462978363, "rewards/rejected": -2.1458325386047363, "step": 1240 }, { "epoch": 0.9005763688760807, "grad_norm": 18.803145183231678, "learning_rate": 4.4146619233165604e-08, "logits/chosen": -2.0230329036712646, "logits/rejected": -2.025296688079834, "logps/chosen": -1.0652821063995361, "logps/rejected": -1.2190508842468262, "loss": 1.1677, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.1305642127990723, "rewards/margins": 0.3075374960899353, "rewards/rejected": -2.4381017684936523, "step": 1250 }, { "epoch": 0.9077809798270894, "grad_norm": 25.018490567837954, "learning_rate": 4.4011170069798126e-08, "logits/chosen": -2.020940065383911, "logits/rejected": -2.025850296020508, "logps/chosen": -1.1181256771087646, "logps/rejected": -1.2433640956878662, "loss": 1.1932, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.2362513542175293, "rewards/margins": 0.25047701597213745, "rewards/rejected": -2.4867281913757324, "step": 1260 }, { "epoch": 0.9149855907780979, "grad_norm": 18.123087760553187, "learning_rate": 4.387438449967594e-08, "logits/chosen": -1.982254981994629, "logits/rejected": -1.97560715675354, "logps/chosen": -0.9658223986625671, "logps/rejected": -1.085925579071045, "loss": 1.1909, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.9316447973251343, "rewards/margins": 0.24020643532276154, "rewards/rejected": -2.17185115814209, "step": 1270 }, { "epoch": 0.9221902017291066, "grad_norm": 21.17056826903978, "learning_rate": 4.373627213825983e-08, "logits/chosen": -2.0719313621520996, "logits/rejected": -2.0676798820495605, "logps/chosen": -1.0272830724716187, "logps/rejected": -1.1627672910690308, "loss": 1.1829, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0545661449432373, "rewards/margins": 0.27096837759017944, "rewards/rejected": -2.3255345821380615, "step": 1280 }, { "epoch": 0.9293948126801153, "grad_norm": 16.73743221772608, "learning_rate": 4.359684269427848e-08, "logits/chosen": -2.034970760345459, "logits/rejected": -2.0339713096618652, "logps/chosen": -0.9956309199333191, "logps/rejected": -1.0993244647979736, "loss": 1.2107, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.9912618398666382, "rewards/margins": 0.20738673210144043, "rewards/rejected": -2.1986489295959473, "step": 1290 }, { "epoch": 0.9365994236311239, "grad_norm": 23.479698749807888, "learning_rate": 4.34561059690461e-08, "logits/chosen": -2.079378843307495, "logits/rejected": -2.0813305377960205, "logps/chosen": -1.047837495803833, "logps/rejected": -1.112128496170044, "loss": 1.2707, "rewards/accuracies": 0.53125, "rewards/chosen": -2.095674991607666, "rewards/margins": 0.12858203053474426, "rewards/rejected": -2.224256992340088, "step": 1300 }, { "epoch": 0.9438040345821326, "grad_norm": 21.373476828454745, "learning_rate": 4.3314071855773314e-08, "logits/chosen": -2.044544219970703, "logits/rejected": -2.0450897216796875, "logps/chosen": -0.9845747947692871, "logps/rejected": -1.0792670249938965, "loss": 1.2235, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.9691495895385742, "rewards/margins": 0.18938450515270233, "rewards/rejected": -2.158534049987793, "step": 1310 }, { "epoch": 0.9510086455331412, "grad_norm": 20.390675123621403, "learning_rate": 4.3170750338871806e-08, "logits/chosen": -2.0153450965881348, "logits/rejected": -2.008953094482422, "logps/chosen": -1.0770236253738403, "logps/rejected": -1.2197729349136353, "loss": 1.1662, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.1540472507476807, "rewards/margins": 0.2854984402656555, "rewards/rejected": -2.4395458698272705, "step": 1320 }, { "epoch": 0.9582132564841499, "grad_norm": 14.835531781677203, "learning_rate": 4.3026151493252414e-08, "logits/chosen": -2.039367437362671, "logits/rejected": -2.0349154472351074, "logps/chosen": -1.0609954595565796, "logps/rejected": -1.1818583011627197, "loss": 1.2003, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.121990919113159, "rewards/margins": 0.24172568321228027, "rewards/rejected": -2.3637166023254395, "step": 1330 }, { "epoch": 0.9654178674351584, "grad_norm": 25.71038185604989, "learning_rate": 4.2880285483616895e-08, "logits/chosen": -2.0069704055786133, "logits/rejected": -2.007664680480957, "logps/chosen": -1.0175052881240845, "logps/rejected": -1.1325770616531372, "loss": 1.2093, "rewards/accuracies": 0.59375, "rewards/chosen": -2.035010576248169, "rewards/margins": 0.2301437109708786, "rewards/rejected": -2.2651541233062744, "step": 1340 }, { "epoch": 0.9726224783861671, "grad_norm": 16.092608904878997, "learning_rate": 4.273316256374342e-08, "logits/chosen": -1.9464366436004639, "logits/rejected": -1.9446899890899658, "logps/chosen": -1.01396644115448, "logps/rejected": -1.0869011878967285, "loss": 1.264, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.02793288230896, "rewards/margins": 0.1458693891763687, "rewards/rejected": -2.173802375793457, "step": 1350 }, { "epoch": 0.9798270893371758, "grad_norm": 16.212857235886922, "learning_rate": 4.258479307576576e-08, "logits/chosen": -1.9840402603149414, "logits/rejected": -1.9818894863128662, "logps/chosen": -0.9638694524765015, "logps/rejected": -1.0546468496322632, "loss": 1.24, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.927738904953003, "rewards/margins": 0.18155473470687866, "rewards/rejected": -2.1092936992645264, "step": 1360 }, { "epoch": 0.9870317002881844, "grad_norm": 21.341000872382455, "learning_rate": 4.243518744944626e-08, "logits/chosen": -2.0093555450439453, "logits/rejected": -2.0047600269317627, "logps/chosen": -1.0009874105453491, "logps/rejected": -1.1216598749160767, "loss": 1.1889, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0019748210906982, "rewards/margins": 0.24134452641010284, "rewards/rejected": -2.2433197498321533, "step": 1370 }, { "epoch": 0.9942363112391931, "grad_norm": 20.83793747644969, "learning_rate": 4.22843562014427e-08, "logits/chosen": -1.9709367752075195, "logits/rejected": -1.9672348499298096, "logps/chosen": -1.0514830350875854, "logps/rejected": -1.1256954669952393, "loss": 1.2503, "rewards/accuracies": 0.5625, "rewards/chosen": -2.102966070175171, "rewards/margins": 0.14842486381530762, "rewards/rejected": -2.2513909339904785, "step": 1380 }, { "epoch": 1.0014409221902016, "grad_norm": 28.539886189287515, "learning_rate": 4.2132309934569e-08, "logits/chosen": -2.051409959793091, "logits/rejected": -2.051856517791748, "logps/chosen": -1.015867829322815, "logps/rejected": -1.128615379333496, "loss": 1.2103, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.03173565864563, "rewards/margins": 0.22549493610858917, "rewards/rejected": -2.257230758666992, "step": 1390 }, { "epoch": 1.0086455331412103, "grad_norm": 18.4548159325349, "learning_rate": 4.197905933704989e-08, "logits/chosen": -1.9460541009902954, "logits/rejected": -1.9434579610824585, "logps/chosen": -1.0608787536621094, "logps/rejected": -1.1942651271820068, "loss": 1.2017, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.1217575073242188, "rewards/margins": 0.2667728066444397, "rewards/rejected": -2.3885302543640137, "step": 1400 }, { "epoch": 1.015850144092219, "grad_norm": 23.677146712392545, "learning_rate": 4.1824615181769577e-08, "logits/chosen": -1.992706060409546, "logits/rejected": -1.9971202611923218, "logps/chosen": -1.0128545761108398, "logps/rejected": -1.138115644454956, "loss": 1.2035, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0257091522216797, "rewards/margins": 0.2505221366882324, "rewards/rejected": -2.276231288909912, "step": 1410 }, { "epoch": 1.0230547550432276, "grad_norm": 18.466913113268376, "learning_rate": 4.1668988325514434e-08, "logits/chosen": -2.0149149894714355, "logits/rejected": -2.0098109245300293, "logps/chosen": -1.1170905828475952, "logps/rejected": -1.2321112155914307, "loss": 1.2252, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.2341811656951904, "rewards/margins": 0.2300410270690918, "rewards/rejected": -2.4642224311828613, "step": 1420 }, { "epoch": 1.0302593659942363, "grad_norm": 20.916480925982736, "learning_rate": 4.1512189708209844e-08, "logits/chosen": -2.0576863288879395, "logits/rejected": -2.0563559532165527, "logps/chosen": -0.9412269592285156, "logps/rejected": -1.0276962518692017, "loss": 1.2464, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.8824539184570312, "rewards/margins": 0.17293845117092133, "rewards/rejected": -2.0553925037384033, "step": 1430 }, { "epoch": 1.037463976945245, "grad_norm": 22.10230375057076, "learning_rate": 4.1354230352151143e-08, "logits/chosen": -2.009265661239624, "logits/rejected": -2.002540111541748, "logps/chosen": -1.138351559638977, "logps/rejected": -1.2199509143829346, "loss": 1.2585, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.276703119277954, "rewards/margins": 0.16319862008094788, "rewards/rejected": -2.439901828765869, "step": 1440 }, { "epoch": 1.0446685878962536, "grad_norm": 16.842031017248782, "learning_rate": 4.119512136122882e-08, "logits/chosen": -2.0774807929992676, "logits/rejected": -2.086643695831299, "logps/chosen": -0.9951038360595703, "logps/rejected": -1.1451139450073242, "loss": 1.1708, "rewards/accuracies": 0.59375, "rewards/chosen": -1.9902076721191406, "rewards/margins": 0.30002015829086304, "rewards/rejected": -2.2902278900146484, "step": 1450 }, { "epoch": 1.0518731988472623, "grad_norm": 15.427164808054908, "learning_rate": 4.103487392014795e-08, "logits/chosen": -1.992767095565796, "logits/rejected": -1.980544090270996, "logps/chosen": -1.0006814002990723, "logps/rejected": -1.15886390209198, "loss": 1.1455, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.0013628005981445, "rewards/margins": 0.31636515259742737, "rewards/rejected": -2.31772780418396, "step": 1460 }, { "epoch": 1.059077809798271, "grad_norm": 16.81042888795935, "learning_rate": 4.087349929364192e-08, "logits/chosen": -2.034682273864746, "logits/rejected": -2.0252864360809326, "logps/chosen": -0.9601753354072571, "logps/rejected": -1.0913857221603394, "loss": 1.1863, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.9203506708145142, "rewards/margins": 0.26242080330848694, "rewards/rejected": -2.1827714443206787, "step": 1470 }, { "epoch": 1.0662824207492796, "grad_norm": 17.584619579081235, "learning_rate": 4.0711008825680645e-08, "logits/chosen": -1.979069709777832, "logits/rejected": -1.97795832157135, "logps/chosen": -1.0063145160675049, "logps/rejected": -1.1248080730438232, "loss": 1.2064, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.0126290321350098, "rewards/margins": 0.2369869500398636, "rewards/rejected": -2.2496161460876465, "step": 1480 }, { "epoch": 1.0734870317002883, "grad_norm": 19.98068478862068, "learning_rate": 4.054741393867306e-08, "logits/chosen": -1.99558424949646, "logits/rejected": -1.9926925897598267, "logps/chosen": -1.1117796897888184, "logps/rejected": -1.1623036861419678, "loss": 1.2882, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.2235593795776367, "rewards/margins": 0.10104763507843018, "rewards/rejected": -2.3246073722839355, "step": 1490 }, { "epoch": 1.080691642651297, "grad_norm": 18.569188294062595, "learning_rate": 4.038272613266419e-08, "logits/chosen": -1.9959461688995361, "logits/rejected": -1.9826500415802002, "logps/chosen": -1.0095350742340088, "logps/rejected": -1.1202278137207031, "loss": 1.2023, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.0190701484680176, "rewards/margins": 0.22138550877571106, "rewards/rejected": -2.2404556274414062, "step": 1500 }, { "epoch": 1.0878962536023056, "grad_norm": 18.129783454014866, "learning_rate": 4.0216956984526784e-08, "logits/chosen": -2.04606032371521, "logits/rejected": -2.047947406768799, "logps/chosen": -1.0161449909210205, "logps/rejected": -1.124267339706421, "loss": 1.2167, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.032289981842041, "rewards/margins": 0.21624493598937988, "rewards/rejected": -2.248534679412842, "step": 1510 }, { "epoch": 1.0951008645533142, "grad_norm": 16.171374987629033, "learning_rate": 4.0050118147147446e-08, "logits/chosen": -1.9890464544296265, "logits/rejected": -1.989335298538208, "logps/chosen": -1.0982977151870728, "logps/rejected": -1.110621690750122, "loss": 1.3393, "rewards/accuracies": 0.4375, "rewards/chosen": -2.1965954303741455, "rewards/margins": 0.02464829757809639, "rewards/rejected": -2.221243381500244, "step": 1520 }, { "epoch": 1.1023054755043227, "grad_norm": 17.66132069219183, "learning_rate": 3.988222134860755e-08, "logits/chosen": -2.0323548316955566, "logits/rejected": -2.0236430168151855, "logps/chosen": -0.9508152008056641, "logps/rejected": -1.1158647537231445, "loss": 1.1407, "rewards/accuracies": 0.625, "rewards/chosen": -1.9016304016113281, "rewards/margins": 0.33009934425354004, "rewards/rejected": -2.231729507446289, "step": 1530 }, { "epoch": 1.1095100864553313, "grad_norm": 23.676130358664636, "learning_rate": 3.9713278391358724e-08, "logits/chosen": -2.0360183715820312, "logits/rejected": -2.0298333168029785, "logps/chosen": -1.025137186050415, "logps/rejected": -1.1484403610229492, "loss": 1.1877, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.05027437210083, "rewards/margins": 0.24660632014274597, "rewards/rejected": -2.2968807220458984, "step": 1540 }, { "epoch": 1.11671469740634, "grad_norm": 17.77840056029204, "learning_rate": 3.954330115139328e-08, "logits/chosen": -2.0122570991516113, "logits/rejected": -2.0070974826812744, "logps/chosen": -1.0277431011199951, "logps/rejected": -1.1330978870391846, "loss": 1.2216, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0554862022399902, "rewards/margins": 0.21070995926856995, "rewards/rejected": -2.266195774078369, "step": 1550 }, { "epoch": 1.1239193083573487, "grad_norm": 25.812098081681867, "learning_rate": 3.937230157740931e-08, "logits/chosen": -2.067347764968872, "logits/rejected": -2.0611376762390137, "logps/chosen": -1.0478734970092773, "logps/rejected": -1.1832859516143799, "loss": 1.1824, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0957469940185547, "rewards/margins": 0.2708250880241394, "rewards/rejected": -2.3665719032287598, "step": 1560 }, { "epoch": 1.1311239193083573, "grad_norm": 16.22328310375803, "learning_rate": 3.920029168997077e-08, "logits/chosen": -2.04835844039917, "logits/rejected": -2.04648494720459, "logps/chosen": -1.0037837028503418, "logps/rejected": -1.131502628326416, "loss": 1.1863, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.0075674057006836, "rewards/margins": 0.25543779134750366, "rewards/rejected": -2.263005256652832, "step": 1570 }, { "epoch": 1.138328530259366, "grad_norm": 29.81353401958458, "learning_rate": 3.9027283580662476e-08, "logits/chosen": -2.0225307941436768, "logits/rejected": -2.0166521072387695, "logps/chosen": -1.0478241443634033, "logps/rejected": -1.1936235427856445, "loss": 1.1765, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.0956482887268066, "rewards/margins": 0.2915985882282257, "rewards/rejected": -2.387247085571289, "step": 1580 }, { "epoch": 1.1455331412103746, "grad_norm": 16.941588748106863, "learning_rate": 3.885328941124014e-08, "logits/chosen": -1.991965889930725, "logits/rejected": -1.9873225688934326, "logps/chosen": -0.9666957855224609, "logps/rejected": -1.1006277799606323, "loss": 1.1706, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.9333915710449219, "rewards/margins": 0.26786428689956665, "rewards/rejected": -2.2012555599212646, "step": 1590 }, { "epoch": 1.1527377521613833, "grad_norm": 20.82364621838478, "learning_rate": 3.867832141277539e-08, "logits/chosen": -2.0321202278137207, "logits/rejected": -2.0232601165771484, "logps/chosen": -1.0682156085968018, "logps/rejected": -1.180410623550415, "loss": 1.2096, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.1364312171936035, "rewards/margins": 0.2243901491165161, "rewards/rejected": -2.36082124710083, "step": 1600 }, { "epoch": 1.159942363112392, "grad_norm": 20.912686096120964, "learning_rate": 3.850239188479606e-08, "logits/chosen": -1.9847033023834229, "logits/rejected": -1.9881378412246704, "logps/chosen": -1.0096313953399658, "logps/rejected": -1.1002733707427979, "loss": 1.2372, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.0192627906799316, "rewards/margins": 0.18128342926502228, "rewards/rejected": -2.2005467414855957, "step": 1610 }, { "epoch": 1.1671469740634006, "grad_norm": 21.899733424702635, "learning_rate": 3.832551319442151e-08, "logits/chosen": -2.0586349964141846, "logits/rejected": -2.059906482696533, "logps/chosen": -1.057755708694458, "logps/rejected": -1.184890627861023, "loss": 1.1897, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.115511417388916, "rewards/margins": 0.254270076751709, "rewards/rejected": -2.369781255722046, "step": 1620 }, { "epoch": 1.1743515850144093, "grad_norm": 17.2910410178799, "learning_rate": 3.81476977754933e-08, "logits/chosen": -1.9560763835906982, "logits/rejected": -1.9524368047714233, "logps/chosen": -1.0269958972930908, "logps/rejected": -1.0969812870025635, "loss": 1.2579, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.0539917945861816, "rewards/margins": 0.13997015357017517, "rewards/rejected": -2.193962574005127, "step": 1630 }, { "epoch": 1.181556195965418, "grad_norm": 16.85537517324203, "learning_rate": 3.796895812770114e-08, "logits/chosen": -1.9784526824951172, "logits/rejected": -1.9793262481689453, "logps/chosen": -1.0173704624176025, "logps/rejected": -1.1091585159301758, "loss": 1.2408, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.034740924835205, "rewards/margins": 0.1835760474205017, "rewards/rejected": -2.2183170318603516, "step": 1640 }, { "epoch": 1.1887608069164266, "grad_norm": 22.175531020521074, "learning_rate": 3.7789306815704216e-08, "logits/chosen": -2.009108781814575, "logits/rejected": -2.006824254989624, "logps/chosen": -1.0072455406188965, "logps/rejected": -1.0781667232513428, "loss": 1.2618, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.014491081237793, "rewards/margins": 0.14184223115444183, "rewards/rejected": -2.1563334465026855, "step": 1650 }, { "epoch": 1.195965417867435, "grad_norm": 18.89404553225258, "learning_rate": 3.760875646824795e-08, "logits/chosen": -1.932428002357483, "logits/rejected": -1.9363291263580322, "logps/chosen": -0.9747514724731445, "logps/rejected": -1.0793020725250244, "loss": 1.2232, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.949502944946289, "rewards/margins": 0.20910124480724335, "rewards/rejected": -2.158604145050049, "step": 1660 }, { "epoch": 1.2031700288184437, "grad_norm": 22.227342153467788, "learning_rate": 3.742731977727623e-08, "logits/chosen": -2.0331270694732666, "logits/rejected": -2.0301709175109863, "logps/chosen": -1.0413671731948853, "logps/rejected": -1.1772552728652954, "loss": 1.1795, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0827343463897705, "rewards/margins": 0.27177631855010986, "rewards/rejected": -2.354510545730591, "step": 1670 }, { "epoch": 1.2103746397694524, "grad_norm": 19.70333261721218, "learning_rate": 3.7245009497039244e-08, "logits/chosen": -1.970715880393982, "logits/rejected": -1.9627761840820312, "logps/chosen": -1.0116103887557983, "logps/rejected": -1.1484659910202026, "loss": 1.1716, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.0232207775115967, "rewards/margins": 0.2737112045288086, "rewards/rejected": -2.2969319820404053, "step": 1680 }, { "epoch": 1.217579250720461, "grad_norm": 18.86128397711634, "learning_rate": 3.7061838443196886e-08, "logits/chosen": -2.0151665210723877, "logits/rejected": -2.016679048538208, "logps/chosen": -1.026761531829834, "logps/rejected": -1.150320053100586, "loss": 1.1888, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.053523063659668, "rewards/margins": 0.2471170723438263, "rewards/rejected": -2.300640106201172, "step": 1690 }, { "epoch": 1.2247838616714697, "grad_norm": 22.70930817597516, "learning_rate": 3.68778194919179e-08, "logits/chosen": -1.983304738998413, "logits/rejected": -1.984287977218628, "logps/chosen": -1.0792837142944336, "logps/rejected": -1.2009527683258057, "loss": 1.1955, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.158567428588867, "rewards/margins": 0.24333825707435608, "rewards/rejected": -2.4019055366516113, "step": 1700 }, { "epoch": 1.2319884726224783, "grad_norm": 20.280682845222326, "learning_rate": 3.66929655789747e-08, "logits/chosen": -2.0337467193603516, "logits/rejected": -2.0225701332092285, "logps/chosen": -0.9402590990066528, "logps/rejected": -1.0919773578643799, "loss": 1.1634, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.8805181980133057, "rewards/margins": 0.30343663692474365, "rewards/rejected": -2.1839547157287598, "step": 1710 }, { "epoch": 1.239193083573487, "grad_norm": 16.359613747281564, "learning_rate": 3.6507289698834064e-08, "logits/chosen": -1.9774224758148193, "logits/rejected": -1.973842978477478, "logps/chosen": -0.98408442735672, "logps/rejected": -1.1161837577819824, "loss": 1.196, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.96816885471344, "rewards/margins": 0.2641984820365906, "rewards/rejected": -2.232367515563965, "step": 1720 }, { "epoch": 1.2463976945244957, "grad_norm": 25.191044914408238, "learning_rate": 3.6320804903743684e-08, "logits/chosen": -2.026642322540283, "logits/rejected": -2.0262598991394043, "logps/chosen": -1.0340476036071777, "logps/rejected": -1.1598145961761475, "loss": 1.1976, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0680952072143555, "rewards/margins": 0.251534104347229, "rewards/rejected": -2.319629192352295, "step": 1730 }, { "epoch": 1.2536023054755043, "grad_norm": 17.23248745457562, "learning_rate": 3.61335243028146e-08, "logits/chosen": -2.0114941596984863, "logits/rejected": -2.016153573989868, "logps/chosen": -1.092045783996582, "logps/rejected": -1.2228668928146362, "loss": 1.1916, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.184091567993164, "rewards/margins": 0.2616419494152069, "rewards/rejected": -2.4457337856292725, "step": 1740 }, { "epoch": 1.260806916426513, "grad_norm": 18.437527072676268, "learning_rate": 3.5945461061099736e-08, "logits/chosen": -1.972100853919983, "logits/rejected": -1.9586093425750732, "logps/chosen": -1.0443975925445557, "logps/rejected": -1.1218501329421997, "loss": 1.2706, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.0887951850891113, "rewards/margins": 0.15490522980690002, "rewards/rejected": -2.2437002658843994, "step": 1750 }, { "epoch": 1.2680115273775217, "grad_norm": 19.695402848445642, "learning_rate": 3.5756628398668446e-08, "logits/chosen": -2.0573649406433105, "logits/rejected": -2.0625429153442383, "logps/chosen": -1.1325814723968506, "logps/rejected": -1.232399821281433, "loss": 1.2403, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.265162944793701, "rewards/margins": 0.19963672757148743, "rewards/rejected": -2.464799642562866, "step": 1760 }, { "epoch": 1.2752161383285303, "grad_norm": 17.758331420145563, "learning_rate": 3.556703958967716e-08, "logits/chosen": -2.041581630706787, "logits/rejected": -2.036958694458008, "logps/chosen": -1.0513150691986084, "logps/rejected": -1.1853423118591309, "loss": 1.1882, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.102630138397217, "rewards/margins": 0.2680542469024658, "rewards/rejected": -2.3706846237182617, "step": 1770 }, { "epoch": 1.282420749279539, "grad_norm": 24.11832525210908, "learning_rate": 3.5376707961436297e-08, "logits/chosen": -2.025054454803467, "logits/rejected": -2.019120693206787, "logps/chosen": -1.1408239603042603, "logps/rejected": -1.202470064163208, "loss": 1.2726, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.2816479206085205, "rewards/margins": 0.12329187244176865, "rewards/rejected": -2.404940128326416, "step": 1780 }, { "epoch": 1.2896253602305476, "grad_norm": 12.89708045158757, "learning_rate": 3.51856468934734e-08, "logits/chosen": -1.9773098230361938, "logits/rejected": -1.9786754846572876, "logps/chosen": -0.9762522578239441, "logps/rejected": -1.0697864294052124, "loss": 1.2234, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.9525045156478882, "rewards/margins": 0.18706828355789185, "rewards/rejected": -2.139572858810425, "step": 1790 }, { "epoch": 1.2968299711815563, "grad_norm": 20.10190857160128, "learning_rate": 3.499386981659262e-08, "logits/chosen": -2.0595974922180176, "logits/rejected": -2.0540311336517334, "logps/chosen": -1.0190843343734741, "logps/rejected": -1.208898901939392, "loss": 1.1243, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.0381686687469482, "rewards/margins": 0.3796289563179016, "rewards/rejected": -2.417797803878784, "step": 1800 }, { "epoch": 1.304034582132565, "grad_norm": 20.878615577501385, "learning_rate": 3.480139021193057e-08, "logits/chosen": -1.9839977025985718, "logits/rejected": -1.9858938455581665, "logps/chosen": -0.9964865446090698, "logps/rejected": -1.1170662641525269, "loss": 1.212, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.9929730892181396, "rewards/margins": 0.24115952849388123, "rewards/rejected": -2.2341325283050537, "step": 1810 }, { "epoch": 1.3112391930835736, "grad_norm": 28.47013732688272, "learning_rate": 3.4608221610008666e-08, "logits/chosen": -2.0153214931488037, "logits/rejected": -2.010758876800537, "logps/chosen": -0.9736091494560242, "logps/rejected": -1.120499849319458, "loss": 1.1707, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.9472182989120483, "rewards/margins": 0.29378125071525574, "rewards/rejected": -2.240999698638916, "step": 1820 }, { "epoch": 1.318443804034582, "grad_norm": 15.221657015785182, "learning_rate": 3.4414377589782e-08, "logits/chosen": -1.9868896007537842, "logits/rejected": -1.9957456588745117, "logps/chosen": -1.0180634260177612, "logps/rejected": -1.150467872619629, "loss": 1.1966, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.0361268520355225, "rewards/margins": 0.26480910181999207, "rewards/rejected": -2.300935745239258, "step": 1830 }, { "epoch": 1.3256484149855907, "grad_norm": 18.190653029469026, "learning_rate": 3.4219871777684745e-08, "logits/chosen": -1.9982116222381592, "logits/rejected": -1.9859825372695923, "logps/chosen": -0.9929243922233582, "logps/rejected": -1.1142441034317017, "loss": 1.2076, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.9858487844467163, "rewards/margins": 0.24263925850391388, "rewards/rejected": -2.2284882068634033, "step": 1840 }, { "epoch": 1.3328530259365994, "grad_norm": 17.791029774645512, "learning_rate": 3.4024717846672364e-08, "logits/chosen": -2.0318691730499268, "logits/rejected": -2.025087833404541, "logps/chosen": -0.9934013485908508, "logps/rejected": -1.1215975284576416, "loss": 1.1959, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.9868026971817017, "rewards/margins": 0.25639256834983826, "rewards/rejected": -2.243195056915283, "step": 1850 }, { "epoch": 1.340057636887608, "grad_norm": 17.7783196169273, "learning_rate": 3.382892951526036e-08, "logits/chosen": -2.0219979286193848, "logits/rejected": -2.0191798210144043, "logps/chosen": -1.0518848896026611, "logps/rejected": -1.20078444480896, "loss": 1.1628, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.1037697792053223, "rewards/margins": 0.2977990508079529, "rewards/rejected": -2.40156888961792, "step": 1860 }, { "epoch": 1.3472622478386167, "grad_norm": 20.2114199388819, "learning_rate": 3.3632520546559974e-08, "logits/chosen": -1.9855273962020874, "logits/rejected": -1.9737205505371094, "logps/chosen": -0.926679253578186, "logps/rejected": -1.0955464839935303, "loss": 1.1271, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.853358507156372, "rewards/margins": 0.3377344310283661, "rewards/rejected": -2.1910929679870605, "step": 1870 }, { "epoch": 1.3544668587896254, "grad_norm": 19.579421951203443, "learning_rate": 3.34355047473107e-08, "logits/chosen": -1.9991194009780884, "logits/rejected": -1.9949671030044556, "logps/chosen": -1.0290135145187378, "logps/rejected": -1.1186621189117432, "loss": 1.2445, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.0580270290374756, "rewards/margins": 0.1792970895767212, "rewards/rejected": -2.2373242378234863, "step": 1880 }, { "epoch": 1.361671469740634, "grad_norm": 22.936331468503273, "learning_rate": 3.323789596690971e-08, "logits/chosen": -1.966144323348999, "logits/rejected": -1.9670454263687134, "logps/chosen": -1.0209920406341553, "logps/rejected": -1.1551681756973267, "loss": 1.1787, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.0419840812683105, "rewards/margins": 0.26835212111473083, "rewards/rejected": -2.3103363513946533, "step": 1890 }, { "epoch": 1.3688760806916427, "grad_norm": 15.801647380635032, "learning_rate": 3.303970809643828e-08, "logits/chosen": -1.998286247253418, "logits/rejected": -2.0028045177459717, "logps/chosen": -1.0353937149047852, "logps/rejected": -1.1643174886703491, "loss": 1.1927, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0707874298095703, "rewards/margins": 0.25784778594970703, "rewards/rejected": -2.3286349773406982, "step": 1900 }, { "epoch": 1.3760806916426513, "grad_norm": 20.693514419325513, "learning_rate": 3.2840955067685356e-08, "logits/chosen": -2.031480312347412, "logits/rejected": -2.035548686981201, "logps/chosen": -1.0550123453140259, "logps/rejected": -1.2029016017913818, "loss": 1.1631, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.1100246906280518, "rewards/margins": 0.29577863216400146, "rewards/rejected": -2.4058032035827637, "step": 1910 }, { "epoch": 1.38328530259366, "grad_norm": 16.858093329362955, "learning_rate": 3.264165085216817e-08, "logits/chosen": -2.038879871368408, "logits/rejected": -2.0388429164886475, "logps/chosen": -0.9352089166641235, "logps/rejected": -1.10355544090271, "loss": 1.1401, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.870417833328247, "rewards/margins": 0.33669325709342957, "rewards/rejected": -2.20711088180542, "step": 1920 }, { "epoch": 1.3904899135446687, "grad_norm": 18.690349536010206, "learning_rate": 3.244180946015008e-08, "logits/chosen": -1.966835618019104, "logits/rejected": -1.967462182044983, "logps/chosen": -1.0351486206054688, "logps/rejected": -1.0991723537445068, "loss": 1.273, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.0702972412109375, "rewards/margins": 0.12804751098155975, "rewards/rejected": -2.1983447074890137, "step": 1930 }, { "epoch": 1.397694524495677, "grad_norm": 15.348372078288971, "learning_rate": 3.224144493965578e-08, "logits/chosen": -2.0523886680603027, "logits/rejected": -2.0558857917785645, "logps/chosen": -0.9908173680305481, "logps/rejected": -1.0954809188842773, "loss": 1.2177, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9816347360610962, "rewards/margins": 0.20932729542255402, "rewards/rejected": -2.1909618377685547, "step": 1940 }, { "epoch": 1.4048991354466858, "grad_norm": 17.879874010257755, "learning_rate": 3.204057137548371e-08, "logits/chosen": -2.014993667602539, "logits/rejected": -2.0096094608306885, "logps/chosen": -0.9776601791381836, "logps/rejected": -1.0827131271362305, "loss": 1.2165, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9553203582763672, "rewards/margins": 0.2101059854030609, "rewards/rejected": -2.165426254272461, "step": 1950 }, { "epoch": 1.4121037463976944, "grad_norm": 19.498418734777132, "learning_rate": 3.183920288821597e-08, "logits/chosen": -1.9974403381347656, "logits/rejected": -1.9938675165176392, "logps/chosen": -1.002251386642456, "logps/rejected": -1.1633012294769287, "loss": 1.1473, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.004502773284912, "rewards/margins": 0.32209956645965576, "rewards/rejected": -2.3266024589538574, "step": 1960 }, { "epoch": 1.419308357348703, "grad_norm": 23.526801289262714, "learning_rate": 3.1637353633225735e-08, "logits/chosen": -2.0382392406463623, "logits/rejected": -2.0321145057678223, "logps/chosen": -1.0285365581512451, "logps/rejected": -1.1749916076660156, "loss": 1.1708, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0570731163024902, "rewards/margins": 0.2929099202156067, "rewards/rejected": -2.3499832153320312, "step": 1970 }, { "epoch": 1.4265129682997117, "grad_norm": 19.581143803282398, "learning_rate": 3.143503779968213e-08, "logits/chosen": -2.0113444328308105, "logits/rejected": -2.011580467224121, "logps/chosen": -1.0159164667129517, "logps/rejected": -1.1518559455871582, "loss": 1.196, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0318329334259033, "rewards/margins": 0.2718789875507355, "rewards/rejected": -2.3037118911743164, "step": 1980 }, { "epoch": 1.4337175792507204, "grad_norm": 18.05404403193421, "learning_rate": 3.1232269609552875e-08, "logits/chosen": -1.9945173263549805, "logits/rejected": -1.9919058084487915, "logps/chosen": -0.9980915188789368, "logps/rejected": -1.1186559200286865, "loss": 1.2014, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.9961830377578735, "rewards/margins": 0.24112899601459503, "rewards/rejected": -2.237311840057373, "step": 1990 }, { "epoch": 1.440922190201729, "grad_norm": 16.09307467422962, "learning_rate": 3.102906331660444e-08, "logits/chosen": -2.0536019802093506, "logits/rejected": -2.045327663421631, "logps/chosen": -0.9929038882255554, "logps/rejected": -1.1643650531768799, "loss": 1.1353, "rewards/accuracies": 0.625, "rewards/chosen": -1.9858077764511108, "rewards/margins": 0.34292247891426086, "rewards/rejected": -2.3287301063537598, "step": 2000 }, { "epoch": 1.4481268011527377, "grad_norm": 16.081259631225404, "learning_rate": 3.082543320540015e-08, "logits/chosen": -1.9962193965911865, "logits/rejected": -1.9890375137329102, "logps/chosen": -1.0065879821777344, "logps/rejected": -1.1499404907226562, "loss": 1.1679, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.0131759643554688, "rewards/margins": 0.2867050766944885, "rewards/rejected": -2.2998809814453125, "step": 2010 }, { "epoch": 1.4553314121037464, "grad_norm": 18.028275293304183, "learning_rate": 3.062139359029599e-08, "logits/chosen": -2.031736373901367, "logits/rejected": -2.0316202640533447, "logps/chosen": -1.0291239023208618, "logps/rejected": -1.1133326292037964, "loss": 1.2476, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.0582478046417236, "rewards/margins": 0.16841746866703033, "rewards/rejected": -2.2266652584075928, "step": 2020 }, { "epoch": 1.462536023054755, "grad_norm": 18.739986191205507, "learning_rate": 3.041695881443437e-08, "logits/chosen": -2.0472700595855713, "logits/rejected": -2.0425424575805664, "logps/chosen": -0.9730477333068848, "logps/rejected": -1.1086480617523193, "loss": 1.1771, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.9460954666137695, "rewards/margins": 0.271200567483902, "rewards/rejected": -2.2172961235046387, "step": 2030 }, { "epoch": 1.4697406340057637, "grad_norm": 22.13845084834241, "learning_rate": 3.0212143248735886e-08, "logits/chosen": -2.0294270515441895, "logits/rejected": -2.029846668243408, "logps/chosen": -0.9991294741630554, "logps/rejected": -1.1360986232757568, "loss": 1.1761, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9982589483261108, "rewards/margins": 0.27393826842308044, "rewards/rejected": -2.2721972465515137, "step": 2040 }, { "epoch": 1.4769452449567724, "grad_norm": 19.822466302624346, "learning_rate": 3.0006961290889077e-08, "logits/chosen": -2.0190889835357666, "logits/rejected": -2.0099661350250244, "logps/chosen": -1.1185331344604492, "logps/rejected": -1.286892056465149, "loss": 1.1647, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.2370662689208984, "rewards/margins": 0.33671754598617554, "rewards/rejected": -2.573784112930298, "step": 2050 }, { "epoch": 1.484149855907781, "grad_norm": 21.925715491881135, "learning_rate": 2.980142736433833e-08, "logits/chosen": -2.01119327545166, "logits/rejected": -2.004316806793213, "logps/chosen": -1.0309051275253296, "logps/rejected": -1.0949158668518066, "loss": 1.2751, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.061810255050659, "rewards/margins": 0.12802138924598694, "rewards/rejected": -2.1898317337036133, "step": 2060 }, { "epoch": 1.4913544668587897, "grad_norm": 24.46772736032293, "learning_rate": 2.9595555917269997e-08, "logits/chosen": -2.03961181640625, "logits/rejected": -2.0247714519500732, "logps/chosen": -1.1411329507827759, "logps/rejected": -1.2373685836791992, "loss": 1.2153, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.2822659015655518, "rewards/margins": 0.19247153401374817, "rewards/rejected": -2.4747371673583984, "step": 2070 }, { "epoch": 1.4985590778097984, "grad_norm": 18.449968646671344, "learning_rate": 2.9389361421596725e-08, "logits/chosen": -1.9533805847167969, "logits/rejected": -1.9556515216827393, "logps/chosen": -1.0595835447311401, "logps/rejected": -1.1933083534240723, "loss": 1.1832, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.1191670894622803, "rewards/margins": 0.2674497365951538, "rewards/rejected": -2.3866167068481445, "step": 2080 }, { "epoch": 1.505763688760807, "grad_norm": 20.34455177562933, "learning_rate": 2.9182858371940126e-08, "logits/chosen": -2.0372543334960938, "logits/rejected": -2.031832218170166, "logps/chosen": -1.0473064184188843, "logps/rejected": -1.1757621765136719, "loss": 1.188, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0946128368377686, "rewards/margins": 0.25691163539886475, "rewards/rejected": -2.3515243530273438, "step": 2090 }, { "epoch": 1.5129682997118157, "grad_norm": 18.90759740416456, "learning_rate": 2.8976061284611908e-08, "logits/chosen": -1.9889026880264282, "logits/rejected": -1.9977174997329712, "logps/chosen": -0.9364235997200012, "logps/rejected": -1.0655431747436523, "loss": 1.1944, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.8728471994400024, "rewards/margins": 0.2582393288612366, "rewards/rejected": -2.1310863494873047, "step": 2100 }, { "epoch": 1.5201729106628243, "grad_norm": 21.434032214198695, "learning_rate": 2.8768984696593384e-08, "logits/chosen": -1.9844554662704468, "logits/rejected": -1.974907636642456, "logps/chosen": -1.0168864727020264, "logps/rejected": -1.1343626976013184, "loss": 1.2154, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.0337729454040527, "rewards/margins": 0.23495233058929443, "rewards/rejected": -2.2687253952026367, "step": 2110 }, { "epoch": 1.527377521613833, "grad_norm": 18.047284778863265, "learning_rate": 2.8561643164513637e-08, "logits/chosen": -1.9067440032958984, "logits/rejected": -1.9029529094696045, "logps/chosen": -1.0492842197418213, "logps/rejected": -1.1676268577575684, "loss": 1.2013, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.0985684394836426, "rewards/margins": 0.23668520152568817, "rewards/rejected": -2.3352537155151367, "step": 2120 }, { "epoch": 1.5345821325648417, "grad_norm": 18.95635925899202, "learning_rate": 2.8354051263626227e-08, "logits/chosen": -1.9887434244155884, "logits/rejected": -1.994476556777954, "logps/chosen": -1.0596574544906616, "logps/rejected": -1.1733436584472656, "loss": 1.206, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.1193149089813232, "rewards/margins": 0.22737233340740204, "rewards/rejected": -2.3466873168945312, "step": 2130 }, { "epoch": 1.54178674351585, "grad_norm": 19.584229312796637, "learning_rate": 2.8146223586784573e-08, "logits/chosen": -1.9805008172988892, "logits/rejected": -1.9726234674453735, "logps/chosen": -1.0646823644638062, "logps/rejected": -1.1987252235412598, "loss": 1.1873, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.1293647289276123, "rewards/margins": 0.26808565855026245, "rewards/rejected": -2.3974504470825195, "step": 2140 }, { "epoch": 1.5489913544668588, "grad_norm": 25.50415369546022, "learning_rate": 2.7938174743416205e-08, "logits/chosen": -1.9369735717773438, "logits/rejected": -1.933683156967163, "logps/chosen": -1.050445795059204, "logps/rejected": -1.1607348918914795, "loss": 1.2135, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.100891590118408, "rewards/margins": 0.220577672123909, "rewards/rejected": -2.321469783782959, "step": 2150 }, { "epoch": 1.5561959654178674, "grad_norm": 19.684619038178205, "learning_rate": 2.7729919358495728e-08, "logits/chosen": -2.005277395248413, "logits/rejected": -2.0062077045440674, "logps/chosen": -1.1121950149536133, "logps/rejected": -1.19098699092865, "loss": 1.2586, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.2243900299072266, "rewards/margins": 0.1575840413570404, "rewards/rejected": -2.3819739818573, "step": 2160 }, { "epoch": 1.563400576368876, "grad_norm": 19.293166467927325, "learning_rate": 2.7521472071516772e-08, "logits/chosen": -2.0027170181274414, "logits/rejected": -2.0016961097717285, "logps/chosen": -0.9449695348739624, "logps/rejected": -1.0605154037475586, "loss": 1.2076, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.8899390697479248, "rewards/margins": 0.2310914546251297, "rewards/rejected": -2.121030807495117, "step": 2170 }, { "epoch": 1.5706051873198847, "grad_norm": 22.062496687144794, "learning_rate": 2.731284753546289e-08, "logits/chosen": -1.9856891632080078, "logits/rejected": -1.9836666584014893, "logps/chosen": -1.081839919090271, "logps/rejected": -1.2224990129470825, "loss": 1.1741, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.163679838180542, "rewards/margins": 0.2813180387020111, "rewards/rejected": -2.444998025894165, "step": 2180 }, { "epoch": 1.5778097982708934, "grad_norm": 21.803351526445823, "learning_rate": 2.710406041577751e-08, "logits/chosen": -2.04976224899292, "logits/rejected": -2.0463500022888184, "logps/chosen": -1.0325794219970703, "logps/rejected": -1.1858645677566528, "loss": 1.1631, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0651588439941406, "rewards/margins": 0.3065701127052307, "rewards/rejected": -2.3717291355133057, "step": 2190 }, { "epoch": 1.585014409221902, "grad_norm": 18.0281741107113, "learning_rate": 2.6895125389333017e-08, "logits/chosen": -2.0131421089172363, "logits/rejected": -2.0089025497436523, "logps/chosen": -1.0270203351974487, "logps/rejected": -1.177971363067627, "loss": 1.1622, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.0540406703948975, "rewards/margins": 0.3019018769264221, "rewards/rejected": -2.355942726135254, "step": 2200 }, { "epoch": 1.5922190201729105, "grad_norm": 17.129921710950377, "learning_rate": 2.6686057143399028e-08, "logits/chosen": -2.010429620742798, "logits/rejected": -2.0119571685791016, "logps/chosen": -1.0614731311798096, "logps/rejected": -1.1593468189239502, "loss": 1.2433, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.122946262359619, "rewards/margins": 0.1957472264766693, "rewards/rejected": -2.3186936378479004, "step": 2210 }, { "epoch": 1.5994236311239192, "grad_norm": 19.402506516811066, "learning_rate": 2.647687037460996e-08, "logits/chosen": -2.0160350799560547, "logits/rejected": -2.0153958797454834, "logps/chosen": -1.0873758792877197, "logps/rejected": -1.2827941179275513, "loss": 1.1246, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.1747517585754395, "rewards/margins": 0.3908364176750183, "rewards/rejected": -2.5655882358551025, "step": 2220 }, { "epoch": 1.6066282420749278, "grad_norm": 20.370963061014333, "learning_rate": 2.626757978793187e-08, "logits/chosen": -2.0244648456573486, "logits/rejected": -2.0181853771209717, "logps/chosen": -1.0852004289627075, "logps/rejected": -1.2089064121246338, "loss": 1.2036, "rewards/accuracies": 0.625, "rewards/chosen": -2.170400857925415, "rewards/margins": 0.24741193652153015, "rewards/rejected": -2.4178128242492676, "step": 2230 }, { "epoch": 1.6138328530259365, "grad_norm": 23.538795309630903, "learning_rate": 2.6058200095628797e-08, "logits/chosen": -1.9968335628509521, "logits/rejected": -2.000123977661133, "logps/chosen": -0.9178045392036438, "logps/rejected": -1.086455225944519, "loss": 1.1446, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.8356090784072876, "rewards/margins": 0.3373013734817505, "rewards/rejected": -2.172910451889038, "step": 2240 }, { "epoch": 1.6210374639769451, "grad_norm": 18.7834477811749, "learning_rate": 2.584874601622854e-08, "logits/chosen": -2.0577359199523926, "logits/rejected": -2.0486464500427246, "logps/chosen": -1.0842779874801636, "logps/rejected": -1.2169630527496338, "loss": 1.2055, "rewards/accuracies": 0.5625, "rewards/chosen": -2.168555974960327, "rewards/margins": 0.26537027955055237, "rewards/rejected": -2.4339261054992676, "step": 2250 }, { "epoch": 1.6282420749279538, "grad_norm": 21.50129735883824, "learning_rate": 2.5639232273487993e-08, "logits/chosen": -1.9792057275772095, "logits/rejected": -1.9694305658340454, "logps/chosen": -0.9786102175712585, "logps/rejected": -1.0999042987823486, "loss": 1.2022, "rewards/accuracies": 0.59375, "rewards/chosen": -1.957220435142517, "rewards/margins": 0.24258823692798615, "rewards/rejected": -2.1998085975646973, "step": 2260 }, { "epoch": 1.6354466858789625, "grad_norm": 20.836699972853967, "learning_rate": 2.5429673595358142e-08, "logits/chosen": -2.0185582637786865, "logits/rejected": -2.0170459747314453, "logps/chosen": -1.043128490447998, "logps/rejected": -1.165533185005188, "loss": 1.1967, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.086256980895996, "rewards/margins": 0.24480919539928436, "rewards/rejected": -2.331066370010376, "step": 2270 }, { "epoch": 1.6426512968299711, "grad_norm": 23.73080611195804, "learning_rate": 2.5220084712948764e-08, "logits/chosen": -1.9826107025146484, "logits/rejected": -1.9717817306518555, "logps/chosen": -1.1198116540908813, "logps/rejected": -1.2381196022033691, "loss": 1.1906, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.2396233081817627, "rewards/margins": 0.23661574721336365, "rewards/rejected": -2.4762392044067383, "step": 2280 }, { "epoch": 1.6498559077809798, "grad_norm": 19.691578624312058, "learning_rate": 2.5010480359492838e-08, "logits/chosen": -1.9650003910064697, "logits/rejected": -1.9621715545654297, "logps/chosen": -1.0505197048187256, "logps/rejected": -1.1106680631637573, "loss": 1.2861, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.101039409637451, "rewards/margins": 0.12029679119586945, "rewards/rejected": -2.2213361263275146, "step": 2290 }, { "epoch": 1.6570605187319885, "grad_norm": 21.239713120458195, "learning_rate": 2.480087526931091e-08, "logits/chosen": -2.0088658332824707, "logits/rejected": -1.9966083765029907, "logps/chosen": -1.0031955242156982, "logps/rejected": -1.1170064210891724, "loss": 1.2166, "rewards/accuracies": 0.59375, "rewards/chosen": -2.0063910484313965, "rewards/margins": 0.22762183845043182, "rewards/rejected": -2.2340128421783447, "step": 2300 }, { "epoch": 1.6642651296829971, "grad_norm": 19.4121166793283, "learning_rate": 2.4591284176775326e-08, "logits/chosen": -1.9742721319198608, "logits/rejected": -1.9704244136810303, "logps/chosen": -1.07572340965271, "logps/rejected": -1.159128189086914, "loss": 1.2565, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.15144681930542, "rewards/margins": 0.16680975258350372, "rewards/rejected": -2.318256378173828, "step": 2310 }, { "epoch": 1.6714697406340058, "grad_norm": 21.976079747343572, "learning_rate": 2.4381721815274443e-08, "logits/chosen": -2.0400168895721436, "logits/rejected": -2.0402297973632812, "logps/chosen": -1.019706130027771, "logps/rejected": -1.1512401103973389, "loss": 1.1928, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.039412260055542, "rewards/margins": 0.26306766271591187, "rewards/rejected": -2.3024802207946777, "step": 2320 }, { "epoch": 1.6786743515850144, "grad_norm": 19.583839102475277, "learning_rate": 2.4172202916176936e-08, "logits/chosen": -2.0487046241760254, "logits/rejected": -2.0510191917419434, "logps/chosen": -0.9676868319511414, "logps/rejected": -1.1354566812515259, "loss": 1.1567, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.9353736639022827, "rewards/margins": 0.3355395197868347, "rewards/rejected": -2.2709133625030518, "step": 2330 }, { "epoch": 1.685878962536023, "grad_norm": 19.19182662272249, "learning_rate": 2.3962742207796268e-08, "logits/chosen": -1.9858787059783936, "logits/rejected": -1.9837026596069336, "logps/chosen": -0.9570139050483704, "logps/rejected": -1.1192009449005127, "loss": 1.1603, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9140278100967407, "rewards/margins": 0.32437413930892944, "rewards/rejected": -2.2384018898010254, "step": 2340 }, { "epoch": 1.6930835734870318, "grad_norm": 22.691977894194924, "learning_rate": 2.3753354414355334e-08, "logits/chosen": -1.9461901187896729, "logits/rejected": -1.9355932474136353, "logps/chosen": -1.0649149417877197, "logps/rejected": -1.1817948818206787, "loss": 1.213, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.1298298835754395, "rewards/margins": 0.2337600290775299, "rewards/rejected": -2.3635897636413574, "step": 2350 }, { "epoch": 1.7002881844380404, "grad_norm": 18.59945891396093, "learning_rate": 2.3544054254951408e-08, "logits/chosen": -1.9878515005111694, "logits/rejected": -1.9792087078094482, "logps/chosen": -0.9375497102737427, "logps/rejected": -1.1345303058624268, "loss": 1.114, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.8750994205474854, "rewards/margins": 0.393961638212204, "rewards/rejected": -2.2690606117248535, "step": 2360 }, { "epoch": 1.707492795389049, "grad_norm": 18.203541895462912, "learning_rate": 2.3334856442521435e-08, "logits/chosen": -2.0370235443115234, "logits/rejected": -2.0295424461364746, "logps/chosen": -1.0964655876159668, "logps/rejected": -1.166515588760376, "loss": 1.27, "rewards/accuracies": 0.53125, "rewards/chosen": -2.1929311752319336, "rewards/margins": 0.1401001363992691, "rewards/rejected": -2.333031177520752, "step": 2370 }, { "epoch": 1.7146974063400577, "grad_norm": 19.123513495613718, "learning_rate": 2.3125775682807826e-08, "logits/chosen": -2.0507147312164307, "logits/rejected": -2.0506680011749268, "logps/chosen": -1.1658060550689697, "logps/rejected": -1.2665237188339233, "loss": 1.2309, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.3316121101379395, "rewards/margins": 0.2014356553554535, "rewards/rejected": -2.5330474376678467, "step": 2380 }, { "epoch": 1.7219020172910664, "grad_norm": 20.583955091856193, "learning_rate": 2.291682667332464e-08, "logits/chosen": -2.0658912658691406, "logits/rejected": -2.0607848167419434, "logps/chosen": -1.0484416484832764, "logps/rejected": -1.1794416904449463, "loss": 1.1918, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.0968832969665527, "rewards/margins": 0.2620001435279846, "rewards/rejected": -2.3588833808898926, "step": 2390 }, { "epoch": 1.729106628242075, "grad_norm": 15.255925002553854, "learning_rate": 2.2708024102324454e-08, "logits/chosen": -2.0251784324645996, "logits/rejected": -2.0195064544677734, "logps/chosen": -1.0335527658462524, "logps/rejected": -1.2097657918930054, "loss": 1.1498, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.067105531692505, "rewards/margins": 0.35242635011672974, "rewards/rejected": -2.4195315837860107, "step": 2400 }, { "epoch": 1.7363112391930837, "grad_norm": 22.44593573299748, "learning_rate": 2.2499382647765797e-08, "logits/chosen": -2.0198001861572266, "logits/rejected": -2.016092538833618, "logps/chosen": -1.0722882747650146, "logps/rejected": -1.161583662033081, "loss": 1.2463, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.1445765495300293, "rewards/margins": 0.17859075963497162, "rewards/rejected": -2.323167324066162, "step": 2410 }, { "epoch": 1.7435158501440924, "grad_norm": 20.996477598226324, "learning_rate": 2.2290916976281427e-08, "logits/chosen": -1.997984528541565, "logits/rejected": -1.991624116897583, "logps/chosen": -0.9992947578430176, "logps/rejected": -1.1312335729599, "loss": 1.2149, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.9985895156860352, "rewards/margins": 0.26387742161750793, "rewards/rejected": -2.2624671459198, "step": 2420 }, { "epoch": 1.7507204610951008, "grad_norm": 18.145146158512926, "learning_rate": 2.2082641742147238e-08, "logits/chosen": -1.9863611459732056, "logits/rejected": -1.9797251224517822, "logps/chosen": -1.0165367126464844, "logps/rejected": -1.2077693939208984, "loss": 1.115, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.0330734252929688, "rewards/margins": 0.38246554136276245, "rewards/rejected": -2.415538787841797, "step": 2430 }, { "epoch": 1.7579250720461095, "grad_norm": 20.511354788346416, "learning_rate": 2.1874571586252177e-08, "logits/chosen": -2.0291788578033447, "logits/rejected": -2.0222790241241455, "logps/chosen": -1.0278832912445068, "logps/rejected": -1.1068981885910034, "loss": 1.256, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.0557665824890137, "rewards/margins": 0.15802964568138123, "rewards/rejected": -2.213796377182007, "step": 2440 }, { "epoch": 1.7651296829971181, "grad_norm": 20.78736849578736, "learning_rate": 2.1666721135069037e-08, "logits/chosen": -2.022594928741455, "logits/rejected": -2.019284725189209, "logps/chosen": -1.1104170083999634, "logps/rejected": -1.2043354511260986, "loss": 1.2436, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.2208340167999268, "rewards/margins": 0.18783698976039886, "rewards/rejected": -2.4086709022521973, "step": 2450 }, { "epoch": 1.7723342939481268, "grad_norm": 15.559026450288725, "learning_rate": 2.145910499962628e-08, "logits/chosen": -2.0644400119781494, "logits/rejected": -2.0565133094787598, "logps/chosen": -0.9585247039794922, "logps/rejected": -1.101301908493042, "loss": 1.1824, "rewards/accuracies": 0.59375, "rewards/chosen": -1.9170494079589844, "rewards/margins": 0.28555426001548767, "rewards/rejected": -2.202603816986084, "step": 2460 }, { "epoch": 1.7795389048991355, "grad_norm": 23.88258329458798, "learning_rate": 2.1251737774480915e-08, "logits/chosen": -2.0456204414367676, "logits/rejected": -2.036010265350342, "logps/chosen": -1.1689999103546143, "logps/rejected": -1.2592300176620483, "loss": 1.2704, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.3379998207092285, "rewards/margins": 0.18046024441719055, "rewards/rejected": -2.5184600353240967, "step": 2470 }, { "epoch": 1.7867435158501441, "grad_norm": 17.478397647824718, "learning_rate": 2.104463403669264e-08, "logits/chosen": -1.9978790283203125, "logits/rejected": -1.9951884746551514, "logps/chosen": -1.0451444387435913, "logps/rejected": -1.189968466758728, "loss": 1.1806, "rewards/accuracies": 0.625, "rewards/chosen": -2.0902888774871826, "rewards/margins": 0.2896478772163391, "rewards/rejected": -2.379936933517456, "step": 2480 }, { "epoch": 1.7939481268011528, "grad_norm": 17.10039588248249, "learning_rate": 2.0837808344799028e-08, "logits/chosen": -1.9799407720565796, "logits/rejected": -1.9754537343978882, "logps/chosen": -0.9404538869857788, "logps/rejected": -1.0731335878372192, "loss": 1.1826, "rewards/accuracies": 0.59375, "rewards/chosen": -1.8809077739715576, "rewards/margins": 0.26535919308662415, "rewards/rejected": -2.1462671756744385, "step": 2490 }, { "epoch": 1.8011527377521612, "grad_norm": 18.18583469521082, "learning_rate": 2.063127523779219e-08, "logits/chosen": -1.9833685159683228, "logits/rejected": -1.9792015552520752, "logps/chosen": -1.0076165199279785, "logps/rejected": -1.1942096948623657, "loss": 1.1139, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.015233039855957, "rewards/margins": 0.373186320066452, "rewards/rejected": -2.3884193897247314, "step": 2500 }, { "epoch": 1.8083573487031699, "grad_norm": 19.97417842705391, "learning_rate": 2.0425049234096737e-08, "logits/chosen": -1.9911282062530518, "logits/rejected": -1.9853017330169678, "logps/chosen": -1.0088120698928833, "logps/rejected": -1.1258007287979126, "loss": 1.2158, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.0176241397857666, "rewards/margins": 0.2339775562286377, "rewards/rejected": -2.251601457595825, "step": 2510 }, { "epoch": 1.8155619596541785, "grad_norm": 19.435490123277745, "learning_rate": 2.0219144830549163e-08, "logits/chosen": -1.9644883871078491, "logits/rejected": -1.9634536504745483, "logps/chosen": -1.0153688192367554, "logps/rejected": -1.161481261253357, "loss": 1.1831, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0307376384735107, "rewards/margins": 0.2922249436378479, "rewards/rejected": -2.322962522506714, "step": 2520 }, { "epoch": 1.8227665706051872, "grad_norm": 19.09312194813426, "learning_rate": 2.0013576501378823e-08, "logits/chosen": -1.9830167293548584, "logits/rejected": -1.9765300750732422, "logps/chosen": -1.0100529193878174, "logps/rejected": -1.144884705543518, "loss": 1.1939, "rewards/accuracies": 0.59375, "rewards/chosen": -2.0201058387756348, "rewards/margins": 0.2696635127067566, "rewards/rejected": -2.289769411087036, "step": 2530 }, { "epoch": 1.8299711815561959, "grad_norm": 20.224925594213033, "learning_rate": 1.9808358697190426e-08, "logits/chosen": -1.9704053401947021, "logits/rejected": -1.966780662536621, "logps/chosen": -0.9303935766220093, "logps/rejected": -1.0650821924209595, "loss": 1.1986, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8607871532440186, "rewards/margins": 0.26937711238861084, "rewards/rejected": -2.130164384841919, "step": 2540 }, { "epoch": 1.8371757925072045, "grad_norm": 21.09688980967129, "learning_rate": 1.9603505843948214e-08, "logits/chosen": -2.01230525970459, "logits/rejected": -2.002260208129883, "logps/chosen": -0.948139488697052, "logps/rejected": -1.1198240518569946, "loss": 1.1395, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.896278977394104, "rewards/margins": 0.3433689475059509, "rewards/rejected": -2.2396481037139893, "step": 2550 }, { "epoch": 1.8443804034582132, "grad_norm": 20.232449119924333, "learning_rate": 1.9399032341961886e-08, "logits/chosen": -1.9766803979873657, "logits/rejected": -1.960636854171753, "logps/chosen": -0.9899090528488159, "logps/rejected": -1.0627111196517944, "loss": 1.2699, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.9798181056976318, "rewards/margins": 0.1456039845943451, "rewards/rejected": -2.125422239303589, "step": 2560 }, { "epoch": 1.8515850144092219, "grad_norm": 26.201248917968616, "learning_rate": 1.9194952564874323e-08, "logits/chosen": -2.0239641666412354, "logits/rejected": -2.0179200172424316, "logps/chosen": -1.0649644136428833, "logps/rejected": -1.2079579830169678, "loss": 1.1683, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.1299288272857666, "rewards/margins": 0.2859875559806824, "rewards/rejected": -2.4159159660339355, "step": 2570 }, { "epoch": 1.8587896253602305, "grad_norm": 20.644198497609576, "learning_rate": 1.8991280858651157e-08, "logits/chosen": -1.9820836782455444, "logits/rejected": -1.9764404296875, "logps/chosen": -1.0636051893234253, "logps/rejected": -1.14960777759552, "loss": 1.2503, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.1272103786468506, "rewards/margins": 0.1720050871372223, "rewards/rejected": -2.29921555519104, "step": 2580 }, { "epoch": 1.8659942363112392, "grad_norm": 16.854142688708556, "learning_rate": 1.8788031540572327e-08, "logits/chosen": -1.9858763217926025, "logits/rejected": -1.977818489074707, "logps/chosen": -0.9995776414871216, "logps/rejected": -1.1453144550323486, "loss": 1.1718, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.9991552829742432, "rewards/margins": 0.29147323966026306, "rewards/rejected": -2.2906289100646973, "step": 2590 }, { "epoch": 1.8731988472622478, "grad_norm": 16.996398857656907, "learning_rate": 1.858521889822565e-08, "logits/chosen": -2.0046029090881348, "logits/rejected": -2.007223129272461, "logps/chosen": -0.9735875129699707, "logps/rejected": -1.0832773447036743, "loss": 1.2233, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.9471750259399414, "rewards/margins": 0.21937978267669678, "rewards/rejected": -2.1665546894073486, "step": 2600 }, { "epoch": 1.8804034582132565, "grad_norm": 16.42143731996496, "learning_rate": 1.8382857188502422e-08, "logits/chosen": -1.9885772466659546, "logits/rejected": -1.9836734533309937, "logps/chosen": -0.9854310750961304, "logps/rejected": -1.1128942966461182, "loss": 1.1824, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9708621501922607, "rewards/margins": 0.25492629408836365, "rewards/rejected": -2.2257885932922363, "step": 2610 }, { "epoch": 1.8876080691642652, "grad_norm": 22.507165910966208, "learning_rate": 1.8180960636595234e-08, "logits/chosen": -1.9683783054351807, "logits/rejected": -1.966205358505249, "logps/chosen": -1.0359306335449219, "logps/rejected": -1.1797659397125244, "loss": 1.1791, "rewards/accuracies": 0.5625, "rewards/chosen": -2.0718612670898438, "rewards/margins": 0.28767016530036926, "rewards/rejected": -2.359531879425049, "step": 2620 }, { "epoch": 1.8948126801152738, "grad_norm": 20.53168247865903, "learning_rate": 1.7979543434998015e-08, "logits/chosen": -2.038526773452759, "logits/rejected": -2.0433452129364014, "logps/chosen": -1.1234701871871948, "logps/rejected": -1.2116920948028564, "loss": 1.2381, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.2469403743743896, "rewards/margins": 0.17644372582435608, "rewards/rejected": -2.423384189605713, "step": 2630 }, { "epoch": 1.9020172910662825, "grad_norm": 26.15291556775582, "learning_rate": 1.7778619742508345e-08, "logits/chosen": -1.9968883991241455, "logits/rejected": -1.9899314641952515, "logps/chosen": -1.0930571556091309, "logps/rejected": -1.1869739294052124, "loss": 1.2543, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.1861143112182617, "rewards/margins": 0.18783339858055115, "rewards/rejected": -2.373947858810425, "step": 2640 }, { "epoch": 1.9092219020172911, "grad_norm": 23.18245485008842, "learning_rate": 1.757820368323213e-08, "logits/chosen": -1.9929345846176147, "logits/rejected": -1.9831962585449219, "logps/chosen": -1.1062356233596802, "logps/rejected": -1.2650859355926514, "loss": 1.161, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.2124712467193604, "rewards/margins": 0.3177003860473633, "rewards/rejected": -2.5301718711853027, "step": 2650 }, { "epoch": 1.9164265129682998, "grad_norm": 22.34671676050883, "learning_rate": 1.7378309345590803e-08, "logits/chosen": -2.006321907043457, "logits/rejected": -2.015603542327881, "logps/chosen": -1.0863068103790283, "logps/rejected": -1.2286168336868286, "loss": 1.1821, "rewards/accuracies": 0.5625, "rewards/chosen": -2.1726136207580566, "rewards/margins": 0.2846204340457916, "rewards/rejected": -2.4572336673736572, "step": 2660 }, { "epoch": 1.9236311239193085, "grad_norm": 20.09934555506027, "learning_rate": 1.717895078133088e-08, "logits/chosen": -2.059466600418091, "logits/rejected": -2.0556395053863525, "logps/chosen": -1.0593311786651611, "logps/rejected": -1.2005198001861572, "loss": 1.1828, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.1186623573303223, "rewards/margins": 0.28237712383270264, "rewards/rejected": -2.4010396003723145, "step": 2670 }, { "epoch": 1.9308357348703171, "grad_norm": 21.005378635461394, "learning_rate": 1.698014200453624e-08, "logits/chosen": -2.0109024047851562, "logits/rejected": -2.0184760093688965, "logps/chosen": -1.031286597251892, "logps/rejected": -1.1622233390808105, "loss": 1.1776, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.062573194503784, "rewards/margins": 0.26187336444854736, "rewards/rejected": -2.324446678161621, "step": 2680 }, { "epoch": 1.9380403458213258, "grad_norm": 24.456240122864646, "learning_rate": 1.6781896990642964e-08, "logits/chosen": -1.9447215795516968, "logits/rejected": -1.942016839981079, "logps/chosen": -1.1477292776107788, "logps/rejected": -1.2380485534667969, "loss": 1.2441, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.2954585552215576, "rewards/margins": 0.18063834309577942, "rewards/rejected": -2.4760971069335938, "step": 2690 }, { "epoch": 1.9452449567723344, "grad_norm": 24.379018095612878, "learning_rate": 1.658422967545693e-08, "logits/chosen": -2.0516204833984375, "logits/rejected": -2.0385327339172363, "logps/chosen": -1.0048881769180298, "logps/rejected": -1.1195095777511597, "loss": 1.2189, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -2.0097763538360596, "rewards/margins": 0.2292429655790329, "rewards/rejected": -2.2390191555023193, "step": 2700 }, { "epoch": 1.952449567723343, "grad_norm": 20.606423235238847, "learning_rate": 1.638715395417418e-08, "logits/chosen": -2.0263454914093018, "logits/rejected": -2.024291515350342, "logps/chosen": -1.069252610206604, "logps/rejected": -1.2053518295288086, "loss": 1.1848, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.138505220413208, "rewards/margins": 0.27219831943511963, "rewards/rejected": -2.410703659057617, "step": 2710 }, { "epoch": 1.9596541786743515, "grad_norm": 22.535979632799137, "learning_rate": 1.619068368040416e-08, "logits/chosen": -2.024005174636841, "logits/rejected": -2.0195693969726562, "logps/chosen": -1.0005989074707031, "logps/rejected": -1.178637981414795, "loss": 1.1296, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.0011978149414062, "rewards/margins": 0.356078177690506, "rewards/rejected": -2.35727596282959, "step": 2720 }, { "epoch": 1.9668587896253602, "grad_norm": 17.493969053743083, "learning_rate": 1.5994832665195853e-08, "logits/chosen": -1.9611831903457642, "logits/rejected": -1.9615755081176758, "logps/chosen": -1.0340797901153564, "logps/rejected": -1.146831750869751, "loss": 1.2119, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.068159580230713, "rewards/margins": 0.22550389170646667, "rewards/rejected": -2.293663501739502, "step": 2730 }, { "epoch": 1.9740634005763689, "grad_norm": 20.261024993446156, "learning_rate": 1.5799614676066906e-08, "logits/chosen": -2.068851947784424, "logits/rejected": -2.065795421600342, "logps/chosen": -0.9484384655952454, "logps/rejected": -1.0868208408355713, "loss": 1.1744, "rewards/accuracies": 0.625, "rewards/chosen": -1.8968769311904907, "rewards/margins": 0.2767646610736847, "rewards/rejected": -2.1736416816711426, "step": 2740 }, { "epoch": 1.9812680115273775, "grad_norm": 16.03971358941223, "learning_rate": 1.560504343603587e-08, "logits/chosen": -1.9830427169799805, "logits/rejected": -1.983306884765625, "logps/chosen": -1.0689435005187988, "logps/rejected": -1.224401593208313, "loss": 1.1606, "rewards/accuracies": 0.65625, "rewards/chosen": -2.1378870010375977, "rewards/margins": 0.3109160363674164, "rewards/rejected": -2.448803186416626, "step": 2750 }, { "epoch": 1.9884726224783862, "grad_norm": 18.729955235435014, "learning_rate": 1.541113262265748e-08, "logits/chosen": -2.0666756629943848, "logits/rejected": -2.0645081996917725, "logps/chosen": -1.0288206338882446, "logps/rejected": -1.1466666460037231, "loss": 1.2071, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.0576412677764893, "rewards/margins": 0.23569221794605255, "rewards/rejected": -2.2933332920074463, "step": 2760 }, { "epoch": 1.9956772334293948, "grad_norm": 25.946584240501473, "learning_rate": 1.5217895867061227e-08, "logits/chosen": -2.00740385055542, "logits/rejected": -2.0015203952789307, "logps/chosen": -1.0842344760894775, "logps/rejected": -1.1836035251617432, "loss": 1.2464, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.168468952178955, "rewards/margins": 0.19873787462711334, "rewards/rejected": -2.3672070503234863, "step": 2770 }, { "epoch": 2.0028818443804033, "grad_norm": 22.724338628633177, "learning_rate": 1.5025346752993098e-08, "logits/chosen": -1.9985382556915283, "logits/rejected": -2.000462532043457, "logps/chosen": -1.071683406829834, "logps/rejected": -1.1988445520401, "loss": 1.1999, "rewards/accuracies": 0.5625, "rewards/chosen": -2.143366813659668, "rewards/margins": 0.25432220101356506, "rewards/rejected": -2.3976891040802, "step": 2780 }, { "epoch": 2.010086455331412, "grad_norm": 23.240965924702092, "learning_rate": 1.4833498815860756e-08, "logits/chosen": -2.053358554840088, "logits/rejected": -2.055558443069458, "logps/chosen": -0.9996848106384277, "logps/rejected": -1.1848304271697998, "loss": 1.149, "rewards/accuracies": 0.59375, "rewards/chosen": -1.9993696212768555, "rewards/margins": 0.3702912926673889, "rewards/rejected": -2.3696608543395996, "step": 2790 }, { "epoch": 2.0172910662824206, "grad_norm": 18.225560415881105, "learning_rate": 1.4642365541781993e-08, "logits/chosen": -1.9646400213241577, "logits/rejected": -1.9561887979507446, "logps/chosen": -1.0267730951309204, "logps/rejected": -1.1928188800811768, "loss": 1.1511, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.053546190261841, "rewards/margins": 0.3320915699005127, "rewards/rejected": -2.3856377601623535, "step": 2800 }, { "epoch": 2.0244956772334293, "grad_norm": 17.931282922261985, "learning_rate": 1.4451960366636745e-08, "logits/chosen": -2.026698589324951, "logits/rejected": -2.0378384590148926, "logps/chosen": -1.0406183004379272, "logps/rejected": -1.1752769947052002, "loss": 1.1819, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0812366008758545, "rewards/margins": 0.26931747794151306, "rewards/rejected": -2.3505539894104004, "step": 2810 }, { "epoch": 2.031700288184438, "grad_norm": 19.290431128690432, "learning_rate": 1.4262296675122592e-08, "logits/chosen": -2.0173158645629883, "logits/rejected": -2.0136220455169678, "logps/chosen": -1.030659556388855, "logps/rejected": -1.1910489797592163, "loss": 1.1523, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.06131911277771, "rewards/margins": 0.32077842950820923, "rewards/rejected": -2.3820979595184326, "step": 2820 }, { "epoch": 2.0389048991354466, "grad_norm": 17.902852888321604, "learning_rate": 1.407338779981389e-08, "logits/chosen": -1.9934546947479248, "logits/rejected": -1.9913368225097656, "logps/chosen": -0.9143481254577637, "logps/rejected": -1.0957781076431274, "loss": 1.1116, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8286962509155273, "rewards/margins": 0.36285993456840515, "rewards/rejected": -2.191556215286255, "step": 2830 }, { "epoch": 2.0461095100864553, "grad_norm": 21.306516095869544, "learning_rate": 1.3885247020224534e-08, "logits/chosen": -2.0094637870788574, "logits/rejected": -2.004822015762329, "logps/chosen": -1.0016566514968872, "logps/rejected": -1.1311957836151123, "loss": 1.1913, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0033133029937744, "rewards/margins": 0.25907841324806213, "rewards/rejected": -2.2623915672302246, "step": 2840 }, { "epoch": 2.053314121037464, "grad_norm": 17.105370578566056, "learning_rate": 1.369788756187445e-08, "logits/chosen": -2.008868455886841, "logits/rejected": -2.0054876804351807, "logps/chosen": -1.0270612239837646, "logps/rejected": -1.1226084232330322, "loss": 1.2343, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.0541224479675293, "rewards/margins": 0.19109439849853516, "rewards/rejected": -2.2452168464660645, "step": 2850 }, { "epoch": 2.0605187319884726, "grad_norm": 18.695157813530198, "learning_rate": 1.3511322595359925e-08, "logits/chosen": -2.033163547515869, "logits/rejected": -2.0247857570648193, "logps/chosen": -0.9382683634757996, "logps/rejected": -1.1057795286178589, "loss": 1.14, "rewards/accuracies": 0.65625, "rewards/chosen": -1.8765367269515991, "rewards/margins": 0.3350227475166321, "rewards/rejected": -2.2115590572357178, "step": 2860 }, { "epoch": 2.0677233429394812, "grad_norm": 17.33211536858926, "learning_rate": 1.3325565235427716e-08, "logits/chosen": -2.028552770614624, "logits/rejected": -2.0268807411193848, "logps/chosen": -0.9831819534301758, "logps/rejected": -1.1274645328521729, "loss": 1.177, "rewards/accuracies": 0.59375, "rewards/chosen": -1.9663639068603516, "rewards/margins": 0.288565069437027, "rewards/rejected": -2.2549290657043457, "step": 2870 }, { "epoch": 2.07492795389049, "grad_norm": 17.173051243263835, "learning_rate": 1.3140628540053218e-08, "logits/chosen": -1.9946448802947998, "logits/rejected": -1.9966709613800049, "logps/chosen": -0.9759955406188965, "logps/rejected": -1.1114940643310547, "loss": 1.1833, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.951991081237793, "rewards/margins": 0.2709970772266388, "rewards/rejected": -2.2229881286621094, "step": 2880 }, { "epoch": 2.0821325648414986, "grad_norm": 19.25779046293631, "learning_rate": 1.2956525509522451e-08, "logits/chosen": -1.9791204929351807, "logits/rejected": -1.97879159450531, "logps/chosen": -1.1120542287826538, "logps/rejected": -1.2156860828399658, "loss": 1.234, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.2241084575653076, "rewards/margins": 0.20726370811462402, "rewards/rejected": -2.4313721656799316, "step": 2890 }, { "epoch": 2.089337175792507, "grad_norm": 19.696970893217582, "learning_rate": 1.2773269085518267e-08, "logits/chosen": -2.011164426803589, "logits/rejected": -2.0127670764923096, "logps/chosen": -1.0766938924789429, "logps/rejected": -1.2073553800582886, "loss": 1.1837, "rewards/accuracies": 0.65625, "rewards/chosen": -2.1533877849578857, "rewards/margins": 0.2613227963447571, "rewards/rejected": -2.414710760116577, "step": 2900 }, { "epoch": 2.096541786743516, "grad_norm": 20.176935063380885, "learning_rate": 1.2590872150210574e-08, "logits/chosen": -2.0675017833709717, "logits/rejected": -2.0605273246765137, "logps/chosen": -1.057830810546875, "logps/rejected": -1.1687963008880615, "loss": 1.2247, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.11566162109375, "rewards/margins": 0.22193074226379395, "rewards/rejected": -2.337592601776123, "step": 2910 }, { "epoch": 2.1037463976945245, "grad_norm": 20.572020361191345, "learning_rate": 1.2409347525350775e-08, "logits/chosen": -2.0273613929748535, "logits/rejected": -2.0172836780548096, "logps/chosen": -1.1093732118606567, "logps/rejected": -1.2572507858276367, "loss": 1.166, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.2187464237213135, "rewards/margins": 0.2957550585269928, "rewards/rejected": -2.5145015716552734, "step": 2920 }, { "epoch": 2.110951008645533, "grad_norm": 22.752346590850024, "learning_rate": 1.2228707971370421e-08, "logits/chosen": -2.018433094024658, "logits/rejected": -2.01145601272583, "logps/chosen": -0.9928643107414246, "logps/rejected": -1.1063158512115479, "loss": 1.2256, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9857286214828491, "rewards/margins": 0.22690317034721375, "rewards/rejected": -2.2126317024230957, "step": 2930 }, { "epoch": 2.118155619596542, "grad_norm": 21.233577131341413, "learning_rate": 1.2048966186484282e-08, "logits/chosen": -2.015031337738037, "logits/rejected": -1.9982612133026123, "logps/chosen": -1.117865800857544, "logps/rejected": -1.232062578201294, "loss": 1.2127, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.235731601715088, "rewards/margins": 0.2283933460712433, "rewards/rejected": -2.464125156402588, "step": 2940 }, { "epoch": 2.1253602305475505, "grad_norm": 28.100094599633593, "learning_rate": 1.187013480579762e-08, "logits/chosen": -2.010659694671631, "logits/rejected": -2.0132699012756348, "logps/chosen": -1.0425684452056885, "logps/rejected": -1.1762912273406982, "loss": 1.201, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.085136890411377, "rewards/margins": 0.26744550466537476, "rewards/rejected": -2.3525824546813965, "step": 2950 }, { "epoch": 2.132564841498559, "grad_norm": 40.79478390193748, "learning_rate": 1.1692226400418073e-08, "logits/chosen": -1.9510002136230469, "logits/rejected": -1.9495048522949219, "logps/chosen": -1.0822184085845947, "logps/rejected": -1.212081789970398, "loss": 1.2188, "rewards/accuracies": 0.5625, "rewards/chosen": -2.1644368171691895, "rewards/margins": 0.2597268521785736, "rewards/rejected": -2.424163579940796, "step": 2960 }, { "epoch": 2.139769452449568, "grad_norm": 16.326675723252357, "learning_rate": 1.1515253476571923e-08, "logits/chosen": -1.9815658330917358, "logits/rejected": -1.975783109664917, "logps/chosen": -1.0105996131896973, "logps/rejected": -1.192287564277649, "loss": 1.1208, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.0211992263793945, "rewards/margins": 0.36337584257125854, "rewards/rejected": -2.384575128555298, "step": 2970 }, { "epoch": 2.1469740634005765, "grad_norm": 19.921977505309442, "learning_rate": 1.133922847472496e-08, "logits/chosen": -1.9953126907348633, "logits/rejected": -1.9960988759994507, "logps/chosen": -1.110705018043518, "logps/rejected": -1.209160566329956, "loss": 1.2518, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.221410036087036, "rewards/margins": 0.19691102206707, "rewards/rejected": -2.418321132659912, "step": 2980 }, { "epoch": 2.154178674351585, "grad_norm": 22.89240067306987, "learning_rate": 1.1164163768707952e-08, "logits/chosen": -2.003279209136963, "logits/rejected": -1.9980405569076538, "logps/chosen": -1.0043981075286865, "logps/rejected": -1.1427768468856812, "loss": 1.1843, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.008796215057373, "rewards/margins": 0.27675721049308777, "rewards/rejected": -2.2855536937713623, "step": 2990 }, { "epoch": 2.161383285302594, "grad_norm": 17.89862895130365, "learning_rate": 1.0990071664846861e-08, "logits/chosen": -1.9780519008636475, "logits/rejected": -1.9771487712860107, "logps/chosen": -1.0197398662567139, "logps/rejected": -1.1996923685073853, "loss": 1.1581, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0394797325134277, "rewards/margins": 0.35990482568740845, "rewards/rejected": -2.3993847370147705, "step": 3000 }, { "epoch": 2.1685878962536025, "grad_norm": 18.90659171579793, "learning_rate": 1.0816964401097739e-08, "logits/chosen": -1.964535117149353, "logits/rejected": -1.9613316059112549, "logps/chosen": -0.9563964009284973, "logps/rejected": -1.0800330638885498, "loss": 1.2052, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.9127928018569946, "rewards/margins": 0.24727335572242737, "rewards/rejected": -2.1600661277770996, "step": 3010 }, { "epoch": 2.175792507204611, "grad_norm": 19.878484764331017, "learning_rate": 1.0644854146186406e-08, "logits/chosen": -2.0236928462982178, "logits/rejected": -2.0173866748809814, "logps/chosen": -1.0241403579711914, "logps/rejected": -1.1835166215896606, "loss": 1.1624, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.048280715942383, "rewards/margins": 0.318752646446228, "rewards/rejected": -2.3670332431793213, "step": 3020 }, { "epoch": 2.18299711815562, "grad_norm": 19.088076538610206, "learning_rate": 1.0473752998753114e-08, "logits/chosen": -2.004102945327759, "logits/rejected": -1.9954335689544678, "logps/chosen": -1.0195882320404053, "logps/rejected": -1.1798489093780518, "loss": 1.1535, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0391764640808105, "rewards/margins": 0.3205214738845825, "rewards/rejected": -2.3596978187561035, "step": 3030 }, { "epoch": 2.1902017291066285, "grad_norm": 19.31361091042759, "learning_rate": 1.030367298650201e-08, "logits/chosen": -2.023881196975708, "logits/rejected": -2.0238354206085205, "logps/chosen": -1.0392138957977295, "logps/rejected": -1.19191312789917, "loss": 1.1578, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.078427791595459, "rewards/margins": 0.3053986132144928, "rewards/rejected": -2.38382625579834, "step": 3040 }, { "epoch": 2.1974063400576367, "grad_norm": 22.093759972479646, "learning_rate": 1.0134626065355675e-08, "logits/chosen": -2.0746548175811768, "logits/rejected": -2.0715177059173584, "logps/chosen": -1.0234037637710571, "logps/rejected": -1.1665077209472656, "loss": 1.1883, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.0468075275421143, "rewards/margins": 0.2862081527709961, "rewards/rejected": -2.3330154418945312, "step": 3050 }, { "epoch": 2.2046109510086453, "grad_norm": 19.64286406855496, "learning_rate": 9.966624118614611e-09, "logits/chosen": -2.006706476211548, "logits/rejected": -2.0016489028930664, "logps/chosen": -1.0631506443023682, "logps/rejected": -1.2085468769073486, "loss": 1.1876, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.1263012886047363, "rewards/margins": 0.29079198837280273, "rewards/rejected": -2.4170937538146973, "step": 3060 }, { "epoch": 2.211815561959654, "grad_norm": 14.856155733229528, "learning_rate": 9.799678956121976e-09, "logits/chosen": -1.9717843532562256, "logits/rejected": -1.9674240350723267, "logps/chosen": -1.0307656526565552, "logps/rejected": -1.1394712924957275, "loss": 1.2006, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.0615313053131104, "rewards/margins": 0.21741144359111786, "rewards/rejected": -2.278942584991455, "step": 3070 }, { "epoch": 2.2190201729106627, "grad_norm": 23.633018389781732, "learning_rate": 9.633802313433314e-09, "logits/chosen": -1.9454095363616943, "logits/rejected": -1.9511306285858154, "logps/chosen": -1.0190519094467163, "logps/rejected": -1.1248835325241089, "loss": 1.2055, "rewards/accuracies": 0.59375, "rewards/chosen": -2.0381038188934326, "rewards/margins": 0.21166305243968964, "rewards/rejected": -2.2497670650482178, "step": 3080 }, { "epoch": 2.2262247838616713, "grad_norm": 20.794315619142072, "learning_rate": 9.469005850991705e-09, "logits/chosen": -2.0088305473327637, "logits/rejected": -2.003154754638672, "logps/chosen": -1.0144597291946411, "logps/rejected": -1.1316652297973633, "loss": 1.2343, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.0289194583892822, "rewards/margins": 0.23441116511821747, "rewards/rejected": -2.2633304595947266, "step": 3090 }, { "epoch": 2.23342939481268, "grad_norm": 18.67038535819961, "learning_rate": 9.305301153307949e-09, "logits/chosen": -2.0057482719421387, "logits/rejected": -2.0133614540100098, "logps/chosen": -0.9462668299674988, "logps/rejected": -1.1108109951019287, "loss": 1.1573, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.8925336599349976, "rewards/margins": 0.3290883004665375, "rewards/rejected": -2.2216219902038574, "step": 3100 }, { "epoch": 2.2406340057636887, "grad_norm": 18.08187045245269, "learning_rate": 9.142699728146336e-09, "logits/chosen": -1.9763425588607788, "logits/rejected": -1.9695403575897217, "logps/chosen": -1.0319360494613647, "logps/rejected": -1.1644192934036255, "loss": 1.2014, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.0638720989227295, "rewards/margins": 0.26496636867523193, "rewards/rejected": -2.328838586807251, "step": 3110 }, { "epoch": 2.2478386167146973, "grad_norm": 16.765059853307356, "learning_rate": 8.981213005715627e-09, "logits/chosen": -2.002300500869751, "logits/rejected": -2.005335569381714, "logps/chosen": -0.9925374984741211, "logps/rejected": -1.1654067039489746, "loss": 1.1484, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.9850749969482422, "rewards/margins": 0.34573858976364136, "rewards/rejected": -2.330813407897949, "step": 3120 }, { "epoch": 2.255043227665706, "grad_norm": 21.826707648017194, "learning_rate": 8.820852337865611e-09, "logits/chosen": -2.0354738235473633, "logits/rejected": -2.031705617904663, "logps/chosen": -0.9956668019294739, "logps/rejected": -1.1440733671188354, "loss": 1.1717, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.9913336038589478, "rewards/margins": 0.296813428401947, "rewards/rejected": -2.288146734237671, "step": 3130 }, { "epoch": 2.2622478386167146, "grad_norm": 17.005533531498173, "learning_rate": 8.661628997289044e-09, "logits/chosen": -1.9752384424209595, "logits/rejected": -1.97113835811615, "logps/chosen": -1.0153406858444214, "logps/rejected": -1.1710517406463623, "loss": 1.1687, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.0306813716888428, "rewards/margins": 0.3114221394062042, "rewards/rejected": -2.3421034812927246, "step": 3140 }, { "epoch": 2.2694524495677233, "grad_norm": 16.411029038337308, "learning_rate": 8.503554176729341e-09, "logits/chosen": -1.976362943649292, "logits/rejected": -1.974590539932251, "logps/chosen": -1.026755928993225, "logps/rejected": -1.1865880489349365, "loss": 1.1696, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.05351185798645, "rewards/margins": 0.3196641802787781, "rewards/rejected": -2.373176097869873, "step": 3150 }, { "epoch": 2.276657060518732, "grad_norm": 24.940392400474, "learning_rate": 8.346638988193636e-09, "logits/chosen": -2.0030248165130615, "logits/rejected": -1.9979517459869385, "logps/chosen": -0.9251815676689148, "logps/rejected": -1.0761079788208008, "loss": 1.1768, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.8503631353378296, "rewards/margins": 0.30185258388519287, "rewards/rejected": -2.1522159576416016, "step": 3160 }, { "epoch": 2.2838616714697406, "grad_norm": 23.2363909978899, "learning_rate": 8.19089446217176e-09, "logits/chosen": -1.9777719974517822, "logits/rejected": -1.9676783084869385, "logps/chosen": -1.0022261142730713, "logps/rejected": -1.1919556856155396, "loss": 1.1202, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.0044522285461426, "rewards/margins": 0.3794591426849365, "rewards/rejected": -2.383911371231079, "step": 3170 }, { "epoch": 2.2910662824207493, "grad_norm": 17.14618373707155, "learning_rate": 8.036331546860777e-09, "logits/chosen": -1.982940435409546, "logits/rejected": -1.982465386390686, "logps/chosen": -0.9494163393974304, "logps/rejected": -1.039945125579834, "loss": 1.2442, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.8988326787948608, "rewards/margins": 0.18105748295783997, "rewards/rejected": -2.079890251159668, "step": 3180 }, { "epoch": 2.298270893371758, "grad_norm": 23.5198522631464, "learning_rate": 7.882961107395416e-09, "logits/chosen": -1.9984643459320068, "logits/rejected": -1.9926011562347412, "logps/chosen": -1.130748987197876, "logps/rejected": -1.177819848060608, "loss": 1.315, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -2.261497974395752, "rewards/margins": 0.09414196014404297, "rewards/rejected": -2.355639696121216, "step": 3190 }, { "epoch": 2.3054755043227666, "grad_norm": 25.590315233089598, "learning_rate": 7.73079392508428e-09, "logits/chosen": -1.9712879657745361, "logits/rejected": -1.9705880880355835, "logps/chosen": -1.0907241106033325, "logps/rejected": -1.2799861431121826, "loss": 1.1523, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.181448221206665, "rewards/margins": 0.37852445244789124, "rewards/rejected": -2.5599722862243652, "step": 3200 }, { "epoch": 2.3126801152737753, "grad_norm": 21.478168268234054, "learning_rate": 7.579840696651938e-09, "logits/chosen": -1.9963871240615845, "logits/rejected": -1.9932276010513306, "logps/chosen": -1.0474956035614014, "logps/rejected": -1.172515869140625, "loss": 1.2079, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0949912071228027, "rewards/margins": 0.2500404119491577, "rewards/rejected": -2.34503173828125, "step": 3210 }, { "epoch": 2.319884726224784, "grad_norm": 20.9972814315902, "learning_rate": 7.43011203348704e-09, "logits/chosen": -1.9149713516235352, "logits/rejected": -1.9116861820220947, "logps/chosen": -1.0514217615127563, "logps/rejected": -1.1269280910491943, "loss": 1.2686, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.1028435230255127, "rewards/margins": 0.1510128229856491, "rewards/rejected": -2.2538561820983887, "step": 3220 }, { "epoch": 2.3270893371757926, "grad_norm": 18.760848272652197, "learning_rate": 7.281618460896344e-09, "logits/chosen": -1.995486855506897, "logits/rejected": -1.9929373264312744, "logps/chosen": -0.9654563665390015, "logps/rejected": -1.1074378490447998, "loss": 1.1729, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.930912733078003, "rewards/margins": 0.2839628756046295, "rewards/rejected": -2.2148756980895996, "step": 3230 }, { "epoch": 2.3342939481268012, "grad_norm": 20.42845258559301, "learning_rate": 7.134370417364849e-09, "logits/chosen": -1.9683917760849, "logits/rejected": -1.9679603576660156, "logps/chosen": -1.001461386680603, "logps/rejected": -1.1403329372406006, "loss": 1.2027, "rewards/accuracies": 0.5625, "rewards/chosen": -2.002922773361206, "rewards/margins": 0.2777433395385742, "rewards/rejected": -2.280665874481201, "step": 3240 }, { "epoch": 2.34149855907781, "grad_norm": 23.164358986342677, "learning_rate": 6.988378253821981e-09, "logits/chosen": -1.9697679281234741, "logits/rejected": -1.9687258005142212, "logps/chosen": -1.0258748531341553, "logps/rejected": -1.143920660018921, "loss": 1.209, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.0517497062683105, "rewards/margins": 0.23609168827533722, "rewards/rejected": -2.287841320037842, "step": 3250 }, { "epoch": 2.3487031700288186, "grad_norm": 20.30991552682094, "learning_rate": 6.8436522329140186e-09, "logits/chosen": -1.9788017272949219, "logits/rejected": -1.985569715499878, "logps/chosen": -1.0339092016220093, "logps/rejected": -1.1592271327972412, "loss": 1.2106, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.0678184032440186, "rewards/margins": 0.2506362795829773, "rewards/rejected": -2.3184542655944824, "step": 3260 }, { "epoch": 2.3559077809798272, "grad_norm": 21.894995604840652, "learning_rate": 6.700202528282603e-09, "logits/chosen": -1.978734016418457, "logits/rejected": -1.969061255455017, "logps/chosen": -1.0286333560943604, "logps/rejected": -1.1439108848571777, "loss": 1.2158, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0572667121887207, "rewards/margins": 0.23055517673492432, "rewards/rejected": -2.2878217697143555, "step": 3270 }, { "epoch": 2.363112391930836, "grad_norm": 21.434677454334327, "learning_rate": 6.558039223849668e-09, "logits/chosen": -2.0271782875061035, "logits/rejected": -2.0174622535705566, "logps/chosen": -1.0365641117095947, "logps/rejected": -1.2443287372589111, "loss": 1.1155, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.0731282234191895, "rewards/margins": 0.41552942991256714, "rewards/rejected": -2.4886574745178223, "step": 3280 }, { "epoch": 2.3703170028818445, "grad_norm": 22.134638764373964, "learning_rate": 6.417172313108471e-09, "logits/chosen": -1.95876944065094, "logits/rejected": -1.9533073902130127, "logps/chosen": -0.9859912991523743, "logps/rejected": -1.1151400804519653, "loss": 1.1979, "rewards/accuracies": 0.59375, "rewards/chosen": -1.9719825983047485, "rewards/margins": 0.2582974135875702, "rewards/rejected": -2.2302801609039307, "step": 3290 }, { "epoch": 2.377521613832853, "grad_norm": 21.658570611710445, "learning_rate": 6.277611698421179e-09, "logits/chosen": -2.017606735229492, "logits/rejected": -2.0095458030700684, "logps/chosen": -0.9050453305244446, "logps/rejected": -1.098288893699646, "loss": 1.1252, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.8100906610488892, "rewards/margins": 0.3864876627922058, "rewards/rejected": -2.196577787399292, "step": 3300 }, { "epoch": 2.3847262247838614, "grad_norm": 22.8559892529762, "learning_rate": 6.139367190322714e-09, "logits/chosen": -2.0034892559051514, "logits/rejected": -2.0032081604003906, "logps/chosen": -1.0592529773712158, "logps/rejected": -1.2185790538787842, "loss": 1.1609, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.1185059547424316, "rewards/margins": 0.3186524510383606, "rewards/rejected": -2.4371581077575684, "step": 3310 }, { "epoch": 2.39193083573487, "grad_norm": 17.198608533100995, "learning_rate": 6.002448506831171e-09, "logits/chosen": -2.0061838626861572, "logits/rejected": -2.0014090538024902, "logps/chosen": -0.9808699488639832, "logps/rejected": -1.1246802806854248, "loss": 1.1731, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9617398977279663, "rewards/margins": 0.2876203954219818, "rewards/rejected": -2.2493605613708496, "step": 3320 }, { "epoch": 2.3991354466858787, "grad_norm": 18.199025209277288, "learning_rate": 5.866865272764607e-09, "logits/chosen": -2.023648262023926, "logits/rejected": -2.0234923362731934, "logps/chosen": -1.0167878866195679, "logps/rejected": -1.1612762212753296, "loss": 1.1772, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0335757732391357, "rewards/margins": 0.28897663950920105, "rewards/rejected": -2.322552442550659, "step": 3330 }, { "epoch": 2.4063400576368874, "grad_norm": 23.302252487813124, "learning_rate": 5.7326270190645595e-09, "logits/chosen": -1.896691918373108, "logits/rejected": -1.8979320526123047, "logps/chosen": -1.0594362020492554, "logps/rejected": -1.1698405742645264, "loss": 1.2168, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.1188724040985107, "rewards/margins": 0.2208089381456375, "rewards/rejected": -2.3396811485290527, "step": 3340 }, { "epoch": 2.413544668587896, "grad_norm": 18.446092862588884, "learning_rate": 5.599743182125938e-09, "logits/chosen": -2.043023109436035, "logits/rejected": -2.043013095855713, "logps/chosen": -1.0480068922042847, "logps/rejected": -1.1850215196609497, "loss": 1.179, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.0960137844085693, "rewards/margins": 0.2740294933319092, "rewards/rejected": -2.3700430393218994, "step": 3350 }, { "epoch": 2.4207492795389047, "grad_norm": 20.220307143059344, "learning_rate": 5.46822310313379e-09, "logits/chosen": -2.0473732948303223, "logits/rejected": -2.0569522380828857, "logps/chosen": -1.09086012840271, "logps/rejected": -1.1955832242965698, "loss": 1.2353, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.18172025680542, "rewards/margins": 0.2094462662935257, "rewards/rejected": -2.3911664485931396, "step": 3360 }, { "epoch": 2.4279538904899134, "grad_norm": 20.780990431383444, "learning_rate": 5.33807602740658e-09, "logits/chosen": -2.022789478302002, "logits/rejected": -2.0159573554992676, "logps/chosen": -0.9560559988021851, "logps/rejected": -1.160628318786621, "loss": 1.111, "rewards/accuracies": 0.65625, "rewards/chosen": -1.9121119976043701, "rewards/margins": 0.40914446115493774, "rewards/rejected": -2.321256637573242, "step": 3370 }, { "epoch": 2.435158501440922, "grad_norm": 21.245348975655457, "learning_rate": 5.209311103746334e-09, "logits/chosen": -2.0008084774017334, "logits/rejected": -2.0011303424835205, "logps/chosen": -1.0523884296417236, "logps/rejected": -1.224974274635315, "loss": 1.1587, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.1047768592834473, "rewards/margins": 0.3451715409755707, "rewards/rejected": -2.44994854927063, "step": 3380 }, { "epoch": 2.4423631123919307, "grad_norm": 24.352598699910715, "learning_rate": 5.081937383795484e-09, "logits/chosen": -1.972608208656311, "logits/rejected": -1.9717302322387695, "logps/chosen": -0.9721845388412476, "logps/rejected": -1.137112021446228, "loss": 1.1484, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.9443690776824951, "rewards/margins": 0.32985490560531616, "rewards/rejected": -2.274224042892456, "step": 3390 }, { "epoch": 2.4495677233429394, "grad_norm": 18.574622449743107, "learning_rate": 4.955963821400599e-09, "logits/chosen": -2.0249781608581543, "logits/rejected": -2.019134759902954, "logps/chosen": -1.029394507408142, "logps/rejected": -1.167999505996704, "loss": 1.1922, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.058789014816284, "rewards/margins": 0.2772100567817688, "rewards/rejected": -2.335999011993408, "step": 3400 }, { "epoch": 2.456772334293948, "grad_norm": 15.429482416255146, "learning_rate": 4.831399271982928e-09, "logits/chosen": -1.9512850046157837, "logits/rejected": -1.9432300329208374, "logps/chosen": -1.0431114435195923, "logps/rejected": -1.1738238334655762, "loss": 1.2095, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.0862228870391846, "rewards/margins": 0.26142507791519165, "rewards/rejected": -2.3476476669311523, "step": 3410 }, { "epoch": 2.4639769452449567, "grad_norm": 25.103110732614255, "learning_rate": 4.708252491915951e-09, "logits/chosen": -2.030299663543701, "logits/rejected": -2.024203300476074, "logps/chosen": -1.0459200143814087, "logps/rejected": -1.1930789947509766, "loss": 1.197, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.0918400287628174, "rewards/margins": 0.2943178713321686, "rewards/rejected": -2.386157989501953, "step": 3420 }, { "epoch": 2.4711815561959654, "grad_norm": 25.280433628761124, "learning_rate": 4.58653213790981e-09, "logits/chosen": -2.006598472595215, "logits/rejected": -1.9983857870101929, "logps/chosen": -1.025721549987793, "logps/rejected": -1.1746145486831665, "loss": 1.1792, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.051443099975586, "rewards/margins": 0.29778599739074707, "rewards/rejected": -2.349229097366333, "step": 3430 }, { "epoch": 2.478386167146974, "grad_norm": 18.242083284353217, "learning_rate": 4.466246766402773e-09, "logits/chosen": -1.9907543659210205, "logits/rejected": -1.9845359325408936, "logps/chosen": -1.0393613576889038, "logps/rejected": -1.1935051679611206, "loss": 1.1827, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0787227153778076, "rewards/margins": 0.30828770995140076, "rewards/rejected": -2.387010335922241, "step": 3440 }, { "epoch": 2.4855907780979827, "grad_norm": 22.018503196573274, "learning_rate": 4.347404832959775e-09, "logits/chosen": -2.0336432456970215, "logits/rejected": -2.0338807106018066, "logps/chosen": -1.032832384109497, "logps/rejected": -1.1933454275131226, "loss": 1.1612, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.065664768218994, "rewards/margins": 0.32102587819099426, "rewards/rejected": -2.386690855026245, "step": 3450 }, { "epoch": 2.4927953890489913, "grad_norm": 33.037428693429234, "learning_rate": 4.230014691678016e-09, "logits/chosen": -1.9883922338485718, "logits/rejected": -1.9890626668930054, "logps/chosen": -1.0595102310180664, "logps/rejected": -1.126479148864746, "loss": 1.272, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.119020462036133, "rewards/margins": 0.1339379847049713, "rewards/rejected": -2.252958297729492, "step": 3460 }, { "epoch": 2.5, "grad_norm": 17.82843912451702, "learning_rate": 4.114084594599707e-09, "logits/chosen": -1.9903564453125, "logits/rejected": -1.9900470972061157, "logps/chosen": -1.0114375352859497, "logps/rejected": -1.229552984237671, "loss": 1.1011, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.0228750705718994, "rewards/margins": 0.4362305998802185, "rewards/rejected": -2.459105968475342, "step": 3470 }, { "epoch": 2.5072046109510087, "grad_norm": 22.102059612075095, "learning_rate": 3.9996226911319546e-09, "logits/chosen": -1.993326187133789, "logits/rejected": -1.981066107749939, "logps/chosen": -1.0159366130828857, "logps/rejected": -1.1457350254058838, "loss": 1.1898, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0318732261657715, "rewards/margins": 0.25959664583206177, "rewards/rejected": -2.2914700508117676, "step": 3480 }, { "epoch": 2.5144092219020173, "grad_norm": 17.973603590541654, "learning_rate": 3.886637027473949e-09, "logits/chosen": -2.0013790130615234, "logits/rejected": -2.0035085678100586, "logps/chosen": -1.076293706893921, "logps/rejected": -1.2393258810043335, "loss": 1.1562, "rewards/accuracies": 0.625, "rewards/chosen": -2.152587413787842, "rewards/margins": 0.3260645270347595, "rewards/rejected": -2.478651762008667, "step": 3490 }, { "epoch": 2.521613832853026, "grad_norm": 19.37957776631117, "learning_rate": 3.775135546051295e-09, "logits/chosen": -1.9389715194702148, "logits/rejected": -1.9399712085723877, "logps/chosen": -1.0256609916687012, "logps/rejected": -1.1517935991287231, "loss": 1.1987, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0513219833374023, "rewards/margins": 0.252265065908432, "rewards/rejected": -2.3035871982574463, "step": 3500 }, { "epoch": 2.5288184438040346, "grad_norm": 23.33567215234884, "learning_rate": 3.665126084957723e-09, "logits/chosen": -1.984487533569336, "logits/rejected": -1.9886258840560913, "logps/chosen": -1.1339917182922363, "logps/rejected": -1.233039140701294, "loss": 1.2585, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.2679834365844727, "rewards/margins": 0.19809459149837494, "rewards/rejected": -2.466078281402588, "step": 3510 }, { "epoch": 2.5360230547550433, "grad_norm": 19.609640038869685, "learning_rate": 3.556616377404101e-09, "logits/chosen": -2.00850248336792, "logits/rejected": -2.006412982940674, "logps/chosen": -1.07861328125, "logps/rejected": -1.236485242843628, "loss": 1.154, "rewards/accuracies": 0.625, "rewards/chosen": -2.1572265625, "rewards/margins": 0.31574416160583496, "rewards/rejected": -2.472970485687256, "step": 3520 }, { "epoch": 2.543227665706052, "grad_norm": 19.767394228725337, "learning_rate": 3.4496140511748125e-09, "logits/chosen": -1.9994624853134155, "logits/rejected": -1.9942439794540405, "logps/chosen": -1.0551049709320068, "logps/rejected": -1.1994330883026123, "loss": 1.1752, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.1102099418640137, "rewards/margins": 0.28865596652030945, "rewards/rejected": -2.3988661766052246, "step": 3530 }, { "epoch": 2.5504322766570606, "grad_norm": 31.22852578343729, "learning_rate": 3.3441266280915427e-09, "logits/chosen": -1.9827390909194946, "logits/rejected": -1.983473539352417, "logps/chosen": -1.093752145767212, "logps/rejected": -1.20872163772583, "loss": 1.2131, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.187504291534424, "rewards/margins": 0.229939267039299, "rewards/rejected": -2.41744327545166, "step": 3540 }, { "epoch": 2.5576368876080693, "grad_norm": 23.706412362537016, "learning_rate": 3.2401615234845693e-09, "logits/chosen": -2.0068211555480957, "logits/rejected": -2.0008292198181152, "logps/chosen": -1.0923867225646973, "logps/rejected": -1.2357128858566284, "loss": 1.1896, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.1847734451293945, "rewards/margins": 0.28665226697921753, "rewards/rejected": -2.471425771713257, "step": 3550 }, { "epoch": 2.564841498559078, "grad_norm": 16.13728529223842, "learning_rate": 3.1377260456714375e-09, "logits/chosen": -1.901414155960083, "logits/rejected": -1.8929615020751953, "logps/chosen": -1.0596764087677002, "logps/rejected": -1.2023025751113892, "loss": 1.1686, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.1193528175354004, "rewards/margins": 0.28525251150131226, "rewards/rejected": -2.4046051502227783, "step": 3560 }, { "epoch": 2.5720461095100866, "grad_norm": 18.028717215705484, "learning_rate": 3.0368273954432698e-09, "logits/chosen": -2.0307531356811523, "logits/rejected": -2.022324800491333, "logps/chosen": -1.0493528842926025, "logps/rejected": -1.1534329652786255, "loss": 1.2252, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.098705768585205, "rewards/margins": 0.20816004276275635, "rewards/rejected": -2.306865930557251, "step": 3570 }, { "epoch": 2.5792507204610953, "grad_norm": 17.367490930434325, "learning_rate": 2.937472665558541e-09, "logits/chosen": -2.019484281539917, "logits/rejected": -2.020643711090088, "logps/chosen": -1.036195993423462, "logps/rejected": -1.147991418838501, "loss": 1.2267, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.072391986846924, "rewards/margins": 0.22359101474285126, "rewards/rejected": -2.295982837677002, "step": 3580 }, { "epoch": 2.586455331412104, "grad_norm": 21.805325598847563, "learning_rate": 2.8396688402445053e-09, "logits/chosen": -2.0637335777282715, "logits/rejected": -2.0563552379608154, "logps/chosen": -1.0100147724151611, "logps/rejected": -1.2180942296981812, "loss": 1.1063, "rewards/accuracies": 0.65625, "rewards/chosen": -2.0200295448303223, "rewards/margins": 0.4161592125892639, "rewards/rejected": -2.4361884593963623, "step": 3590 }, { "epoch": 2.5936599423631126, "grad_norm": 24.439180591540023, "learning_rate": 2.7434227947062324e-09, "logits/chosen": -2.0062127113342285, "logits/rejected": -1.9998852014541626, "logps/chosen": -1.1317923069000244, "logps/rejected": -1.2398041486740112, "loss": 1.2326, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.263584613800049, "rewards/margins": 0.21602365374565125, "rewards/rejected": -2.4796082973480225, "step": 3600 }, { "epoch": 2.6008645533141213, "grad_norm": 18.265831934479376, "learning_rate": 2.6487412946432976e-09, "logits/chosen": -1.9716873168945312, "logits/rejected": -1.966560959815979, "logps/chosen": -1.0693469047546387, "logps/rejected": -1.2056225538253784, "loss": 1.1912, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.1386938095092773, "rewards/margins": 0.27255168557167053, "rewards/rejected": -2.411245107650757, "step": 3610 }, { "epoch": 2.60806916426513, "grad_norm": 22.910160520824302, "learning_rate": 2.5556309957742024e-09, "logits/chosen": -1.97675359249115, "logits/rejected": -1.9716304540634155, "logps/chosen": -1.0250674486160278, "logps/rejected": -1.2212371826171875, "loss": 1.1161, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.0501348972320557, "rewards/margins": 0.3923397660255432, "rewards/rejected": -2.442474365234375, "step": 3620 }, { "epoch": 2.6152737752161386, "grad_norm": 22.671601957903725, "learning_rate": 2.4640984433684758e-09, "logits/chosen": -2.0380005836486816, "logits/rejected": -2.0387332439422607, "logps/chosen": -1.1190853118896484, "logps/rejected": -1.235012173652649, "loss": 1.2353, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.238170623779297, "rewards/margins": 0.231853649020195, "rewards/rejected": -2.470024347305298, "step": 3630 }, { "epoch": 2.6224783861671472, "grad_norm": 17.024567886757257, "learning_rate": 2.3741500717865987e-09, "logits/chosen": -1.9916216135025024, "logits/rejected": -2.0025291442871094, "logps/chosen": -1.0068811178207397, "logps/rejected": -1.1515626907348633, "loss": 1.1789, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.0137622356414795, "rewards/margins": 0.2893627882003784, "rewards/rejected": -2.3031253814697266, "step": 3640 }, { "epoch": 2.629682997118156, "grad_norm": 17.494575910236158, "learning_rate": 2.285792204027678e-09, "logits/chosen": -1.9781382083892822, "logits/rejected": -1.9753141403198242, "logps/chosen": -1.013346791267395, "logps/rejected": -1.211428165435791, "loss": 1.1021, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.02669358253479, "rewards/margins": 0.39616289734840393, "rewards/rejected": -2.422856330871582, "step": 3650 }, { "epoch": 2.636887608069164, "grad_norm": 20.794166929263792, "learning_rate": 2.199031051284972e-09, "logits/chosen": -2.007935047149658, "logits/rejected": -2.0033650398254395, "logps/chosen": -1.069888710975647, "logps/rejected": -1.1960642337799072, "loss": 1.219, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.139777421951294, "rewards/margins": 0.25235068798065186, "rewards/rejected": -2.3921284675598145, "step": 3660 }, { "epoch": 2.6440922190201728, "grad_norm": 16.98386285768041, "learning_rate": 2.113872712509254e-09, "logits/chosen": -1.9919393062591553, "logits/rejected": -1.9845707416534424, "logps/chosen": -1.1297125816345215, "logps/rejected": -1.241287112236023, "loss": 1.2279, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.259425163269043, "rewards/margins": 0.22314925491809845, "rewards/rejected": -2.482574224472046, "step": 3670 }, { "epoch": 2.6512968299711814, "grad_norm": 14.064145090241722, "learning_rate": 2.0303231739801143e-09, "logits/chosen": -1.9741106033325195, "logits/rejected": -1.9633283615112305, "logps/chosen": -1.0185304880142212, "logps/rejected": -1.1587377786636353, "loss": 1.1825, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.0370609760284424, "rewards/margins": 0.28041452169418335, "rewards/rejected": -2.3174755573272705, "step": 3680 }, { "epoch": 2.65850144092219, "grad_norm": 23.56396327392751, "learning_rate": 1.948388308885102e-09, "logits/chosen": -2.0366296768188477, "logits/rejected": -2.02805757522583, "logps/chosen": -1.0637743473052979, "logps/rejected": -1.1742548942565918, "loss": 1.2171, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.1275486946105957, "rewards/margins": 0.22096149623394012, "rewards/rejected": -2.3485097885131836, "step": 3690 }, { "epoch": 2.6657060518731988, "grad_norm": 25.21273485809688, "learning_rate": 1.86807387690692e-09, "logits/chosen": -2.0631988048553467, "logits/rejected": -2.0600669384002686, "logps/chosen": -1.0889419317245483, "logps/rejected": -1.2770618200302124, "loss": 1.1162, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1778838634490967, "rewards/margins": 0.3762398660182953, "rewards/rejected": -2.554123640060425, "step": 3700 }, { "epoch": 2.6729106628242074, "grad_norm": 19.210332180481718, "learning_rate": 1.789385523818493e-09, "logits/chosen": -2.027967929840088, "logits/rejected": -2.0294251441955566, "logps/chosen": -1.0404349565505981, "logps/rejected": -1.209099531173706, "loss": 1.149, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.0808699131011963, "rewards/margins": 0.33732882142066956, "rewards/rejected": -2.418199062347412, "step": 3710 }, { "epoch": 2.680115273775216, "grad_norm": 25.919412237452388, "learning_rate": 1.712328781086131e-09, "logits/chosen": -2.0483665466308594, "logits/rejected": -2.0430164337158203, "logps/chosen": -1.1230162382125854, "logps/rejected": -1.2185190916061401, "loss": 1.2401, "rewards/accuracies": 0.5625, "rewards/chosen": -2.246032476425171, "rewards/margins": 0.191005676984787, "rewards/rejected": -2.4370381832122803, "step": 3720 }, { "epoch": 2.6873198847262247, "grad_norm": 21.180605350865044, "learning_rate": 1.6369090654806543e-09, "logits/chosen": -2.0540661811828613, "logits/rejected": -2.0474164485931396, "logps/chosen": -1.0206701755523682, "logps/rejected": -1.1645678281784058, "loss": 1.1684, "rewards/accuracies": 0.625, "rewards/chosen": -2.0413403511047363, "rewards/margins": 0.2877953350543976, "rewards/rejected": -2.3291356563568115, "step": 3730 }, { "epoch": 2.6945244956772334, "grad_norm": 19.151584962250723, "learning_rate": 1.5631316786966498e-09, "logits/chosen": -1.9853427410125732, "logits/rejected": -1.978816270828247, "logps/chosen": -1.0220520496368408, "logps/rejected": -1.1623871326446533, "loss": 1.1969, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.0441040992736816, "rewards/margins": 0.2806701064109802, "rewards/rejected": -2.3247742652893066, "step": 3740 }, { "epoch": 2.701729106628242, "grad_norm": 18.499060326329523, "learning_rate": 1.491001806979772e-09, "logits/chosen": -2.035274028778076, "logits/rejected": -2.028480052947998, "logps/chosen": -1.077332854270935, "logps/rejected": -1.2263195514678955, "loss": 1.1742, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.15466570854187, "rewards/margins": 0.29797306656837463, "rewards/rejected": -2.452639102935791, "step": 3750 }, { "epoch": 2.7089337175792507, "grad_norm": 29.16839407251503, "learning_rate": 1.4205245207621508e-09, "logits/chosen": -1.9820353984832764, "logits/rejected": -1.9796836376190186, "logps/chosen": -1.1182725429534912, "logps/rejected": -1.2864872217178345, "loss": 1.1548, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.2365450859069824, "rewards/margins": 0.33642950654029846, "rewards/rejected": -2.572974443435669, "step": 3760 }, { "epoch": 2.7161383285302594, "grad_norm": 17.613059928527296, "learning_rate": 1.3517047743059978e-09, "logits/chosen": -2.0181725025177, "logits/rejected": -2.0215516090393066, "logps/chosen": -1.0734504461288452, "logps/rejected": -1.2342610359191895, "loss": 1.1565, "rewards/accuracies": 0.625, "rewards/chosen": -2.1469008922576904, "rewards/margins": 0.3216209411621094, "rewards/rejected": -2.468522071838379, "step": 3770 }, { "epoch": 2.723342939481268, "grad_norm": 17.0753116834011, "learning_rate": 1.2845474053553156e-09, "logits/chosen": -2.0134406089782715, "logits/rejected": -2.0093090534210205, "logps/chosen": -1.0317740440368652, "logps/rejected": -1.168919324874878, "loss": 1.2025, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.0635480880737305, "rewards/margins": 0.2742905914783478, "rewards/rejected": -2.337838649749756, "step": 3780 }, { "epoch": 2.7305475504322767, "grad_norm": 22.583791154808193, "learning_rate": 1.2190571347958422e-09, "logits/chosen": -2.044787883758545, "logits/rejected": -2.046135187149048, "logps/chosen": -0.9667074084281921, "logps/rejected": -1.1686309576034546, "loss": 1.1099, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9334148168563843, "rewards/margins": 0.4038470387458801, "rewards/rejected": -2.337261915206909, "step": 3790 }, { "epoch": 2.7377521613832854, "grad_norm": 18.287187828533536, "learning_rate": 1.1552385663231634e-09, "logits/chosen": -1.9937756061553955, "logits/rejected": -1.9841327667236328, "logps/chosen": -1.0935721397399902, "logps/rejected": -1.190500020980835, "loss": 1.2381, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.1871442794799805, "rewards/margins": 0.19385603070259094, "rewards/rejected": -2.38100004196167, "step": 3800 }, { "epoch": 2.744956772334294, "grad_norm": 18.99456309056716, "learning_rate": 1.0930961861191302e-09, "logits/chosen": -1.9580612182617188, "logits/rejected": -1.9627430438995361, "logps/chosen": -1.0382286310195923, "logps/rejected": -1.1804331541061401, "loss": 1.2008, "rewards/accuracies": 0.5625, "rewards/chosen": -2.0764572620391846, "rewards/margins": 0.2844088673591614, "rewards/rejected": -2.3608663082122803, "step": 3810 }, { "epoch": 2.7521613832853027, "grad_norm": 16.764883597440075, "learning_rate": 1.0326343625364608e-09, "logits/chosen": -1.9691221714019775, "logits/rejected": -1.9639511108398438, "logps/chosen": -1.0410795211791992, "logps/rejected": -1.2136642932891846, "loss": 1.138, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.0821590423583984, "rewards/margins": 0.3451697826385498, "rewards/rejected": -2.427328586578369, "step": 3820 }, { "epoch": 2.7593659942363113, "grad_norm": 18.44911571731718, "learning_rate": 9.738573457917066e-10, "logits/chosen": -2.043980836868286, "logits/rejected": -2.042267084121704, "logps/chosen": -1.0499022006988525, "logps/rejected": -1.2412595748901367, "loss": 1.11, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.099804401397705, "rewards/margins": 0.3827148973941803, "rewards/rejected": -2.4825191497802734, "step": 3830 }, { "epoch": 2.76657060518732, "grad_norm": 18.764417824451066, "learning_rate": 9.16769267666434e-10, "logits/chosen": -2.0091567039489746, "logits/rejected": -2.0069632530212402, "logps/chosen": -1.074094295501709, "logps/rejected": -1.1498383283615112, "loss": 1.2637, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.148188591003418, "rewards/margins": 0.15148821473121643, "rewards/rejected": -2.2996766567230225, "step": 3840 }, { "epoch": 2.7737752161383287, "grad_norm": 20.046312375742783, "learning_rate": 8.613741412168113e-10, "logits/chosen": -2.027498245239258, "logits/rejected": -2.026846408843994, "logps/chosen": -1.0808565616607666, "logps/rejected": -1.2099745273590088, "loss": 1.1798, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.161713123321533, "rewards/margins": 0.2582358717918396, "rewards/rejected": -2.4199490547180176, "step": 3850 }, { "epoch": 2.7809798270893373, "grad_norm": 19.84763607582755, "learning_rate": 8.076758604914802e-10, "logits/chosen": -1.957332968711853, "logits/rejected": -1.9527628421783447, "logps/chosen": -0.9819733500480652, "logps/rejected": -1.114538550376892, "loss": 1.1997, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.9639467000961304, "rewards/margins": 0.2651303708553314, "rewards/rejected": -2.229077100753784, "step": 3860 }, { "epoch": 2.7881844380403455, "grad_norm": 22.904658084781477, "learning_rate": 7.55678200257856e-10, "logits/chosen": -1.9844300746917725, "logits/rejected": -1.9776723384857178, "logps/chosen": -1.0327974557876587, "logps/rejected": -1.1763405799865723, "loss": 1.1751, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.0655949115753174, "rewards/margins": 0.2870861887931824, "rewards/rejected": -2.3526811599731445, "step": 3870 }, { "epoch": 2.795389048991354, "grad_norm": 17.15291998943784, "learning_rate": 7.053848157367315e-10, "logits/chosen": -1.9995343685150146, "logits/rejected": -1.9940645694732666, "logps/chosen": -1.0412391424179077, "logps/rejected": -1.1907306909561157, "loss": 1.1831, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0824782848358154, "rewards/margins": 0.29898306727409363, "rewards/rejected": -2.3814613819122314, "step": 3880 }, { "epoch": 2.802593659942363, "grad_norm": 15.812884819551362, "learning_rate": 6.567992423453794e-10, "logits/chosen": -2.0206310749053955, "logits/rejected": -2.019430637359619, "logps/chosen": -0.9630235433578491, "logps/rejected": -1.0794202089309692, "loss": 1.2021, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.9260470867156982, "rewards/margins": 0.23279304802417755, "rewards/rejected": -2.1588404178619385, "step": 3890 }, { "epoch": 2.8097982708933715, "grad_norm": 19.54993986750196, "learning_rate": 6.099248954489794e-10, "logits/chosen": -1.953507423400879, "logits/rejected": -1.9511181116104126, "logps/chosen": -1.0681465864181519, "logps/rejected": -1.229273796081543, "loss": 1.1568, "rewards/accuracies": 0.625, "rewards/chosen": -2.1362931728363037, "rewards/margins": 0.32225483655929565, "rewards/rejected": -2.458547592163086, "step": 3900 }, { "epoch": 2.81700288184438, "grad_norm": 22.672929732957467, "learning_rate": 5.647650701205653e-10, "logits/chosen": -2.026876449584961, "logits/rejected": -2.018667697906494, "logps/chosen": -1.1109135150909424, "logps/rejected": -1.2674014568328857, "loss": 1.1766, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.2218270301818848, "rewards/margins": 0.3129761219024658, "rewards/rejected": -2.5348029136657715, "step": 3910 }, { "epoch": 2.824207492795389, "grad_norm": 16.28695288206369, "learning_rate": 5.213229409093856e-10, "logits/chosen": -2.0310721397399902, "logits/rejected": -2.0254709720611572, "logps/chosen": -1.05387282371521, "logps/rejected": -1.1856187582015991, "loss": 1.2009, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.10774564743042, "rewards/margins": 0.263491690158844, "rewards/rejected": -2.3712375164031982, "step": 3920 }, { "epoch": 2.8314121037463975, "grad_norm": 20.975683447759703, "learning_rate": 4.796015616177401e-10, "logits/chosen": -2.0005106925964355, "logits/rejected": -1.994783639907837, "logps/chosen": -1.0665435791015625, "logps/rejected": -1.1785615682601929, "loss": 1.2143, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.133087158203125, "rewards/margins": 0.22403590381145477, "rewards/rejected": -2.3571231365203857, "step": 3930 }, { "epoch": 2.838616714697406, "grad_norm": 16.71900486734478, "learning_rate": 4.3960386508631595e-10, "logits/chosen": -1.937182068824768, "logits/rejected": -1.9297128915786743, "logps/chosen": -0.9666848182678223, "logps/rejected": -1.0865039825439453, "loss": 1.2256, "rewards/accuracies": 0.5625, "rewards/chosen": -1.9333696365356445, "rewards/margins": 0.23963849246501923, "rewards/rejected": -2.1730079650878906, "step": 3940 }, { "epoch": 2.845821325648415, "grad_norm": 35.92105468101964, "learning_rate": 4.013326629880243e-10, "logits/chosen": -1.9777085781097412, "logits/rejected": -1.968076467514038, "logps/chosen": -1.1062713861465454, "logps/rejected": -1.2339928150177002, "loss": 1.2044, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.212542772293091, "rewards/margins": 0.25544288754463196, "rewards/rejected": -2.4679856300354004, "step": 3950 }, { "epoch": 2.8530259365994235, "grad_norm": 19.697159928360417, "learning_rate": 3.64790645630339e-10, "logits/chosen": -1.942647933959961, "logits/rejected": -1.9421268701553345, "logps/chosen": -1.0547170639038086, "logps/rejected": -1.1250708103179932, "loss": 1.2627, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.109434127807617, "rewards/margins": 0.1407076120376587, "rewards/rejected": -2.2501416206359863, "step": 3960 }, { "epoch": 2.860230547550432, "grad_norm": 21.359517678769173, "learning_rate": 3.2998038176619e-10, "logits/chosen": -1.9776138067245483, "logits/rejected": -1.9692010879516602, "logps/chosen": -1.0567617416381836, "logps/rejected": -1.1803498268127441, "loss": 1.2064, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.113523483276367, "rewards/margins": 0.24717645347118378, "rewards/rejected": -2.3606996536254883, "step": 3970 }, { "epoch": 2.867435158501441, "grad_norm": 20.96219918565088, "learning_rate": 2.969043184133907e-10, "logits/chosen": -2.046151638031006, "logits/rejected": -2.044818639755249, "logps/chosen": -0.9711786508560181, "logps/rejected": -1.1876708269119263, "loss": 1.0771, "rewards/accuracies": 0.71875, "rewards/chosen": -1.9423573017120361, "rewards/margins": 0.43298429250717163, "rewards/rejected": -2.3753416538238525, "step": 3980 }, { "epoch": 2.8746397694524495, "grad_norm": 17.910920824523004, "learning_rate": 2.6556478068261447e-10, "logits/chosen": -1.9732444286346436, "logits/rejected": -1.9708236455917358, "logps/chosen": -0.9731259346008301, "logps/rejected": -1.1022650003433228, "loss": 1.2084, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.9462518692016602, "rewards/margins": 0.2582783102989197, "rewards/rejected": -2.2045300006866455, "step": 3990 }, { "epoch": 2.881844380403458, "grad_norm": 20.734203195977276, "learning_rate": 2.3596397161395607e-10, "logits/chosen": -2.044921875, "logits/rejected": -2.0331034660339355, "logps/chosen": -1.0672800540924072, "logps/rejected": -1.2321101427078247, "loss": 1.1588, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.1345601081848145, "rewards/margins": 0.32965999841690063, "rewards/rejected": -2.4642202854156494, "step": 4000 }, { "epoch": 2.889048991354467, "grad_norm": 25.7016655841959, "learning_rate": 2.0810397202206399e-10, "logits/chosen": -1.9520553350448608, "logits/rejected": -1.9573888778686523, "logps/chosen": -1.063836693763733, "logps/rejected": -1.193362832069397, "loss": 1.1905, "rewards/accuracies": 0.625, "rewards/chosen": -2.127673387527466, "rewards/margins": 0.2590521574020386, "rewards/rejected": -2.386725664138794, "step": 4010 }, { "epoch": 2.8962536023054755, "grad_norm": 22.599478343097772, "learning_rate": 1.819867403498737e-10, "logits/chosen": -2.0386157035827637, "logits/rejected": -2.036118984222412, "logps/chosen": -1.0686347484588623, "logps/rejected": -1.2007242441177368, "loss": 1.2018, "rewards/accuracies": 0.625, "rewards/chosen": -2.1372694969177246, "rewards/margins": 0.26417914032936096, "rewards/rejected": -2.4014484882354736, "step": 4020 }, { "epoch": 2.903458213256484, "grad_norm": 21.70269511981427, "learning_rate": 1.5761411253092382e-10, "logits/chosen": -1.964998483657837, "logits/rejected": -1.9548912048339844, "logps/chosen": -0.9872833490371704, "logps/rejected": -1.1099205017089844, "loss": 1.1994, "rewards/accuracies": 0.59375, "rewards/chosen": -1.9745666980743408, "rewards/margins": 0.24527449905872345, "rewards/rejected": -2.2198410034179688, "step": 4030 }, { "epoch": 2.910662824207493, "grad_norm": 20.259335859045336, "learning_rate": 1.3498780186031455e-10, "logits/chosen": -2.010437488555908, "logits/rejected": -2.0069775581359863, "logps/chosen": -1.162232756614685, "logps/rejected": -1.281508207321167, "loss": 1.2266, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.32446551322937, "rewards/margins": 0.2385510504245758, "rewards/rejected": -2.563016414642334, "step": 4040 }, { "epoch": 2.9178674351585014, "grad_norm": 15.586122569686582, "learning_rate": 1.1410939887425141e-10, "logits/chosen": -1.9975839853286743, "logits/rejected": -1.9993999004364014, "logps/chosen": -1.045862078666687, "logps/rejected": -1.1738336086273193, "loss": 1.212, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.091724157333374, "rewards/margins": 0.25594305992126465, "rewards/rejected": -2.3476672172546387, "step": 4050 }, { "epoch": 2.92507204610951, "grad_norm": 18.548109162992386, "learning_rate": 9.498037123825686e-11, "logits/chosen": -2.0100975036621094, "logits/rejected": -2.0066418647766113, "logps/chosen": -1.0210684537887573, "logps/rejected": -1.1468260288238525, "loss": 1.1987, "rewards/accuracies": 0.59375, "rewards/chosen": -2.0421369075775146, "rewards/margins": 0.2515150308609009, "rewards/rejected": -2.293652057647705, "step": 4060 }, { "epoch": 2.9322766570605188, "grad_norm": 21.54852206068809, "learning_rate": 7.760206364398614e-11, "logits/chosen": -2.0660743713378906, "logits/rejected": -2.063163995742798, "logps/chosen": -1.0767936706542969, "logps/rejected": -1.2189406156539917, "loss": 1.1849, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.1535873413085938, "rewards/margins": 0.2842939794063568, "rewards/rejected": -2.4378812313079834, "step": 4070 }, { "epoch": 2.9394812680115274, "grad_norm": 21.178294648611878, "learning_rate": 6.19756977147029e-11, "logits/chosen": -1.9951989650726318, "logits/rejected": -1.9918142557144165, "logps/chosen": -1.02787446975708, "logps/rejected": -1.2345163822174072, "loss": 1.1095, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.05574893951416, "rewards/margins": 0.41328415274620056, "rewards/rejected": -2.4690327644348145, "step": 4080 }, { "epoch": 2.946685878962536, "grad_norm": 20.81149789203122, "learning_rate": 4.810237191940625e-11, "logits/chosen": -1.974111557006836, "logits/rejected": -1.9727462530136108, "logps/chosen": -1.0376461744308472, "logps/rejected": -1.1693501472473145, "loss": 1.217, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.0752923488616943, "rewards/margins": 0.2634081244468689, "rewards/rejected": -2.338700294494629, "step": 4090 }, { "epoch": 2.9538904899135447, "grad_norm": 20.108876799029805, "learning_rate": 3.5983061495617476e-11, "logits/chosen": -2.032691240310669, "logits/rejected": -2.0327444076538086, "logps/chosen": -1.1233651638031006, "logps/rejected": -1.2714459896087646, "loss": 1.1825, "rewards/accuracies": 0.625, "rewards/chosen": -2.246730327606201, "rewards/margins": 0.2961619794368744, "rewards/rejected": -2.5428919792175293, "step": 4100 }, { "epoch": 2.9610951008645534, "grad_norm": 21.51546113795096, "learning_rate": 2.5618618380812694e-11, "logits/chosen": -2.0210888385772705, "logits/rejected": -2.0106148719787598, "logps/chosen": -1.002300500869751, "logps/rejected": -1.166154146194458, "loss": 1.1714, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.004601001739502, "rewards/margins": 0.32770711183547974, "rewards/rejected": -2.332308292388916, "step": 4110 }, { "epoch": 2.968299711815562, "grad_norm": 22.762605833671383, "learning_rate": 1.700977115254576e-11, "logits/chosen": -1.9953645467758179, "logits/rejected": -1.9921376705169678, "logps/chosen": -0.9968992471694946, "logps/rejected": -1.1455665826797485, "loss": 1.1674, "rewards/accuracies": 0.625, "rewards/chosen": -1.9937984943389893, "rewards/margins": 0.29733437299728394, "rewards/rejected": -2.291133165359497, "step": 4120 }, { "epoch": 2.9755043227665707, "grad_norm": 20.444100868277733, "learning_rate": 1.0157124977230868e-11, "logits/chosen": -1.9724935293197632, "logits/rejected": -1.9707790613174438, "logps/chosen": -0.9694275856018066, "logps/rejected": -1.117763876914978, "loss": 1.1687, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9388551712036133, "rewards/margins": 0.2966724932193756, "rewards/rejected": -2.235527753829956, "step": 4130 }, { "epoch": 2.9827089337175794, "grad_norm": 21.99215491997881, "learning_rate": 5.061161567596061e-12, "logits/chosen": -1.9936805963516235, "logits/rejected": -1.98941171169281, "logps/chosen": -1.0571701526641846, "logps/rejected": -1.1420987844467163, "loss": 1.2615, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.114340305328369, "rewards/margins": 0.16985730826854706, "rewards/rejected": -2.2841975688934326, "step": 4140 }, { "epoch": 2.989913544668588, "grad_norm": 20.878532080632212, "learning_rate": 1.7222391488297406e-12, "logits/chosen": -2.013947010040283, "logits/rejected": -2.010057210922241, "logps/chosen": -1.1070269346237183, "logps/rejected": -1.254369854927063, "loss": 1.1756, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.2140538692474365, "rewards/margins": 0.2946857511997223, "rewards/rejected": -2.508739709854126, "step": 4150 }, { "epoch": 2.9971181556195967, "grad_norm": 19.732213045865922, "learning_rate": 1.4059243338693238e-13, "logits/chosen": -1.9882125854492188, "logits/rejected": -1.9810755252838135, "logps/chosen": -1.059184193611145, "logps/rejected": -1.1826164722442627, "loss": 1.1942, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.11836838722229, "rewards/margins": 0.2468646764755249, "rewards/rejected": -2.3652329444885254, "step": 4160 }, { "epoch": 3.0, "step": 4164, "total_flos": 0.0, "train_loss": 1.2025116606473236, "train_runtime": 6278.9508, "train_samples_per_second": 10.608, "train_steps_per_second": 0.663 } ], "logging_steps": 10, "max_steps": 4164, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }