{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997382884061764, "eval_steps": 500, "global_step": 1910, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005234231876472127, "grad_norm": 312.0, "learning_rate": 1.0416666666666667e-06, "logits/chosen": 0.665995180606842, "logits/rejected": 0.7168087959289551, "logps/chosen": -331.14556884765625, "logps/rejected": -289.13482666015625, "loss": 0.6929, "rewards/accuracies": 0.40312498807907104, "rewards/chosen": -0.0007204435532912612, "rewards/margins": 0.0008210704545490444, "rewards/rejected": -0.0015415140660479665, "step": 10 }, { "epoch": 0.010468463752944255, "grad_norm": 284.0, "learning_rate": 2.0833333333333334e-06, "logits/chosen": 0.7776141166687012, "logits/rejected": 0.7684425115585327, "logps/chosen": -357.8346862792969, "logps/rejected": -317.8344421386719, "loss": 0.6936, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0012695642653852701, "rewards/margins": -0.0004827965167351067, "rewards/rejected": -0.0007867676904425025, "step": 20 }, { "epoch": 0.015702695629416383, "grad_norm": 322.0, "learning_rate": 3.125e-06, "logits/chosen": 0.795623779296875, "logits/rejected": 0.8778733015060425, "logps/chosen": -350.8582458496094, "logps/rejected": -318.2168884277344, "loss": 0.6904, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0022780809085816145, "rewards/margins": 0.0060666268691420555, "rewards/rejected": -0.008344708010554314, "step": 30 }, { "epoch": 0.02093692750588851, "grad_norm": 298.0, "learning_rate": 4.166666666666667e-06, "logits/chosen": 0.7740770578384399, "logits/rejected": 0.8167956471443176, "logps/chosen": -319.42022705078125, "logps/rejected": -278.17071533203125, "loss": 0.681, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.006301078014075756, "rewards/margins": 0.026611831039190292, "rewards/rejected": -0.02031075581908226, "step": 40 }, { "epoch": 0.02617115938236064, "grad_norm": 266.0, "learning_rate": 5.208333333333334e-06, "logits/chosen": 0.6866067051887512, "logits/rejected": 0.7555549144744873, "logps/chosen": -320.47479248046875, "logps/rejected": -284.809814453125, "loss": 0.6727, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005391906015574932, "rewards/margins": 0.04711990803480148, "rewards/rejected": -0.05251181870698929, "step": 50 }, { "epoch": 0.031405391258832765, "grad_norm": 288.0, "learning_rate": 6.25e-06, "logits/chosen": 0.6783124208450317, "logits/rejected": 0.7343226671218872, "logps/chosen": -337.063232421875, "logps/rejected": -299.95220947265625, "loss": 0.6656, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.011515669524669647, "rewards/margins": 0.07207117229700089, "rewards/rejected": -0.08358683437108994, "step": 60 }, { "epoch": 0.036639623135304895, "grad_norm": 262.0, "learning_rate": 7.291666666666667e-06, "logits/chosen": 0.8541976809501648, "logits/rejected": 0.9135104417800903, "logps/chosen": -324.9010925292969, "logps/rejected": -313.7442321777344, "loss": 0.6576, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.033813267946243286, "rewards/margins": 0.1023920327425003, "rewards/rejected": -0.1362052857875824, "step": 70 }, { "epoch": 0.04187385501177702, "grad_norm": 282.0, "learning_rate": 8.333333333333334e-06, "logits/chosen": 0.8204466700553894, "logits/rejected": 0.9971317052841187, "logps/chosen": -309.83648681640625, "logps/rejected": -280.83837890625, "loss": 0.649, "rewards/accuracies": 0.640625, "rewards/chosen": -0.051890332251787186, "rewards/margins": 0.13043811917304993, "rewards/rejected": -0.18232843279838562, "step": 80 }, { "epoch": 0.04710808688824915, "grad_norm": 290.0, "learning_rate": 9.375000000000001e-06, "logits/chosen": 0.8248814344406128, "logits/rejected": 0.798936128616333, "logps/chosen": -330.08660888671875, "logps/rejected": -278.1947937011719, "loss": 0.6422, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.07527975738048553, "rewards/margins": 0.17323294281959534, "rewards/rejected": -0.24851271510124207, "step": 90 }, { "epoch": 0.05234231876472128, "grad_norm": 234.0, "learning_rate": 9.999880027023295e-06, "logits/chosen": 0.5035718083381653, "logits/rejected": 0.6347559094429016, "logps/chosen": -328.1427917480469, "logps/rejected": -298.84197998046875, "loss": 0.6071, "rewards/accuracies": 0.671875, "rewards/chosen": -0.14533154666423798, "rewards/margins": 0.28229671716690063, "rewards/rejected": -0.4276282787322998, "step": 100 }, { "epoch": 0.05757655064119341, "grad_norm": 246.0, "learning_rate": 9.998530397154684e-06, "logits/chosen": 0.5344328284263611, "logits/rejected": 0.6689058542251587, "logps/chosen": -325.33978271484375, "logps/rejected": -314.258544921875, "loss": 0.6253, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.20693711936473846, "rewards/margins": 0.24847209453582764, "rewards/rejected": -0.4554091989994049, "step": 110 }, { "epoch": 0.06281078251766553, "grad_norm": 284.0, "learning_rate": 9.995681577335256e-06, "logits/chosen": 0.4409152865409851, "logits/rejected": 0.5330216288566589, "logps/chosen": -340.72930908203125, "logps/rejected": -319.43524169921875, "loss": 0.6163, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3839780390262604, "rewards/margins": 0.3311913013458252, "rewards/rejected": -0.7151693105697632, "step": 120 }, { "epoch": 0.06804501439413765, "grad_norm": 446.0, "learning_rate": 9.99133442200056e-06, "logits/chosen": 0.30335044860839844, "logits/rejected": 0.46630460023880005, "logps/chosen": -353.91961669921875, "logps/rejected": -310.37689208984375, "loss": 0.6173, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.37794750928878784, "rewards/margins": 0.34848180413246155, "rewards/rejected": -0.7264293432235718, "step": 130 }, { "epoch": 0.07327924627060979, "grad_norm": 294.0, "learning_rate": 9.985490234976132e-06, "logits/chosen": 0.45180901885032654, "logits/rejected": 0.5098147392272949, "logps/chosen": -345.41558837890625, "logps/rejected": -291.6056823730469, "loss": 0.5936, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4161813259124756, "rewards/margins": 0.40652480721473694, "rewards/rejected": -0.8227061033248901, "step": 140 }, { "epoch": 0.07851347814708191, "grad_norm": 270.0, "learning_rate": 9.978150769086457e-06, "logits/chosen": 0.40853095054626465, "logits/rejected": 0.5322204828262329, "logps/chosen": -341.7745666503906, "logps/rejected": -305.63043212890625, "loss": 0.6364, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5094475746154785, "rewards/margins": 0.3103070855140686, "rewards/rejected": -0.8197546005249023, "step": 150 }, { "epoch": 0.08374771002355404, "grad_norm": 294.0, "learning_rate": 9.96931822562924e-06, "logits/chosen": 0.33679673075675964, "logits/rejected": 0.4187542498111725, "logps/chosen": -353.4910888671875, "logps/rejected": -340.1171875, "loss": 0.6111, "rewards/accuracies": 0.671875, "rewards/chosen": -0.3925863802433014, "rewards/margins": 0.3052482306957245, "rewards/rejected": -0.6978346109390259, "step": 160 }, { "epoch": 0.08898194190002617, "grad_norm": 288.0, "learning_rate": 9.958995253715193e-06, "logits/chosen": 0.36519330739974976, "logits/rejected": 0.3689248263835907, "logps/chosen": -358.2540283203125, "logps/rejected": -317.26116943359375, "loss": 0.6105, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.3615415096282959, "rewards/margins": 0.3146303594112396, "rewards/rejected": -0.6761718988418579, "step": 170 }, { "epoch": 0.0942161737764983, "grad_norm": 316.0, "learning_rate": 9.947184949473478e-06, "logits/chosen": 0.30113479495048523, "logits/rejected": 0.36322420835494995, "logps/chosen": -344.6726379394531, "logps/rejected": -300.2312316894531, "loss": 0.5862, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5419459342956543, "rewards/margins": 0.40768465399742126, "rewards/rejected": -0.949630618095398, "step": 180 }, { "epoch": 0.09945040565297043, "grad_norm": 322.0, "learning_rate": 9.933890855123114e-06, "logits/chosen": 0.16772204637527466, "logits/rejected": 0.20316286385059357, "logps/chosen": -375.6507263183594, "logps/rejected": -358.7842102050781, "loss": 0.6266, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.9694031476974487, "rewards/margins": 0.3656802773475647, "rewards/rejected": -1.3350833654403687, "step": 190 }, { "epoch": 0.10468463752944256, "grad_norm": 324.0, "learning_rate": 9.919116957910566e-06, "logits/chosen": 0.14172935485839844, "logits/rejected": 0.11142061650753021, "logps/chosen": -349.0318298339844, "logps/rejected": -289.46453857421875, "loss": 0.5972, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6366950273513794, "rewards/margins": 0.4041665494441986, "rewards/rejected": -1.0408614873886108, "step": 200 }, { "epoch": 0.10991886940591468, "grad_norm": 276.0, "learning_rate": 9.902867688913869e-06, "logits/chosen": 0.3469844162464142, "logits/rejected": 0.3956855535507202, "logps/chosen": -365.18756103515625, "logps/rejected": -319.1282958984375, "loss": 0.5693, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8478742837905884, "rewards/margins": 0.47847065329551697, "rewards/rejected": -1.3263448476791382, "step": 210 }, { "epoch": 0.11515310128238682, "grad_norm": 304.0, "learning_rate": 9.885147921713621e-06, "logits/chosen": 0.19320572912693024, "logits/rejected": 0.27649644017219543, "logps/chosen": -338.0028076171875, "logps/rejected": -322.33349609375, "loss": 0.5867, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8439861536026001, "rewards/margins": 0.49753037095069885, "rewards/rejected": -1.3415164947509766, "step": 220 }, { "epoch": 0.12038733315885894, "grad_norm": 256.0, "learning_rate": 9.865962970931287e-06, "logits/chosen": 0.397473007440567, "logits/rejected": 0.41920894384384155, "logps/chosen": -357.3067321777344, "logps/rejected": -311.8251037597656, "loss": 0.588, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2804175317287445, "rewards/margins": 0.4174894690513611, "rewards/rejected": -0.6979071497917175, "step": 230 }, { "epoch": 0.12562156503533106, "grad_norm": 280.0, "learning_rate": 9.845318590635186e-06, "logits/chosen": 0.4800747036933899, "logits/rejected": 0.6408789753913879, "logps/chosen": -353.8960876464844, "logps/rejected": -311.86163330078125, "loss": 0.5746, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.3724919259548187, "rewards/margins": 0.5004433989524841, "rewards/rejected": -0.8729352951049805, "step": 240 }, { "epoch": 0.13085579691180318, "grad_norm": 294.0, "learning_rate": 9.823220972614712e-06, "logits/chosen": 0.3700530230998993, "logits/rejected": 0.4417282044887543, "logps/chosen": -369.8304443359375, "logps/rejected": -302.2646789550781, "loss": 0.573, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5568550825119019, "rewards/margins": 0.5210874080657959, "rewards/rejected": -1.0779423713684082, "step": 250 }, { "epoch": 0.1360900287882753, "grad_norm": 312.0, "learning_rate": 9.79967674452324e-06, "logits/chosen": 0.3898628354072571, "logits/rejected": 0.4610047936439514, "logps/chosen": -347.03118896484375, "logps/rejected": -336.3883056640625, "loss": 0.593, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.400566428899765, "rewards/margins": 0.50266432762146, "rewards/rejected": -0.9032306671142578, "step": 260 }, { "epoch": 0.14132426066474746, "grad_norm": 294.0, "learning_rate": 9.774692967890332e-06, "logits/chosen": 0.17694059014320374, "logits/rejected": 0.22063255310058594, "logps/chosen": -356.22650146484375, "logps/rejected": -323.39617919921875, "loss": 0.5893, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.3973694443702698, "rewards/margins": 0.4625066816806793, "rewards/rejected": -0.8598760366439819, "step": 270 }, { "epoch": 0.14655849254121958, "grad_norm": 310.0, "learning_rate": 9.74827713600379e-06, "logits/chosen": 0.2775232195854187, "logits/rejected": 0.38801032304763794, "logps/chosen": -316.23944091796875, "logps/rejected": -285.94964599609375, "loss": 0.6144, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6175938844680786, "rewards/margins": 0.40199989080429077, "rewards/rejected": -1.0195937156677246, "step": 280 }, { "epoch": 0.1517927244176917, "grad_norm": 272.0, "learning_rate": 9.720437171662232e-06, "logits/chosen": 0.39185574650764465, "logits/rejected": 0.48235875368118286, "logps/chosen": -336.41375732421875, "logps/rejected": -312.65216064453125, "loss": 0.5847, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.23143287003040314, "rewards/margins": 0.4122442305088043, "rewards/rejected": -0.6436771154403687, "step": 290 }, { "epoch": 0.15702695629416383, "grad_norm": 286.0, "learning_rate": 9.691181424798825e-06, "logits/chosen": 0.30432984232902527, "logits/rejected": 0.2606360912322998, "logps/chosen": -320.1929626464844, "logps/rejected": -296.85626220703125, "loss": 0.5877, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.16091197729110718, "rewards/margins": 0.4196176528930664, "rewards/rejected": -0.5805296897888184, "step": 300 }, { "epoch": 0.16226118817063595, "grad_norm": 320.0, "learning_rate": 9.660518669976936e-06, "logits/chosen": 0.3142179250717163, "logits/rejected": 0.42381685972213745, "logps/chosen": -351.7626647949219, "logps/rejected": -305.1934509277344, "loss": 0.6083, "rewards/accuracies": 0.671875, "rewards/chosen": -0.3087378144264221, "rewards/margins": 0.41920194029808044, "rewards/rejected": -0.7279397249221802, "step": 310 }, { "epoch": 0.16749542004710807, "grad_norm": 302.0, "learning_rate": 9.628458103758403e-06, "logits/chosen": 0.33300966024398804, "logits/rejected": 0.37351295351982117, "logps/chosen": -366.12384033203125, "logps/rejected": -330.9198913574219, "loss": 0.5437, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.4144722819328308, "rewards/margins": 0.5553442239761353, "rewards/rejected": -0.9698165655136108, "step": 320 }, { "epoch": 0.17272965192358022, "grad_norm": 314.0, "learning_rate": 9.595009341945246e-06, "logits/chosen": 0.22988705337047577, "logits/rejected": 0.2729721665382385, "logps/chosen": -334.3009338378906, "logps/rejected": -321.2250671386719, "loss": 0.6288, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.8995565176010132, "rewards/margins": 0.4780782163143158, "rewards/rejected": -1.3776347637176514, "step": 330 }, { "epoch": 0.17796388380005235, "grad_norm": 288.0, "learning_rate": 9.560182416695639e-06, "logits/chosen": 0.2716033458709717, "logits/rejected": 0.2932526171207428, "logps/chosen": -331.0648193359375, "logps/rejected": -329.04937744140625, "loss": 0.5775, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6781526803970337, "rewards/margins": 0.5396233201026917, "rewards/rejected": -1.2177760601043701, "step": 340 }, { "epoch": 0.18319811567652447, "grad_norm": 330.0, "learning_rate": 9.523987773514999e-06, "logits/chosen": 0.22474929690361023, "logits/rejected": 0.27910318970680237, "logps/chosen": -335.4002990722656, "logps/rejected": -297.45635986328125, "loss": 0.6094, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.4196121096611023, "rewards/margins": 0.37615495920181274, "rewards/rejected": -0.7957671284675598, "step": 350 }, { "epoch": 0.1884323475529966, "grad_norm": 314.0, "learning_rate": 9.486436268123112e-06, "logits/chosen": 0.1711244434118271, "logits/rejected": 0.24781985580921173, "logps/chosen": -365.4104919433594, "logps/rejected": -339.17510986328125, "loss": 0.5956, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.35211288928985596, "rewards/margins": 0.45698657631874084, "rewards/rejected": -0.8090993762016296, "step": 360 }, { "epoch": 0.19366657942946872, "grad_norm": 322.0, "learning_rate": 9.447539163198218e-06, "logits/chosen": 0.3507956266403198, "logits/rejected": 0.3582335114479065, "logps/chosen": -343.34564208984375, "logps/rejected": -306.3224182128906, "loss": 0.5941, "rewards/accuracies": 0.6875, "rewards/chosen": -0.42065954208374023, "rewards/margins": 0.4902091920375824, "rewards/rejected": -0.9108688235282898, "step": 370 }, { "epoch": 0.19890081130594087, "grad_norm": 201.0, "learning_rate": 9.407308124999031e-06, "logits/chosen": 0.535057544708252, "logits/rejected": 0.5568063855171204, "logps/chosen": -361.60809326171875, "logps/rejected": -340.9134826660156, "loss": 0.6044, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.8800666928291321, "rewards/margins": 0.5229350924491882, "rewards/rejected": -1.4030016660690308, "step": 380 }, { "epoch": 0.204135043182413, "grad_norm": 253.0, "learning_rate": 9.365755219865733e-06, "logits/chosen": 0.5194161534309387, "logits/rejected": 0.6202957630157471, "logps/chosen": -356.85552978515625, "logps/rejected": -337.2948913574219, "loss": 0.5604, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6672223806381226, "rewards/margins": 0.6344886422157288, "rewards/rejected": -1.3017112016677856, "step": 390 }, { "epoch": 0.2093692750588851, "grad_norm": 358.0, "learning_rate": 9.322892910600959e-06, "logits/chosen": 0.539501965045929, "logits/rejected": 0.7477941513061523, "logps/chosen": -328.63128662109375, "logps/rejected": -295.7342529296875, "loss": 0.6114, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6417405009269714, "rewards/margins": 0.4066389203071594, "rewards/rejected": -1.0483794212341309, "step": 400 }, { "epoch": 0.21460350693535724, "grad_norm": 454.0, "learning_rate": 9.278734052731876e-06, "logits/chosen": 0.4946824014186859, "logits/rejected": 0.5248245596885681, "logps/chosen": -346.6952819824219, "logps/rejected": -322.2110595703125, "loss": 0.5717, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -0.4744420647621155, "rewards/margins": 0.5339844226837158, "rewards/rejected": -1.0084264278411865, "step": 410 }, { "epoch": 0.21983773881182936, "grad_norm": 344.0, "learning_rate": 9.233291890654477e-06, "logits/chosen": 0.1536872535943985, "logits/rejected": 0.21127930283546448, "logps/chosen": -349.3604736328125, "logps/rejected": -306.022216796875, "loss": 0.5426, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4529080390930176, "rewards/margins": 0.5982980728149414, "rewards/rejected": -1.051206111907959, "step": 420 }, { "epoch": 0.22507197068830148, "grad_norm": 312.0, "learning_rate": 9.186580053661238e-06, "logits/chosen": 0.2585577666759491, "logits/rejected": 0.21868768334388733, "logps/chosen": -345.9136962890625, "logps/rejected": -358.81427001953125, "loss": 0.6164, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6662707328796387, "rewards/margins": 0.5226248502731323, "rewards/rejected": -1.1888954639434814, "step": 430 }, { "epoch": 0.23030620256477363, "grad_norm": 260.0, "learning_rate": 9.138612551853334e-06, "logits/chosen": 0.14254237711429596, "logits/rejected": 0.29151830077171326, "logps/chosen": -357.33001708984375, "logps/rejected": -309.49505615234375, "loss": 0.5649, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3096122741699219, "rewards/margins": 0.5217168927192688, "rewards/rejected": -0.8313292264938354, "step": 440 }, { "epoch": 0.23554043444124576, "grad_norm": 268.0, "learning_rate": 9.089403771938651e-06, "logits/chosen": 0.2172239124774933, "logits/rejected": 0.3413824439048767, "logps/chosen": -343.6112365722656, "logps/rejected": -313.96453857421875, "loss": 0.5926, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.2865334153175354, "rewards/margins": 0.4800845980644226, "rewards/rejected": -0.766618013381958, "step": 450 }, { "epoch": 0.24077466631771788, "grad_norm": 270.0, "learning_rate": 9.038968472916831e-06, "logits/chosen": 0.20296330749988556, "logits/rejected": 0.29848283529281616, "logps/chosen": -370.986328125, "logps/rejected": -362.0606384277344, "loss": 0.5738, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.44281673431396484, "rewards/margins": 0.5560091733932495, "rewards/rejected": -0.9988259077072144, "step": 460 }, { "epoch": 0.24600889819419, "grad_norm": 284.0, "learning_rate": 8.987321781652663e-06, "logits/chosen": 0.3275991380214691, "logits/rejected": 0.31361979246139526, "logps/chosen": -329.7540588378906, "logps/rejected": -297.33807373046875, "loss": 0.56, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.5578715205192566, "rewards/margins": 0.5787358283996582, "rewards/rejected": -1.1366074085235596, "step": 470 }, { "epoch": 0.2512431300706621, "grad_norm": 348.0, "learning_rate": 8.93447918833914e-06, "logits/chosen": 0.24553251266479492, "logits/rejected": 0.24857684969902039, "logps/chosen": -366.5342102050781, "logps/rejected": -313.3810119628906, "loss": 0.5912, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.5592954754829407, "rewards/margins": 0.5262395143508911, "rewards/rejected": -1.0855350494384766, "step": 480 }, { "epoch": 0.2564773619471343, "grad_norm": 300.0, "learning_rate": 8.880456541851544e-06, "logits/chosen": 0.21284916996955872, "logits/rejected": 0.29197466373443604, "logps/chosen": -394.79217529296875, "logps/rejected": -337.88397216796875, "loss": 0.5492, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5321744680404663, "rewards/margins": 0.6479983925819397, "rewards/rejected": -1.1801728010177612, "step": 490 }, { "epoch": 0.26171159382360637, "grad_norm": 370.0, "learning_rate": 8.825270044993963e-06, "logits/chosen": 0.30395790934562683, "logits/rejected": 0.41689762473106384, "logps/chosen": -316.583251953125, "logps/rejected": -323.14434814453125, "loss": 0.577, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.4370909631252289, "rewards/margins": 0.5204497575759888, "rewards/rejected": -0.9575408101081848, "step": 500 }, { "epoch": 0.2669458257000785, "grad_norm": 350.0, "learning_rate": 8.768936249639632e-06, "logits/chosen": 0.1348932683467865, "logits/rejected": 0.26566845178604126, "logps/chosen": -331.4639587402344, "logps/rejected": -320.91461181640625, "loss": 0.597, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.4939492344856262, "rewards/margins": 0.47716912627220154, "rewards/rejected": -0.9711184501647949, "step": 510 }, { "epoch": 0.2721800575765506, "grad_norm": 312.0, "learning_rate": 8.711472051766606e-06, "logits/chosen": 0.19042043387889862, "logits/rejected": 0.2794221341609955, "logps/chosen": -354.65740966796875, "logps/rejected": -331.80267333984375, "loss": 0.552, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.4176334738731384, "rewards/margins": 0.569202184677124, "rewards/rejected": -0.986835777759552, "step": 520 }, { "epoch": 0.27741428945302277, "grad_norm": 292.0, "learning_rate": 8.652894686390205e-06, "logits/chosen": 0.2197699099779129, "logits/rejected": 0.2926613390445709, "logps/chosen": -357.4103698730469, "logps/rejected": -326.92315673828125, "loss": 0.5695, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5999321341514587, "rewards/margins": 0.5926289558410645, "rewards/rejected": -1.1925609111785889, "step": 530 }, { "epoch": 0.2826485213294949, "grad_norm": 326.0, "learning_rate": 8.593221722393789e-06, "logits/chosen": 0.17706915736198425, "logits/rejected": 0.2584270238876343, "logps/chosen": -358.02740478515625, "logps/rejected": -326.58367919921875, "loss": 0.5489, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.8800150156021118, "rewards/margins": 0.6341498494148254, "rewards/rejected": -1.514164686203003, "step": 540 }, { "epoch": 0.287882753205967, "grad_norm": 332.0, "learning_rate": 8.53247105725939e-06, "logits/chosen": 0.22675807774066925, "logits/rejected": 0.23919430375099182, "logps/chosen": -325.2323913574219, "logps/rejected": -299.59039306640625, "loss": 0.5546, "rewards/accuracies": 0.6875, "rewards/chosen": -0.658308207988739, "rewards/margins": 0.6446129083633423, "rewards/rejected": -1.302921175956726, "step": 550 }, { "epoch": 0.29311698508243916, "grad_norm": 330.0, "learning_rate": 8.470660911699783e-06, "logits/chosen": 0.09726688265800476, "logits/rejected": 0.15922322869300842, "logps/chosen": -337.0237121582031, "logps/rejected": -292.3273010253906, "loss": 0.5796, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.5876582860946655, "rewards/margins": 0.5548220276832581, "rewards/rejected": -1.1424801349639893, "step": 560 }, { "epoch": 0.29835121695891126, "grad_norm": 294.0, "learning_rate": 8.407809824193624e-06, "logits/chosen": 0.08568461239337921, "logits/rejected": 0.2500315308570862, "logps/chosen": -373.3963928222656, "logps/rejected": -339.5152282714844, "loss": 0.5893, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.7039493918418884, "rewards/margins": 0.5431109666824341, "rewards/rejected": -1.2470605373382568, "step": 570 }, { "epoch": 0.3035854488353834, "grad_norm": 274.0, "learning_rate": 8.343936645425277e-06, "logits/chosen": 0.3479989767074585, "logits/rejected": 0.3873814642429352, "logps/chosen": -325.4124450683594, "logps/rejected": -307.15423583984375, "loss": 0.5188, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -0.594230055809021, "rewards/margins": 0.7160730957984924, "rewards/rejected": -1.3103030920028687, "step": 580 }, { "epoch": 0.30881968071185556, "grad_norm": 286.0, "learning_rate": 8.279060532630991e-06, "logits/chosen": 0.3080836236476898, "logits/rejected": 0.4134043753147125, "logps/chosen": -358.85076904296875, "logps/rejected": -332.5126953125, "loss": 0.5926, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.7294015884399414, "rewards/margins": 0.5793807506561279, "rewards/rejected": -1.3087823390960693, "step": 590 }, { "epoch": 0.31405391258832765, "grad_norm": 332.0, "learning_rate": 8.21320094385316e-06, "logits/chosen": 0.3551139533519745, "logits/rejected": 0.38634929060935974, "logps/chosen": -369.88446044921875, "logps/rejected": -338.72308349609375, "loss": 0.6129, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.5936108231544495, "rewards/margins": 0.5011196732521057, "rewards/rejected": -1.0947306156158447, "step": 600 }, { "epoch": 0.3192881444647998, "grad_norm": 290.0, "learning_rate": 8.146377632104328e-06, "logits/chosen": 0.22317269444465637, "logits/rejected": 0.4244447648525238, "logps/chosen": -381.1096496582031, "logps/rejected": -322.06689453125, "loss": 0.5404, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.44336166977882385, "rewards/margins": 0.691791832447052, "rewards/rejected": -1.1351535320281982, "step": 610 }, { "epoch": 0.3245223763412719, "grad_norm": 338.0, "learning_rate": 8.078610639442761e-06, "logits/chosen": 0.23834876716136932, "logits/rejected": 0.2626754641532898, "logps/chosen": -367.4986267089844, "logps/rejected": -314.33697509765625, "loss": 0.5753, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.5537876486778259, "rewards/margins": 0.5256361365318298, "rewards/rejected": -1.0794237852096558, "step": 620 }, { "epoch": 0.32975660821774405, "grad_norm": 346.0, "learning_rate": 8.009920290961302e-06, "logits/chosen": 0.18554073572158813, "logits/rejected": 0.0812699943780899, "logps/chosen": -345.83111572265625, "logps/rejected": -332.9136047363281, "loss": 0.549, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.5027406215667725, "rewards/margins": 0.6489895582199097, "rewards/rejected": -1.1517301797866821, "step": 630 }, { "epoch": 0.33499084009421615, "grad_norm": 326.0, "learning_rate": 7.94032718869134e-06, "logits/chosen": 0.1383436620235443, "logits/rejected": 0.1128048524260521, "logps/chosen": -360.80792236328125, "logps/rejected": -321.8601379394531, "loss": 0.5403, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.42300111055374146, "rewards/margins": 0.6382675170898438, "rewards/rejected": -1.0612685680389404, "step": 640 }, { "epoch": 0.3402250719706883, "grad_norm": 270.0, "learning_rate": 7.869852205423738e-06, "logits/chosen": 0.062131352722644806, "logits/rejected": 0.09803047776222229, "logps/chosen": -344.85394287109375, "logps/rejected": -308.65142822265625, "loss": 0.5739, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.47032850980758667, "rewards/margins": 0.5552009344100952, "rewards/rejected": -1.0255295038223267, "step": 650 }, { "epoch": 0.34545930384716045, "grad_norm": 328.0, "learning_rate": 7.798516478448514e-06, "logits/chosen": 0.0817512795329094, "logits/rejected": 0.07857748121023178, "logps/chosen": -365.5155029296875, "logps/rejected": -317.58673095703125, "loss": 0.5679, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.4949292540550232, "rewards/margins": 0.571694016456604, "rewards/rejected": -1.066623330116272, "step": 660 }, { "epoch": 0.35069353572363254, "grad_norm": 272.0, "learning_rate": 7.726341403215237e-06, "logits/chosen": 0.16348454356193542, "logits/rejected": 0.16545890271663666, "logps/chosen": -343.07965087890625, "logps/rejected": -295.33526611328125, "loss": 0.5704, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.5133311152458191, "rewards/margins": 0.6606391668319702, "rewards/rejected": -1.173970341682434, "step": 670 }, { "epoch": 0.3559277676001047, "grad_norm": 288.0, "learning_rate": 7.653348626915957e-06, "logits/chosen": 0.21217510104179382, "logits/rejected": 0.2788509726524353, "logps/chosen": -338.8388977050781, "logps/rejected": -317.30279541015625, "loss": 0.5509, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.48806896805763245, "rewards/margins": 0.6682482957839966, "rewards/rejected": -1.1563172340393066, "step": 680 }, { "epoch": 0.3611619994765768, "grad_norm": 282.0, "learning_rate": 7.5795600419926595e-06, "logits/chosen": 0.3617590069770813, "logits/rejected": 0.3512483537197113, "logps/chosen": -350.26263427734375, "logps/rejected": -308.1812744140625, "loss": 0.5607, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.3443593382835388, "rewards/margins": 0.5753520727157593, "rewards/rejected": -0.9197114109992981, "step": 690 }, { "epoch": 0.36639623135304894, "grad_norm": 258.0, "learning_rate": 7.504997779571134e-06, "logits/chosen": 0.35462698340415955, "logits/rejected": 0.4073667526245117, "logps/chosen": -342.01727294921875, "logps/rejected": -312.5554504394531, "loss": 0.5957, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.38807958364486694, "rewards/margins": 0.4581042230129242, "rewards/rejected": -0.8461838960647583, "step": 700 }, { "epoch": 0.3716304632295211, "grad_norm": 360.0, "learning_rate": 7.429684202823284e-06, "logits/chosen": 0.3464614748954773, "logits/rejected": 0.26026487350463867, "logps/chosen": -367.31829833984375, "logps/rejected": -318.8564453125, "loss": 0.529, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.44659894704818726, "rewards/margins": 0.6956688165664673, "rewards/rejected": -1.1422678232192993, "step": 710 }, { "epoch": 0.3768646951059932, "grad_norm": 320.0, "learning_rate": 7.353641900259823e-06, "logits/chosen": 0.33798351883888245, "logits/rejected": 0.30416375398635864, "logps/chosen": -348.59454345703125, "logps/rejected": -317.1212158203125, "loss": 0.5682, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.6439448595046997, "rewards/margins": 0.6287893056869507, "rewards/rejected": -1.2727340459823608, "step": 720 }, { "epoch": 0.38209892698246534, "grad_norm": 344.0, "learning_rate": 7.276893678955387e-06, "logits/chosen": 0.15998776257038116, "logits/rejected": 0.35106360912323, "logps/chosen": -367.888671875, "logps/rejected": -336.6737060546875, "loss": 0.5989, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.7499735951423645, "rewards/margins": 0.6262648701667786, "rewards/rejected": -1.3762385845184326, "step": 730 }, { "epoch": 0.38733315885893743, "grad_norm": 264.0, "learning_rate": 7.199462557708098e-06, "logits/chosen": 0.1745099276304245, "logits/rejected": 0.2742732763290405, "logps/chosen": -310.071533203125, "logps/rejected": -299.4850769042969, "loss": 0.5774, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6706897020339966, "rewards/margins": 0.5428994297981262, "rewards/rejected": -1.2135891914367676, "step": 740 }, { "epoch": 0.3925673907354096, "grad_norm": 314.0, "learning_rate": 7.1213717601356245e-06, "logits/chosen": 0.1733260601758957, "logits/rejected": 0.1483219712972641, "logps/chosen": -360.5184020996094, "logps/rejected": -322.8306579589844, "loss": 0.5344, "rewards/accuracies": 0.71875, "rewards/chosen": -0.47234034538269043, "rewards/margins": 0.6140186190605164, "rewards/rejected": -1.086358904838562, "step": 750 }, { "epoch": 0.39780162261188173, "grad_norm": 340.0, "learning_rate": 7.042644707709816e-06, "logits/chosen": 0.17112216353416443, "logits/rejected": 0.19962458312511444, "logps/chosen": -351.1370544433594, "logps/rejected": -333.3719177246094, "loss": 0.5823, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.4944635331630707, "rewards/margins": 0.5449696779251099, "rewards/rejected": -1.039433240890503, "step": 760 }, { "epoch": 0.40303585448835383, "grad_norm": 370.0, "learning_rate": 6.963305012731984e-06, "logits/chosen": 0.198240727186203, "logits/rejected": 0.220525860786438, "logps/chosen": -305.16143798828125, "logps/rejected": -299.1192932128906, "loss": 0.6028, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.5226008296012878, "rewards/margins": 0.5330361127853394, "rewards/rejected": -1.055637001991272, "step": 770 }, { "epoch": 0.408270086364826, "grad_norm": 328.0, "learning_rate": 6.8833764712509554e-06, "logits/chosen": 0.19480012357234955, "logits/rejected": 0.2182588279247284, "logps/chosen": -317.003662109375, "logps/rejected": -302.61334228515625, "loss": 0.5657, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.3436005711555481, "rewards/margins": 0.5159841775894165, "rewards/rejected": -0.8595848083496094, "step": 780 }, { "epoch": 0.4135043182412981, "grad_norm": 364.0, "learning_rate": 6.802883055926026e-06, "logits/chosen": 0.15303662419319153, "logits/rejected": 0.21523818373680115, "logps/chosen": -333.7895812988281, "logps/rejected": -296.78851318359375, "loss": 0.5489, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.3119350075721741, "rewards/margins": 0.6673828959465027, "rewards/rejected": -0.979317843914032, "step": 790 }, { "epoch": 0.4187385501177702, "grad_norm": 366.0, "learning_rate": 6.721848908836921e-06, "logits/chosen": 0.11557696759700775, "logits/rejected": 0.14221954345703125, "logps/chosen": -379.16595458984375, "logps/rejected": -320.8966979980469, "loss": 0.5204, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.3791603744029999, "rewards/margins": 0.6833642721176147, "rewards/rejected": -1.0625245571136475, "step": 800 }, { "epoch": 0.4239727819942423, "grad_norm": 264.0, "learning_rate": 6.640298334242959e-06, "logits/chosen": 0.08530505001544952, "logits/rejected": 0.15681490302085876, "logps/chosen": -323.5985107421875, "logps/rejected": -319.4253234863281, "loss": 0.526, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.5749447345733643, "rewards/margins": 0.6821728944778442, "rewards/rejected": -1.2571176290512085, "step": 810 }, { "epoch": 0.42920701387071447, "grad_norm": 350.0, "learning_rate": 6.558255791293572e-06, "logits/chosen": 0.0707249790430069, "logits/rejected": 0.15043438971042633, "logps/chosen": -357.5900573730469, "logps/rejected": -328.65118408203125, "loss": 0.5905, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.674008846282959, "rewards/margins": 0.6449601054191589, "rewards/rejected": -1.3189690113067627, "step": 820 }, { "epoch": 0.4344412457471866, "grad_norm": 282.0, "learning_rate": 6.475745886692361e-06, "logits/chosen": 0.1705600768327713, "logits/rejected": 0.156109020113945, "logps/chosen": -352.35650634765625, "logps/rejected": -340.0906677246094, "loss": 0.5481, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.580664873123169, "rewards/margins": 0.7169455289840698, "rewards/rejected": -1.2976105213165283, "step": 830 }, { "epoch": 0.4396754776236587, "grad_norm": 368.0, "learning_rate": 6.392793367316905e-06, "logits/chosen": 0.047196634113788605, "logits/rejected": 0.10491514205932617, "logps/chosen": -344.0426940917969, "logps/rejected": -328.0356140136719, "loss": 0.5316, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -0.6165002584457397, "rewards/margins": 0.6772734522819519, "rewards/rejected": -1.2937736511230469, "step": 840 }, { "epoch": 0.44490970950013087, "grad_norm": 390.0, "learning_rate": 6.309423112796529e-06, "logits/chosen": 0.08787860721349716, "logits/rejected": 0.29786446690559387, "logps/chosen": -330.8527526855469, "logps/rejected": -336.91888427734375, "loss": 0.5739, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8546838760375977, "rewards/margins": 0.6589832305908203, "rewards/rejected": -1.513667106628418, "step": 850 }, { "epoch": 0.45014394137660296, "grad_norm": 400.0, "learning_rate": 6.225660128050248e-06, "logits/chosen": 0.1513369381427765, "logits/rejected": 0.18070130050182343, "logps/chosen": -346.57659912109375, "logps/rejected": -328.84130859375, "loss": 0.5638, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0357505083084106, "rewards/margins": 0.6643354892730713, "rewards/rejected": -1.700085997581482, "step": 860 }, { "epoch": 0.4553781732530751, "grad_norm": 354.0, "learning_rate": 6.141529535787139e-06, "logits/chosen": 0.23875145614147186, "logits/rejected": 0.28748852014541626, "logps/chosen": -382.6456604003906, "logps/rejected": -346.1590270996094, "loss": 0.5211, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.7971159219741821, "rewards/margins": 0.7644892930984497, "rewards/rejected": -1.5616052150726318, "step": 870 }, { "epoch": 0.46061240512954726, "grad_norm": 318.0, "learning_rate": 6.057056568971383e-06, "logits/chosen": 0.1365332305431366, "logits/rejected": 0.18043069541454315, "logps/chosen": -365.5246887207031, "logps/rejected": -337.0705261230469, "loss": 0.5237, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.8823005557060242, "rewards/margins": 0.8230516314506531, "rewards/rejected": -1.7053521871566772, "step": 880 }, { "epoch": 0.46584663700601936, "grad_norm": 294.0, "learning_rate": 5.972266563254246e-06, "logits/chosen": 0.33021894097328186, "logits/rejected": 0.26815542578697205, "logps/chosen": -393.06182861328125, "logps/rejected": -348.9358825683594, "loss": 0.5696, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9404589533805847, "rewards/margins": 0.702355682849884, "rewards/rejected": -1.6428148746490479, "step": 890 }, { "epoch": 0.4710808688824915, "grad_norm": 292.0, "learning_rate": 5.887184949375242e-06, "logits/chosen": 0.24816791713237762, "logits/rejected": 0.3371456563472748, "logps/chosen": -343.94952392578125, "logps/rejected": -303.48870849609375, "loss": 0.5451, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5168992877006531, "rewards/margins": 0.7089965343475342, "rewards/rejected": -1.225895881652832, "step": 900 }, { "epoch": 0.4763151007589636, "grad_norm": 256.0, "learning_rate": 5.8018372455348e-06, "logits/chosen": 0.32140612602233887, "logits/rejected": 0.3305002748966217, "logps/chosen": -359.08538818359375, "logps/rejected": -316.7978515625, "loss": 0.5553, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.44215431809425354, "rewards/margins": 0.6605509519577026, "rewards/rejected": -1.1027053594589233, "step": 910 }, { "epoch": 0.48154933263543576, "grad_norm": 268.0, "learning_rate": 5.71624904974069e-06, "logits/chosen": 0.24679967761039734, "logits/rejected": 0.3497712016105652, "logps/chosen": -361.283935546875, "logps/rejected": -338.38153076171875, "loss": 0.5496, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.4841860234737396, "rewards/margins": 0.6799232959747314, "rewards/rejected": -1.1641093492507935, "step": 920 }, { "epoch": 0.48678356451190785, "grad_norm": 250.0, "learning_rate": 5.630446032130498e-06, "logits/chosen": 0.2412446290254593, "logits/rejected": 0.34884771704673767, "logps/chosen": -347.1498107910156, "logps/rejected": -331.9222412109375, "loss": 0.5455, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.5564799904823303, "rewards/margins": 0.6565181016921997, "rewards/rejected": -1.2129981517791748, "step": 930 }, { "epoch": 0.49201779638838, "grad_norm": 352.0, "learning_rate": 5.5444539272724925e-06, "logits/chosen": 0.22467438876628876, "logits/rejected": 0.42467164993286133, "logps/chosen": -348.5015869140625, "logps/rejected": -336.44342041015625, "loss": 0.534, "rewards/accuracies": 0.75, "rewards/chosen": -0.635161280632019, "rewards/margins": 0.7458114624023438, "rewards/rejected": -1.3809726238250732, "step": 940 }, { "epoch": 0.49725202826485215, "grad_norm": 370.0, "learning_rate": 5.458298526447155e-06, "logits/chosen": 0.277851402759552, "logits/rejected": 0.343191534280777, "logps/chosen": -348.0686950683594, "logps/rejected": -311.85992431640625, "loss": 0.6059, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.6322701573371887, "rewards/margins": 0.5943492650985718, "rewards/rejected": -1.2266194820404053, "step": 950 }, { "epoch": 0.5024862601413242, "grad_norm": 376.0, "learning_rate": 5.372005669911694e-06, "logits/chosen": 0.18535420298576355, "logits/rejected": 0.29728174209594727, "logps/chosen": -323.85418701171875, "logps/rejected": -302.52703857421875, "loss": 0.6372, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5875571966171265, "rewards/margins": 0.4703772962093353, "rewards/rejected": -1.0579345226287842, "step": 960 }, { "epoch": 0.5077204920177963, "grad_norm": 318.0, "learning_rate": 5.285601239149875e-06, "logits/chosen": 0.2485879361629486, "logits/rejected": 0.1934640109539032, "logps/chosen": -363.02557373046875, "logps/rejected": -332.2878112792969, "loss": 0.5864, "rewards/accuracies": 0.703125, "rewards/chosen": -0.4688809812068939, "rewards/margins": 0.6029322743415833, "rewards/rejected": -1.0718133449554443, "step": 970 }, { "epoch": 0.5129547238942685, "grad_norm": 352.0, "learning_rate": 5.199111149109498e-06, "logits/chosen": 0.14167314767837524, "logits/rejected": 0.3146267533302307, "logps/chosen": -308.8305969238281, "logps/rejected": -299.807373046875, "loss": 0.5698, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5121305584907532, "rewards/margins": 0.6366390585899353, "rewards/rejected": -1.1487696170806885, "step": 980 }, { "epoch": 0.5181889557707406, "grad_norm": 324.0, "learning_rate": 5.112561340429817e-06, "logits/chosen": 0.30979007482528687, "logits/rejected": 0.24150173366069794, "logps/chosen": -337.63214111328125, "logps/rejected": -298.39111328125, "loss": 0.5672, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5298670530319214, "rewards/margins": 0.603586733341217, "rewards/rejected": -1.1334538459777832, "step": 990 }, { "epoch": 0.5234231876472127, "grad_norm": 320.0, "learning_rate": 5.0259777716612665e-06, "logits/chosen": 0.23533792793750763, "logits/rejected": 0.3287450671195984, "logps/chosen": -365.8298645019531, "logps/rejected": -339.43133544921875, "loss": 0.596, "rewards/accuracies": 0.703125, "rewards/chosen": -0.4760667681694031, "rewards/margins": 0.5520576238632202, "rewards/rejected": -1.028124451637268, "step": 1000 }, { "epoch": 0.528657419523685, "grad_norm": 372.0, "learning_rate": 4.939386411479814e-06, "logits/chosen": 0.29721060395240784, "logits/rejected": 0.3717629909515381, "logps/chosen": -360.3513488769531, "logps/rejected": -350.74090576171875, "loss": 0.5676, "rewards/accuracies": 0.703125, "rewards/chosen": -0.3760520815849304, "rewards/margins": 0.6218014359474182, "rewards/rejected": -0.9978535771369934, "step": 1010 }, { "epoch": 0.533891651400157, "grad_norm": 344.0, "learning_rate": 4.85281323089828e-06, "logits/chosen": 0.34880155324935913, "logits/rejected": 0.32166919112205505, "logps/chosen": -375.96368408203125, "logps/rejected": -341.3938293457031, "loss": 0.6075, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.4991912841796875, "rewards/margins": 0.5173603296279907, "rewards/rejected": -1.0165516138076782, "step": 1020 }, { "epoch": 0.5391258832766291, "grad_norm": 296.0, "learning_rate": 4.766284195476943e-06, "logits/chosen": 0.3899001479148865, "logits/rejected": 0.400698721408844, "logps/chosen": -355.5746154785156, "logps/rejected": -325.11474609375, "loss": 0.5121, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.29699790477752686, "rewards/margins": 0.7675551772117615, "rewards/rejected": -1.0645530223846436, "step": 1030 }, { "epoch": 0.5443601151531012, "grad_norm": 304.0, "learning_rate": 4.679825257535795e-06, "logits/chosen": 0.34135550260543823, "logits/rejected": 0.3182600736618042, "logps/chosen": -349.2203063964844, "logps/rejected": -308.77166748046875, "loss": 0.5413, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.34611597657203674, "rewards/margins": 0.6459987759590149, "rewards/rejected": -0.9921148419380188, "step": 1040 }, { "epoch": 0.5495943470295734, "grad_norm": 370.0, "learning_rate": 4.593462348370759e-06, "logits/chosen": 0.2810625433921814, "logits/rejected": 0.3787192404270172, "logps/chosen": -342.5027770996094, "logps/rejected": -316.65985107421875, "loss": 0.552, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.3849152624607086, "rewards/margins": 0.6414963006973267, "rewards/rejected": -1.0264116525650024, "step": 1050 }, { "epoch": 0.5548285789060455, "grad_norm": 276.0, "learning_rate": 4.507221370476223e-06, "logits/chosen": 0.34859079122543335, "logits/rejected": 0.34620124101638794, "logps/chosen": -351.231689453125, "logps/rejected": -333.7611389160156, "loss": 0.5648, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.3917597532272339, "rewards/margins": 0.6424742937088013, "rewards/rejected": -1.0342340469360352, "step": 1060 }, { "epoch": 0.5600628107825176, "grad_norm": 217.0, "learning_rate": 4.421128189776195e-06, "logits/chosen": 0.29422345757484436, "logits/rejected": 0.3956177234649658, "logps/chosen": -312.7371826171875, "logps/rejected": -271.85382080078125, "loss": 0.5427, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.39171385765075684, "rewards/margins": 0.6362006664276123, "rewards/rejected": -1.0279145240783691, "step": 1070 }, { "epoch": 0.5652970426589898, "grad_norm": 268.0, "learning_rate": 4.335208627866438e-06, "logits/chosen": 0.43888354301452637, "logits/rejected": 0.47600990533828735, "logps/chosen": -342.0023193359375, "logps/rejected": -299.25982666015625, "loss": 0.5153, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.41604313254356384, "rewards/margins": 0.7133996486663818, "rewards/rejected": -1.1294429302215576, "step": 1080 }, { "epoch": 0.5705312745354619, "grad_norm": 330.0, "learning_rate": 4.249488454269908e-06, "logits/chosen": 0.37367188930511475, "logits/rejected": 0.4846370816230774, "logps/chosen": -356.6741638183594, "logps/rejected": -335.75311279296875, "loss": 0.5415, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5590308904647827, "rewards/margins": 0.6862959861755371, "rewards/rejected": -1.2453268766403198, "step": 1090 }, { "epoch": 0.575765506411934, "grad_norm": 310.0, "learning_rate": 4.163993378707786e-06, "logits/chosen": 0.3371972143650055, "logits/rejected": 0.39444199204444885, "logps/chosen": -321.92254638671875, "logps/rejected": -300.8343505859375, "loss": 0.5644, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5255651473999023, "rewards/margins": 0.709193766117096, "rewards/rejected": -1.2347590923309326, "step": 1100 }, { "epoch": 0.5809997382884062, "grad_norm": 380.0, "learning_rate": 4.0787490433884685e-06, "logits/chosen": 0.3301977813243866, "logits/rejected": 0.4222096800804138, "logps/chosen": -329.1367492675781, "logps/rejected": -303.93341064453125, "loss": 0.5756, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5914889574050903, "rewards/margins": 0.5726789236068726, "rewards/rejected": -1.1641680002212524, "step": 1110 }, { "epoch": 0.5862339701648783, "grad_norm": 340.0, "learning_rate": 3.993781015316802e-06, "logits/chosen": 0.32290878891944885, "logits/rejected": 0.3790258765220642, "logps/chosen": -377.72662353515625, "logps/rejected": -331.83441162109375, "loss": 0.5863, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6014608144760132, "rewards/margins": 0.6531153917312622, "rewards/rejected": -1.2545760869979858, "step": 1120 }, { "epoch": 0.5914682020413504, "grad_norm": 356.0, "learning_rate": 3.909114778625861e-06, "logits/chosen": 0.33370259404182434, "logits/rejected": 0.3154350221157074, "logps/chosen": -382.1673278808594, "logps/rejected": -314.76025390625, "loss": 0.4867, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.493272602558136, "rewards/margins": 0.8312891125679016, "rewards/rejected": -1.3245617151260376, "step": 1130 }, { "epoch": 0.5967024339178225, "grad_norm": 342.0, "learning_rate": 3.824775726933596e-06, "logits/chosen": 0.3804655969142914, "logits/rejected": 0.4335269033908844, "logps/chosen": -347.4833984375, "logps/rejected": -302.07989501953125, "loss": 0.5511, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5972079038619995, "rewards/margins": 0.6958715319633484, "rewards/rejected": -1.2930794954299927, "step": 1140 }, { "epoch": 0.6019366657942947, "grad_norm": 452.0, "learning_rate": 3.7407891557266242e-06, "logits/chosen": 0.28930753469467163, "logits/rejected": 0.3638380169868469, "logps/chosen": -337.60076904296875, "logps/rejected": -327.3893127441406, "loss": 0.5637, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7492964863777161, "rewards/margins": 0.7008451819419861, "rewards/rejected": -1.4501416683197021, "step": 1150 }, { "epoch": 0.6071708976707668, "grad_norm": 340.0, "learning_rate": 3.6571802547734457e-06, "logits/chosen": 0.29493704438209534, "logits/rejected": 0.42343488335609436, "logps/chosen": -345.88421630859375, "logps/rejected": -324.6173095703125, "loss": 0.5322, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.7139222025871277, "rewards/margins": 0.7538946866989136, "rewards/rejected": -1.467816948890686, "step": 1160 }, { "epoch": 0.6124051295472389, "grad_norm": 340.0, "learning_rate": 3.5739741005693807e-06, "logits/chosen": 0.36524444818496704, "logits/rejected": 0.4817792475223541, "logps/chosen": -373.5266418457031, "logps/rejected": -342.4767150878906, "loss": 0.5557, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6850903630256653, "rewards/margins": 0.7452309727668762, "rewards/rejected": -1.430321455001831, "step": 1170 }, { "epoch": 0.6176393614237111, "grad_norm": 288.0, "learning_rate": 3.4911956488154696e-06, "logits/chosen": 0.2990756034851074, "logits/rejected": 0.3051765561103821, "logps/chosen": -341.70721435546875, "logps/rejected": -312.9698791503906, "loss": 0.6205, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.803567111492157, "rewards/margins": 0.5913842916488647, "rewards/rejected": -1.3949514627456665, "step": 1180 }, { "epoch": 0.6228735933001832, "grad_norm": 274.0, "learning_rate": 3.4088697269336045e-06, "logits/chosen": 0.3608161211013794, "logits/rejected": 0.40439486503601074, "logps/chosen": -363.10028076171875, "logps/rejected": -309.46038818359375, "loss": 0.4981, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.5898820757865906, "rewards/margins": 0.8492467999458313, "rewards/rejected": -1.4391288757324219, "step": 1190 }, { "epoch": 0.6281078251766553, "grad_norm": 396.0, "learning_rate": 3.3270210266201373e-06, "logits/chosen": 0.39221999049186707, "logits/rejected": 0.44608980417251587, "logps/chosen": -349.74444580078125, "logps/rejected": -325.3342590332031, "loss": 0.5947, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7365375757217407, "rewards/margins": 0.6322949528694153, "rewards/rejected": -1.3688325881958008, "step": 1200 }, { "epoch": 0.6333420570531274, "grad_norm": 270.0, "learning_rate": 3.2456740964401977e-06, "logits/chosen": 0.39257878065109253, "logits/rejected": 0.5652514696121216, "logps/chosen": -351.83795166015625, "logps/rejected": -336.03948974609375, "loss": 0.5676, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.7251110672950745, "rewards/margins": 0.6608349084854126, "rewards/rejected": -1.3859459161758423, "step": 1210 }, { "epoch": 0.6385762889295996, "grad_norm": 352.0, "learning_rate": 3.1648533344649303e-06, "logits/chosen": 0.3172612190246582, "logits/rejected": 0.5099163055419922, "logps/chosen": -338.2564392089844, "logps/rejected": -346.206298828125, "loss": 0.5242, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7247421145439148, "rewards/margins": 0.7186325192451477, "rewards/rejected": -1.4433746337890625, "step": 1220 }, { "epoch": 0.6438105208060717, "grad_norm": 396.0, "learning_rate": 3.084582980953881e-06, "logits/chosen": 0.3695985674858093, "logits/rejected": 0.3975854814052582, "logps/chosen": -386.8086242675781, "logps/rejected": -308.7548828125, "loss": 0.5514, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7079992890357971, "rewards/margins": 0.7007894515991211, "rewards/rejected": -1.4087889194488525, "step": 1230 }, { "epoch": 0.6490447526825438, "grad_norm": 306.0, "learning_rate": 3.0048871110847043e-06, "logits/chosen": 0.36669978499412537, "logits/rejected": 0.3211643695831299, "logps/chosen": -364.41278076171875, "logps/rejected": -323.64862060546875, "loss": 0.507, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.6616480350494385, "rewards/margins": 0.8358721733093262, "rewards/rejected": -1.4975202083587646, "step": 1240 }, { "epoch": 0.654278984559016, "grad_norm": 314.0, "learning_rate": 2.925789627732395e-06, "logits/chosen": 0.30698102712631226, "logits/rejected": 0.35474127531051636, "logps/chosen": -357.7930603027344, "logps/rejected": -327.1461181640625, "loss": 0.5319, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.7104305624961853, "rewards/margins": 0.7990323901176453, "rewards/rejected": -1.5094630718231201, "step": 1250 }, { "epoch": 0.6595132164354881, "grad_norm": 278.0, "learning_rate": 2.8473142543001818e-06, "logits/chosen": 0.3213528096675873, "logits/rejected": 0.38778603076934814, "logps/chosen": -319.82452392578125, "logps/rejected": -299.6809387207031, "loss": 0.5615, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.7967512011528015, "rewards/margins": 0.6921517848968506, "rewards/rejected": -1.4889030456542969, "step": 1260 }, { "epoch": 0.6647474483119602, "grad_norm": 312.0, "learning_rate": 2.7694845276042714e-06, "logits/chosen": 0.3033554255962372, "logits/rejected": 0.31667545437812805, "logps/chosen": -360.24053955078125, "logps/rejected": -331.3898010253906, "loss": 0.5285, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.6901473999023438, "rewards/margins": 0.8282734751701355, "rewards/rejected": -1.5184208154678345, "step": 1270 }, { "epoch": 0.6699816801884323, "grad_norm": 356.0, "learning_rate": 2.6923237908145227e-06, "logits/chosen": 0.3856261074542999, "logits/rejected": 0.39977845549583435, "logps/chosen": -324.6197204589844, "logps/rejected": -332.137939453125, "loss": 0.5253, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.6914088726043701, "rewards/margins": 0.7797143459320068, "rewards/rejected": -1.471123218536377, "step": 1280 }, { "epoch": 0.6752159120649045, "grad_norm": 290.0, "learning_rate": 2.615855186453241e-06, "logits/chosen": 0.3395998179912567, "logits/rejected": 0.4570327401161194, "logps/chosen": -357.72625732421875, "logps/rejected": -345.084716796875, "loss": 0.5233, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -0.630346417427063, "rewards/margins": 0.8085399866104126, "rewards/rejected": -1.438886284828186, "step": 1290 }, { "epoch": 0.6804501439413766, "grad_norm": 366.0, "learning_rate": 2.5401016494541193e-06, "logits/chosen": 0.29590579867362976, "logits/rejected": 0.41916173696517944, "logps/chosen": -340.41949462890625, "logps/rejected": -329.39324951171875, "loss": 0.5494, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7232412099838257, "rewards/margins": 0.7066032886505127, "rewards/rejected": -1.429844617843628, "step": 1300 }, { "epoch": 0.6856843758178487, "grad_norm": 420.0, "learning_rate": 2.4650859002834465e-06, "logits/chosen": 0.33335959911346436, "logits/rejected": 0.5272048115730286, "logps/chosen": -347.9224548339844, "logps/rejected": -328.1867980957031, "loss": 0.5436, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6804947853088379, "rewards/margins": 0.6814740896224976, "rewards/rejected": -1.3619688749313354, "step": 1310 }, { "epoch": 0.6909186076943209, "grad_norm": 330.0, "learning_rate": 2.390830438125661e-06, "logits/chosen": 0.2588549256324768, "logits/rejected": 0.298136442899704, "logps/chosen": -352.1745910644531, "logps/rejected": -324.03460693359375, "loss": 0.595, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6992086172103882, "rewards/margins": 0.6043084263801575, "rewards/rejected": -1.3035171031951904, "step": 1320 }, { "epoch": 0.696152839570793, "grad_norm": 266.0, "learning_rate": 2.3173575341352457e-06, "logits/chosen": 0.31691044569015503, "logits/rejected": 0.4209415316581726, "logps/chosen": -344.1717529296875, "logps/rejected": -322.6068115234375, "loss": 0.5849, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.5810431241989136, "rewards/margins": 0.6289807558059692, "rewards/rejected": -1.2100238800048828, "step": 1330 }, { "epoch": 0.7013870714472651, "grad_norm": 316.0, "learning_rate": 2.2446892247570257e-06, "logits/chosen": 0.34166431427001953, "logits/rejected": 0.4205591678619385, "logps/chosen": -349.1050109863281, "logps/rejected": -328.892822265625, "loss": 0.5814, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.6503351926803589, "rewards/margins": 0.6150357723236084, "rewards/rejected": -1.2653712034225464, "step": 1340 }, { "epoch": 0.7066213033237373, "grad_norm": 302.0, "learning_rate": 2.172847305116872e-06, "logits/chosen": 0.3496546149253845, "logits/rejected": 0.339820921421051, "logps/chosen": -345.417236328125, "logps/rejected": -320.3566589355469, "loss": 0.5675, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.5050859451293945, "rewards/margins": 0.6172486543655396, "rewards/rejected": -1.122334599494934, "step": 1350 }, { "epoch": 0.7118555352002094, "grad_norm": 294.0, "learning_rate": 2.1018533224847638e-06, "logits/chosen": 0.36049994826316833, "logits/rejected": 0.3396483063697815, "logps/chosen": -375.68121337890625, "logps/rejected": -331.56939697265625, "loss": 0.5571, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5250564217567444, "rewards/margins": 0.7472478151321411, "rewards/rejected": -1.2723041772842407, "step": 1360 }, { "epoch": 0.7170897670766815, "grad_norm": 264.0, "learning_rate": 2.0317285698122035e-06, "logits/chosen": 0.23286870121955872, "logits/rejected": 0.4170301556587219, "logps/chosen": -337.4573059082031, "logps/rejected": -320.9345397949219, "loss": 0.5347, "rewards/accuracies": 0.71875, "rewards/chosen": -0.49074649810791016, "rewards/margins": 0.73365718126297, "rewards/rejected": -1.2244036197662354, "step": 1370 }, { "epoch": 0.7223239989531536, "grad_norm": 280.0, "learning_rate": 1.962494079345906e-06, "logits/chosen": 0.20548689365386963, "logits/rejected": 0.2943686544895172, "logps/chosen": -381.2185363769531, "logps/rejected": -330.40753173828125, "loss": 0.5545, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.536620557308197, "rewards/margins": 0.7038464546203613, "rewards/rejected": -1.2404670715332031, "step": 1380 }, { "epoch": 0.7275582308296258, "grad_norm": 254.0, "learning_rate": 1.8941706163196676e-06, "logits/chosen": 0.38386040925979614, "logits/rejected": 0.5160520672798157, "logps/chosen": -312.7783203125, "logps/rejected": -307.5871887207031, "loss": 0.5393, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5432096719741821, "rewards/margins": 0.667094349861145, "rewards/rejected": -1.2103040218353271, "step": 1390 }, { "epoch": 0.7327924627060979, "grad_norm": 255.0, "learning_rate": 1.8267786727263426e-06, "logits/chosen": 0.3777836263179779, "logits/rejected": 0.4588887691497803, "logps/chosen": -339.7103576660156, "logps/rejected": -315.67547607421875, "loss": 0.5454, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.5095704793930054, "rewards/margins": 0.6885578632354736, "rewards/rejected": -1.198128342628479, "step": 1400 }, { "epoch": 0.73802669458257, "grad_norm": 249.0, "learning_rate": 1.760338461171755e-06, "logits/chosen": 0.33284902572631836, "logits/rejected": 0.41128548979759216, "logps/chosen": -326.13421630859375, "logps/rejected": -321.0505065917969, "loss": 0.5956, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6211567521095276, "rewards/margins": 0.6012374758720398, "rewards/rejected": -1.2223942279815674, "step": 1410 }, { "epoch": 0.7432609264590422, "grad_norm": 278.0, "learning_rate": 1.6948699088123992e-06, "logits/chosen": 0.3391318917274475, "logits/rejected": 0.35012945532798767, "logps/chosen": -332.51690673828125, "logps/rejected": -305.8824768066406, "loss": 0.5685, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5398066639900208, "rewards/margins": 0.6431677341461182, "rewards/rejected": -1.1829744577407837, "step": 1420 }, { "epoch": 0.7484951583355143, "grad_norm": 342.0, "learning_rate": 1.6303926513787821e-06, "logits/chosen": 0.19832518696784973, "logits/rejected": 0.17641706764698029, "logps/chosen": -337.8302001953125, "logps/rejected": -308.45574951171875, "loss": 0.5347, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5528967976570129, "rewards/margins": 0.7190114855766296, "rewards/rejected": -1.2719082832336426, "step": 1430 }, { "epoch": 0.7537293902119864, "grad_norm": 292.0, "learning_rate": 1.5669260272861426e-06, "logits/chosen": 0.3353267014026642, "logits/rejected": 0.34371891617774963, "logps/chosen": -334.8343200683594, "logps/rejected": -333.1044006347656, "loss": 0.5223, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.514175534248352, "rewards/margins": 0.7716984748840332, "rewards/rejected": -1.2858738899230957, "step": 1440 }, { "epoch": 0.7589636220884585, "grad_norm": 296.0, "learning_rate": 1.5044890718343535e-06, "logits/chosen": 0.3490106463432312, "logits/rejected": 0.2644171118736267, "logps/chosen": -323.67138671875, "logps/rejected": -314.19647216796875, "loss": 0.5748, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6448107957839966, "rewards/margins": 0.6091581583023071, "rewards/rejected": -1.2539689540863037, "step": 1450 }, { "epoch": 0.7641978539649307, "grad_norm": 300.0, "learning_rate": 1.4431005114987485e-06, "logits/chosen": 0.37269195914268494, "logits/rejected": 0.3853556513786316, "logps/chosen": -394.0393371582031, "logps/rejected": -351.55548095703125, "loss": 0.5406, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.4938036799430847, "rewards/margins": 0.7198097109794617, "rewards/rejected": -1.2136132717132568, "step": 1460 }, { "epoch": 0.7694320858414028, "grad_norm": 302.0, "learning_rate": 1.3827787583135533e-06, "logits/chosen": 0.2608596086502075, "logits/rejected": 0.3895898461341858, "logps/chosen": -346.5641784667969, "logps/rejected": -333.06097412109375, "loss": 0.5955, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5579820871353149, "rewards/margins": 0.6600214838981628, "rewards/rejected": -1.218003511428833, "step": 1470 }, { "epoch": 0.7746663177178749, "grad_norm": 384.0, "learning_rate": 1.3235419043496362e-06, "logits/chosen": 0.4145224094390869, "logits/rejected": 0.5047595500946045, "logps/chosen": -339.38018798828125, "logps/rejected": -315.99114990234375, "loss": 0.5873, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5417883992195129, "rewards/margins": 0.6276899576187134, "rewards/rejected": -1.169478178024292, "step": 1480 }, { "epoch": 0.7799005495943471, "grad_norm": 308.0, "learning_rate": 1.2654077162882271e-06, "logits/chosen": 0.3089558482170105, "logits/rejected": 0.33582228422164917, "logps/chosen": -344.6625671386719, "logps/rejected": -318.9618835449219, "loss": 0.5562, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.5232676863670349, "rewards/margins": 0.7178744077682495, "rewards/rejected": -1.2411420345306396, "step": 1490 }, { "epoch": 0.7851347814708192, "grad_norm": 290.0, "learning_rate": 1.2083936300922238e-06, "logits/chosen": 0.45748963952064514, "logits/rejected": 0.5170606374740601, "logps/chosen": -360.7559814453125, "logps/rejected": -332.96112060546875, "loss": 0.59, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5006940364837646, "rewards/margins": 0.6657803058624268, "rewards/rejected": -1.1664743423461914, "step": 1500 }, { "epoch": 0.7903690133472913, "grad_norm": 242.0, "learning_rate": 1.1525167457766856e-06, "logits/chosen": 0.33311501145362854, "logits/rejected": 0.3369537889957428, "logps/chosen": -335.03680419921875, "logps/rejected": -313.3392333984375, "loss": 0.5675, "rewards/accuracies": 0.671875, "rewards/chosen": -0.5090693831443787, "rewards/margins": 0.669906497001648, "rewards/rejected": -1.1789758205413818, "step": 1510 }, { "epoch": 0.7956032452237635, "grad_norm": 334.0, "learning_rate": 1.0977938222801004e-06, "logits/chosen": 0.36881986260414124, "logits/rejected": 0.45787104964256287, "logps/chosen": -338.68902587890625, "logps/rejected": -313.40631103515625, "loss": 0.5784, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.5590661764144897, "rewards/margins": 0.6329992413520813, "rewards/rejected": -1.1920652389526367, "step": 1520 }, { "epoch": 0.8008374771002356, "grad_norm": 322.0, "learning_rate": 1.0442412724379365e-06, "logits/chosen": 0.26200932264328003, "logits/rejected": 0.27576130628585815, "logps/chosen": -344.83148193359375, "logps/rejected": -293.1750793457031, "loss": 0.5813, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.629385769367218, "rewards/margins": 0.5875921845436096, "rewards/rejected": -1.216977834701538, "step": 1530 }, { "epoch": 0.8060717089767077, "grad_norm": 286.0, "learning_rate": 9.9187515806e-07, "logits/chosen": 0.44481024146080017, "logits/rejected": 0.47209352254867554, "logps/chosen": -366.44891357421875, "logps/rejected": -317.8438415527344, "loss": 0.565, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.4901696741580963, "rewards/margins": 0.6533368825912476, "rewards/rejected": -1.1435067653656006, "step": 1540 }, { "epoch": 0.8113059408531798, "grad_norm": 282.0, "learning_rate": 9.407111851130879e-07, "logits/chosen": 0.43470582365989685, "logits/rejected": 0.3668103814125061, "logps/chosen": -337.2115783691406, "logps/rejected": -318.40155029296875, "loss": 0.5212, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -0.49172383546829224, "rewards/margins": 0.7541629672050476, "rewards/rejected": -1.2458868026733398, "step": 1550 }, { "epoch": 0.816540172729652, "grad_norm": 298.0, "learning_rate": 8.907646990103496e-07, "logits/chosen": 0.37122786045074463, "logits/rejected": 0.4722965657711029, "logps/chosen": -329.2878723144531, "logps/rejected": -302.6796569824219, "loss": 0.533, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5332542657852173, "rewards/margins": 0.6741858720779419, "rewards/rejected": -1.2074401378631592, "step": 1560 }, { "epoch": 0.821774404606124, "grad_norm": 234.0, "learning_rate": 8.42050680008798e-07, "logits/chosen": 0.2228083610534668, "logits/rejected": 0.2700883150100708, "logps/chosen": -343.5450744628906, "logps/rejected": -327.5457458496094, "loss": 0.561, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5294641256332397, "rewards/margins": 0.6504173278808594, "rewards/rejected": -1.1798814535140991, "step": 1570 }, { "epoch": 0.8270086364825961, "grad_norm": 322.0, "learning_rate": 7.945837387163424e-07, "logits/chosen": 0.42028242349624634, "logits/rejected": 0.4180065095424652, "logps/chosen": -353.06536865234375, "logps/rejected": -321.9817810058594, "loss": 0.5844, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.5255261659622192, "rewards/margins": 0.6547245979309082, "rewards/rejected": -1.1802507638931274, "step": 1580 }, { "epoch": 0.8322428683590684, "grad_norm": 380.0, "learning_rate": 7.483781117096828e-07, "logits/chosen": 0.38973018527030945, "logits/rejected": 0.43399643898010254, "logps/chosen": -370.450439453125, "logps/rejected": -342.24395751953125, "loss": 0.547, "rewards/accuracies": 0.6875, "rewards/chosen": -0.527172327041626, "rewards/margins": 0.7201731204986572, "rewards/rejected": -1.2473453283309937, "step": 1590 }, { "epoch": 0.8374771002355405, "grad_norm": 372.0, "learning_rate": 7.034476572643855e-07, "logits/chosen": 0.4103736877441406, "logits/rejected": 0.42073503136634827, "logps/chosen": -348.49932861328125, "logps/rejected": -322.942626953125, "loss": 0.5787, "rewards/accuracies": 0.671875, "rewards/chosen": -0.5726643800735474, "rewards/margins": 0.6367040872573853, "rewards/rejected": -1.2093684673309326, "step": 1600 }, { "epoch": 0.8427113321120125, "grad_norm": 324.0, "learning_rate": 6.598058511984307e-07, "logits/chosen": 0.4105502665042877, "logits/rejected": 0.4338196814060211, "logps/chosen": -334.387451171875, "logps/rejected": -300.1074523925781, "loss": 0.556, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.640805721282959, "rewards/margins": 0.6867104768753052, "rewards/rejected": -1.3275163173675537, "step": 1610 }, { "epoch": 0.8479455639884846, "grad_norm": 246.0, "learning_rate": 6.174657828304543e-07, "logits/chosen": 0.3273460268974304, "logits/rejected": 0.38331374526023865, "logps/chosen": -332.36773681640625, "logps/rejected": -316.30072021484375, "loss": 0.6104, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.5995886325836182, "rewards/margins": 0.5190034508705139, "rewards/rejected": -1.1185920238494873, "step": 1620 }, { "epoch": 0.8531797958649568, "grad_norm": 256.0, "learning_rate": 5.764401510539253e-07, "logits/chosen": 0.39274919033050537, "logits/rejected": 0.30704575777053833, "logps/chosen": -358.00335693359375, "logps/rejected": -301.65203857421875, "loss": 0.5616, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -0.5209869742393494, "rewards/margins": 0.6791882514953613, "rewards/rejected": -1.2001752853393555, "step": 1630 }, { "epoch": 0.8584140277414289, "grad_norm": 230.0, "learning_rate": 5.36741260528415e-07, "logits/chosen": 0.2753371000289917, "logits/rejected": 0.3843556344509125, "logps/chosen": -372.30059814453125, "logps/rejected": -354.3190002441406, "loss": 0.4937, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.4627212584018707, "rewards/margins": 0.8342711329460144, "rewards/rejected": -1.296992540359497, "step": 1640 }, { "epoch": 0.863648259617901, "grad_norm": 253.0, "learning_rate": 4.98381017989103e-07, "logits/chosen": 0.25747784972190857, "logits/rejected": 0.3317343294620514, "logps/chosen": -345.8427734375, "logps/rejected": -307.8304748535156, "loss": 0.5145, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -0.4770580232143402, "rewards/margins": 0.7587519884109497, "rewards/rejected": -1.2358100414276123, "step": 1650 }, { "epoch": 0.8688824914943732, "grad_norm": 274.0, "learning_rate": 4.6137092867564127e-07, "logits/chosen": 0.3641647398471832, "logits/rejected": 0.42129549384117126, "logps/chosen": -317.44012451171875, "logps/rejected": -299.4899597167969, "loss": 0.5478, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5020959377288818, "rewards/margins": 0.6763127446174622, "rewards/rejected": -1.1784086227416992, "step": 1660 }, { "epoch": 0.8741167233708453, "grad_norm": 253.0, "learning_rate": 4.2572209288143095e-07, "logits/chosen": 0.35885730385780334, "logits/rejected": 0.3501403331756592, "logps/chosen": -347.42071533203125, "logps/rejected": -318.55279541015625, "loss": 0.5766, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.5235342383384705, "rewards/margins": 0.6213586926460266, "rewards/rejected": -1.144892930984497, "step": 1670 }, { "epoch": 0.8793509552473174, "grad_norm": 232.0, "learning_rate": 3.9144520262435094e-07, "logits/chosen": 0.35745617747306824, "logits/rejected": 0.4186561703681946, "logps/chosen": -373.4577941894531, "logps/rejected": -316.75909423828125, "loss": 0.5036, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.39865627884864807, "rewards/margins": 0.827733039855957, "rewards/rejected": -1.2263892889022827, "step": 1680 }, { "epoch": 0.8845851871237895, "grad_norm": 348.0, "learning_rate": 3.5855053843994625e-07, "logits/chosen": 0.3469906747341156, "logits/rejected": 0.3383873999118805, "logps/chosen": -330.7817687988281, "logps/rejected": -343.49249267578125, "loss": 0.5905, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5502803921699524, "rewards/margins": 0.5968191623687744, "rewards/rejected": -1.1470996141433716, "step": 1690 }, { "epoch": 0.8898194190002617, "grad_norm": 290.0, "learning_rate": 3.270479662980247e-07, "logits/chosen": 0.4720439314842224, "logits/rejected": 0.507573127746582, "logps/chosen": -340.6956787109375, "logps/rejected": -328.48541259765625, "loss": 0.5653, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5593769550323486, "rewards/margins": 0.6956008672714233, "rewards/rejected": -1.254977822303772, "step": 1700 }, { "epoch": 0.8950536508767338, "grad_norm": 320.0, "learning_rate": 2.9694693464359434e-07, "logits/chosen": 0.36417311429977417, "logits/rejected": 0.32077834010124207, "logps/chosen": -358.11871337890625, "logps/rejected": -348.39031982421875, "loss": 0.55, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.5245743989944458, "rewards/margins": 0.7115057706832886, "rewards/rejected": -1.2360801696777344, "step": 1710 }, { "epoch": 0.9002878827532059, "grad_norm": 302.0, "learning_rate": 2.682564715630287e-07, "logits/chosen": 0.33791905641555786, "logits/rejected": 0.3675960600376129, "logps/chosen": -351.0929870605469, "logps/rejected": -325.4019775390625, "loss": 0.5036, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.5013250112533569, "rewards/margins": 0.8183242678642273, "rewards/rejected": -1.319649338722229, "step": 1720 }, { "epoch": 0.9055221146296781, "grad_norm": 384.0, "learning_rate": 2.4098518207630706e-07, "logits/chosen": 0.38433754444122314, "logits/rejected": 0.397840678691864, "logps/chosen": -345.4227294921875, "logps/rejected": -300.4522399902344, "loss": 0.5736, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.558509886264801, "rewards/margins": 0.6275274157524109, "rewards/rejected": -1.186037302017212, "step": 1730 }, { "epoch": 0.9107563465061502, "grad_norm": 374.0, "learning_rate": 2.1514124555614412e-07, "logits/chosen": 0.2413160502910614, "logits/rejected": 0.3199203610420227, "logps/chosen": -373.3514099121094, "logps/rejected": -336.98895263671875, "loss": 0.5558, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5669843554496765, "rewards/margins": 0.670451283454895, "rewards/rejected": -1.2374355792999268, "step": 1740 }, { "epoch": 0.9159905783826223, "grad_norm": 274.0, "learning_rate": 1.9073241327478287e-07, "logits/chosen": 0.2592464089393616, "logits/rejected": 0.2286282330751419, "logps/chosen": -335.2867736816406, "logps/rejected": -296.50677490234375, "loss": 0.5734, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.5148959159851074, "rewards/margins": 0.5882579684257507, "rewards/rejected": -1.103153944015503, "step": 1750 }, { "epoch": 0.9212248102590945, "grad_norm": 348.0, "learning_rate": 1.677660060791836e-07, "logits/chosen": 0.35103827714920044, "logits/rejected": 0.36917150020599365, "logps/chosen": -350.96551513671875, "logps/rejected": -313.03997802734375, "loss": 0.5246, "rewards/accuracies": 0.734375, "rewards/chosen": -0.4882447123527527, "rewards/margins": 0.7595881819725037, "rewards/rejected": -1.247833013534546, "step": 1760 }, { "epoch": 0.9264590421355666, "grad_norm": 286.0, "learning_rate": 1.4624891219531256e-07, "logits/chosen": 0.3121943771839142, "logits/rejected": 0.3472541272640228, "logps/chosen": -350.4696044921875, "logps/rejected": -318.84222412109375, "loss": 0.5622, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5181047320365906, "rewards/margins": 0.6555663347244263, "rewards/rejected": -1.173671007156372, "step": 1770 }, { "epoch": 0.9316932740120387, "grad_norm": 251.0, "learning_rate": 1.2618758516218187e-07, "logits/chosen": 0.38791024684906006, "logits/rejected": 0.38494163751602173, "logps/chosen": -309.5002746582031, "logps/rejected": -289.1304016113281, "loss": 0.5638, "rewards/accuracies": 0.703125, "rewards/chosen": -0.49522829055786133, "rewards/margins": 0.622164249420166, "rewards/rejected": -1.1173925399780273, "step": 1780 }, { "epoch": 0.9369275058885108, "grad_norm": 336.0, "learning_rate": 1.0758804189626492e-07, "logits/chosen": 0.34429654479026794, "logits/rejected": 0.43823686242103577, "logps/chosen": -337.650146484375, "logps/rejected": -311.02020263671875, "loss": 0.5783, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5348609685897827, "rewards/margins": 0.6113255620002747, "rewards/rejected": -1.1461864709854126, "step": 1790 }, { "epoch": 0.942161737764983, "grad_norm": 360.0, "learning_rate": 9.045586088686497e-08, "logits/chosen": 0.35523998737335205, "logits/rejected": 0.29251137375831604, "logps/chosen": -362.7002868652344, "logps/rejected": -318.9178161621094, "loss": 0.5547, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5227453112602234, "rewards/margins": 0.6950558423995972, "rewards/rejected": -1.2178010940551758, "step": 1800 }, { "epoch": 0.9473959696414551, "grad_norm": 370.0, "learning_rate": 7.479618052298132e-08, "logits/chosen": 0.46711522340774536, "logits/rejected": 0.34828323125839233, "logps/chosen": -365.4064636230469, "logps/rejected": -346.60321044921875, "loss": 0.5483, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5453445315361023, "rewards/margins": 0.6945411562919617, "rewards/rejected": -1.2398855686187744, "step": 1810 }, { "epoch": 0.9526302015179272, "grad_norm": 272.0, "learning_rate": 6.06136975521715e-08, "logits/chosen": 0.21810802817344666, "logits/rejected": 0.33456215262413025, "logps/chosen": -362.4671630859375, "logps/rejected": -329.0609436035156, "loss": 0.5389, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.5769203305244446, "rewards/margins": 0.7337585687637329, "rewards/rejected": -1.3106788396835327, "step": 1820 }, { "epoch": 0.9578644333943994, "grad_norm": 280.0, "learning_rate": 4.7912665671874246e-08, "logits/chosen": 0.2832115590572357, "logits/rejected": 0.31732824444770813, "logps/chosen": -343.995361328125, "logps/rejected": -318.9495544433594, "loss": 0.5587, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5360749959945679, "rewards/margins": 0.6776861548423767, "rewards/rejected": -1.2137610912322998, "step": 1830 }, { "epoch": 0.9630986652708715, "grad_norm": 334.0, "learning_rate": 3.669689425361444e-08, "logits/chosen": 0.32280421257019043, "logits/rejected": 0.36163073778152466, "logps/chosen": -317.0058288574219, "logps/rejected": -308.7008972167969, "loss": 0.5632, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.4824526906013489, "rewards/margins": 0.6204373240470886, "rewards/rejected": -1.102890133857727, "step": 1840 }, { "epoch": 0.9683328971473436, "grad_norm": 368.0, "learning_rate": 2.6969747200472073e-08, "logits/chosen": 0.378648579120636, "logits/rejected": 0.5615746378898621, "logps/chosen": -327.52935791015625, "logps/rejected": -317.4973449707031, "loss": 0.588, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.559417188167572, "rewards/margins": 0.6587087512016296, "rewards/rejected": -1.2181260585784912, "step": 1850 }, { "epoch": 0.9735671290238157, "grad_norm": 340.0, "learning_rate": 1.873414193816092e-08, "logits/chosen": 0.3898963928222656, "logits/rejected": 0.37868732213974, "logps/chosen": -372.6197814941406, "logps/rejected": -348.3377380371094, "loss": 0.5311, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.5276554226875305, "rewards/margins": 0.7353495359420776, "rewards/rejected": -1.2630048990249634, "step": 1860 }, { "epoch": 0.9788013609002879, "grad_norm": 422.0, "learning_rate": 1.1992548540016858e-08, "logits/chosen": 0.31078967452049255, "logits/rejected": 0.2968718707561493, "logps/chosen": -372.1559143066406, "logps/rejected": -341.91424560546875, "loss": 0.5663, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.5914555788040161, "rewards/margins": 0.6402640342712402, "rewards/rejected": -1.2317196130752563, "step": 1870 }, { "epoch": 0.98403559277676, "grad_norm": 306.0, "learning_rate": 6.746988986156e-09, "logits/chosen": 0.37066927552223206, "logits/rejected": 0.4628186821937561, "logps/chosen": -331.226318359375, "logps/rejected": -304.57989501953125, "loss": 0.5334, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.5163390636444092, "rewards/margins": 0.7293740510940552, "rewards/rejected": -1.245713233947754, "step": 1880 }, { "epoch": 0.9892698246532321, "grad_norm": 326.0, "learning_rate": 2.9990365570314874e-09, "logits/chosen": 0.37458479404449463, "logits/rejected": 0.33924776315689087, "logps/chosen": -363.9075012207031, "logps/rejected": -337.91070556640625, "loss": 0.5197, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.5259437561035156, "rewards/margins": 0.7279617190361023, "rewards/rejected": -1.2539054155349731, "step": 1890 }, { "epoch": 0.9945040565297043, "grad_norm": 338.0, "learning_rate": 7.498153615653758e-10, "logits/chosen": 0.327511191368103, "logits/rejected": 0.3362283408641815, "logps/chosen": -310.9585266113281, "logps/rejected": -305.3989562988281, "loss": 0.5745, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5321878790855408, "rewards/margins": 0.6392725706100464, "rewards/rejected": -1.171460509300232, "step": 1900 }, { "epoch": 0.9997382884061764, "grad_norm": 304.0, "learning_rate": 0.0, "logits/chosen": 0.31302136182785034, "logits/rejected": 0.31303030252456665, "logps/chosen": -355.60076904296875, "logps/rejected": -323.382568359375, "loss": 0.5713, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5852067470550537, "rewards/margins": 0.5970735549926758, "rewards/rejected": -1.1822803020477295, "step": 1910 } ], "logging_steps": 10, "max_steps": 1910, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }