diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,2898 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9997382884061764, + "eval_steps": 500, + "global_step": 1910, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005234231876472127, + "grad_norm": 312.0, + "learning_rate": 1.0416666666666667e-06, + "logits/chosen": 0.665995180606842, + "logits/rejected": 0.7168087959289551, + "logps/chosen": -331.14556884765625, + "logps/rejected": -289.13482666015625, + "loss": 0.6929, + "rewards/accuracies": 0.40312498807907104, + "rewards/chosen": -0.0007204435532912612, + "rewards/margins": 0.0008210704545490444, + "rewards/rejected": -0.0015415140660479665, + "step": 10 + }, + { + "epoch": 0.010468463752944255, + "grad_norm": 284.0, + "learning_rate": 2.0833333333333334e-06, + "logits/chosen": 0.7776141166687012, + "logits/rejected": 0.7684425115585327, + "logps/chosen": -357.8346862792969, + "logps/rejected": -317.8344421386719, + "loss": 0.6936, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.0012695642653852701, + "rewards/margins": -0.0004827965167351067, + "rewards/rejected": -0.0007867676904425025, + "step": 20 + }, + { + "epoch": 0.015702695629416383, + "grad_norm": 322.0, + "learning_rate": 3.125e-06, + "logits/chosen": 0.795623779296875, + "logits/rejected": 0.8778733015060425, + "logps/chosen": -350.8582458496094, + "logps/rejected": -318.2168884277344, + "loss": 0.6904, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0022780809085816145, + "rewards/margins": 0.0060666268691420555, + "rewards/rejected": -0.008344708010554314, + "step": 30 + }, + { + "epoch": 0.02093692750588851, + "grad_norm": 298.0, + "learning_rate": 4.166666666666667e-06, + "logits/chosen": 0.7740770578384399, + "logits/rejected": 0.8167956471443176, + "logps/chosen": -319.42022705078125, + "logps/rejected": -278.17071533203125, + "loss": 0.681, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.006301078014075756, + "rewards/margins": 0.026611831039190292, + "rewards/rejected": -0.02031075581908226, + "step": 40 + }, + { + "epoch": 0.02617115938236064, + "grad_norm": 266.0, + "learning_rate": 5.208333333333334e-06, + "logits/chosen": 0.6866067051887512, + "logits/rejected": 0.7555549144744873, + "logps/chosen": -320.47479248046875, + "logps/rejected": -284.809814453125, + "loss": 0.6727, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.005391906015574932, + "rewards/margins": 0.04711990803480148, + "rewards/rejected": -0.05251181870698929, + "step": 50 + }, + { + "epoch": 0.031405391258832765, + "grad_norm": 288.0, + "learning_rate": 6.25e-06, + "logits/chosen": 0.6783124208450317, + "logits/rejected": 0.7343226671218872, + "logps/chosen": -337.063232421875, + "logps/rejected": -299.95220947265625, + "loss": 0.6656, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.011515669524669647, + "rewards/margins": 0.07207117229700089, + "rewards/rejected": -0.08358683437108994, + "step": 60 + }, + { + "epoch": 0.036639623135304895, + "grad_norm": 262.0, + "learning_rate": 7.291666666666667e-06, + "logits/chosen": 0.8541976809501648, + "logits/rejected": 0.9135104417800903, + "logps/chosen": -324.9010925292969, + "logps/rejected": -313.7442321777344, + "loss": 0.6576, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.033813267946243286, + "rewards/margins": 0.1023920327425003, + "rewards/rejected": -0.1362052857875824, + "step": 70 + }, + { + "epoch": 0.04187385501177702, + "grad_norm": 282.0, + "learning_rate": 8.333333333333334e-06, + "logits/chosen": 0.8204466700553894, + "logits/rejected": 0.9971317052841187, + "logps/chosen": -309.83648681640625, + "logps/rejected": -280.83837890625, + "loss": 0.649, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.051890332251787186, + "rewards/margins": 0.13043811917304993, + "rewards/rejected": -0.18232843279838562, + "step": 80 + }, + { + "epoch": 0.04710808688824915, + "grad_norm": 290.0, + "learning_rate": 9.375000000000001e-06, + "logits/chosen": 0.8248814344406128, + "logits/rejected": 0.798936128616333, + "logps/chosen": -330.08660888671875, + "logps/rejected": -278.1947937011719, + "loss": 0.6422, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.07527975738048553, + "rewards/margins": 0.17323294281959534, + "rewards/rejected": -0.24851271510124207, + "step": 90 + }, + { + "epoch": 0.05234231876472128, + "grad_norm": 234.0, + "learning_rate": 9.999880027023295e-06, + "logits/chosen": 0.5035718083381653, + "logits/rejected": 0.6347559094429016, + "logps/chosen": -328.1427917480469, + "logps/rejected": -298.84197998046875, + "loss": 0.6071, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.14533154666423798, + "rewards/margins": 0.28229671716690063, + "rewards/rejected": -0.4276282787322998, + "step": 100 + }, + { + "epoch": 0.05757655064119341, + "grad_norm": 246.0, + "learning_rate": 9.998530397154684e-06, + "logits/chosen": 0.5344328284263611, + "logits/rejected": 0.6689058542251587, + "logps/chosen": -325.33978271484375, + "logps/rejected": -314.258544921875, + "loss": 0.6253, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.20693711936473846, + "rewards/margins": 0.24847209453582764, + "rewards/rejected": -0.4554091989994049, + "step": 110 + }, + { + "epoch": 0.06281078251766553, + "grad_norm": 284.0, + "learning_rate": 9.995681577335256e-06, + "logits/chosen": 0.4409152865409851, + "logits/rejected": 0.5330216288566589, + "logps/chosen": -340.72930908203125, + "logps/rejected": -319.43524169921875, + "loss": 0.6163, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.3839780390262604, + "rewards/margins": 0.3311913013458252, + "rewards/rejected": -0.7151693105697632, + "step": 120 + }, + { + "epoch": 0.06804501439413765, + "grad_norm": 446.0, + "learning_rate": 9.99133442200056e-06, + "logits/chosen": 0.30335044860839844, + "logits/rejected": 0.46630460023880005, + "logps/chosen": -353.91961669921875, + "logps/rejected": -310.37689208984375, + "loss": 0.6173, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.37794750928878784, + "rewards/margins": 0.34848180413246155, + "rewards/rejected": -0.7264293432235718, + "step": 130 + }, + { + "epoch": 0.07327924627060979, + "grad_norm": 294.0, + "learning_rate": 9.985490234976132e-06, + "logits/chosen": 0.45180901885032654, + "logits/rejected": 0.5098147392272949, + "logps/chosen": -345.41558837890625, + "logps/rejected": -291.6056823730469, + "loss": 0.5936, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4161813259124756, + "rewards/margins": 0.40652480721473694, + "rewards/rejected": -0.8227061033248901, + "step": 140 + }, + { + "epoch": 0.07851347814708191, + "grad_norm": 270.0, + "learning_rate": 9.978150769086457e-06, + "logits/chosen": 0.40853095054626465, + "logits/rejected": 0.5322204828262329, + "logps/chosen": -341.7745666503906, + "logps/rejected": -305.63043212890625, + "loss": 0.6364, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5094475746154785, + "rewards/margins": 0.3103070855140686, + "rewards/rejected": -0.8197546005249023, + "step": 150 + }, + { + "epoch": 0.08374771002355404, + "grad_norm": 294.0, + "learning_rate": 9.96931822562924e-06, + "logits/chosen": 0.33679673075675964, + "logits/rejected": 0.4187542498111725, + "logps/chosen": -353.4910888671875, + "logps/rejected": -340.1171875, + "loss": 0.6111, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.3925863802433014, + "rewards/margins": 0.3052482306957245, + "rewards/rejected": -0.6978346109390259, + "step": 160 + }, + { + "epoch": 0.08898194190002617, + "grad_norm": 288.0, + "learning_rate": 9.958995253715193e-06, + "logits/chosen": 0.36519330739974976, + "logits/rejected": 0.3689248263835907, + "logps/chosen": -358.2540283203125, + "logps/rejected": -317.26116943359375, + "loss": 0.6105, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.3615415096282959, + "rewards/margins": 0.3146303594112396, + "rewards/rejected": -0.6761718988418579, + "step": 170 + }, + { + "epoch": 0.0942161737764983, + "grad_norm": 316.0, + "learning_rate": 9.947184949473478e-06, + "logits/chosen": 0.30113479495048523, + "logits/rejected": 0.36322420835494995, + "logps/chosen": -344.6726379394531, + "logps/rejected": -300.2312316894531, + "loss": 0.5862, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5419459342956543, + "rewards/margins": 0.40768465399742126, + "rewards/rejected": -0.949630618095398, + "step": 180 + }, + { + "epoch": 0.09945040565297043, + "grad_norm": 322.0, + "learning_rate": 9.933890855123114e-06, + "logits/chosen": 0.16772204637527466, + "logits/rejected": 0.20316286385059357, + "logps/chosen": -375.6507263183594, + "logps/rejected": -358.7842102050781, + "loss": 0.6266, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.9694031476974487, + "rewards/margins": 0.3656802773475647, + "rewards/rejected": -1.3350833654403687, + "step": 190 + }, + { + "epoch": 0.10468463752944256, + "grad_norm": 324.0, + "learning_rate": 9.919116957910566e-06, + "logits/chosen": 0.14172935485839844, + "logits/rejected": 0.11142061650753021, + "logps/chosen": -349.0318298339844, + "logps/rejected": -289.46453857421875, + "loss": 0.5972, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6366950273513794, + "rewards/margins": 0.4041665494441986, + "rewards/rejected": -1.0408614873886108, + "step": 200 + }, + { + "epoch": 0.10991886940591468, + "grad_norm": 276.0, + "learning_rate": 9.902867688913869e-06, + "logits/chosen": 0.3469844162464142, + "logits/rejected": 0.3956855535507202, + "logps/chosen": -365.18756103515625, + "logps/rejected": -319.1282958984375, + "loss": 0.5693, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8478742837905884, + "rewards/margins": 0.47847065329551697, + "rewards/rejected": -1.3263448476791382, + "step": 210 + }, + { + "epoch": 0.11515310128238682, + "grad_norm": 304.0, + "learning_rate": 9.885147921713621e-06, + "logits/chosen": 0.19320572912693024, + "logits/rejected": 0.27649644017219543, + "logps/chosen": -338.0028076171875, + "logps/rejected": -322.33349609375, + "loss": 0.5867, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8439861536026001, + "rewards/margins": 0.49753037095069885, + "rewards/rejected": -1.3415164947509766, + "step": 220 + }, + { + "epoch": 0.12038733315885894, + "grad_norm": 256.0, + "learning_rate": 9.865962970931287e-06, + "logits/chosen": 0.397473007440567, + "logits/rejected": 0.41920894384384155, + "logps/chosen": -357.3067321777344, + "logps/rejected": -311.8251037597656, + "loss": 0.588, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.2804175317287445, + "rewards/margins": 0.4174894690513611, + "rewards/rejected": -0.6979071497917175, + "step": 230 + }, + { + "epoch": 0.12562156503533106, + "grad_norm": 280.0, + "learning_rate": 9.845318590635186e-06, + "logits/chosen": 0.4800747036933899, + "logits/rejected": 0.6408789753913879, + "logps/chosen": -353.8960876464844, + "logps/rejected": -311.86163330078125, + "loss": 0.5746, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -0.3724919259548187, + "rewards/margins": 0.5004433989524841, + "rewards/rejected": -0.8729352951049805, + "step": 240 + }, + { + "epoch": 0.13085579691180318, + "grad_norm": 294.0, + "learning_rate": 9.823220972614712e-06, + "logits/chosen": 0.3700530230998993, + "logits/rejected": 0.4417282044887543, + "logps/chosen": -369.8304443359375, + "logps/rejected": -302.2646789550781, + "loss": 0.573, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5568550825119019, + "rewards/margins": 0.5210874080657959, + "rewards/rejected": -1.0779423713684082, + "step": 250 + }, + { + "epoch": 0.1360900287882753, + "grad_norm": 312.0, + "learning_rate": 9.79967674452324e-06, + "logits/chosen": 0.3898628354072571, + "logits/rejected": 0.4610047936439514, + "logps/chosen": -347.03118896484375, + "logps/rejected": -336.3883056640625, + "loss": 0.593, + "rewards/accuracies": 0.6656249761581421, + "rewards/chosen": -0.400566428899765, + "rewards/margins": 0.50266432762146, + "rewards/rejected": -0.9032306671142578, + "step": 260 + }, + { + "epoch": 0.14132426066474746, + "grad_norm": 294.0, + "learning_rate": 9.774692967890332e-06, + "logits/chosen": 0.17694059014320374, + "logits/rejected": 0.22063255310058594, + "logps/chosen": -356.22650146484375, + "logps/rejected": -323.39617919921875, + "loss": 0.5893, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -0.3973694443702698, + "rewards/margins": 0.4625066816806793, + "rewards/rejected": -0.8598760366439819, + "step": 270 + }, + { + "epoch": 0.14655849254121958, + "grad_norm": 310.0, + "learning_rate": 9.74827713600379e-06, + "logits/chosen": 0.2775232195854187, + "logits/rejected": 0.38801032304763794, + "logps/chosen": -316.23944091796875, + "logps/rejected": -285.94964599609375, + "loss": 0.6144, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6175938844680786, + "rewards/margins": 0.40199989080429077, + "rewards/rejected": -1.0195937156677246, + "step": 280 + }, + { + "epoch": 0.1517927244176917, + "grad_norm": 272.0, + "learning_rate": 9.720437171662232e-06, + "logits/chosen": 0.39185574650764465, + "logits/rejected": 0.48235875368118286, + "logps/chosen": -336.41375732421875, + "logps/rejected": -312.65216064453125, + "loss": 0.5847, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.23143287003040314, + "rewards/margins": 0.4122442305088043, + "rewards/rejected": -0.6436771154403687, + "step": 290 + }, + { + "epoch": 0.15702695629416383, + "grad_norm": 286.0, + "learning_rate": 9.691181424798825e-06, + "logits/chosen": 0.30432984232902527, + "logits/rejected": 0.2606360912322998, + "logps/chosen": -320.1929626464844, + "logps/rejected": -296.85626220703125, + "loss": 0.5877, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.16091197729110718, + "rewards/margins": 0.4196176528930664, + "rewards/rejected": -0.5805296897888184, + "step": 300 + }, + { + "epoch": 0.16226118817063595, + "grad_norm": 320.0, + "learning_rate": 9.660518669976936e-06, + "logits/chosen": 0.3142179250717163, + "logits/rejected": 0.42381685972213745, + "logps/chosen": -351.7626647949219, + "logps/rejected": -305.1934509277344, + "loss": 0.6083, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.3087378144264221, + "rewards/margins": 0.41920194029808044, + "rewards/rejected": -0.7279397249221802, + "step": 310 + }, + { + "epoch": 0.16749542004710807, + "grad_norm": 302.0, + "learning_rate": 9.628458103758403e-06, + "logits/chosen": 0.33300966024398804, + "logits/rejected": 0.37351295351982117, + "logps/chosen": -366.12384033203125, + "logps/rejected": -330.9198913574219, + "loss": 0.5437, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.4144722819328308, + "rewards/margins": 0.5553442239761353, + "rewards/rejected": -0.9698165655136108, + "step": 320 + }, + { + "epoch": 0.17272965192358022, + "grad_norm": 314.0, + "learning_rate": 9.595009341945246e-06, + "logits/chosen": 0.22988705337047577, + "logits/rejected": 0.2729721665382385, + "logps/chosen": -334.3009338378906, + "logps/rejected": -321.2250671386719, + "loss": 0.6288, + "rewards/accuracies": 0.659375011920929, + "rewards/chosen": -0.8995565176010132, + "rewards/margins": 0.4780782163143158, + "rewards/rejected": -1.3776347637176514, + "step": 330 + }, + { + "epoch": 0.17796388380005235, + "grad_norm": 288.0, + "learning_rate": 9.560182416695639e-06, + "logits/chosen": 0.2716033458709717, + "logits/rejected": 0.2932526171207428, + "logps/chosen": -331.0648193359375, + "logps/rejected": -329.04937744140625, + "loss": 0.5775, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6781526803970337, + "rewards/margins": 0.5396233201026917, + "rewards/rejected": -1.2177760601043701, + "step": 340 + }, + { + "epoch": 0.18319811567652447, + "grad_norm": 330.0, + "learning_rate": 9.523987773514999e-06, + "logits/chosen": 0.22474929690361023, + "logits/rejected": 0.27910318970680237, + "logps/chosen": -335.4002990722656, + "logps/rejected": -297.45635986328125, + "loss": 0.6094, + "rewards/accuracies": 0.659375011920929, + "rewards/chosen": -0.4196121096611023, + "rewards/margins": 0.37615495920181274, + "rewards/rejected": -0.7957671284675598, + "step": 350 + }, + { + "epoch": 0.1884323475529966, + "grad_norm": 314.0, + "learning_rate": 9.486436268123112e-06, + "logits/chosen": 0.1711244434118271, + "logits/rejected": 0.24781985580921173, + "logps/chosen": -365.4104919433594, + "logps/rejected": -339.17510986328125, + "loss": 0.5956, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.35211288928985596, + "rewards/margins": 0.45698657631874084, + "rewards/rejected": -0.8090993762016296, + "step": 360 + }, + { + "epoch": 0.19366657942946872, + "grad_norm": 322.0, + "learning_rate": 9.447539163198218e-06, + "logits/chosen": 0.3507956266403198, + "logits/rejected": 0.3582335114479065, + "logps/chosen": -343.34564208984375, + "logps/rejected": -306.3224182128906, + "loss": 0.5941, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.42065954208374023, + "rewards/margins": 0.4902091920375824, + "rewards/rejected": -0.9108688235282898, + "step": 370 + }, + { + "epoch": 0.19890081130594087, + "grad_norm": 201.0, + "learning_rate": 9.407308124999031e-06, + "logits/chosen": 0.535057544708252, + "logits/rejected": 0.5568063855171204, + "logps/chosen": -361.60809326171875, + "logps/rejected": -340.9134826660156, + "loss": 0.6044, + "rewards/accuracies": 0.6656249761581421, + "rewards/chosen": -0.8800666928291321, + "rewards/margins": 0.5229350924491882, + "rewards/rejected": -1.4030016660690308, + "step": 380 + }, + { + "epoch": 0.204135043182413, + "grad_norm": 253.0, + "learning_rate": 9.365755219865733e-06, + "logits/chosen": 0.5194161534309387, + "logits/rejected": 0.6202957630157471, + "logps/chosen": -356.85552978515625, + "logps/rejected": -337.2948913574219, + "loss": 0.5604, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6672223806381226, + "rewards/margins": 0.6344886422157288, + "rewards/rejected": -1.3017112016677856, + "step": 390 + }, + { + "epoch": 0.2093692750588851, + "grad_norm": 358.0, + "learning_rate": 9.322892910600959e-06, + "logits/chosen": 0.539501965045929, + "logits/rejected": 0.7477941513061523, + "logps/chosen": -328.63128662109375, + "logps/rejected": -295.7342529296875, + "loss": 0.6114, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6417405009269714, + "rewards/margins": 0.4066389203071594, + "rewards/rejected": -1.0483794212341309, + "step": 400 + }, + { + "epoch": 0.21460350693535724, + "grad_norm": 454.0, + "learning_rate": 9.278734052731876e-06, + "logits/chosen": 0.4946824014186859, + "logits/rejected": 0.5248245596885681, + "logps/chosen": -346.6952819824219, + "logps/rejected": -322.2110595703125, + "loss": 0.5717, + "rewards/accuracies": 0.7406250238418579, + "rewards/chosen": -0.4744420647621155, + "rewards/margins": 0.5339844226837158, + "rewards/rejected": -1.0084264278411865, + "step": 410 + }, + { + "epoch": 0.21983773881182936, + "grad_norm": 344.0, + "learning_rate": 9.233291890654477e-06, + "logits/chosen": 0.1536872535943985, + "logits/rejected": 0.21127930283546448, + "logps/chosen": -349.3604736328125, + "logps/rejected": -306.022216796875, + "loss": 0.5426, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.4529080390930176, + "rewards/margins": 0.5982980728149414, + "rewards/rejected": -1.051206111907959, + "step": 420 + }, + { + "epoch": 0.22507197068830148, + "grad_norm": 312.0, + "learning_rate": 9.186580053661238e-06, + "logits/chosen": 0.2585577666759491, + "logits/rejected": 0.21868768334388733, + "logps/chosen": -345.9136962890625, + "logps/rejected": -358.81427001953125, + "loss": 0.6164, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6662707328796387, + "rewards/margins": 0.5226248502731323, + "rewards/rejected": -1.1888954639434814, + "step": 430 + }, + { + "epoch": 0.23030620256477363, + "grad_norm": 260.0, + "learning_rate": 9.138612551853334e-06, + "logits/chosen": 0.14254237711429596, + "logits/rejected": 0.29151830077171326, + "logps/chosen": -357.33001708984375, + "logps/rejected": -309.49505615234375, + "loss": 0.5649, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.3096122741699219, + "rewards/margins": 0.5217168927192688, + "rewards/rejected": -0.8313292264938354, + "step": 440 + }, + { + "epoch": 0.23554043444124576, + "grad_norm": 268.0, + "learning_rate": 9.089403771938651e-06, + "logits/chosen": 0.2172239124774933, + "logits/rejected": 0.3413824439048767, + "logps/chosen": -343.6112365722656, + "logps/rejected": -313.96453857421875, + "loss": 0.5926, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.2865334153175354, + "rewards/margins": 0.4800845980644226, + "rewards/rejected": -0.766618013381958, + "step": 450 + }, + { + "epoch": 0.24077466631771788, + "grad_norm": 270.0, + "learning_rate": 9.038968472916831e-06, + "logits/chosen": 0.20296330749988556, + "logits/rejected": 0.29848283529281616, + "logps/chosen": -370.986328125, + "logps/rejected": -362.0606384277344, + "loss": 0.5738, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.44281673431396484, + "rewards/margins": 0.5560091733932495, + "rewards/rejected": -0.9988259077072144, + "step": 460 + }, + { + "epoch": 0.24600889819419, + "grad_norm": 284.0, + "learning_rate": 8.987321781652663e-06, + "logits/chosen": 0.3275991380214691, + "logits/rejected": 0.31361979246139526, + "logps/chosen": -329.7540588378906, + "logps/rejected": -297.33807373046875, + "loss": 0.56, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -0.5578715205192566, + "rewards/margins": 0.5787358283996582, + "rewards/rejected": -1.1366074085235596, + "step": 470 + }, + { + "epoch": 0.2512431300706621, + "grad_norm": 348.0, + "learning_rate": 8.93447918833914e-06, + "logits/chosen": 0.24553251266479492, + "logits/rejected": 0.24857684969902039, + "logps/chosen": -366.5342102050781, + "logps/rejected": -313.3810119628906, + "loss": 0.5912, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -0.5592954754829407, + "rewards/margins": 0.5262395143508911, + "rewards/rejected": -1.0855350494384766, + "step": 480 + }, + { + "epoch": 0.2564773619471343, + "grad_norm": 300.0, + "learning_rate": 8.880456541851544e-06, + "logits/chosen": 0.21284916996955872, + "logits/rejected": 0.29197466373443604, + "logps/chosen": -394.79217529296875, + "logps/rejected": -337.88397216796875, + "loss": 0.5492, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5321744680404663, + "rewards/margins": 0.6479983925819397, + "rewards/rejected": -1.1801728010177612, + "step": 490 + }, + { + "epoch": 0.26171159382360637, + "grad_norm": 370.0, + "learning_rate": 8.825270044993963e-06, + "logits/chosen": 0.30395790934562683, + "logits/rejected": 0.41689762473106384, + "logps/chosen": -316.583251953125, + "logps/rejected": -323.14434814453125, + "loss": 0.577, + "rewards/accuracies": 0.659375011920929, + "rewards/chosen": -0.4370909631252289, + "rewards/margins": 0.5204497575759888, + "rewards/rejected": -0.9575408101081848, + "step": 500 + }, + { + "epoch": 0.2669458257000785, + "grad_norm": 350.0, + "learning_rate": 8.768936249639632e-06, + "logits/chosen": 0.1348932683467865, + "logits/rejected": 0.26566845178604126, + "logps/chosen": -331.4639587402344, + "logps/rejected": -320.91461181640625, + "loss": 0.597, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.4939492344856262, + "rewards/margins": 0.47716912627220154, + "rewards/rejected": -0.9711184501647949, + "step": 510 + }, + { + "epoch": 0.2721800575765506, + "grad_norm": 312.0, + "learning_rate": 8.711472051766606e-06, + "logits/chosen": 0.19042043387889862, + "logits/rejected": 0.2794221341609955, + "logps/chosen": -354.65740966796875, + "logps/rejected": -331.80267333984375, + "loss": 0.552, + "rewards/accuracies": 0.6781250238418579, + "rewards/chosen": -0.4176334738731384, + "rewards/margins": 0.569202184677124, + "rewards/rejected": -0.986835777759552, + "step": 520 + }, + { + "epoch": 0.27741428945302277, + "grad_norm": 292.0, + "learning_rate": 8.652894686390205e-06, + "logits/chosen": 0.2197699099779129, + "logits/rejected": 0.2926613390445709, + "logps/chosen": -357.4103698730469, + "logps/rejected": -326.92315673828125, + "loss": 0.5695, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5999321341514587, + "rewards/margins": 0.5926289558410645, + "rewards/rejected": -1.1925609111785889, + "step": 530 + }, + { + "epoch": 0.2826485213294949, + "grad_norm": 326.0, + "learning_rate": 8.593221722393789e-06, + "logits/chosen": 0.17706915736198425, + "logits/rejected": 0.2584270238876343, + "logps/chosen": -358.02740478515625, + "logps/rejected": -326.58367919921875, + "loss": 0.5489, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -0.8800150156021118, + "rewards/margins": 0.6341498494148254, + "rewards/rejected": -1.514164686203003, + "step": 540 + }, + { + "epoch": 0.287882753205967, + "grad_norm": 332.0, + "learning_rate": 8.53247105725939e-06, + "logits/chosen": 0.22675807774066925, + "logits/rejected": 0.23919430375099182, + "logps/chosen": -325.2323913574219, + "logps/rejected": -299.59039306640625, + "loss": 0.5546, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.658308207988739, + "rewards/margins": 0.6446129083633423, + "rewards/rejected": -1.302921175956726, + "step": 550 + }, + { + "epoch": 0.29311698508243916, + "grad_norm": 330.0, + "learning_rate": 8.470660911699783e-06, + "logits/chosen": 0.09726688265800476, + "logits/rejected": 0.15922322869300842, + "logps/chosen": -337.0237121582031, + "logps/rejected": -292.3273010253906, + "loss": 0.5796, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -0.5876582860946655, + "rewards/margins": 0.5548220276832581, + "rewards/rejected": -1.1424801349639893, + "step": 560 + }, + { + "epoch": 0.29835121695891126, + "grad_norm": 294.0, + "learning_rate": 8.407809824193624e-06, + "logits/chosen": 0.08568461239337921, + "logits/rejected": 0.2500315308570862, + "logps/chosen": -373.3963928222656, + "logps/rejected": -339.5152282714844, + "loss": 0.5893, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -0.7039493918418884, + "rewards/margins": 0.5431109666824341, + "rewards/rejected": -1.2470605373382568, + "step": 570 + }, + { + "epoch": 0.3035854488353834, + "grad_norm": 274.0, + "learning_rate": 8.343936645425277e-06, + "logits/chosen": 0.3479989767074585, + "logits/rejected": 0.3873814642429352, + "logps/chosen": -325.4124450683594, + "logps/rejected": -307.15423583984375, + "loss": 0.5188, + "rewards/accuracies": 0.7406250238418579, + "rewards/chosen": -0.594230055809021, + "rewards/margins": 0.7160730957984924, + "rewards/rejected": -1.3103030920028687, + "step": 580 + }, + { + "epoch": 0.30881968071185556, + "grad_norm": 286.0, + "learning_rate": 8.279060532630991e-06, + "logits/chosen": 0.3080836236476898, + "logits/rejected": 0.4134043753147125, + "logps/chosen": -358.85076904296875, + "logps/rejected": -332.5126953125, + "loss": 0.5926, + "rewards/accuracies": 0.6656249761581421, + "rewards/chosen": -0.7294015884399414, + "rewards/margins": 0.5793807506561279, + "rewards/rejected": -1.3087823390960693, + "step": 590 + }, + { + "epoch": 0.31405391258832765, + "grad_norm": 332.0, + "learning_rate": 8.21320094385316e-06, + "logits/chosen": 0.3551139533519745, + "logits/rejected": 0.38634929060935974, + "logps/chosen": -369.88446044921875, + "logps/rejected": -338.72308349609375, + "loss": 0.6129, + "rewards/accuracies": 0.6656249761581421, + "rewards/chosen": -0.5936108231544495, + "rewards/margins": 0.5011196732521057, + "rewards/rejected": -1.0947306156158447, + "step": 600 + }, + { + "epoch": 0.3192881444647998, + "grad_norm": 290.0, + "learning_rate": 8.146377632104328e-06, + "logits/chosen": 0.22317269444465637, + "logits/rejected": 0.4244447648525238, + "logps/chosen": -381.1096496582031, + "logps/rejected": -322.06689453125, + "loss": 0.5404, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -0.44336166977882385, + "rewards/margins": 0.691791832447052, + "rewards/rejected": -1.1351535320281982, + "step": 610 + }, + { + "epoch": 0.3245223763412719, + "grad_norm": 338.0, + "learning_rate": 8.078610639442761e-06, + "logits/chosen": 0.23834876716136932, + "logits/rejected": 0.2626754641532898, + "logps/chosen": -367.4986267089844, + "logps/rejected": -314.33697509765625, + "loss": 0.5753, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -0.5537876486778259, + "rewards/margins": 0.5256361365318298, + "rewards/rejected": -1.0794237852096558, + "step": 620 + }, + { + "epoch": 0.32975660821774405, + "grad_norm": 346.0, + "learning_rate": 8.009920290961302e-06, + "logits/chosen": 0.18554073572158813, + "logits/rejected": 0.0812699943780899, + "logps/chosen": -345.83111572265625, + "logps/rejected": -332.9136047363281, + "loss": 0.549, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.5027406215667725, + "rewards/margins": 0.6489895582199097, + "rewards/rejected": -1.1517301797866821, + "step": 630 + }, + { + "epoch": 0.33499084009421615, + "grad_norm": 326.0, + "learning_rate": 7.94032718869134e-06, + "logits/chosen": 0.1383436620235443, + "logits/rejected": 0.1128048524260521, + "logps/chosen": -360.80792236328125, + "logps/rejected": -321.8601379394531, + "loss": 0.5403, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.42300111055374146, + "rewards/margins": 0.6382675170898438, + "rewards/rejected": -1.0612685680389404, + "step": 640 + }, + { + "epoch": 0.3402250719706883, + "grad_norm": 270.0, + "learning_rate": 7.869852205423738e-06, + "logits/chosen": 0.062131352722644806, + "logits/rejected": 0.09803047776222229, + "logps/chosen": -344.85394287109375, + "logps/rejected": -308.65142822265625, + "loss": 0.5739, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.47032850980758667, + "rewards/margins": 0.5552009344100952, + "rewards/rejected": -1.0255295038223267, + "step": 650 + }, + { + "epoch": 0.34545930384716045, + "grad_norm": 328.0, + "learning_rate": 7.798516478448514e-06, + "logits/chosen": 0.0817512795329094, + "logits/rejected": 0.07857748121023178, + "logps/chosen": -365.5155029296875, + "logps/rejected": -317.58673095703125, + "loss": 0.5679, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -0.4949292540550232, + "rewards/margins": 0.571694016456604, + "rewards/rejected": -1.066623330116272, + "step": 660 + }, + { + "epoch": 0.35069353572363254, + "grad_norm": 272.0, + "learning_rate": 7.726341403215237e-06, + "logits/chosen": 0.16348454356193542, + "logits/rejected": 0.16545890271663666, + "logps/chosen": -343.07965087890625, + "logps/rejected": -295.33526611328125, + "loss": 0.5704, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -0.5133311152458191, + "rewards/margins": 0.6606391668319702, + "rewards/rejected": -1.173970341682434, + "step": 670 + }, + { + "epoch": 0.3559277676001047, + "grad_norm": 288.0, + "learning_rate": 7.653348626915957e-06, + "logits/chosen": 0.21217510104179382, + "logits/rejected": 0.2788509726524353, + "logps/chosen": -338.8388977050781, + "logps/rejected": -317.30279541015625, + "loss": 0.5509, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.48806896805763245, + "rewards/margins": 0.6682482957839966, + "rewards/rejected": -1.1563172340393066, + "step": 680 + }, + { + "epoch": 0.3611619994765768, + "grad_norm": 282.0, + "learning_rate": 7.5795600419926595e-06, + "logits/chosen": 0.3617590069770813, + "logits/rejected": 0.3512483537197113, + "logps/chosen": -350.26263427734375, + "logps/rejected": -308.1812744140625, + "loss": 0.5607, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -0.3443593382835388, + "rewards/margins": 0.5753520727157593, + "rewards/rejected": -0.9197114109992981, + "step": 690 + }, + { + "epoch": 0.36639623135304894, + "grad_norm": 258.0, + "learning_rate": 7.504997779571134e-06, + "logits/chosen": 0.35462698340415955, + "logits/rejected": 0.4073667526245117, + "logps/chosen": -342.01727294921875, + "logps/rejected": -312.5554504394531, + "loss": 0.5957, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.38807958364486694, + "rewards/margins": 0.4581042230129242, + "rewards/rejected": -0.8461838960647583, + "step": 700 + }, + { + "epoch": 0.3716304632295211, + "grad_norm": 360.0, + "learning_rate": 7.429684202823284e-06, + "logits/chosen": 0.3464614748954773, + "logits/rejected": 0.26026487350463867, + "logps/chosen": -367.31829833984375, + "logps/rejected": -318.8564453125, + "loss": 0.529, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -0.44659894704818726, + "rewards/margins": 0.6956688165664673, + "rewards/rejected": -1.1422678232192993, + "step": 710 + }, + { + "epoch": 0.3768646951059932, + "grad_norm": 320.0, + "learning_rate": 7.353641900259823e-06, + "logits/chosen": 0.33798351883888245, + "logits/rejected": 0.30416375398635864, + "logps/chosen": -348.59454345703125, + "logps/rejected": -317.1212158203125, + "loss": 0.5682, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -0.6439448595046997, + "rewards/margins": 0.6287893056869507, + "rewards/rejected": -1.2727340459823608, + "step": 720 + }, + { + "epoch": 0.38209892698246534, + "grad_norm": 344.0, + "learning_rate": 7.276893678955387e-06, + "logits/chosen": 0.15998776257038116, + "logits/rejected": 0.35106360912323, + "logps/chosen": -367.888671875, + "logps/rejected": -336.6737060546875, + "loss": 0.5989, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -0.7499735951423645, + "rewards/margins": 0.6262648701667786, + "rewards/rejected": -1.3762385845184326, + "step": 730 + }, + { + "epoch": 0.38733315885893743, + "grad_norm": 264.0, + "learning_rate": 7.199462557708098e-06, + "logits/chosen": 0.1745099276304245, + "logits/rejected": 0.2742732763290405, + "logps/chosen": -310.071533203125, + "logps/rejected": -299.4850769042969, + "loss": 0.5774, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6706897020339966, + "rewards/margins": 0.5428994297981262, + "rewards/rejected": -1.2135891914367676, + "step": 740 + }, + { + "epoch": 0.3925673907354096, + "grad_norm": 314.0, + "learning_rate": 7.1213717601356245e-06, + "logits/chosen": 0.1733260601758957, + "logits/rejected": 0.1483219712972641, + "logps/chosen": -360.5184020996094, + "logps/rejected": -322.8306579589844, + "loss": 0.5344, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.47234034538269043, + "rewards/margins": 0.6140186190605164, + "rewards/rejected": -1.086358904838562, + "step": 750 + }, + { + "epoch": 0.39780162261188173, + "grad_norm": 340.0, + "learning_rate": 7.042644707709816e-06, + "logits/chosen": 0.17112216353416443, + "logits/rejected": 0.19962458312511444, + "logps/chosen": -351.1370544433594, + "logps/rejected": -333.3719177246094, + "loss": 0.5823, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -0.4944635331630707, + "rewards/margins": 0.5449696779251099, + "rewards/rejected": -1.039433240890503, + "step": 760 + }, + { + "epoch": 0.40303585448835383, + "grad_norm": 370.0, + "learning_rate": 6.963305012731984e-06, + "logits/chosen": 0.198240727186203, + "logits/rejected": 0.220525860786438, + "logps/chosen": -305.16143798828125, + "logps/rejected": -299.1192932128906, + "loss": 0.6028, + "rewards/accuracies": 0.6781250238418579, + "rewards/chosen": -0.5226008296012878, + "rewards/margins": 0.5330361127853394, + "rewards/rejected": -1.055637001991272, + "step": 770 + }, + { + "epoch": 0.408270086364826, + "grad_norm": 328.0, + "learning_rate": 6.8833764712509554e-06, + "logits/chosen": 0.19480012357234955, + "logits/rejected": 0.2182588279247284, + "logps/chosen": -317.003662109375, + "logps/rejected": -302.61334228515625, + "loss": 0.5657, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -0.3436005711555481, + "rewards/margins": 0.5159841775894165, + "rewards/rejected": -0.8595848083496094, + "step": 780 + }, + { + "epoch": 0.4135043182412981, + "grad_norm": 364.0, + "learning_rate": 6.802883055926026e-06, + "logits/chosen": 0.15303662419319153, + "logits/rejected": 0.21523818373680115, + "logps/chosen": -333.7895812988281, + "logps/rejected": -296.78851318359375, + "loss": 0.5489, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -0.3119350075721741, + "rewards/margins": 0.6673828959465027, + "rewards/rejected": -0.979317843914032, + "step": 790 + }, + { + "epoch": 0.4187385501177702, + "grad_norm": 366.0, + "learning_rate": 6.721848908836921e-06, + "logits/chosen": 0.11557696759700775, + "logits/rejected": 0.14221954345703125, + "logps/chosen": -379.16595458984375, + "logps/rejected": -320.8966979980469, + "loss": 0.5204, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.3791603744029999, + "rewards/margins": 0.6833642721176147, + "rewards/rejected": -1.0625245571136475, + "step": 800 + }, + { + "epoch": 0.4239727819942423, + "grad_norm": 264.0, + "learning_rate": 6.640298334242959e-06, + "logits/chosen": 0.08530505001544952, + "logits/rejected": 0.15681490302085876, + "logps/chosen": -323.5985107421875, + "logps/rejected": -319.4253234863281, + "loss": 0.526, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.5749447345733643, + "rewards/margins": 0.6821728944778442, + "rewards/rejected": -1.2571176290512085, + "step": 810 + }, + { + "epoch": 0.42920701387071447, + "grad_norm": 350.0, + "learning_rate": 6.558255791293572e-06, + "logits/chosen": 0.0707249790430069, + "logits/rejected": 0.15043438971042633, + "logps/chosen": -357.5900573730469, + "logps/rejected": -328.65118408203125, + "loss": 0.5905, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.674008846282959, + "rewards/margins": 0.6449601054191589, + "rewards/rejected": -1.3189690113067627, + "step": 820 + }, + { + "epoch": 0.4344412457471866, + "grad_norm": 282.0, + "learning_rate": 6.475745886692361e-06, + "logits/chosen": 0.1705600768327713, + "logits/rejected": 0.156109020113945, + "logps/chosen": -352.35650634765625, + "logps/rejected": -340.0906677246094, + "loss": 0.5481, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -0.580664873123169, + "rewards/margins": 0.7169455289840698, + "rewards/rejected": -1.2976105213165283, + "step": 830 + }, + { + "epoch": 0.4396754776236587, + "grad_norm": 368.0, + "learning_rate": 6.392793367316905e-06, + "logits/chosen": 0.047196634113788605, + "logits/rejected": 0.10491514205932617, + "logps/chosen": -344.0426940917969, + "logps/rejected": -328.0356140136719, + "loss": 0.5316, + "rewards/accuracies": 0.7406250238418579, + "rewards/chosen": -0.6165002584457397, + "rewards/margins": 0.6772734522819519, + "rewards/rejected": -1.2937736511230469, + "step": 840 + }, + { + "epoch": 0.44490970950013087, + "grad_norm": 390.0, + "learning_rate": 6.309423112796529e-06, + "logits/chosen": 0.08787860721349716, + "logits/rejected": 0.29786446690559387, + "logps/chosen": -330.8527526855469, + "logps/rejected": -336.91888427734375, + "loss": 0.5739, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8546838760375977, + "rewards/margins": 0.6589832305908203, + "rewards/rejected": -1.513667106628418, + "step": 850 + }, + { + "epoch": 0.45014394137660296, + "grad_norm": 400.0, + "learning_rate": 6.225660128050248e-06, + "logits/chosen": 0.1513369381427765, + "logits/rejected": 0.18070130050182343, + "logps/chosen": -346.57659912109375, + "logps/rejected": -328.84130859375, + "loss": 0.5638, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0357505083084106, + "rewards/margins": 0.6643354892730713, + "rewards/rejected": -1.700085997581482, + "step": 860 + }, + { + "epoch": 0.4553781732530751, + "grad_norm": 354.0, + "learning_rate": 6.141529535787139e-06, + "logits/chosen": 0.23875145614147186, + "logits/rejected": 0.28748852014541626, + "logps/chosen": -382.6456604003906, + "logps/rejected": -346.1590270996094, + "loss": 0.5211, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.7971159219741821, + "rewards/margins": 0.7644892930984497, + "rewards/rejected": -1.5616052150726318, + "step": 870 + }, + { + "epoch": 0.46061240512954726, + "grad_norm": 318.0, + "learning_rate": 6.057056568971383e-06, + "logits/chosen": 0.1365332305431366, + "logits/rejected": 0.18043069541454315, + "logps/chosen": -365.5246887207031, + "logps/rejected": -337.0705261230469, + "loss": 0.5237, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -0.8823005557060242, + "rewards/margins": 0.8230516314506531, + "rewards/rejected": -1.7053521871566772, + "step": 880 + }, + { + "epoch": 0.46584663700601936, + "grad_norm": 294.0, + "learning_rate": 5.972266563254246e-06, + "logits/chosen": 0.33021894097328186, + "logits/rejected": 0.26815542578697205, + "logps/chosen": -393.06182861328125, + "logps/rejected": -348.9358825683594, + "loss": 0.5696, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9404589533805847, + "rewards/margins": 0.702355682849884, + "rewards/rejected": -1.6428148746490479, + "step": 890 + }, + { + "epoch": 0.4710808688824915, + "grad_norm": 292.0, + "learning_rate": 5.887184949375242e-06, + "logits/chosen": 0.24816791713237762, + "logits/rejected": 0.3371456563472748, + "logps/chosen": -343.94952392578125, + "logps/rejected": -303.48870849609375, + "loss": 0.5451, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5168992877006531, + "rewards/margins": 0.7089965343475342, + "rewards/rejected": -1.225895881652832, + "step": 900 + }, + { + "epoch": 0.4763151007589636, + "grad_norm": 256.0, + "learning_rate": 5.8018372455348e-06, + "logits/chosen": 0.32140612602233887, + "logits/rejected": 0.3305002748966217, + "logps/chosen": -359.08538818359375, + "logps/rejected": -316.7978515625, + "loss": 0.5553, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -0.44215431809425354, + "rewards/margins": 0.6605509519577026, + "rewards/rejected": -1.1027053594589233, + "step": 910 + }, + { + "epoch": 0.48154933263543576, + "grad_norm": 268.0, + "learning_rate": 5.71624904974069e-06, + "logits/chosen": 0.24679967761039734, + "logits/rejected": 0.3497712016105652, + "logps/chosen": -361.283935546875, + "logps/rejected": -338.38153076171875, + "loss": 0.5496, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -0.4841860234737396, + "rewards/margins": 0.6799232959747314, + "rewards/rejected": -1.1641093492507935, + "step": 920 + }, + { + "epoch": 0.48678356451190785, + "grad_norm": 250.0, + "learning_rate": 5.630446032130498e-06, + "logits/chosen": 0.2412446290254593, + "logits/rejected": 0.34884771704673767, + "logps/chosen": -347.1498107910156, + "logps/rejected": -331.9222412109375, + "loss": 0.5455, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.5564799904823303, + "rewards/margins": 0.6565181016921997, + "rewards/rejected": -1.2129981517791748, + "step": 930 + }, + { + "epoch": 0.49201779638838, + "grad_norm": 352.0, + "learning_rate": 5.5444539272724925e-06, + "logits/chosen": 0.22467438876628876, + "logits/rejected": 0.42467164993286133, + "logps/chosen": -348.5015869140625, + "logps/rejected": -336.44342041015625, + "loss": 0.534, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.635161280632019, + "rewards/margins": 0.7458114624023438, + "rewards/rejected": -1.3809726238250732, + "step": 940 + }, + { + "epoch": 0.49725202826485215, + "grad_norm": 370.0, + "learning_rate": 5.458298526447155e-06, + "logits/chosen": 0.277851402759552, + "logits/rejected": 0.343191534280777, + "logps/chosen": -348.0686950683594, + "logps/rejected": -311.85992431640625, + "loss": 0.6059, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -0.6322701573371887, + "rewards/margins": 0.5943492650985718, + "rewards/rejected": -1.2266194820404053, + "step": 950 + }, + { + "epoch": 0.5024862601413242, + "grad_norm": 376.0, + "learning_rate": 5.372005669911694e-06, + "logits/chosen": 0.18535420298576355, + "logits/rejected": 0.29728174209594727, + "logps/chosen": -323.85418701171875, + "logps/rejected": -302.52703857421875, + "loss": 0.6372, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.5875571966171265, + "rewards/margins": 0.4703772962093353, + "rewards/rejected": -1.0579345226287842, + "step": 960 + }, + { + "epoch": 0.5077204920177963, + "grad_norm": 318.0, + "learning_rate": 5.285601239149875e-06, + "logits/chosen": 0.2485879361629486, + "logits/rejected": 0.1934640109539032, + "logps/chosen": -363.02557373046875, + "logps/rejected": -332.2878112792969, + "loss": 0.5864, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.4688809812068939, + "rewards/margins": 0.6029322743415833, + "rewards/rejected": -1.0718133449554443, + "step": 970 + }, + { + "epoch": 0.5129547238942685, + "grad_norm": 352.0, + "learning_rate": 5.199111149109498e-06, + "logits/chosen": 0.14167314767837524, + "logits/rejected": 0.3146267533302307, + "logps/chosen": -308.8305969238281, + "logps/rejected": -299.807373046875, + "loss": 0.5698, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5121305584907532, + "rewards/margins": 0.6366390585899353, + "rewards/rejected": -1.1487696170806885, + "step": 980 + }, + { + "epoch": 0.5181889557707406, + "grad_norm": 324.0, + "learning_rate": 5.112561340429817e-06, + "logits/chosen": 0.30979007482528687, + "logits/rejected": 0.24150173366069794, + "logps/chosen": -337.63214111328125, + "logps/rejected": -298.39111328125, + "loss": 0.5672, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5298670530319214, + "rewards/margins": 0.603586733341217, + "rewards/rejected": -1.1334538459777832, + "step": 990 + }, + { + "epoch": 0.5234231876472127, + "grad_norm": 320.0, + "learning_rate": 5.0259777716612665e-06, + "logits/chosen": 0.23533792793750763, + "logits/rejected": 0.3287450671195984, + "logps/chosen": -365.8298645019531, + "logps/rejected": -339.43133544921875, + "loss": 0.596, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.4760667681694031, + "rewards/margins": 0.5520576238632202, + "rewards/rejected": -1.028124451637268, + "step": 1000 + }, + { + "epoch": 0.528657419523685, + "grad_norm": 372.0, + "learning_rate": 4.939386411479814e-06, + "logits/chosen": 0.29721060395240784, + "logits/rejected": 0.3717629909515381, + "logps/chosen": -360.3513488769531, + "logps/rejected": -350.74090576171875, + "loss": 0.5676, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.3760520815849304, + "rewards/margins": 0.6218014359474182, + "rewards/rejected": -0.9978535771369934, + "step": 1010 + }, + { + "epoch": 0.533891651400157, + "grad_norm": 344.0, + "learning_rate": 4.85281323089828e-06, + "logits/chosen": 0.34880155324935913, + "logits/rejected": 0.32166919112205505, + "logps/chosen": -375.96368408203125, + "logps/rejected": -341.3938293457031, + "loss": 0.6075, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.4991912841796875, + "rewards/margins": 0.5173603296279907, + "rewards/rejected": -1.0165516138076782, + "step": 1020 + }, + { + "epoch": 0.5391258832766291, + "grad_norm": 296.0, + "learning_rate": 4.766284195476943e-06, + "logits/chosen": 0.3899001479148865, + "logits/rejected": 0.400698721408844, + "logps/chosen": -355.5746154785156, + "logps/rejected": -325.11474609375, + "loss": 0.5121, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.29699790477752686, + "rewards/margins": 0.7675551772117615, + "rewards/rejected": -1.0645530223846436, + "step": 1030 + }, + { + "epoch": 0.5443601151531012, + "grad_norm": 304.0, + "learning_rate": 4.679825257535795e-06, + "logits/chosen": 0.34135550260543823, + "logits/rejected": 0.3182600736618042, + "logps/chosen": -349.2203063964844, + "logps/rejected": -308.77166748046875, + "loss": 0.5413, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -0.34611597657203674, + "rewards/margins": 0.6459987759590149, + "rewards/rejected": -0.9921148419380188, + "step": 1040 + }, + { + "epoch": 0.5495943470295734, + "grad_norm": 370.0, + "learning_rate": 4.593462348370759e-06, + "logits/chosen": 0.2810625433921814, + "logits/rejected": 0.3787192404270172, + "logps/chosen": -342.5027770996094, + "logps/rejected": -316.65985107421875, + "loss": 0.552, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.3849152624607086, + "rewards/margins": 0.6414963006973267, + "rewards/rejected": -1.0264116525650024, + "step": 1050 + }, + { + "epoch": 0.5548285789060455, + "grad_norm": 276.0, + "learning_rate": 4.507221370476223e-06, + "logits/chosen": 0.34859079122543335, + "logits/rejected": 0.34620124101638794, + "logps/chosen": -351.231689453125, + "logps/rejected": -333.7611389160156, + "loss": 0.5648, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.3917597532272339, + "rewards/margins": 0.6424742937088013, + "rewards/rejected": -1.0342340469360352, + "step": 1060 + }, + { + "epoch": 0.5600628107825176, + "grad_norm": 217.0, + "learning_rate": 4.421128189776195e-06, + "logits/chosen": 0.29422345757484436, + "logits/rejected": 0.3956177234649658, + "logps/chosen": -312.7371826171875, + "logps/rejected": -271.85382080078125, + "loss": 0.5427, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.39171385765075684, + "rewards/margins": 0.6362006664276123, + "rewards/rejected": -1.0279145240783691, + "step": 1070 + }, + { + "epoch": 0.5652970426589898, + "grad_norm": 268.0, + "learning_rate": 4.335208627866438e-06, + "logits/chosen": 0.43888354301452637, + "logits/rejected": 0.47600990533828735, + "logps/chosen": -342.0023193359375, + "logps/rejected": -299.25982666015625, + "loss": 0.5153, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.41604313254356384, + "rewards/margins": 0.7133996486663818, + "rewards/rejected": -1.1294429302215576, + "step": 1080 + }, + { + "epoch": 0.5705312745354619, + "grad_norm": 330.0, + "learning_rate": 4.249488454269908e-06, + "logits/chosen": 0.37367188930511475, + "logits/rejected": 0.4846370816230774, + "logps/chosen": -356.6741638183594, + "logps/rejected": -335.75311279296875, + "loss": 0.5415, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5590308904647827, + "rewards/margins": 0.6862959861755371, + "rewards/rejected": -1.2453268766403198, + "step": 1090 + }, + { + "epoch": 0.575765506411934, + "grad_norm": 310.0, + "learning_rate": 4.163993378707786e-06, + "logits/chosen": 0.3371972143650055, + "logits/rejected": 0.39444199204444885, + "logps/chosen": -321.92254638671875, + "logps/rejected": -300.8343505859375, + "loss": 0.5644, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5255651473999023, + "rewards/margins": 0.709193766117096, + "rewards/rejected": -1.2347590923309326, + "step": 1100 + }, + { + "epoch": 0.5809997382884062, + "grad_norm": 380.0, + "learning_rate": 4.0787490433884685e-06, + "logits/chosen": 0.3301977813243866, + "logits/rejected": 0.4222096800804138, + "logps/chosen": -329.1367492675781, + "logps/rejected": -303.93341064453125, + "loss": 0.5756, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5914889574050903, + "rewards/margins": 0.5726789236068726, + "rewards/rejected": -1.1641680002212524, + "step": 1110 + }, + { + "epoch": 0.5862339701648783, + "grad_norm": 340.0, + "learning_rate": 3.993781015316802e-06, + "logits/chosen": 0.32290878891944885, + "logits/rejected": 0.3790258765220642, + "logps/chosen": -377.72662353515625, + "logps/rejected": -331.83441162109375, + "loss": 0.5863, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6014608144760132, + "rewards/margins": 0.6531153917312622, + "rewards/rejected": -1.2545760869979858, + "step": 1120 + }, + { + "epoch": 0.5914682020413504, + "grad_norm": 356.0, + "learning_rate": 3.909114778625861e-06, + "logits/chosen": 0.33370259404182434, + "logits/rejected": 0.3154350221157074, + "logps/chosen": -382.1673278808594, + "logps/rejected": -314.76025390625, + "loss": 0.4867, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.493272602558136, + "rewards/margins": 0.8312891125679016, + "rewards/rejected": -1.3245617151260376, + "step": 1130 + }, + { + "epoch": 0.5967024339178225, + "grad_norm": 342.0, + "learning_rate": 3.824775726933596e-06, + "logits/chosen": 0.3804655969142914, + "logits/rejected": 0.4335269033908844, + "logps/chosen": -347.4833984375, + "logps/rejected": -302.07989501953125, + "loss": 0.5511, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5972079038619995, + "rewards/margins": 0.6958715319633484, + "rewards/rejected": -1.2930794954299927, + "step": 1140 + }, + { + "epoch": 0.6019366657942947, + "grad_norm": 452.0, + "learning_rate": 3.7407891557266242e-06, + "logits/chosen": 0.28930753469467163, + "logits/rejected": 0.3638380169868469, + "logps/chosen": -337.60076904296875, + "logps/rejected": -327.3893127441406, + "loss": 0.5637, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7492964863777161, + "rewards/margins": 0.7008451819419861, + "rewards/rejected": -1.4501416683197021, + "step": 1150 + }, + { + "epoch": 0.6071708976707668, + "grad_norm": 340.0, + "learning_rate": 3.6571802547734457e-06, + "logits/chosen": 0.29493704438209534, + "logits/rejected": 0.42343488335609436, + "logps/chosen": -345.88421630859375, + "logps/rejected": -324.6173095703125, + "loss": 0.5322, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -0.7139222025871277, + "rewards/margins": 0.7538946866989136, + "rewards/rejected": -1.467816948890686, + "step": 1160 + }, + { + "epoch": 0.6124051295472389, + "grad_norm": 340.0, + "learning_rate": 3.5739741005693807e-06, + "logits/chosen": 0.36524444818496704, + "logits/rejected": 0.4817792475223541, + "logps/chosen": -373.5266418457031, + "logps/rejected": -342.4767150878906, + "loss": 0.5557, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6850903630256653, + "rewards/margins": 0.7452309727668762, + "rewards/rejected": -1.430321455001831, + "step": 1170 + }, + { + "epoch": 0.6176393614237111, + "grad_norm": 288.0, + "learning_rate": 3.4911956488154696e-06, + "logits/chosen": 0.2990756034851074, + "logits/rejected": 0.3051765561103821, + "logps/chosen": -341.70721435546875, + "logps/rejected": -312.9698791503906, + "loss": 0.6205, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.803567111492157, + "rewards/margins": 0.5913842916488647, + "rewards/rejected": -1.3949514627456665, + "step": 1180 + }, + { + "epoch": 0.6228735933001832, + "grad_norm": 274.0, + "learning_rate": 3.4088697269336045e-06, + "logits/chosen": 0.3608161211013794, + "logits/rejected": 0.40439486503601074, + "logps/chosen": -363.10028076171875, + "logps/rejected": -309.46038818359375, + "loss": 0.4981, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.5898820757865906, + "rewards/margins": 0.8492467999458313, + "rewards/rejected": -1.4391288757324219, + "step": 1190 + }, + { + "epoch": 0.6281078251766553, + "grad_norm": 396.0, + "learning_rate": 3.3270210266201373e-06, + "logits/chosen": 0.39221999049186707, + "logits/rejected": 0.44608980417251587, + "logps/chosen": -349.74444580078125, + "logps/rejected": -325.3342590332031, + "loss": 0.5947, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.7365375757217407, + "rewards/margins": 0.6322949528694153, + "rewards/rejected": -1.3688325881958008, + "step": 1200 + }, + { + "epoch": 0.6333420570531274, + "grad_norm": 270.0, + "learning_rate": 3.2456740964401977e-06, + "logits/chosen": 0.39257878065109253, + "logits/rejected": 0.5652514696121216, + "logps/chosen": -351.83795166015625, + "logps/rejected": -336.03948974609375, + "loss": 0.5676, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -0.7251110672950745, + "rewards/margins": 0.6608349084854126, + "rewards/rejected": -1.3859459161758423, + "step": 1210 + }, + { + "epoch": 0.6385762889295996, + "grad_norm": 352.0, + "learning_rate": 3.1648533344649303e-06, + "logits/chosen": 0.3172612190246582, + "logits/rejected": 0.5099163055419922, + "logps/chosen": -338.2564392089844, + "logps/rejected": -346.206298828125, + "loss": 0.5242, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.7247421145439148, + "rewards/margins": 0.7186325192451477, + "rewards/rejected": -1.4433746337890625, + "step": 1220 + }, + { + "epoch": 0.6438105208060717, + "grad_norm": 396.0, + "learning_rate": 3.084582980953881e-06, + "logits/chosen": 0.3695985674858093, + "logits/rejected": 0.3975854814052582, + "logps/chosen": -386.8086242675781, + "logps/rejected": -308.7548828125, + "loss": 0.5514, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7079992890357971, + "rewards/margins": 0.7007894515991211, + "rewards/rejected": -1.4087889194488525, + "step": 1230 + }, + { + "epoch": 0.6490447526825438, + "grad_norm": 306.0, + "learning_rate": 3.0048871110847043e-06, + "logits/chosen": 0.36669978499412537, + "logits/rejected": 0.3211643695831299, + "logps/chosen": -364.41278076171875, + "logps/rejected": -323.64862060546875, + "loss": 0.507, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.6616480350494385, + "rewards/margins": 0.8358721733093262, + "rewards/rejected": -1.4975202083587646, + "step": 1240 + }, + { + "epoch": 0.654278984559016, + "grad_norm": 314.0, + "learning_rate": 2.925789627732395e-06, + "logits/chosen": 0.30698102712631226, + "logits/rejected": 0.35474127531051636, + "logps/chosen": -357.7930603027344, + "logps/rejected": -327.1461181640625, + "loss": 0.5319, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -0.7104305624961853, + "rewards/margins": 0.7990323901176453, + "rewards/rejected": -1.5094630718231201, + "step": 1250 + }, + { + "epoch": 0.6595132164354881, + "grad_norm": 278.0, + "learning_rate": 2.8473142543001818e-06, + "logits/chosen": 0.3213528096675873, + "logits/rejected": 0.38778603076934814, + "logps/chosen": -319.82452392578125, + "logps/rejected": -299.6809387207031, + "loss": 0.5615, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -0.7967512011528015, + "rewards/margins": 0.6921517848968506, + "rewards/rejected": -1.4889030456542969, + "step": 1260 + }, + { + "epoch": 0.6647474483119602, + "grad_norm": 312.0, + "learning_rate": 2.7694845276042714e-06, + "logits/chosen": 0.3033554255962372, + "logits/rejected": 0.31667545437812805, + "logps/chosen": -360.24053955078125, + "logps/rejected": -331.3898010253906, + "loss": 0.5285, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.6901473999023438, + "rewards/margins": 0.8282734751701355, + "rewards/rejected": -1.5184208154678345, + "step": 1270 + }, + { + "epoch": 0.6699816801884323, + "grad_norm": 356.0, + "learning_rate": 2.6923237908145227e-06, + "logits/chosen": 0.3856261074542999, + "logits/rejected": 0.39977845549583435, + "logps/chosen": -324.6197204589844, + "logps/rejected": -332.137939453125, + "loss": 0.5253, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -0.6914088726043701, + "rewards/margins": 0.7797143459320068, + "rewards/rejected": -1.471123218536377, + "step": 1280 + }, + { + "epoch": 0.6752159120649045, + "grad_norm": 290.0, + "learning_rate": 2.615855186453241e-06, + "logits/chosen": 0.3395998179912567, + "logits/rejected": 0.4570327401161194, + "logps/chosen": -357.72625732421875, + "logps/rejected": -345.084716796875, + "loss": 0.5233, + "rewards/accuracies": 0.7406250238418579, + "rewards/chosen": -0.630346417427063, + "rewards/margins": 0.8085399866104126, + "rewards/rejected": -1.438886284828186, + "step": 1290 + }, + { + "epoch": 0.6804501439413766, + "grad_norm": 366.0, + "learning_rate": 2.5401016494541193e-06, + "logits/chosen": 0.29590579867362976, + "logits/rejected": 0.41916173696517944, + "logps/chosen": -340.41949462890625, + "logps/rejected": -329.39324951171875, + "loss": 0.5494, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7232412099838257, + "rewards/margins": 0.7066032886505127, + "rewards/rejected": -1.429844617843628, + "step": 1300 + }, + { + "epoch": 0.6856843758178487, + "grad_norm": 420.0, + "learning_rate": 2.4650859002834465e-06, + "logits/chosen": 0.33335959911346436, + "logits/rejected": 0.5272048115730286, + "logps/chosen": -347.9224548339844, + "logps/rejected": -328.1867980957031, + "loss": 0.5436, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6804947853088379, + "rewards/margins": 0.6814740896224976, + "rewards/rejected": -1.3619688749313354, + "step": 1310 + }, + { + "epoch": 0.6909186076943209, + "grad_norm": 330.0, + "learning_rate": 2.390830438125661e-06, + "logits/chosen": 0.2588549256324768, + "logits/rejected": 0.298136442899704, + "logps/chosen": -352.1745910644531, + "logps/rejected": -324.03460693359375, + "loss": 0.595, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6992086172103882, + "rewards/margins": 0.6043084263801575, + "rewards/rejected": -1.3035171031951904, + "step": 1320 + }, + { + "epoch": 0.696152839570793, + "grad_norm": 266.0, + "learning_rate": 2.3173575341352457e-06, + "logits/chosen": 0.31691044569015503, + "logits/rejected": 0.4209415316581726, + "logps/chosen": -344.1717529296875, + "logps/rejected": -322.6068115234375, + "loss": 0.5849, + "rewards/accuracies": 0.659375011920929, + "rewards/chosen": -0.5810431241989136, + "rewards/margins": 0.6289807558059692, + "rewards/rejected": -1.2100238800048828, + "step": 1330 + }, + { + "epoch": 0.7013870714472651, + "grad_norm": 316.0, + "learning_rate": 2.2446892247570257e-06, + "logits/chosen": 0.34166431427001953, + "logits/rejected": 0.4205591678619385, + "logps/chosen": -349.1050109863281, + "logps/rejected": -328.892822265625, + "loss": 0.5814, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -0.6503351926803589, + "rewards/margins": 0.6150357723236084, + "rewards/rejected": -1.2653712034225464, + "step": 1340 + }, + { + "epoch": 0.7066213033237373, + "grad_norm": 302.0, + "learning_rate": 2.172847305116872e-06, + "logits/chosen": 0.3496546149253845, + "logits/rejected": 0.339820921421051, + "logps/chosen": -345.417236328125, + "logps/rejected": -320.3566589355469, + "loss": 0.5675, + "rewards/accuracies": 0.659375011920929, + "rewards/chosen": -0.5050859451293945, + "rewards/margins": 0.6172486543655396, + "rewards/rejected": -1.122334599494934, + "step": 1350 + }, + { + "epoch": 0.7118555352002094, + "grad_norm": 294.0, + "learning_rate": 2.1018533224847638e-06, + "logits/chosen": 0.36049994826316833, + "logits/rejected": 0.3396483063697815, + "logps/chosen": -375.68121337890625, + "logps/rejected": -331.56939697265625, + "loss": 0.5571, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5250564217567444, + "rewards/margins": 0.7472478151321411, + "rewards/rejected": -1.2723041772842407, + "step": 1360 + }, + { + "epoch": 0.7170897670766815, + "grad_norm": 264.0, + "learning_rate": 2.0317285698122035e-06, + "logits/chosen": 0.23286870121955872, + "logits/rejected": 0.4170301556587219, + "logps/chosen": -337.4573059082031, + "logps/rejected": -320.9345397949219, + "loss": 0.5347, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.49074649810791016, + "rewards/margins": 0.73365718126297, + "rewards/rejected": -1.2244036197662354, + "step": 1370 + }, + { + "epoch": 0.7223239989531536, + "grad_norm": 280.0, + "learning_rate": 1.962494079345906e-06, + "logits/chosen": 0.20548689365386963, + "logits/rejected": 0.2943686544895172, + "logps/chosen": -381.2185363769531, + "logps/rejected": -330.40753173828125, + "loss": 0.5545, + "rewards/accuracies": 0.6656249761581421, + "rewards/chosen": -0.536620557308197, + "rewards/margins": 0.7038464546203613, + "rewards/rejected": -1.2404670715332031, + "step": 1380 + }, + { + "epoch": 0.7275582308296258, + "grad_norm": 254.0, + "learning_rate": 1.8941706163196676e-06, + "logits/chosen": 0.38386040925979614, + "logits/rejected": 0.5160520672798157, + "logps/chosen": -312.7783203125, + "logps/rejected": -307.5871887207031, + "loss": 0.5393, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5432096719741821, + "rewards/margins": 0.667094349861145, + "rewards/rejected": -1.2103040218353271, + "step": 1390 + }, + { + "epoch": 0.7327924627060979, + "grad_norm": 255.0, + "learning_rate": 1.8267786727263426e-06, + "logits/chosen": 0.3777836263179779, + "logits/rejected": 0.4588887691497803, + "logps/chosen": -339.7103576660156, + "logps/rejected": -315.67547607421875, + "loss": 0.5454, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -0.5095704793930054, + "rewards/margins": 0.6885578632354736, + "rewards/rejected": -1.198128342628479, + "step": 1400 + }, + { + "epoch": 0.73802669458257, + "grad_norm": 249.0, + "learning_rate": 1.760338461171755e-06, + "logits/chosen": 0.33284902572631836, + "logits/rejected": 0.41128548979759216, + "logps/chosen": -326.13421630859375, + "logps/rejected": -321.0505065917969, + "loss": 0.5956, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6211567521095276, + "rewards/margins": 0.6012374758720398, + "rewards/rejected": -1.2223942279815674, + "step": 1410 + }, + { + "epoch": 0.7432609264590422, + "grad_norm": 278.0, + "learning_rate": 1.6948699088123992e-06, + "logits/chosen": 0.3391318917274475, + "logits/rejected": 0.35012945532798767, + "logps/chosen": -332.51690673828125, + "logps/rejected": -305.8824768066406, + "loss": 0.5685, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5398066639900208, + "rewards/margins": 0.6431677341461182, + "rewards/rejected": -1.1829744577407837, + "step": 1420 + }, + { + "epoch": 0.7484951583355143, + "grad_norm": 342.0, + "learning_rate": 1.6303926513787821e-06, + "logits/chosen": 0.19832518696784973, + "logits/rejected": 0.17641706764698029, + "logps/chosen": -337.8302001953125, + "logps/rejected": -308.45574951171875, + "loss": 0.5347, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5528967976570129, + "rewards/margins": 0.7190114855766296, + "rewards/rejected": -1.2719082832336426, + "step": 1430 + }, + { + "epoch": 0.7537293902119864, + "grad_norm": 292.0, + "learning_rate": 1.5669260272861426e-06, + "logits/chosen": 0.3353267014026642, + "logits/rejected": 0.34371891617774963, + "logps/chosen": -334.8343200683594, + "logps/rejected": -333.1044006347656, + "loss": 0.5223, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.514175534248352, + "rewards/margins": 0.7716984748840332, + "rewards/rejected": -1.2858738899230957, + "step": 1440 + }, + { + "epoch": 0.7589636220884585, + "grad_norm": 296.0, + "learning_rate": 1.5044890718343535e-06, + "logits/chosen": 0.3490106463432312, + "logits/rejected": 0.2644171118736267, + "logps/chosen": -323.67138671875, + "logps/rejected": -314.19647216796875, + "loss": 0.5748, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.6448107957839966, + "rewards/margins": 0.6091581583023071, + "rewards/rejected": -1.2539689540863037, + "step": 1450 + }, + { + "epoch": 0.7641978539649307, + "grad_norm": 300.0, + "learning_rate": 1.4431005114987485e-06, + "logits/chosen": 0.37269195914268494, + "logits/rejected": 0.3853556513786316, + "logps/chosen": -394.0393371582031, + "logps/rejected": -351.55548095703125, + "loss": 0.5406, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -0.4938036799430847, + "rewards/margins": 0.7198097109794617, + "rewards/rejected": -1.2136132717132568, + "step": 1460 + }, + { + "epoch": 0.7694320858414028, + "grad_norm": 302.0, + "learning_rate": 1.3827787583135533e-06, + "logits/chosen": 0.2608596086502075, + "logits/rejected": 0.3895898461341858, + "logps/chosen": -346.5641784667969, + "logps/rejected": -333.06097412109375, + "loss": 0.5955, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5579820871353149, + "rewards/margins": 0.6600214838981628, + "rewards/rejected": -1.218003511428833, + "step": 1470 + }, + { + "epoch": 0.7746663177178749, + "grad_norm": 384.0, + "learning_rate": 1.3235419043496362e-06, + "logits/chosen": 0.4145224094390869, + "logits/rejected": 0.5047595500946045, + "logps/chosen": -339.38018798828125, + "logps/rejected": -315.99114990234375, + "loss": 0.5873, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5417883992195129, + "rewards/margins": 0.6276899576187134, + "rewards/rejected": -1.169478178024292, + "step": 1480 + }, + { + "epoch": 0.7799005495943471, + "grad_norm": 308.0, + "learning_rate": 1.2654077162882271e-06, + "logits/chosen": 0.3089558482170105, + "logits/rejected": 0.33582228422164917, + "logps/chosen": -344.6625671386719, + "logps/rejected": -318.9618835449219, + "loss": 0.5562, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -0.5232676863670349, + "rewards/margins": 0.7178744077682495, + "rewards/rejected": -1.2411420345306396, + "step": 1490 + }, + { + "epoch": 0.7851347814708192, + "grad_norm": 290.0, + "learning_rate": 1.2083936300922238e-06, + "logits/chosen": 0.45748963952064514, + "logits/rejected": 0.5170606374740601, + "logps/chosen": -360.7559814453125, + "logps/rejected": -332.96112060546875, + "loss": 0.59, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5006940364837646, + "rewards/margins": 0.6657803058624268, + "rewards/rejected": -1.1664743423461914, + "step": 1500 + }, + { + "epoch": 0.7903690133472913, + "grad_norm": 242.0, + "learning_rate": 1.1525167457766856e-06, + "logits/chosen": 0.33311501145362854, + "logits/rejected": 0.3369537889957428, + "logps/chosen": -335.03680419921875, + "logps/rejected": -313.3392333984375, + "loss": 0.5675, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.5090693831443787, + "rewards/margins": 0.669906497001648, + "rewards/rejected": -1.1789758205413818, + "step": 1510 + }, + { + "epoch": 0.7956032452237635, + "grad_norm": 334.0, + "learning_rate": 1.0977938222801004e-06, + "logits/chosen": 0.36881986260414124, + "logits/rejected": 0.45787104964256287, + "logps/chosen": -338.68902587890625, + "logps/rejected": -313.40631103515625, + "loss": 0.5784, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -0.5590661764144897, + "rewards/margins": 0.6329992413520813, + "rewards/rejected": -1.1920652389526367, + "step": 1520 + }, + { + "epoch": 0.8008374771002356, + "grad_norm": 322.0, + "learning_rate": 1.0442412724379365e-06, + "logits/chosen": 0.26200932264328003, + "logits/rejected": 0.27576130628585815, + "logps/chosen": -344.83148193359375, + "logps/rejected": -293.1750793457031, + "loss": 0.5813, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.629385769367218, + "rewards/margins": 0.5875921845436096, + "rewards/rejected": -1.216977834701538, + "step": 1530 + }, + { + "epoch": 0.8060717089767077, + "grad_norm": 286.0, + "learning_rate": 9.9187515806e-07, + "logits/chosen": 0.44481024146080017, + "logits/rejected": 0.47209352254867554, + "logps/chosen": -366.44891357421875, + "logps/rejected": -317.8438415527344, + "loss": 0.565, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.4901696741580963, + "rewards/margins": 0.6533368825912476, + "rewards/rejected": -1.1435067653656006, + "step": 1540 + }, + { + "epoch": 0.8113059408531798, + "grad_norm": 282.0, + "learning_rate": 9.407111851130879e-07, + "logits/chosen": 0.43470582365989685, + "logits/rejected": 0.3668103814125061, + "logps/chosen": -337.2115783691406, + "logps/rejected": -318.40155029296875, + "loss": 0.5212, + "rewards/accuracies": 0.746874988079071, + "rewards/chosen": -0.49172383546829224, + "rewards/margins": 0.7541629672050476, + "rewards/rejected": -1.2458868026733398, + "step": 1550 + }, + { + "epoch": 0.816540172729652, + "grad_norm": 298.0, + "learning_rate": 8.907646990103496e-07, + "logits/chosen": 0.37122786045074463, + "logits/rejected": 0.4722965657711029, + "logps/chosen": -329.2878723144531, + "logps/rejected": -302.6796569824219, + "loss": 0.533, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5332542657852173, + "rewards/margins": 0.6741858720779419, + "rewards/rejected": -1.2074401378631592, + "step": 1560 + }, + { + "epoch": 0.821774404606124, + "grad_norm": 234.0, + "learning_rate": 8.42050680008798e-07, + "logits/chosen": 0.2228083610534668, + "logits/rejected": 0.2700883150100708, + "logps/chosen": -343.5450744628906, + "logps/rejected": -327.5457458496094, + "loss": 0.561, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5294641256332397, + "rewards/margins": 0.6504173278808594, + "rewards/rejected": -1.1798814535140991, + "step": 1570 + }, + { + "epoch": 0.8270086364825961, + "grad_norm": 322.0, + "learning_rate": 7.945837387163424e-07, + "logits/chosen": 0.42028242349624634, + "logits/rejected": 0.4180065095424652, + "logps/chosen": -353.06536865234375, + "logps/rejected": -321.9817810058594, + "loss": 0.5844, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -0.5255261659622192, + "rewards/margins": 0.6547245979309082, + "rewards/rejected": -1.1802507638931274, + "step": 1580 + }, + { + "epoch": 0.8322428683590684, + "grad_norm": 380.0, + "learning_rate": 7.483781117096828e-07, + "logits/chosen": 0.38973018527030945, + "logits/rejected": 0.43399643898010254, + "logps/chosen": -370.450439453125, + "logps/rejected": -342.24395751953125, + "loss": 0.547, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.527172327041626, + "rewards/margins": 0.7201731204986572, + "rewards/rejected": -1.2473453283309937, + "step": 1590 + }, + { + "epoch": 0.8374771002355405, + "grad_norm": 372.0, + "learning_rate": 7.034476572643855e-07, + "logits/chosen": 0.4103736877441406, + "logits/rejected": 0.42073503136634827, + "logps/chosen": -348.49932861328125, + "logps/rejected": -322.942626953125, + "loss": 0.5787, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.5726643800735474, + "rewards/margins": 0.6367040872573853, + "rewards/rejected": -1.2093684673309326, + "step": 1600 + }, + { + "epoch": 0.8427113321120125, + "grad_norm": 324.0, + "learning_rate": 6.598058511984307e-07, + "logits/chosen": 0.4105502665042877, + "logits/rejected": 0.4338196814060211, + "logps/chosen": -334.387451171875, + "logps/rejected": -300.1074523925781, + "loss": 0.556, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -0.640805721282959, + "rewards/margins": 0.6867104768753052, + "rewards/rejected": -1.3275163173675537, + "step": 1610 + }, + { + "epoch": 0.8479455639884846, + "grad_norm": 246.0, + "learning_rate": 6.174657828304543e-07, + "logits/chosen": 0.3273460268974304, + "logits/rejected": 0.38331374526023865, + "logps/chosen": -332.36773681640625, + "logps/rejected": -316.30072021484375, + "loss": 0.6104, + "rewards/accuracies": 0.653124988079071, + "rewards/chosen": -0.5995886325836182, + "rewards/margins": 0.5190034508705139, + "rewards/rejected": -1.1185920238494873, + "step": 1620 + }, + { + "epoch": 0.8531797958649568, + "grad_norm": 256.0, + "learning_rate": 5.764401510539253e-07, + "logits/chosen": 0.39274919033050537, + "logits/rejected": 0.30704575777053833, + "logps/chosen": -358.00335693359375, + "logps/rejected": -301.65203857421875, + "loss": 0.5616, + "rewards/accuracies": 0.7406250238418579, + "rewards/chosen": -0.5209869742393494, + "rewards/margins": 0.6791882514953613, + "rewards/rejected": -1.2001752853393555, + "step": 1630 + }, + { + "epoch": 0.8584140277414289, + "grad_norm": 230.0, + "learning_rate": 5.36741260528415e-07, + "logits/chosen": 0.2753371000289917, + "logits/rejected": 0.3843556344509125, + "logps/chosen": -372.30059814453125, + "logps/rejected": -354.3190002441406, + "loss": 0.4937, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.4627212584018707, + "rewards/margins": 0.8342711329460144, + "rewards/rejected": -1.296992540359497, + "step": 1640 + }, + { + "epoch": 0.863648259617901, + "grad_norm": 253.0, + "learning_rate": 4.98381017989103e-07, + "logits/chosen": 0.25747784972190857, + "logits/rejected": 0.3317343294620514, + "logps/chosen": -345.8427734375, + "logps/rejected": -307.8304748535156, + "loss": 0.5145, + "rewards/accuracies": 0.746874988079071, + "rewards/chosen": -0.4770580232143402, + "rewards/margins": 0.7587519884109497, + "rewards/rejected": -1.2358100414276123, + "step": 1650 + }, + { + "epoch": 0.8688824914943732, + "grad_norm": 274.0, + "learning_rate": 4.6137092867564127e-07, + "logits/chosen": 0.3641647398471832, + "logits/rejected": 0.42129549384117126, + "logps/chosen": -317.44012451171875, + "logps/rejected": -299.4899597167969, + "loss": 0.5478, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5020959377288818, + "rewards/margins": 0.6763127446174622, + "rewards/rejected": -1.1784086227416992, + "step": 1660 + }, + { + "epoch": 0.8741167233708453, + "grad_norm": 253.0, + "learning_rate": 4.2572209288143095e-07, + "logits/chosen": 0.35885730385780334, + "logits/rejected": 0.3501403331756592, + "logps/chosen": -347.42071533203125, + "logps/rejected": -318.55279541015625, + "loss": 0.5766, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -0.5235342383384705, + "rewards/margins": 0.6213586926460266, + "rewards/rejected": -1.144892930984497, + "step": 1670 + }, + { + "epoch": 0.8793509552473174, + "grad_norm": 232.0, + "learning_rate": 3.9144520262435094e-07, + "logits/chosen": 0.35745617747306824, + "logits/rejected": 0.4186561703681946, + "logps/chosen": -373.4577941894531, + "logps/rejected": -316.75909423828125, + "loss": 0.5036, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.39865627884864807, + "rewards/margins": 0.827733039855957, + "rewards/rejected": -1.2263892889022827, + "step": 1680 + }, + { + "epoch": 0.8845851871237895, + "grad_norm": 348.0, + "learning_rate": 3.5855053843994625e-07, + "logits/chosen": 0.3469906747341156, + "logits/rejected": 0.3383873999118805, + "logps/chosen": -330.7817687988281, + "logps/rejected": -343.49249267578125, + "loss": 0.5905, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5502803921699524, + "rewards/margins": 0.5968191623687744, + "rewards/rejected": -1.1470996141433716, + "step": 1690 + }, + { + "epoch": 0.8898194190002617, + "grad_norm": 290.0, + "learning_rate": 3.270479662980247e-07, + "logits/chosen": 0.4720439314842224, + "logits/rejected": 0.507573127746582, + "logps/chosen": -340.6956787109375, + "logps/rejected": -328.48541259765625, + "loss": 0.5653, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5593769550323486, + "rewards/margins": 0.6956008672714233, + "rewards/rejected": -1.254977822303772, + "step": 1700 + }, + { + "epoch": 0.8950536508767338, + "grad_norm": 320.0, + "learning_rate": 2.9694693464359434e-07, + "logits/chosen": 0.36417311429977417, + "logits/rejected": 0.32077834010124207, + "logps/chosen": -358.11871337890625, + "logps/rejected": -348.39031982421875, + "loss": 0.55, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.5245743989944458, + "rewards/margins": 0.7115057706832886, + "rewards/rejected": -1.2360801696777344, + "step": 1710 + }, + { + "epoch": 0.9002878827532059, + "grad_norm": 302.0, + "learning_rate": 2.682564715630287e-07, + "logits/chosen": 0.33791905641555786, + "logits/rejected": 0.3675960600376129, + "logps/chosen": -351.0929870605469, + "logps/rejected": -325.4019775390625, + "loss": 0.5036, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.5013250112533569, + "rewards/margins": 0.8183242678642273, + "rewards/rejected": -1.319649338722229, + "step": 1720 + }, + { + "epoch": 0.9055221146296781, + "grad_norm": 384.0, + "learning_rate": 2.4098518207630706e-07, + "logits/chosen": 0.38433754444122314, + "logits/rejected": 0.397840678691864, + "logps/chosen": -345.4227294921875, + "logps/rejected": -300.4522399902344, + "loss": 0.5736, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.558509886264801, + "rewards/margins": 0.6275274157524109, + "rewards/rejected": -1.186037302017212, + "step": 1730 + }, + { + "epoch": 0.9107563465061502, + "grad_norm": 374.0, + "learning_rate": 2.1514124555614412e-07, + "logits/chosen": 0.2413160502910614, + "logits/rejected": 0.3199203610420227, + "logps/chosen": -373.3514099121094, + "logps/rejected": -336.98895263671875, + "loss": 0.5558, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.5669843554496765, + "rewards/margins": 0.670451283454895, + "rewards/rejected": -1.2374355792999268, + "step": 1740 + }, + { + "epoch": 0.9159905783826223, + "grad_norm": 274.0, + "learning_rate": 1.9073241327478287e-07, + "logits/chosen": 0.2592464089393616, + "logits/rejected": 0.2286282330751419, + "logps/chosen": -335.2867736816406, + "logps/rejected": -296.50677490234375, + "loss": 0.5734, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -0.5148959159851074, + "rewards/margins": 0.5882579684257507, + "rewards/rejected": -1.103153944015503, + "step": 1750 + }, + { + "epoch": 0.9212248102590945, + "grad_norm": 348.0, + "learning_rate": 1.677660060791836e-07, + "logits/chosen": 0.35103827714920044, + "logits/rejected": 0.36917150020599365, + "logps/chosen": -350.96551513671875, + "logps/rejected": -313.03997802734375, + "loss": 0.5246, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.4882447123527527, + "rewards/margins": 0.7595881819725037, + "rewards/rejected": -1.247833013534546, + "step": 1760 + }, + { + "epoch": 0.9264590421355666, + "grad_norm": 286.0, + "learning_rate": 1.4624891219531256e-07, + "logits/chosen": 0.3121943771839142, + "logits/rejected": 0.3472541272640228, + "logps/chosen": -350.4696044921875, + "logps/rejected": -318.84222412109375, + "loss": 0.5622, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5181047320365906, + "rewards/margins": 0.6555663347244263, + "rewards/rejected": -1.173671007156372, + "step": 1770 + }, + { + "epoch": 0.9316932740120387, + "grad_norm": 251.0, + "learning_rate": 1.2618758516218187e-07, + "logits/chosen": 0.38791024684906006, + "logits/rejected": 0.38494163751602173, + "logps/chosen": -309.5002746582031, + "logps/rejected": -289.1304016113281, + "loss": 0.5638, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.49522829055786133, + "rewards/margins": 0.622164249420166, + "rewards/rejected": -1.1173925399780273, + "step": 1780 + }, + { + "epoch": 0.9369275058885108, + "grad_norm": 336.0, + "learning_rate": 1.0758804189626492e-07, + "logits/chosen": 0.34429654479026794, + "logits/rejected": 0.43823686242103577, + "logps/chosen": -337.650146484375, + "logps/rejected": -311.02020263671875, + "loss": 0.5783, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5348609685897827, + "rewards/margins": 0.6113255620002747, + "rewards/rejected": -1.1461864709854126, + "step": 1790 + }, + { + "epoch": 0.942161737764983, + "grad_norm": 360.0, + "learning_rate": 9.045586088686497e-08, + "logits/chosen": 0.35523998737335205, + "logits/rejected": 0.29251137375831604, + "logps/chosen": -362.7002868652344, + "logps/rejected": -318.9178161621094, + "loss": 0.5547, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.5227453112602234, + "rewards/margins": 0.6950558423995972, + "rewards/rejected": -1.2178010940551758, + "step": 1800 + }, + { + "epoch": 0.9473959696414551, + "grad_norm": 370.0, + "learning_rate": 7.479618052298132e-08, + "logits/chosen": 0.46711522340774536, + "logits/rejected": 0.34828323125839233, + "logps/chosen": -365.4064636230469, + "logps/rejected": -346.60321044921875, + "loss": 0.5483, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5453445315361023, + "rewards/margins": 0.6945411562919617, + "rewards/rejected": -1.2398855686187744, + "step": 1810 + }, + { + "epoch": 0.9526302015179272, + "grad_norm": 272.0, + "learning_rate": 6.06136975521715e-08, + "logits/chosen": 0.21810802817344666, + "logits/rejected": 0.33456215262413025, + "logps/chosen": -362.4671630859375, + "logps/rejected": -329.0609436035156, + "loss": 0.5389, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -0.5769203305244446, + "rewards/margins": 0.7337585687637329, + "rewards/rejected": -1.3106788396835327, + "step": 1820 + }, + { + "epoch": 0.9578644333943994, + "grad_norm": 280.0, + "learning_rate": 4.7912665671874246e-08, + "logits/chosen": 0.2832115590572357, + "logits/rejected": 0.31732824444770813, + "logps/chosen": -343.995361328125, + "logps/rejected": -318.9495544433594, + "loss": 0.5587, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5360749959945679, + "rewards/margins": 0.6776861548423767, + "rewards/rejected": -1.2137610912322998, + "step": 1830 + }, + { + "epoch": 0.9630986652708715, + "grad_norm": 334.0, + "learning_rate": 3.669689425361444e-08, + "logits/chosen": 0.32280421257019043, + "logits/rejected": 0.36163073778152466, + "logps/chosen": -317.0058288574219, + "logps/rejected": -308.7008972167969, + "loss": 0.5632, + "rewards/accuracies": 0.6781250238418579, + "rewards/chosen": -0.4824526906013489, + "rewards/margins": 0.6204373240470886, + "rewards/rejected": -1.102890133857727, + "step": 1840 + }, + { + "epoch": 0.9683328971473436, + "grad_norm": 368.0, + "learning_rate": 2.6969747200472073e-08, + "logits/chosen": 0.378648579120636, + "logits/rejected": 0.5615746378898621, + "logps/chosen": -327.52935791015625, + "logps/rejected": -317.4973449707031, + "loss": 0.588, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.559417188167572, + "rewards/margins": 0.6587087512016296, + "rewards/rejected": -1.2181260585784912, + "step": 1850 + }, + { + "epoch": 0.9735671290238157, + "grad_norm": 340.0, + "learning_rate": 1.873414193816092e-08, + "logits/chosen": 0.3898963928222656, + "logits/rejected": 0.37868732213974, + "logps/chosen": -372.6197814941406, + "logps/rejected": -348.3377380371094, + "loss": 0.5311, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.5276554226875305, + "rewards/margins": 0.7353495359420776, + "rewards/rejected": -1.2630048990249634, + "step": 1860 + }, + { + "epoch": 0.9788013609002879, + "grad_norm": 422.0, + "learning_rate": 1.1992548540016858e-08, + "logits/chosen": 0.31078967452049255, + "logits/rejected": 0.2968718707561493, + "logps/chosen": -372.1559143066406, + "logps/rejected": -341.91424560546875, + "loss": 0.5663, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -0.5914555788040161, + "rewards/margins": 0.6402640342712402, + "rewards/rejected": -1.2317196130752563, + "step": 1870 + }, + { + "epoch": 0.98403559277676, + "grad_norm": 306.0, + "learning_rate": 6.746988986156e-09, + "logits/chosen": 0.37066927552223206, + "logits/rejected": 0.4628186821937561, + "logps/chosen": -331.226318359375, + "logps/rejected": -304.57989501953125, + "loss": 0.5334, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.5163390636444092, + "rewards/margins": 0.7293740510940552, + "rewards/rejected": -1.245713233947754, + "step": 1880 + }, + { + "epoch": 0.9892698246532321, + "grad_norm": 326.0, + "learning_rate": 2.9990365570314874e-09, + "logits/chosen": 0.37458479404449463, + "logits/rejected": 0.33924776315689087, + "logps/chosen": -363.9075012207031, + "logps/rejected": -337.91070556640625, + "loss": 0.5197, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.5259437561035156, + "rewards/margins": 0.7279617190361023, + "rewards/rejected": -1.2539054155349731, + "step": 1890 + }, + { + "epoch": 0.9945040565297043, + "grad_norm": 338.0, + "learning_rate": 7.498153615653758e-10, + "logits/chosen": 0.327511191368103, + "logits/rejected": 0.3362283408641815, + "logps/chosen": -310.9585266113281, + "logps/rejected": -305.3989562988281, + "loss": 0.5745, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5321878790855408, + "rewards/margins": 0.6392725706100464, + "rewards/rejected": -1.171460509300232, + "step": 1900 + }, + { + "epoch": 0.9997382884061764, + "grad_norm": 304.0, + "learning_rate": 0.0, + "logits/chosen": 0.31302136182785034, + "logits/rejected": 0.31303030252456665, + "logps/chosen": -355.60076904296875, + "logps/rejected": -323.382568359375, + "loss": 0.5713, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5852067470550537, + "rewards/margins": 0.5970735549926758, + "rewards/rejected": -1.1822803020477295, + "step": 1910 + } + ], + "logging_steps": 10, + "max_steps": 1910, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}