diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,70503 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.995661605206074, + "eval_steps": 500, + "global_step": 4146, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0014461315979754157, + "grad_norm": 17.68067024457695, + "learning_rate": 8e-08, + "logits/chosen": 0.8356107473373413, + "logits/rejected": 0.7603495717048645, + "logps/chosen": -0.8946685791015625, + "logps/rejected": -0.9055352807044983, + "loss": 1.0095, + "odds_ratio_loss": 0.7516021132469177, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0894668698310852, + "rewards/margins": 0.0010866625234484673, + "rewards/rejected": -0.09055352210998535, + "sft_loss": 0.8946685791015625, + "step": 1 + }, + { + "epoch": 0.0028922631959508315, + "grad_norm": 22.2584821655261, + "learning_rate": 1.6e-07, + "logits/chosen": 0.9862427115440369, + "logits/rejected": 0.8068673014640808, + "logps/chosen": -0.906627357006073, + "logps/rejected": -1.0544397830963135, + "loss": 1.0587, + "odds_ratio_loss": 0.66656893491745, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09066274017095566, + "rewards/margins": 0.01478125061839819, + "rewards/rejected": -0.10544399917125702, + "sft_loss": 0.906627357006073, + "step": 2 + }, + { + "epoch": 0.004338394793926247, + "grad_norm": 27.462300830950515, + "learning_rate": 2.4e-07, + "logits/chosen": 0.745313823223114, + "logits/rejected": 0.5240325927734375, + "logps/chosen": -1.166395664215088, + "logps/rejected": -1.1368509531021118, + "loss": 1.0855, + "odds_ratio_loss": 0.8221395015716553, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.11663956940174103, + "rewards/margins": -0.0029544695280492306, + "rewards/rejected": -0.11368509382009506, + "sft_loss": 1.166395664215088, + "step": 3 + }, + { + "epoch": 0.005784526391901663, + "grad_norm": 20.425495140968387, + "learning_rate": 3.2e-07, + "logits/chosen": 0.9010774493217468, + "logits/rejected": 0.7436051368713379, + "logps/chosen": -0.9683417081832886, + "logps/rejected": -1.0990208387374878, + "loss": 1.0477, + "odds_ratio_loss": 0.6798087358474731, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09683416783809662, + "rewards/margins": 0.013067921623587608, + "rewards/rejected": -0.10990208387374878, + "sft_loss": 0.9683417081832886, + "step": 4 + }, + { + "epoch": 0.0072306579898770785, + "grad_norm": 9.072577711772448, + "learning_rate": 4e-07, + "logits/chosen": 1.0971115827560425, + "logits/rejected": 0.9646638631820679, + "logps/chosen": -0.6877315044403076, + "logps/rejected": -1.0325212478637695, + "loss": 0.9536, + "odds_ratio_loss": 0.6098527908325195, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06877315789461136, + "rewards/margins": 0.03447896987199783, + "rewards/rejected": -0.10325212776660919, + "sft_loss": 0.6877315044403076, + "step": 5 + }, + { + "epoch": 0.008676789587852495, + "grad_norm": 31.366473181101718, + "learning_rate": 4.8e-07, + "logits/chosen": 1.1323935985565186, + "logits/rejected": 0.6428372263908386, + "logps/chosen": -0.9170743227005005, + "logps/rejected": -1.1397260427474976, + "loss": 1.0515, + "odds_ratio_loss": 0.6087831258773804, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09170743823051453, + "rewards/margins": 0.022265177220106125, + "rewards/rejected": -0.11397261917591095, + "sft_loss": 0.9170743227005005, + "step": 6 + }, + { + "epoch": 0.01012292118582791, + "grad_norm": 23.362133533438314, + "learning_rate": 5.6e-07, + "logits/chosen": 1.0590741634368896, + "logits/rejected": 0.8795340657234192, + "logps/chosen": -0.849834144115448, + "logps/rejected": -1.0384660959243774, + "loss": 0.9954, + "odds_ratio_loss": 0.6854482889175415, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08498341590166092, + "rewards/margins": 0.01886320300400257, + "rewards/rejected": -0.10384660959243774, + "sft_loss": 0.849834144115448, + "step": 7 + }, + { + "epoch": 0.011569052783803326, + "grad_norm": 13.59399093179875, + "learning_rate": 6.4e-07, + "logits/chosen": 0.9279472827911377, + "logits/rejected": 0.7937031388282776, + "logps/chosen": -0.8301913142204285, + "logps/rejected": -0.9943248629570007, + "loss": 0.9836, + "odds_ratio_loss": 0.6135038137435913, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08301912993192673, + "rewards/margins": 0.016413355246186256, + "rewards/rejected": -0.09943248331546783, + "sft_loss": 0.8301913142204285, + "step": 8 + }, + { + "epoch": 0.013015184381778741, + "grad_norm": 20.87366368622792, + "learning_rate": 7.2e-07, + "logits/chosen": 0.6361145973205566, + "logits/rejected": 0.5400397181510925, + "logps/chosen": -1.2082147598266602, + "logps/rejected": -1.314789056777954, + "loss": 1.0284, + "odds_ratio_loss": 0.7909836173057556, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.12082147598266602, + "rewards/margins": 0.010657444596290588, + "rewards/rejected": -0.1314789056777954, + "sft_loss": 1.2082147598266602, + "step": 9 + }, + { + "epoch": 0.014461315979754157, + "grad_norm": 35.6477296168724, + "learning_rate": 8e-07, + "logits/chosen": 0.7234739065170288, + "logits/rejected": 0.7677637338638306, + "logps/chosen": -0.9336320161819458, + "logps/rejected": -1.1697273254394531, + "loss": 0.9868, + "odds_ratio_loss": 0.6399700045585632, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0933631956577301, + "rewards/margins": 0.02360953576862812, + "rewards/rejected": -0.11697272211313248, + "sft_loss": 0.9336320161819458, + "step": 10 + }, + { + "epoch": 0.015907447577729574, + "grad_norm": 17.36589859929308, + "learning_rate": 8.799999999999999e-07, + "logits/chosen": 0.8094204664230347, + "logits/rejected": 0.6163336038589478, + "logps/chosen": -0.8817548751831055, + "logps/rejected": -0.9194456934928894, + "loss": 1.0479, + "odds_ratio_loss": 0.7149617075920105, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.08817549049854279, + "rewards/margins": 0.003769081551581621, + "rewards/rejected": -0.09194456785917282, + "sft_loss": 0.8817548751831055, + "step": 11 + }, + { + "epoch": 0.01735357917570499, + "grad_norm": 7.785721880156769, + "learning_rate": 9.6e-07, + "logits/chosen": 0.6122461557388306, + "logits/rejected": 0.7143501043319702, + "logps/chosen": -0.9489305019378662, + "logps/rejected": -1.1337459087371826, + "loss": 0.951, + "odds_ratio_loss": 0.5687827467918396, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09489305317401886, + "rewards/margins": 0.018481535837054253, + "rewards/rejected": -0.11337459087371826, + "sft_loss": 0.9489305019378662, + "step": 12 + }, + { + "epoch": 0.018799710773680405, + "grad_norm": 11.720887718671964, + "learning_rate": 1.04e-06, + "logits/chosen": 0.7059281468391418, + "logits/rejected": 0.6184044480323792, + "logps/chosen": -0.8445357084274292, + "logps/rejected": -0.9474976062774658, + "loss": 0.8701, + "odds_ratio_loss": 0.7172947525978088, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08445357531309128, + "rewards/margins": 0.010296182706952095, + "rewards/rejected": -0.09474976360797882, + "sft_loss": 0.8445357084274292, + "step": 13 + }, + { + "epoch": 0.02024584237165582, + "grad_norm": 10.973659424574418, + "learning_rate": 1.12e-06, + "logits/chosen": 0.855961799621582, + "logits/rejected": 0.6677908897399902, + "logps/chosen": -1.0772851705551147, + "logps/rejected": -1.1868722438812256, + "loss": 1.004, + "odds_ratio_loss": 0.6617684364318848, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.107728511095047, + "rewards/margins": 0.010958710685372353, + "rewards/rejected": -0.1186872273683548, + "sft_loss": 1.0772851705551147, + "step": 14 + }, + { + "epoch": 0.021691973969631236, + "grad_norm": 10.00136448443639, + "learning_rate": 1.2e-06, + "logits/chosen": 0.8666166663169861, + "logits/rejected": 0.7374793887138367, + "logps/chosen": -0.6898748278617859, + "logps/rejected": -0.9876577854156494, + "loss": 0.954, + "odds_ratio_loss": 0.5177364349365234, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06898748129606247, + "rewards/margins": 0.02977829799056053, + "rewards/rejected": -0.0987657755613327, + "sft_loss": 0.6898748278617859, + "step": 15 + }, + { + "epoch": 0.023138105567606652, + "grad_norm": 7.339064707053332, + "learning_rate": 1.28e-06, + "logits/chosen": 1.0984623432159424, + "logits/rejected": 0.9263596534729004, + "logps/chosen": -0.9031826853752136, + "logps/rejected": -0.9198977947235107, + "loss": 1.0056, + "odds_ratio_loss": 0.7259780168533325, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.09031827002763748, + "rewards/margins": 0.001671510748565197, + "rewards/rejected": -0.09198978543281555, + "sft_loss": 0.9031826853752136, + "step": 16 + }, + { + "epoch": 0.024584237165582067, + "grad_norm": 8.349121794398082, + "learning_rate": 1.3600000000000001e-06, + "logits/chosen": 0.958114743232727, + "logits/rejected": 0.7234222888946533, + "logps/chosen": -0.7752377390861511, + "logps/rejected": -1.0817121267318726, + "loss": 0.8369, + "odds_ratio_loss": 0.5174487233161926, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07752377539873123, + "rewards/margins": 0.030647438019514084, + "rewards/rejected": -0.10817121714353561, + "sft_loss": 0.7752377390861511, + "step": 17 + }, + { + "epoch": 0.026030368763557483, + "grad_norm": 16.74840406974954, + "learning_rate": 1.44e-06, + "logits/chosen": 1.036068081855774, + "logits/rejected": 0.7755584716796875, + "logps/chosen": -0.7796313166618347, + "logps/rejected": -0.8467473387718201, + "loss": 0.9468, + "odds_ratio_loss": 0.7774651050567627, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07796312868595123, + "rewards/margins": 0.006711603607982397, + "rewards/rejected": -0.08467473834753036, + "sft_loss": 0.7796313166618347, + "step": 18 + }, + { + "epoch": 0.0274765003615329, + "grad_norm": 19.71080839311607, + "learning_rate": 1.5199999999999998e-06, + "logits/chosen": 0.9443542957305908, + "logits/rejected": 0.7273315191268921, + "logps/chosen": -0.8325945138931274, + "logps/rejected": -0.9272810220718384, + "loss": 0.8364, + "odds_ratio_loss": 0.6661559343338013, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0832594484090805, + "rewards/margins": 0.009468646720051765, + "rewards/rejected": -0.09272809326648712, + "sft_loss": 0.8325945138931274, + "step": 19 + }, + { + "epoch": 0.028922631959508314, + "grad_norm": 17.932913869480345, + "learning_rate": 1.6e-06, + "logits/chosen": 0.9203794598579407, + "logits/rejected": 0.7660186290740967, + "logps/chosen": -0.8127734065055847, + "logps/rejected": -0.9844791889190674, + "loss": 0.8528, + "odds_ratio_loss": 0.6955982446670532, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08127734810113907, + "rewards/margins": 0.017170574516057968, + "rewards/rejected": -0.09844791889190674, + "sft_loss": 0.8127734065055847, + "step": 20 + }, + { + "epoch": 0.03036876355748373, + "grad_norm": 4.676737103695072, + "learning_rate": 1.6799999999999998e-06, + "logits/chosen": 1.0346933603286743, + "logits/rejected": 0.7640005350112915, + "logps/chosen": -0.7656794786453247, + "logps/rejected": -0.9085444211959839, + "loss": 0.8817, + "odds_ratio_loss": 0.5907818675041199, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07656795531511307, + "rewards/margins": 0.014286492951214314, + "rewards/rejected": -0.09085444360971451, + "sft_loss": 0.7656794786453247, + "step": 21 + }, + { + "epoch": 0.03181489515545915, + "grad_norm": 21.219968023645468, + "learning_rate": 1.7599999999999999e-06, + "logits/chosen": 0.7389025688171387, + "logits/rejected": 0.768907904624939, + "logps/chosen": -0.740533709526062, + "logps/rejected": -0.9216527938842773, + "loss": 0.9192, + "odds_ratio_loss": 0.7230217456817627, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07405337691307068, + "rewards/margins": 0.018111903220415115, + "rewards/rejected": -0.0921652764081955, + "sft_loss": 0.740533709526062, + "step": 22 + }, + { + "epoch": 0.033261026753434564, + "grad_norm": 5.1186934556905905, + "learning_rate": 1.84e-06, + "logits/chosen": 1.104678988456726, + "logits/rejected": 0.933571457862854, + "logps/chosen": -0.8342806696891785, + "logps/rejected": -1.0144786834716797, + "loss": 0.9273, + "odds_ratio_loss": 0.5798680186271667, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08342806994915009, + "rewards/margins": 0.018019797280430794, + "rewards/rejected": -0.10144786536693573, + "sft_loss": 0.8342806696891785, + "step": 23 + }, + { + "epoch": 0.03470715835140998, + "grad_norm": 5.996648991214411, + "learning_rate": 1.92e-06, + "logits/chosen": 0.6956485509872437, + "logits/rejected": 0.5334842801094055, + "logps/chosen": -0.8051484823226929, + "logps/rejected": -1.0607928037643433, + "loss": 0.9043, + "odds_ratio_loss": 0.5199005603790283, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08051484823226929, + "rewards/margins": 0.02556443400681019, + "rewards/rejected": -0.10607928782701492, + "sft_loss": 0.8051484823226929, + "step": 24 + }, + { + "epoch": 0.036153289949385395, + "grad_norm": 15.463181593194717, + "learning_rate": 2e-06, + "logits/chosen": 1.0711913108825684, + "logits/rejected": 0.6716903448104858, + "logps/chosen": -0.780259370803833, + "logps/rejected": -0.8929332494735718, + "loss": 0.8245, + "odds_ratio_loss": 0.6280741095542908, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0780259370803833, + "rewards/margins": 0.01126739289611578, + "rewards/rejected": -0.08929332345724106, + "sft_loss": 0.780259370803833, + "step": 25 + }, + { + "epoch": 0.03759942154736081, + "grad_norm": 6.4509394430823725, + "learning_rate": 2.08e-06, + "logits/chosen": 0.9420080184936523, + "logits/rejected": 0.8971505165100098, + "logps/chosen": -0.6603622436523438, + "logps/rejected": -0.9678102135658264, + "loss": 0.8143, + "odds_ratio_loss": 0.5521212816238403, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06603622436523438, + "rewards/margins": 0.030744802206754684, + "rewards/rejected": -0.09678103029727936, + "sft_loss": 0.6603622436523438, + "step": 26 + }, + { + "epoch": 0.039045553145336226, + "grad_norm": 5.904770765139368, + "learning_rate": 2.16e-06, + "logits/chosen": 1.0589203834533691, + "logits/rejected": 1.0087999105453491, + "logps/chosen": -0.9654819965362549, + "logps/rejected": -1.0529497861862183, + "loss": 0.9155, + "odds_ratio_loss": 0.7037606239318848, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.09654819965362549, + "rewards/margins": 0.008746780455112457, + "rewards/rejected": -0.10529498755931854, + "sft_loss": 0.9654819965362549, + "step": 27 + }, + { + "epoch": 0.04049168474331164, + "grad_norm": 6.804983128940474, + "learning_rate": 2.24e-06, + "logits/chosen": 1.0854326486587524, + "logits/rejected": 0.7822256684303284, + "logps/chosen": -0.8072190284729004, + "logps/rejected": -1.1046161651611328, + "loss": 0.8532, + "odds_ratio_loss": 0.5510303378105164, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0807219073176384, + "rewards/margins": 0.029739707708358765, + "rewards/rejected": -0.11046161502599716, + "sft_loss": 0.8072190284729004, + "step": 28 + }, + { + "epoch": 0.04193781634128706, + "grad_norm": 11.155540190844036, + "learning_rate": 2.32e-06, + "logits/chosen": 0.9284826517105103, + "logits/rejected": 0.7190419435501099, + "logps/chosen": -0.84559166431427, + "logps/rejected": -0.9231205582618713, + "loss": 0.9285, + "odds_ratio_loss": 0.6862574815750122, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08455916494131088, + "rewards/margins": 0.007752885576337576, + "rewards/rejected": -0.0923120528459549, + "sft_loss": 0.84559166431427, + "step": 29 + }, + { + "epoch": 0.04338394793926247, + "grad_norm": 7.130648176543995, + "learning_rate": 2.4e-06, + "logits/chosen": 0.9986206293106079, + "logits/rejected": 0.7722131013870239, + "logps/chosen": -0.6998481154441833, + "logps/rejected": -1.2671633958816528, + "loss": 0.8881, + "odds_ratio_loss": 0.4308162033557892, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0699848085641861, + "rewards/margins": 0.05673152208328247, + "rewards/rejected": -0.12671633064746857, + "sft_loss": 0.6998481154441833, + "step": 30 + }, + { + "epoch": 0.04483007953723789, + "grad_norm": 7.964095468445476, + "learning_rate": 2.48e-06, + "logits/chosen": 1.0034689903259277, + "logits/rejected": 0.8177412152290344, + "logps/chosen": -0.7045868635177612, + "logps/rejected": -1.0203757286071777, + "loss": 0.8301, + "odds_ratio_loss": 0.4992942214012146, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07045868784189224, + "rewards/margins": 0.031578898429870605, + "rewards/rejected": -0.10203758627176285, + "sft_loss": 0.7045868635177612, + "step": 31 + }, + { + "epoch": 0.046276211135213303, + "grad_norm": 6.206570705290645, + "learning_rate": 2.56e-06, + "logits/chosen": 0.8828554749488831, + "logits/rejected": 0.8245723247528076, + "logps/chosen": -0.7559677362442017, + "logps/rejected": -0.7688103318214417, + "loss": 0.9242, + "odds_ratio_loss": 0.6954567432403564, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07559677958488464, + "rewards/margins": 0.001284262165427208, + "rewards/rejected": -0.0768810361623764, + "sft_loss": 0.7559677362442017, + "step": 32 + }, + { + "epoch": 0.04772234273318872, + "grad_norm": 3.819558444843904, + "learning_rate": 2.64e-06, + "logits/chosen": 0.9170816540718079, + "logits/rejected": 0.6471152305603027, + "logps/chosen": -0.7481220960617065, + "logps/rejected": -1.044447422027588, + "loss": 0.8164, + "odds_ratio_loss": 0.5162819623947144, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07481221854686737, + "rewards/margins": 0.029632527381181717, + "rewards/rejected": -0.10444474220275879, + "sft_loss": 0.7481220960617065, + "step": 33 + }, + { + "epoch": 0.049168474331164135, + "grad_norm": 17.468727974038213, + "learning_rate": 2.7200000000000002e-06, + "logits/chosen": 0.9004616737365723, + "logits/rejected": 0.9549974203109741, + "logps/chosen": -0.7914824485778809, + "logps/rejected": -0.9024128317832947, + "loss": 0.7975, + "odds_ratio_loss": 0.6575660705566406, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07914824038743973, + "rewards/margins": 0.011093037202954292, + "rewards/rejected": -0.09024128317832947, + "sft_loss": 0.7914824485778809, + "step": 34 + }, + { + "epoch": 0.05061460592913955, + "grad_norm": 4.6049278375914415, + "learning_rate": 2.8e-06, + "logits/chosen": 1.0355346202850342, + "logits/rejected": 0.8618839979171753, + "logps/chosen": -0.6818061470985413, + "logps/rejected": -1.0076775550842285, + "loss": 0.8934, + "odds_ratio_loss": 0.48088353872299194, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.068180613219738, + "rewards/margins": 0.03258715197443962, + "rewards/rejected": -0.10076776891946793, + "sft_loss": 0.6818061470985413, + "step": 35 + }, + { + "epoch": 0.052060737527114966, + "grad_norm": 10.126587564107808, + "learning_rate": 2.88e-06, + "logits/chosen": 1.0104275941848755, + "logits/rejected": 0.8126014471054077, + "logps/chosen": -1.0583109855651855, + "logps/rejected": -1.188212513923645, + "loss": 0.9078, + "odds_ratio_loss": 0.6410647630691528, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1058310940861702, + "rewards/margins": 0.012990152463316917, + "rewards/rejected": -0.11882124096155167, + "sft_loss": 1.0583109855651855, + "step": 36 + }, + { + "epoch": 0.05350686912509038, + "grad_norm": 5.195790645822001, + "learning_rate": 2.96e-06, + "logits/chosen": 0.8879537582397461, + "logits/rejected": 0.8749307990074158, + "logps/chosen": -0.9341248273849487, + "logps/rejected": -1.0359203815460205, + "loss": 0.8659, + "odds_ratio_loss": 0.6952499151229858, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09341248869895935, + "rewards/margins": 0.0101795494556427, + "rewards/rejected": -0.10359203815460205, + "sft_loss": 0.9341248273849487, + "step": 37 + }, + { + "epoch": 0.0549530007230658, + "grad_norm": 7.028715936791847, + "learning_rate": 3.0399999999999997e-06, + "logits/chosen": 0.8880399465560913, + "logits/rejected": 0.7450892329216003, + "logps/chosen": -0.7892625331878662, + "logps/rejected": -0.7776023745536804, + "loss": 0.8412, + "odds_ratio_loss": 0.7238245010375977, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.07892625778913498, + "rewards/margins": -0.0011660188902169466, + "rewards/rejected": -0.0777602344751358, + "sft_loss": 0.7892625331878662, + "step": 38 + }, + { + "epoch": 0.05639913232104121, + "grad_norm": 5.64693417602307, + "learning_rate": 3.1199999999999998e-06, + "logits/chosen": 0.9538003206253052, + "logits/rejected": 0.8920505046844482, + "logps/chosen": -0.6508951187133789, + "logps/rejected": -1.040766954421997, + "loss": 0.8128, + "odds_ratio_loss": 0.4561525881290436, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06508950889110565, + "rewards/margins": 0.03898719325661659, + "rewards/rejected": -0.10407670587301254, + "sft_loss": 0.6508951187133789, + "step": 39 + }, + { + "epoch": 0.05784526391901663, + "grad_norm": 4.834895359786458, + "learning_rate": 3.2e-06, + "logits/chosen": 1.149649977684021, + "logits/rejected": 0.9461228251457214, + "logps/chosen": -0.9168272614479065, + "logps/rejected": -1.1571921110153198, + "loss": 0.8322, + "odds_ratio_loss": 0.6043152809143066, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09168273210525513, + "rewards/margins": 0.02403649315237999, + "rewards/rejected": -0.11571922153234482, + "sft_loss": 0.9168272614479065, + "step": 40 + }, + { + "epoch": 0.05929139551699204, + "grad_norm": 4.061085611051016, + "learning_rate": 3.2799999999999995e-06, + "logits/chosen": 1.1228294372558594, + "logits/rejected": 0.8763277530670166, + "logps/chosen": -0.7228974103927612, + "logps/rejected": -0.8757832050323486, + "loss": 0.8719, + "odds_ratio_loss": 0.6970602869987488, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07228974252939224, + "rewards/margins": 0.015288583002984524, + "rewards/rejected": -0.08757832646369934, + "sft_loss": 0.7228974103927612, + "step": 41 + }, + { + "epoch": 0.06073752711496746, + "grad_norm": 4.580481948452303, + "learning_rate": 3.3599999999999996e-06, + "logits/chosen": 1.060896873474121, + "logits/rejected": 0.8334064483642578, + "logps/chosen": -0.6850025653839111, + "logps/rejected": -1.0060864686965942, + "loss": 0.7318, + "odds_ratio_loss": 0.5280728340148926, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06850025802850723, + "rewards/margins": 0.03210839629173279, + "rewards/rejected": -0.10060865432024002, + "sft_loss": 0.6850025653839111, + "step": 42 + }, + { + "epoch": 0.06218365871294288, + "grad_norm": 5.169320832414527, + "learning_rate": 3.4399999999999997e-06, + "logits/chosen": 1.1155328750610352, + "logits/rejected": 0.7692084908485413, + "logps/chosen": -0.6570387482643127, + "logps/rejected": -0.9917082786560059, + "loss": 0.7906, + "odds_ratio_loss": 0.5790050625801086, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0657038763165474, + "rewards/margins": 0.03346695005893707, + "rewards/rejected": -0.09917082637548447, + "sft_loss": 0.6570387482643127, + "step": 43 + }, + { + "epoch": 0.0636297903109183, + "grad_norm": 3.998113789272701, + "learning_rate": 3.5199999999999998e-06, + "logits/chosen": 0.9994779229164124, + "logits/rejected": 0.7162806391716003, + "logps/chosen": -0.7262680530548096, + "logps/rejected": -0.9695625305175781, + "loss": 0.8926, + "odds_ratio_loss": 0.5879533886909485, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07262679934501648, + "rewards/margins": 0.024329453706741333, + "rewards/rejected": -0.09695626050233841, + "sft_loss": 0.7262680530548096, + "step": 44 + }, + { + "epoch": 0.0650759219088937, + "grad_norm": 3.4506987115134757, + "learning_rate": 3.6e-06, + "logits/chosen": 0.8653795123100281, + "logits/rejected": 0.7805371284484863, + "logps/chosen": -0.7925112247467041, + "logps/rejected": -1.1502509117126465, + "loss": 0.7794, + "odds_ratio_loss": 0.5484931468963623, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07925112545490265, + "rewards/margins": 0.035773973912000656, + "rewards/rejected": -0.11502508819103241, + "sft_loss": 0.7925112247467041, + "step": 45 + }, + { + "epoch": 0.06652205350686913, + "grad_norm": 2.9338860329360443, + "learning_rate": 3.68e-06, + "logits/chosen": 0.9942594766616821, + "logits/rejected": 0.6031086444854736, + "logps/chosen": -0.7195640802383423, + "logps/rejected": -0.9865086674690247, + "loss": 0.8429, + "odds_ratio_loss": 0.5378471612930298, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07195641100406647, + "rewards/margins": 0.026694457978010178, + "rewards/rejected": -0.09865086525678635, + "sft_loss": 0.7195640802383423, + "step": 46 + }, + { + "epoch": 0.06796818510484454, + "grad_norm": 18.444891203967817, + "learning_rate": 3.7599999999999996e-06, + "logits/chosen": 1.0228843688964844, + "logits/rejected": 0.6981637477874756, + "logps/chosen": -0.8705933094024658, + "logps/rejected": -1.2415589094161987, + "loss": 0.9371, + "odds_ratio_loss": 0.8004380464553833, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08705933392047882, + "rewards/margins": 0.037096552550792694, + "rewards/rejected": -0.12415588647127151, + "sft_loss": 0.8705933094024658, + "step": 47 + }, + { + "epoch": 0.06941431670281996, + "grad_norm": 4.737185826037039, + "learning_rate": 3.84e-06, + "logits/chosen": 1.0492089986801147, + "logits/rejected": 0.6930069327354431, + "logps/chosen": -0.8137787580490112, + "logps/rejected": -1.0236539840698242, + "loss": 0.8259, + "odds_ratio_loss": 0.6219763159751892, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08137787878513336, + "rewards/margins": 0.020987525582313538, + "rewards/rejected": -0.1023654043674469, + "sft_loss": 0.8137787580490112, + "step": 48 + }, + { + "epoch": 0.07086044830079537, + "grad_norm": 20.118458343186862, + "learning_rate": 3.92e-06, + "logits/chosen": 0.8911822438240051, + "logits/rejected": 0.6944270730018616, + "logps/chosen": -1.052188515663147, + "logps/rejected": -1.200345754623413, + "loss": 1.0211, + "odds_ratio_loss": 0.6875659823417664, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.10521887242794037, + "rewards/margins": 0.01481570117175579, + "rewards/rejected": -0.12003456056118011, + "sft_loss": 1.052188515663147, + "step": 49 + }, + { + "epoch": 0.07230657989877079, + "grad_norm": 5.7579792293688845, + "learning_rate": 4e-06, + "logits/chosen": 0.7346884608268738, + "logits/rejected": 0.660639226436615, + "logps/chosen": -0.9779757261276245, + "logps/rejected": -1.0913139581680298, + "loss": 0.9103, + "odds_ratio_loss": 0.6801553964614868, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09779756516218185, + "rewards/margins": 0.0113338278606534, + "rewards/rejected": -0.10913139581680298, + "sft_loss": 0.9779757261276245, + "step": 50 + }, + { + "epoch": 0.0737527114967462, + "grad_norm": 3.985226415418088, + "learning_rate": 4.08e-06, + "logits/chosen": 1.0270240306854248, + "logits/rejected": 0.8079010248184204, + "logps/chosen": -0.6934676766395569, + "logps/rejected": -1.0339841842651367, + "loss": 0.8186, + "odds_ratio_loss": 0.4910920560359955, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06934677064418793, + "rewards/margins": 0.034051645547151566, + "rewards/rejected": -0.1033984124660492, + "sft_loss": 0.6934676766395569, + "step": 51 + }, + { + "epoch": 0.07519884309472162, + "grad_norm": 7.042436013353386, + "learning_rate": 4.16e-06, + "logits/chosen": 0.9413070678710938, + "logits/rejected": 0.7744332551956177, + "logps/chosen": -0.7994893193244934, + "logps/rejected": -0.9427841901779175, + "loss": 0.9243, + "odds_ratio_loss": 0.6500593423843384, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.07994893193244934, + "rewards/margins": 0.014329486526548862, + "rewards/rejected": -0.09427841752767563, + "sft_loss": 0.7994893193244934, + "step": 52 + }, + { + "epoch": 0.07664497469269703, + "grad_norm": 7.37511851741436, + "learning_rate": 4.24e-06, + "logits/chosen": 1.0415973663330078, + "logits/rejected": 0.8271567821502686, + "logps/chosen": -0.7861368656158447, + "logps/rejected": -0.9846646189689636, + "loss": 0.798, + "odds_ratio_loss": 0.613193690776825, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.07861369103193283, + "rewards/margins": 0.01985277608036995, + "rewards/rejected": -0.09846646338701248, + "sft_loss": 0.7861368656158447, + "step": 53 + }, + { + "epoch": 0.07809110629067245, + "grad_norm": 5.530049183138087, + "learning_rate": 4.32e-06, + "logits/chosen": 1.035746455192566, + "logits/rejected": 0.6262372732162476, + "logps/chosen": -0.7784286737442017, + "logps/rejected": -1.2029107809066772, + "loss": 0.8983, + "odds_ratio_loss": 0.49279114603996277, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07784287631511688, + "rewards/margins": 0.042448196560144424, + "rewards/rejected": -0.1202910766005516, + "sft_loss": 0.7784286737442017, + "step": 54 + }, + { + "epoch": 0.07953723788864786, + "grad_norm": 6.675216545716233, + "learning_rate": 4.4e-06, + "logits/chosen": 0.8669562935829163, + "logits/rejected": 0.6869562268257141, + "logps/chosen": -0.9464516639709473, + "logps/rejected": -1.1197850704193115, + "loss": 0.8484, + "odds_ratio_loss": 0.608762800693512, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09464516490697861, + "rewards/margins": 0.017333339899778366, + "rewards/rejected": -0.11197850108146667, + "sft_loss": 0.9464516639709473, + "step": 55 + }, + { + "epoch": 0.08098336948662328, + "grad_norm": 9.900030241867466, + "learning_rate": 4.48e-06, + "logits/chosen": 0.9601394534111023, + "logits/rejected": 0.7478969097137451, + "logps/chosen": -0.6744643449783325, + "logps/rejected": -1.155207872390747, + "loss": 0.8861, + "odds_ratio_loss": 0.47762393951416016, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06744643300771713, + "rewards/margins": 0.04807435721158981, + "rewards/rejected": -0.11552079021930695, + "sft_loss": 0.6744643449783325, + "step": 56 + }, + { + "epoch": 0.0824295010845987, + "grad_norm": 5.686458237142553, + "learning_rate": 4.5599999999999995e-06, + "logits/chosen": 1.0790021419525146, + "logits/rejected": 1.103777289390564, + "logps/chosen": -0.7211272716522217, + "logps/rejected": -0.8694078922271729, + "loss": 0.8806, + "odds_ratio_loss": 0.6249120235443115, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.07211272418498993, + "rewards/margins": 0.014828063547611237, + "rewards/rejected": -0.08694078773260117, + "sft_loss": 0.7211272716522217, + "step": 57 + }, + { + "epoch": 0.08387563268257411, + "grad_norm": 8.777827739224744, + "learning_rate": 4.64e-06, + "logits/chosen": 1.056907296180725, + "logits/rejected": 0.8694510459899902, + "logps/chosen": -0.7749738097190857, + "logps/rejected": -1.1565377712249756, + "loss": 0.8714, + "odds_ratio_loss": 0.5057281255722046, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07749737799167633, + "rewards/margins": 0.0381564125418663, + "rewards/rejected": -0.11565379798412323, + "sft_loss": 0.7749738097190857, + "step": 58 + }, + { + "epoch": 0.08532176428054954, + "grad_norm": 3.0283569241613524, + "learning_rate": 4.72e-06, + "logits/chosen": 0.9298826456069946, + "logits/rejected": 0.6310389041900635, + "logps/chosen": -0.6535084843635559, + "logps/rejected": -1.1013919115066528, + "loss": 0.7376, + "odds_ratio_loss": 0.42935678362846375, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06535085290670395, + "rewards/margins": 0.04478834196925163, + "rewards/rejected": -0.11013919860124588, + "sft_loss": 0.6535084843635559, + "step": 59 + }, + { + "epoch": 0.08676789587852494, + "grad_norm": 4.29846372956217, + "learning_rate": 4.8e-06, + "logits/chosen": 1.125542163848877, + "logits/rejected": 0.9014356732368469, + "logps/chosen": -0.6315572261810303, + "logps/rejected": -1.0562748908996582, + "loss": 0.7112, + "odds_ratio_loss": 0.4755699634552002, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06315572559833527, + "rewards/margins": 0.04247176647186279, + "rewards/rejected": -0.10562749207019806, + "sft_loss": 0.6315572261810303, + "step": 60 + }, + { + "epoch": 0.08821402747650037, + "grad_norm": 3.541070990089084, + "learning_rate": 4.88e-06, + "logits/chosen": 1.202345848083496, + "logits/rejected": 1.018904209136963, + "logps/chosen": -0.5439103841781616, + "logps/rejected": -0.8144343495368958, + "loss": 0.7895, + "odds_ratio_loss": 0.5063339471817017, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.054391033947467804, + "rewards/margins": 0.02705240435898304, + "rewards/rejected": -0.0814434364438057, + "sft_loss": 0.5439103841781616, + "step": 61 + }, + { + "epoch": 0.08966015907447578, + "grad_norm": 8.368866284697733, + "learning_rate": 4.96e-06, + "logits/chosen": 1.0831687450408936, + "logits/rejected": 1.0173500776290894, + "logps/chosen": -0.7749509811401367, + "logps/rejected": -0.9368792176246643, + "loss": 0.9018, + "odds_ratio_loss": 0.6482864618301392, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.07749509811401367, + "rewards/margins": 0.01619281992316246, + "rewards/rejected": -0.09368792176246643, + "sft_loss": 0.7749509811401367, + "step": 62 + }, + { + "epoch": 0.0911062906724512, + "grad_norm": 18.13366140105839, + "learning_rate": 5.04e-06, + "logits/chosen": 0.8514754176139832, + "logits/rejected": 0.6560556888580322, + "logps/chosen": -1.131075382232666, + "logps/rejected": -1.4477269649505615, + "loss": 1.0699, + "odds_ratio_loss": 0.6726313233375549, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.11310753226280212, + "rewards/margins": 0.03166516125202179, + "rewards/rejected": -0.1447726935148239, + "sft_loss": 1.131075382232666, + "step": 63 + }, + { + "epoch": 0.09255242227042661, + "grad_norm": 5.535853982596553, + "learning_rate": 5.12e-06, + "logits/chosen": 1.1037675142288208, + "logits/rejected": 0.8205037713050842, + "logps/chosen": -0.712691068649292, + "logps/rejected": -0.9677368998527527, + "loss": 0.8002, + "odds_ratio_loss": 0.5520382523536682, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07126910239458084, + "rewards/margins": 0.025504592806100845, + "rewards/rejected": -0.09677369147539139, + "sft_loss": 0.712691068649292, + "step": 64 + }, + { + "epoch": 0.09399855386840203, + "grad_norm": 3.238469408471894, + "learning_rate": 5.2e-06, + "logits/chosen": 0.8979614973068237, + "logits/rejected": 0.8939811587333679, + "logps/chosen": -0.9045255184173584, + "logps/rejected": -1.1835026741027832, + "loss": 0.8535, + "odds_ratio_loss": 0.5290065407752991, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09045255184173584, + "rewards/margins": 0.027897723019123077, + "rewards/rejected": -0.11835027486085892, + "sft_loss": 0.9045255184173584, + "step": 65 + }, + { + "epoch": 0.09544468546637744, + "grad_norm": 3.9235480660049364, + "learning_rate": 5.28e-06, + "logits/chosen": 1.0252755880355835, + "logits/rejected": 0.805016040802002, + "logps/chosen": -0.6900225877761841, + "logps/rejected": -1.1014788150787354, + "loss": 0.8608, + "odds_ratio_loss": 0.4513680636882782, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06900225579738617, + "rewards/margins": 0.041145630180835724, + "rewards/rejected": -0.11014789342880249, + "sft_loss": 0.6900225877761841, + "step": 66 + }, + { + "epoch": 0.09689081706435286, + "grad_norm": 4.096161339643125, + "learning_rate": 5.36e-06, + "logits/chosen": 0.8171386122703552, + "logits/rejected": 0.8907574415206909, + "logps/chosen": -0.7569859027862549, + "logps/rejected": -0.9295750856399536, + "loss": 0.8071, + "odds_ratio_loss": 0.5909979343414307, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07569858431816101, + "rewards/margins": 0.01725892536342144, + "rewards/rejected": -0.0929575189948082, + "sft_loss": 0.7569859027862549, + "step": 67 + }, + { + "epoch": 0.09833694866232827, + "grad_norm": 3.4880666252048034, + "learning_rate": 5.4400000000000004e-06, + "logits/chosen": 1.0990238189697266, + "logits/rejected": 0.8180453777313232, + "logps/chosen": -0.5353269577026367, + "logps/rejected": -1.0225234031677246, + "loss": 0.7102, + "odds_ratio_loss": 0.44182831048965454, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05353269353508949, + "rewards/margins": 0.04871963709592819, + "rewards/rejected": -0.10225233435630798, + "sft_loss": 0.5353269577026367, + "step": 68 + }, + { + "epoch": 0.09978308026030369, + "grad_norm": 4.957151765389429, + "learning_rate": 5.52e-06, + "logits/chosen": 1.0731085538864136, + "logits/rejected": 1.0340485572814941, + "logps/chosen": -0.779766321182251, + "logps/rejected": -0.9066184163093567, + "loss": 0.7684, + "odds_ratio_loss": 0.6572247743606567, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.07797663658857346, + "rewards/margins": 0.012685209512710571, + "rewards/rejected": -0.09066184610128403, + "sft_loss": 0.779766321182251, + "step": 69 + }, + { + "epoch": 0.1012292118582791, + "grad_norm": 3.1824426288429173, + "learning_rate": 5.6e-06, + "logits/chosen": 1.1628271341323853, + "logits/rejected": 0.8257616758346558, + "logps/chosen": -0.6837524771690369, + "logps/rejected": -1.061213731765747, + "loss": 0.8337, + "odds_ratio_loss": 0.46075576543807983, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06837525963783264, + "rewards/margins": 0.03774610906839371, + "rewards/rejected": -0.10612136870622635, + "sft_loss": 0.6837524771690369, + "step": 70 + }, + { + "epoch": 0.10267534345625452, + "grad_norm": 2.9877543497021897, + "learning_rate": 5.68e-06, + "logits/chosen": 1.220341682434082, + "logits/rejected": 0.8999754190444946, + "logps/chosen": -0.7730412483215332, + "logps/rejected": -0.9635111093521118, + "loss": 0.811, + "odds_ratio_loss": 0.6032207012176514, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07730412483215332, + "rewards/margins": 0.01904698833823204, + "rewards/rejected": -0.09635111689567566, + "sft_loss": 0.7730412483215332, + "step": 71 + }, + { + "epoch": 0.10412147505422993, + "grad_norm": 3.68575891866247, + "learning_rate": 5.76e-06, + "logits/chosen": 1.1874583959579468, + "logits/rejected": 1.072020411491394, + "logps/chosen": -0.9427322149276733, + "logps/rejected": -1.102614402770996, + "loss": 0.8952, + "odds_ratio_loss": 0.6729242205619812, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09427322447299957, + "rewards/margins": 0.015988217666745186, + "rewards/rejected": -0.11026144027709961, + "sft_loss": 0.9427322149276733, + "step": 72 + }, + { + "epoch": 0.10556760665220535, + "grad_norm": 2.8520049474926448, + "learning_rate": 5.84e-06, + "logits/chosen": 0.9496423006057739, + "logits/rejected": 0.68682461977005, + "logps/chosen": -0.84450763463974, + "logps/rejected": -1.4252830743789673, + "loss": 0.824, + "odds_ratio_loss": 0.4710613489151001, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08445076644420624, + "rewards/margins": 0.058077551424503326, + "rewards/rejected": -0.14252832531929016, + "sft_loss": 0.84450763463974, + "step": 73 + }, + { + "epoch": 0.10701373825018076, + "grad_norm": 4.909446839208547, + "learning_rate": 5.92e-06, + "logits/chosen": 1.0577809810638428, + "logits/rejected": 1.0075559616088867, + "logps/chosen": -0.7429163455963135, + "logps/rejected": -1.2081857919692993, + "loss": 0.8528, + "odds_ratio_loss": 0.47998422384262085, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07429163157939911, + "rewards/margins": 0.046526938676834106, + "rewards/rejected": -0.12081858515739441, + "sft_loss": 0.7429163455963135, + "step": 74 + }, + { + "epoch": 0.10845986984815618, + "grad_norm": 8.719192985116054, + "learning_rate": 6e-06, + "logits/chosen": 1.0399186611175537, + "logits/rejected": 0.7236427068710327, + "logps/chosen": -0.8451824188232422, + "logps/rejected": -1.1076613664627075, + "loss": 0.9515, + "odds_ratio_loss": 0.5856386423110962, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08451823890209198, + "rewards/margins": 0.026247896254062653, + "rewards/rejected": -0.11076614260673523, + "sft_loss": 0.8451824188232422, + "step": 75 + }, + { + "epoch": 0.1099060014461316, + "grad_norm": 3.242774804776767, + "learning_rate": 6.079999999999999e-06, + "logits/chosen": 1.170623540878296, + "logits/rejected": 0.9636479616165161, + "logps/chosen": -0.7744165658950806, + "logps/rejected": -0.9548482298851013, + "loss": 0.8192, + "odds_ratio_loss": 0.6630096435546875, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.07744166254997253, + "rewards/margins": 0.018043160438537598, + "rewards/rejected": -0.09548482298851013, + "sft_loss": 0.7744165658950806, + "step": 76 + }, + { + "epoch": 0.11135213304410702, + "grad_norm": 5.606115194740814, + "learning_rate": 6.1599999999999995e-06, + "logits/chosen": 1.2055904865264893, + "logits/rejected": 0.8868299126625061, + "logps/chosen": -1.0346038341522217, + "logps/rejected": -0.9845407009124756, + "loss": 0.9325, + "odds_ratio_loss": 0.8492572903633118, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1034604012966156, + "rewards/margins": -0.005006318911910057, + "rewards/rejected": -0.0984540730714798, + "sft_loss": 1.0346038341522217, + "step": 77 + }, + { + "epoch": 0.11279826464208242, + "grad_norm": 11.819359350343193, + "learning_rate": 6.2399999999999995e-06, + "logits/chosen": 1.2517834901809692, + "logits/rejected": 0.7903429269790649, + "logps/chosen": -0.7485107779502869, + "logps/rejected": -1.4334845542907715, + "loss": 0.8361, + "odds_ratio_loss": 0.37815889716148376, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07485108077526093, + "rewards/margins": 0.06849737465381622, + "rewards/rejected": -0.14334845542907715, + "sft_loss": 0.7485107779502869, + "step": 78 + }, + { + "epoch": 0.11424439624005785, + "grad_norm": 3.8517494454673367, + "learning_rate": 6.32e-06, + "logits/chosen": 1.035347580909729, + "logits/rejected": 0.7966654896736145, + "logps/chosen": -0.7934662103652954, + "logps/rejected": -1.0752906799316406, + "loss": 0.784, + "odds_ratio_loss": 0.5487212538719177, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07934662699699402, + "rewards/margins": 0.028182443231344223, + "rewards/rejected": -0.10752906650304794, + "sft_loss": 0.7934662103652954, + "step": 79 + }, + { + "epoch": 0.11569052783803326, + "grad_norm": 5.067148096067779, + "learning_rate": 6.4e-06, + "logits/chosen": 1.0562264919281006, + "logits/rejected": 0.9451851844787598, + "logps/chosen": -0.8409098386764526, + "logps/rejected": -1.0445072650909424, + "loss": 0.8479, + "odds_ratio_loss": 0.5876671671867371, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08409098535776138, + "rewards/margins": 0.020359739661216736, + "rewards/rejected": -0.10445072501897812, + "sft_loss": 0.8409098386764526, + "step": 80 + }, + { + "epoch": 0.11713665943600868, + "grad_norm": 3.52750504575538, + "learning_rate": 6.48e-06, + "logits/chosen": 0.8893054127693176, + "logits/rejected": 0.852543830871582, + "logps/chosen": -0.799626350402832, + "logps/rejected": -1.193359613418579, + "loss": 0.819, + "odds_ratio_loss": 0.514946460723877, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07996264100074768, + "rewards/margins": 0.03937332704663277, + "rewards/rejected": -0.11933596432209015, + "sft_loss": 0.799626350402832, + "step": 81 + }, + { + "epoch": 0.11858279103398409, + "grad_norm": 7.884839298424228, + "learning_rate": 6.559999999999999e-06, + "logits/chosen": 0.9608660936355591, + "logits/rejected": 0.7938075065612793, + "logps/chosen": -0.905261218547821, + "logps/rejected": -1.2442588806152344, + "loss": 0.9495, + "odds_ratio_loss": 0.7865235209465027, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.09052613377571106, + "rewards/margins": 0.03389975428581238, + "rewards/rejected": -0.12442588806152344, + "sft_loss": 0.905261218547821, + "step": 82 + }, + { + "epoch": 0.12002892263195951, + "grad_norm": 3.7300532504299255, + "learning_rate": 6.639999999999999e-06, + "logits/chosen": 0.8842788934707642, + "logits/rejected": 0.8357954025268555, + "logps/chosen": -0.8731819987297058, + "logps/rejected": -0.9422979950904846, + "loss": 0.8644, + "odds_ratio_loss": 0.6957225203514099, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08731820434331894, + "rewards/margins": 0.0069116028025746346, + "rewards/rejected": -0.0942298099398613, + "sft_loss": 0.8731819987297058, + "step": 83 + }, + { + "epoch": 0.12147505422993492, + "grad_norm": 4.523959440898583, + "learning_rate": 6.719999999999999e-06, + "logits/chosen": 0.8383230566978455, + "logits/rejected": 0.9067773222923279, + "logps/chosen": -0.9250085353851318, + "logps/rejected": -1.1533596515655518, + "loss": 0.9015, + "odds_ratio_loss": 0.6070276498794556, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09250085055828094, + "rewards/margins": 0.022835111245512962, + "rewards/rejected": -0.11533597111701965, + "sft_loss": 0.9250085353851318, + "step": 84 + }, + { + "epoch": 0.12292118582791034, + "grad_norm": 4.009564684524501, + "learning_rate": 6.799999999999999e-06, + "logits/chosen": 1.1185507774353027, + "logits/rejected": 0.9861254096031189, + "logps/chosen": -0.7019460201263428, + "logps/rejected": -1.3582534790039062, + "loss": 0.7674, + "odds_ratio_loss": 0.38255906105041504, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07019460946321487, + "rewards/margins": 0.06563074886798859, + "rewards/rejected": -0.13582536578178406, + "sft_loss": 0.7019460201263428, + "step": 85 + }, + { + "epoch": 0.12436731742588576, + "grad_norm": 4.978681123457067, + "learning_rate": 6.879999999999999e-06, + "logits/chosen": 1.1274616718292236, + "logits/rejected": 1.0533956289291382, + "logps/chosen": -0.8610905408859253, + "logps/rejected": -1.079627275466919, + "loss": 1.054, + "odds_ratio_loss": 0.6273439526557922, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08610905706882477, + "rewards/margins": 0.021853668615221977, + "rewards/rejected": -0.1079627275466919, + "sft_loss": 0.8610905408859253, + "step": 86 + }, + { + "epoch": 0.12581344902386118, + "grad_norm": 3.3881381629270027, + "learning_rate": 6.9599999999999994e-06, + "logits/chosen": 0.7648379802703857, + "logits/rejected": 0.7650954723358154, + "logps/chosen": -0.9075650572776794, + "logps/rejected": -0.9574316740036011, + "loss": 0.832, + "odds_ratio_loss": 0.740204930305481, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.09075652062892914, + "rewards/margins": 0.0049866605550050735, + "rewards/rejected": -0.09574317187070847, + "sft_loss": 0.9075650572776794, + "step": 87 + }, + { + "epoch": 0.1272595806218366, + "grad_norm": 4.3362701314683685, + "learning_rate": 7.0399999999999995e-06, + "logits/chosen": 1.11726975440979, + "logits/rejected": 0.7689473032951355, + "logps/chosen": -0.806981086730957, + "logps/rejected": -1.3281915187835693, + "loss": 0.7672, + "odds_ratio_loss": 0.45331400632858276, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08069811016321182, + "rewards/margins": 0.05212104693055153, + "rewards/rejected": -0.13281914591789246, + "sft_loss": 0.806981086730957, + "step": 88 + }, + { + "epoch": 0.128705712219812, + "grad_norm": 5.657518637155685, + "learning_rate": 7.12e-06, + "logits/chosen": 0.9810636043548584, + "logits/rejected": 0.6966180205345154, + "logps/chosen": -0.844375491142273, + "logps/rejected": -1.1458146572113037, + "loss": 0.8386, + "odds_ratio_loss": 0.5362038612365723, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08443755656480789, + "rewards/margins": 0.03014390543103218, + "rewards/rejected": -0.11458146572113037, + "sft_loss": 0.844375491142273, + "step": 89 + }, + { + "epoch": 0.1301518438177874, + "grad_norm": 4.112138737639026, + "learning_rate": 7.2e-06, + "logits/chosen": 1.0501034259796143, + "logits/rejected": 0.9047867059707642, + "logps/chosen": -0.7257125973701477, + "logps/rejected": -0.9122753739356995, + "loss": 0.894, + "odds_ratio_loss": 0.5964730381965637, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07257126271724701, + "rewards/margins": 0.018656279891729355, + "rewards/rejected": -0.09122754633426666, + "sft_loss": 0.7257125973701477, + "step": 90 + }, + { + "epoch": 0.13159797541576285, + "grad_norm": 3.329297429235451, + "learning_rate": 7.28e-06, + "logits/chosen": 0.9696775078773499, + "logits/rejected": 0.7746231555938721, + "logps/chosen": -0.8063006401062012, + "logps/rejected": -1.0086421966552734, + "loss": 0.812, + "odds_ratio_loss": 0.5819357633590698, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08063006401062012, + "rewards/margins": 0.02023415081202984, + "rewards/rejected": -0.1008642166852951, + "sft_loss": 0.8063006401062012, + "step": 91 + }, + { + "epoch": 0.13304410701373826, + "grad_norm": 3.335099806630708, + "learning_rate": 7.36e-06, + "logits/chosen": 0.8014536499977112, + "logits/rejected": 0.7237277626991272, + "logps/chosen": -0.9466049671173096, + "logps/rejected": -1.1349971294403076, + "loss": 0.8838, + "odds_ratio_loss": 0.6838469505310059, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.09466049075126648, + "rewards/margins": 0.018839217722415924, + "rewards/rejected": -0.113499715924263, + "sft_loss": 0.9466049671173096, + "step": 92 + }, + { + "epoch": 0.13449023861171366, + "grad_norm": 3.6165391263690747, + "learning_rate": 7.44e-06, + "logits/chosen": 0.9842814803123474, + "logits/rejected": 0.7658122181892395, + "logps/chosen": -0.8842167258262634, + "logps/rejected": -1.298095703125, + "loss": 0.9687, + "odds_ratio_loss": 0.47397667169570923, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08842167258262634, + "rewards/margins": 0.0413878932595253, + "rewards/rejected": -0.12980955839157104, + "sft_loss": 0.8842167258262634, + "step": 93 + }, + { + "epoch": 0.13593637020968907, + "grad_norm": 3.258394073264351, + "learning_rate": 7.519999999999999e-06, + "logits/chosen": 1.2068235874176025, + "logits/rejected": 0.7209275960922241, + "logps/chosen": -0.7621604204177856, + "logps/rejected": -1.1626864671707153, + "loss": 0.8619, + "odds_ratio_loss": 0.5013617277145386, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07621604204177856, + "rewards/margins": 0.04005260765552521, + "rewards/rejected": -0.11626865714788437, + "sft_loss": 0.7621604204177856, + "step": 94 + }, + { + "epoch": 0.1373825018076645, + "grad_norm": 2.8180662547268556, + "learning_rate": 7.599999999999999e-06, + "logits/chosen": 0.8703403472900391, + "logits/rejected": 0.8108032941818237, + "logps/chosen": -0.7662641406059265, + "logps/rejected": -1.0566703081130981, + "loss": 0.797, + "odds_ratio_loss": 0.5328947305679321, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07662642002105713, + "rewards/margins": 0.029040617868304253, + "rewards/rejected": -0.10566703975200653, + "sft_loss": 0.7662641406059265, + "step": 95 + }, + { + "epoch": 0.13882863340563992, + "grad_norm": 3.6926906103204753, + "learning_rate": 7.68e-06, + "logits/chosen": 1.23716139793396, + "logits/rejected": 0.7248931527137756, + "logps/chosen": -0.8130195736885071, + "logps/rejected": -1.3673142194747925, + "loss": 0.8516, + "odds_ratio_loss": 0.5090689659118652, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0813019648194313, + "rewards/margins": 0.055429454892873764, + "rewards/rejected": -0.13673141598701477, + "sft_loss": 0.8130195736885071, + "step": 96 + }, + { + "epoch": 0.14027476500361533, + "grad_norm": 3.004623867002305, + "learning_rate": 7.76e-06, + "logits/chosen": 1.098660945892334, + "logits/rejected": 0.9489208459854126, + "logps/chosen": -0.8788610100746155, + "logps/rejected": -1.3326481580734253, + "loss": 0.88, + "odds_ratio_loss": 0.5792851448059082, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08788609504699707, + "rewards/margins": 0.04537871479988098, + "rewards/rejected": -0.13326480984687805, + "sft_loss": 0.8788610100746155, + "step": 97 + }, + { + "epoch": 0.14172089660159073, + "grad_norm": 4.588818222593908, + "learning_rate": 7.84e-06, + "logits/chosen": 0.9910817742347717, + "logits/rejected": 0.7520518898963928, + "logps/chosen": -0.9339778423309326, + "logps/rejected": -1.2882939577102661, + "loss": 0.8916, + "odds_ratio_loss": 0.5939985513687134, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.09339778870344162, + "rewards/margins": 0.03543160483241081, + "rewards/rejected": -0.12882938981056213, + "sft_loss": 0.9339778423309326, + "step": 98 + }, + { + "epoch": 0.14316702819956617, + "grad_norm": 4.138791016089956, + "learning_rate": 7.92e-06, + "logits/chosen": 1.126113772392273, + "logits/rejected": 1.0925661325454712, + "logps/chosen": -0.6955535411834717, + "logps/rejected": -1.1417237520217896, + "loss": 0.8094, + "odds_ratio_loss": 0.46277695894241333, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06955534964799881, + "rewards/margins": 0.04461703449487686, + "rewards/rejected": -0.11417238414287567, + "sft_loss": 0.6955535411834717, + "step": 99 + }, + { + "epoch": 0.14461315979754158, + "grad_norm": 5.006479057556612, + "learning_rate": 8e-06, + "logits/chosen": 0.9892964363098145, + "logits/rejected": 0.9845679998397827, + "logps/chosen": -0.7721818089485168, + "logps/rejected": -1.2382351160049438, + "loss": 0.9171, + "odds_ratio_loss": 0.5086143016815186, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0772181823849678, + "rewards/margins": 0.04660532623529434, + "rewards/rejected": -0.12382350862026215, + "sft_loss": 0.7721818089485168, + "step": 100 + }, + { + "epoch": 0.146059291395517, + "grad_norm": 4.221494833102751, + "learning_rate": 7.999998794192551e-06, + "logits/chosen": 1.0973759889602661, + "logits/rejected": 0.825110912322998, + "logps/chosen": -0.7033730745315552, + "logps/rejected": -0.8850679397583008, + "loss": 0.9284, + "odds_ratio_loss": 0.8899041414260864, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07033731043338776, + "rewards/margins": 0.01816948503255844, + "rewards/rejected": -0.0885067954659462, + "sft_loss": 0.7033730745315552, + "step": 101 + }, + { + "epoch": 0.1475054229934924, + "grad_norm": 4.217626175448476, + "learning_rate": 7.999995176770932e-06, + "logits/chosen": 1.0476391315460205, + "logits/rejected": 1.20012366771698, + "logps/chosen": -0.7490733861923218, + "logps/rejected": -1.357642650604248, + "loss": 1.0154, + "odds_ratio_loss": 0.416873574256897, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0749073401093483, + "rewards/margins": 0.060856934636831284, + "rewards/rejected": -0.13576427102088928, + "sft_loss": 0.7490733861923218, + "step": 102 + }, + { + "epoch": 0.14895155459146783, + "grad_norm": 3.870844944694995, + "learning_rate": 7.999989147737321e-06, + "logits/chosen": 1.3996965885162354, + "logits/rejected": 1.0624024868011475, + "logps/chosen": -0.7202589511871338, + "logps/rejected": -1.4208965301513672, + "loss": 0.8005, + "odds_ratio_loss": 0.5439056754112244, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07202590256929398, + "rewards/margins": 0.0700637698173523, + "rewards/rejected": -0.14208966493606567, + "sft_loss": 0.7202589511871338, + "step": 103 + }, + { + "epoch": 0.15039768618944324, + "grad_norm": 7.09212049587876, + "learning_rate": 7.999980707095359e-06, + "logits/chosen": 0.975654125213623, + "logits/rejected": 0.6775309443473816, + "logps/chosen": -0.8399592638015747, + "logps/rejected": -1.3394160270690918, + "loss": 0.8881, + "odds_ratio_loss": 0.5169605016708374, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08399593830108643, + "rewards/margins": 0.04994567483663559, + "rewards/rejected": -0.13394160568714142, + "sft_loss": 0.8399592638015747, + "step": 104 + }, + { + "epoch": 0.15184381778741865, + "grad_norm": 3.2017348907968923, + "learning_rate": 7.99996985485013e-06, + "logits/chosen": 1.3523914813995361, + "logits/rejected": 1.1231534481048584, + "logps/chosen": -0.8947268724441528, + "logps/rejected": -0.947187066078186, + "loss": 0.9017, + "odds_ratio_loss": 0.6707051992416382, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0894726812839508, + "rewards/margins": 0.005246022716164589, + "rewards/rejected": -0.09471870958805084, + "sft_loss": 0.8947268724441528, + "step": 105 + }, + { + "epoch": 0.15328994938539406, + "grad_norm": 5.116065229940842, + "learning_rate": 7.999956591008177e-06, + "logits/chosen": 0.9761826395988464, + "logits/rejected": 0.8456287384033203, + "logps/chosen": -0.9132041931152344, + "logps/rejected": -1.2559480667114258, + "loss": 0.8568, + "odds_ratio_loss": 0.5640711784362793, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09132041782140732, + "rewards/margins": 0.0342743881046772, + "rewards/rejected": -0.12559480965137482, + "sft_loss": 0.9132041931152344, + "step": 106 + }, + { + "epoch": 0.1547360809833695, + "grad_norm": 4.402587157653877, + "learning_rate": 7.999940915577498e-06, + "logits/chosen": 1.186305046081543, + "logits/rejected": 0.9557572603225708, + "logps/chosen": -0.8537756204605103, + "logps/rejected": -1.1455714702606201, + "loss": 0.8694, + "odds_ratio_loss": 0.5543307065963745, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08537755906581879, + "rewards/margins": 0.029179591685533524, + "rewards/rejected": -0.11455715447664261, + "sft_loss": 0.8537756204605103, + "step": 107 + }, + { + "epoch": 0.1561822125813449, + "grad_norm": 5.333316678653816, + "learning_rate": 7.999922828567544e-06, + "logits/chosen": 1.1584198474884033, + "logits/rejected": 0.866698145866394, + "logps/chosen": -0.7809406518936157, + "logps/rejected": -1.0734394788742065, + "loss": 1.0352, + "odds_ratio_loss": 0.6391094326972961, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07809406518936157, + "rewards/margins": 0.029249876737594604, + "rewards/rejected": -0.10734394937753677, + "sft_loss": 0.7809406518936157, + "step": 108 + }, + { + "epoch": 0.1576283441793203, + "grad_norm": 4.181964422446741, + "learning_rate": 7.999902329989218e-06, + "logits/chosen": 1.1614594459533691, + "logits/rejected": 0.9762617349624634, + "logps/chosen": -0.751151978969574, + "logps/rejected": -1.218088150024414, + "loss": 1.0105, + "odds_ratio_loss": 0.5704274773597717, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07511519640684128, + "rewards/margins": 0.04669361934065819, + "rewards/rejected": -0.12180881202220917, + "sft_loss": 0.751151978969574, + "step": 109 + }, + { + "epoch": 0.15907447577729572, + "grad_norm": 4.998106576378202, + "learning_rate": 7.999879419854883e-06, + "logits/chosen": 0.8305407762527466, + "logits/rejected": 0.789115309715271, + "logps/chosen": -0.7767509818077087, + "logps/rejected": -1.8016185760498047, + "loss": 0.9043, + "odds_ratio_loss": 0.41118738055229187, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07767509669065475, + "rewards/margins": 0.1024867594242096, + "rewards/rejected": -0.18016186356544495, + "sft_loss": 0.7767509818077087, + "step": 110 + }, + { + "epoch": 0.16052060737527116, + "grad_norm": 2.7549476854728794, + "learning_rate": 7.999854098178346e-06, + "logits/chosen": 1.3881924152374268, + "logits/rejected": 1.1433719396591187, + "logps/chosen": -0.6934785842895508, + "logps/rejected": -1.2178714275360107, + "loss": 0.8135, + "odds_ratio_loss": 0.49856168031692505, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06934786587953568, + "rewards/margins": 0.052439287304878235, + "rewards/rejected": -0.12178714573383331, + "sft_loss": 0.6934785842895508, + "step": 111 + }, + { + "epoch": 0.16196673897324657, + "grad_norm": 3.946293342352624, + "learning_rate": 7.999826364974878e-06, + "logits/chosen": 1.0656780004501343, + "logits/rejected": 0.935299277305603, + "logps/chosen": -0.7571520805358887, + "logps/rejected": -1.3282155990600586, + "loss": 0.982, + "odds_ratio_loss": 0.48986226320266724, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07571521401405334, + "rewards/margins": 0.057106345891952515, + "rewards/rejected": -0.13282155990600586, + "sft_loss": 0.7571520805358887, + "step": 112 + }, + { + "epoch": 0.16341287057122197, + "grad_norm": 4.909329819670107, + "learning_rate": 7.999796220261196e-06, + "logits/chosen": 1.2778562307357788, + "logits/rejected": 1.0329546928405762, + "logps/chosen": -0.84651118516922, + "logps/rejected": -1.1883376836776733, + "loss": 0.7856, + "odds_ratio_loss": 0.5662205219268799, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08465111255645752, + "rewards/margins": 0.034182652831077576, + "rewards/rejected": -0.11883377283811569, + "sft_loss": 0.84651118516922, + "step": 113 + }, + { + "epoch": 0.1648590021691974, + "grad_norm": 3.3384814962446705, + "learning_rate": 7.999763664055477e-06, + "logits/chosen": 1.0280776023864746, + "logits/rejected": 0.9973467588424683, + "logps/chosen": -0.8428975939750671, + "logps/rejected": -1.4291057586669922, + "loss": 0.7546, + "odds_ratio_loss": 0.48032891750335693, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.08428976684808731, + "rewards/margins": 0.05862081050872803, + "rewards/rejected": -0.14291056990623474, + "sft_loss": 0.8428975939750671, + "step": 114 + }, + { + "epoch": 0.16630513376717282, + "grad_norm": 8.183910290224793, + "learning_rate": 7.999728696377347e-06, + "logits/chosen": 1.1547966003417969, + "logits/rejected": 0.9317610859870911, + "logps/chosen": -0.8324732184410095, + "logps/rejected": -0.9054718017578125, + "loss": 0.8927, + "odds_ratio_loss": 0.7408847808837891, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08324733376502991, + "rewards/margins": 0.007299853954464197, + "rewards/rejected": -0.09054718911647797, + "sft_loss": 0.8324732184410095, + "step": 115 + }, + { + "epoch": 0.16775126536514823, + "grad_norm": 8.690597190815746, + "learning_rate": 7.99969131724789e-06, + "logits/chosen": 1.031779170036316, + "logits/rejected": 0.9053889513015747, + "logps/chosen": -0.647568941116333, + "logps/rejected": -1.9913418292999268, + "loss": 0.8437, + "odds_ratio_loss": 0.4320135712623596, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06475690007209778, + "rewards/margins": 0.13437728583812714, + "rewards/rejected": -0.1991342008113861, + "sft_loss": 0.647568941116333, + "step": 116 + }, + { + "epoch": 0.16919739696312364, + "grad_norm": 3.5044517946016094, + "learning_rate": 7.999651526689642e-06, + "logits/chosen": 0.9917743802070618, + "logits/rejected": 0.8863325119018555, + "logps/chosen": -0.793087899684906, + "logps/rejected": -1.0273563861846924, + "loss": 0.8403, + "odds_ratio_loss": 0.6240065693855286, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.07930880039930344, + "rewards/margins": 0.023426856845617294, + "rewards/rejected": -0.10273565351963043, + "sft_loss": 0.793087899684906, + "step": 117 + }, + { + "epoch": 0.17064352856109907, + "grad_norm": 3.342116463726969, + "learning_rate": 7.999609324726592e-06, + "logits/chosen": 1.1911059617996216, + "logits/rejected": 0.9521145820617676, + "logps/chosen": -0.7328106760978699, + "logps/rejected": -1.5902066230773926, + "loss": 0.8601, + "odds_ratio_loss": 0.40612202882766724, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07328107208013535, + "rewards/margins": 0.08573959767818451, + "rewards/rejected": -0.15902066230773926, + "sft_loss": 0.7328106760978699, + "step": 118 + }, + { + "epoch": 0.17208966015907448, + "grad_norm": 2.9956931508799247, + "learning_rate": 7.999564711384184e-06, + "logits/chosen": 0.7753307223320007, + "logits/rejected": 0.7809234261512756, + "logps/chosen": -0.8056610822677612, + "logps/rejected": -1.2226719856262207, + "loss": 0.8651, + "odds_ratio_loss": 0.5064537525177002, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.08056611567735672, + "rewards/margins": 0.04170108586549759, + "rewards/rejected": -0.12226720154285431, + "sft_loss": 0.8056610822677612, + "step": 119 + }, + { + "epoch": 0.1735357917570499, + "grad_norm": 4.895879965141485, + "learning_rate": 7.999517686689316e-06, + "logits/chosen": 1.0470901727676392, + "logits/rejected": 0.868865966796875, + "logps/chosen": -0.701599657535553, + "logps/rejected": -1.2374787330627441, + "loss": 0.9475, + "odds_ratio_loss": 0.5663205981254578, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07015997171401978, + "rewards/margins": 0.0535879023373127, + "rewards/rejected": -0.12374787032604218, + "sft_loss": 0.701599657535553, + "step": 120 + }, + { + "epoch": 0.1749819233550253, + "grad_norm": 3.877019988352723, + "learning_rate": 7.999468250670339e-06, + "logits/chosen": 0.8767350912094116, + "logits/rejected": 0.8941935300827026, + "logps/chosen": -0.7906222939491272, + "logps/rejected": -1.284137487411499, + "loss": 0.7921, + "odds_ratio_loss": 0.4683469235897064, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07906222343444824, + "rewards/margins": 0.0493515208363533, + "rewards/rejected": -0.12841375172138214, + "sft_loss": 0.7906222939491272, + "step": 121 + }, + { + "epoch": 0.17642805495300073, + "grad_norm": 13.592899631151674, + "learning_rate": 7.999416403357056e-06, + "logits/chosen": 0.7820194959640503, + "logits/rejected": 0.8491408824920654, + "logps/chosen": -0.8906112909317017, + "logps/rejected": -1.2906492948532104, + "loss": 0.862, + "odds_ratio_loss": 0.5522098541259766, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08906114101409912, + "rewards/margins": 0.04000380262732506, + "rewards/rejected": -0.12906494736671448, + "sft_loss": 0.8906112909317017, + "step": 122 + }, + { + "epoch": 0.17787418655097614, + "grad_norm": 2.9079992073602368, + "learning_rate": 7.99936214478073e-06, + "logits/chosen": 1.0372124910354614, + "logits/rejected": 0.8244482278823853, + "logps/chosen": -0.7030653953552246, + "logps/rejected": -1.417474627494812, + "loss": 0.7689, + "odds_ratio_loss": 0.3927401304244995, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07030653953552246, + "rewards/margins": 0.0714409276843071, + "rewards/rejected": -0.14174747467041016, + "sft_loss": 0.7030653953552246, + "step": 123 + }, + { + "epoch": 0.17932031814895155, + "grad_norm": 3.1826994675798077, + "learning_rate": 7.999305474974071e-06, + "logits/chosen": 1.1816697120666504, + "logits/rejected": 0.942751407623291, + "logps/chosen": -0.7592461109161377, + "logps/rejected": -1.1128010749816895, + "loss": 0.8271, + "odds_ratio_loss": 0.5115275382995605, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07592461258172989, + "rewards/margins": 0.03535548970103264, + "rewards/rejected": -0.11128010600805283, + "sft_loss": 0.7592461109161377, + "step": 124 + }, + { + "epoch": 0.18076644974692696, + "grad_norm": 3.0381911136860293, + "learning_rate": 7.999246393971247e-06, + "logits/chosen": 1.0905792713165283, + "logits/rejected": 0.7886046171188354, + "logps/chosen": -0.7659871578216553, + "logps/rejected": -1.5863971710205078, + "loss": 0.8179, + "odds_ratio_loss": 0.36050310730934143, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07659871876239777, + "rewards/margins": 0.08204102516174316, + "rewards/rejected": -0.15863972902297974, + "sft_loss": 0.7659871578216553, + "step": 125 + }, + { + "epoch": 0.1822125813449024, + "grad_norm": 10.206680986022551, + "learning_rate": 7.999184901807875e-06, + "logits/chosen": 0.8782986402511597, + "logits/rejected": 0.9173144698143005, + "logps/chosen": -0.8764703273773193, + "logps/rejected": -1.0771946907043457, + "loss": 0.9931, + "odds_ratio_loss": 0.6936411261558533, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.08764703571796417, + "rewards/margins": 0.020072437822818756, + "rewards/rejected": -0.10771947354078293, + "sft_loss": 0.8764703273773193, + "step": 126 + }, + { + "epoch": 0.1836587129428778, + "grad_norm": 3.1106884257684007, + "learning_rate": 7.999120998521033e-06, + "logits/chosen": 1.0604857206344604, + "logits/rejected": 1.0624693632125854, + "logps/chosen": -0.7787372469902039, + "logps/rejected": -1.3081343173980713, + "loss": 0.8462, + "odds_ratio_loss": 0.47298046946525574, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07787372171878815, + "rewards/margins": 0.05293971672654152, + "rewards/rejected": -0.13081344962120056, + "sft_loss": 0.7787372469902039, + "step": 127 + }, + { + "epoch": 0.18510484454085321, + "grad_norm": 5.440440093591614, + "learning_rate": 7.999054684149247e-06, + "logits/chosen": 1.07607102394104, + "logits/rejected": 1.0282245874404907, + "logps/chosen": -0.7584375143051147, + "logps/rejected": -1.117292046546936, + "loss": 0.8178, + "odds_ratio_loss": 0.5175266861915588, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07584375143051147, + "rewards/margins": 0.035885464400053024, + "rewards/rejected": -0.1117292046546936, + "sft_loss": 0.7584375143051147, + "step": 128 + }, + { + "epoch": 0.18655097613882862, + "grad_norm": 9.357508492499358, + "learning_rate": 7.998985958732496e-06, + "logits/chosen": 1.1318176984786987, + "logits/rejected": 0.8266797065734863, + "logps/chosen": -0.8652774095535278, + "logps/rejected": -1.303208351135254, + "loss": 0.9764, + "odds_ratio_loss": 0.5121663212776184, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0865277424454689, + "rewards/margins": 0.04379308968782425, + "rewards/rejected": -0.13032083213329315, + "sft_loss": 0.8652774095535278, + "step": 129 + }, + { + "epoch": 0.18799710773680406, + "grad_norm": 4.527334579057526, + "learning_rate": 7.998914822312218e-06, + "logits/chosen": 0.8850480318069458, + "logits/rejected": 0.8174777626991272, + "logps/chosen": -0.9658189415931702, + "logps/rejected": -1.1547739505767822, + "loss": 0.9466, + "odds_ratio_loss": 0.6745478510856628, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.09658190608024597, + "rewards/margins": 0.01889548823237419, + "rewards/rejected": -0.11547739058732986, + "sft_loss": 0.9658189415931702, + "step": 130 + }, + { + "epoch": 0.18944323933477947, + "grad_norm": 4.443262215950921, + "learning_rate": 7.998841274931302e-06, + "logits/chosen": 1.0955193042755127, + "logits/rejected": 0.7597486972808838, + "logps/chosen": -0.7344867587089539, + "logps/rejected": -1.3775670528411865, + "loss": 0.8939, + "odds_ratio_loss": 0.5716267824172974, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07344868034124374, + "rewards/margins": 0.0643080472946167, + "rewards/rejected": -0.13775672018527985, + "sft_loss": 0.7344867587089539, + "step": 131 + }, + { + "epoch": 0.19088937093275488, + "grad_norm": 2.6245053770051414, + "learning_rate": 7.998765316634085e-06, + "logits/chosen": 1.3198339939117432, + "logits/rejected": 0.9343856573104858, + "logps/chosen": -0.7423506379127502, + "logps/rejected": -1.0867445468902588, + "loss": 0.8236, + "odds_ratio_loss": 0.5474746227264404, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07423506677150726, + "rewards/margins": 0.034439388662576675, + "rewards/rejected": -0.10867445170879364, + "sft_loss": 0.7423506379127502, + "step": 132 + }, + { + "epoch": 0.19233550253073028, + "grad_norm": 3.008883127727729, + "learning_rate": 7.998686947466366e-06, + "logits/chosen": 0.9286758899688721, + "logits/rejected": 0.7316522002220154, + "logps/chosen": -0.8677981495857239, + "logps/rejected": -1.2070274353027344, + "loss": 0.858, + "odds_ratio_loss": 0.5611447691917419, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08677982538938522, + "rewards/margins": 0.03392292559146881, + "rewards/rejected": -0.12070275098085403, + "sft_loss": 0.8677981495857239, + "step": 133 + }, + { + "epoch": 0.19378163412870572, + "grad_norm": 3.9031706234078754, + "learning_rate": 7.998606167475395e-06, + "logits/chosen": 0.9033632874488831, + "logits/rejected": 0.7890003323554993, + "logps/chosen": -0.854728102684021, + "logps/rejected": -1.2537704706192017, + "loss": 0.8872, + "odds_ratio_loss": 0.5175142288208008, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08547282218933105, + "rewards/margins": 0.039904236793518066, + "rewards/rejected": -0.12537705898284912, + "sft_loss": 0.854728102684021, + "step": 134 + }, + { + "epoch": 0.19522776572668113, + "grad_norm": 2.9440785803501335, + "learning_rate": 7.998522976709873e-06, + "logits/chosen": 1.0039453506469727, + "logits/rejected": 0.9802931547164917, + "logps/chosen": -0.7600377798080444, + "logps/rejected": -1.3574674129486084, + "loss": 0.8128, + "odds_ratio_loss": 0.610292375087738, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0760037824511528, + "rewards/margins": 0.05974297225475311, + "rewards/rejected": -0.13574674725532532, + "sft_loss": 0.7600377798080444, + "step": 135 + }, + { + "epoch": 0.19667389732465654, + "grad_norm": 2.8581912638451694, + "learning_rate": 7.998437375219955e-06, + "logits/chosen": 1.2175956964492798, + "logits/rejected": 0.8139463663101196, + "logps/chosen": -0.7659615874290466, + "logps/rejected": -1.644038200378418, + "loss": 0.8783, + "odds_ratio_loss": 0.49914222955703735, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07659615576267242, + "rewards/margins": 0.08780766278505325, + "rewards/rejected": -0.16440382599830627, + "sft_loss": 0.7659615874290466, + "step": 136 + }, + { + "epoch": 0.19812002892263195, + "grad_norm": 6.226413264357474, + "learning_rate": 7.998349363057252e-06, + "logits/chosen": 1.0425236225128174, + "logits/rejected": 0.9942991137504578, + "logps/chosen": -0.7867666482925415, + "logps/rejected": -1.3995404243469238, + "loss": 0.8907, + "odds_ratio_loss": 0.5336313843727112, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07867667078971863, + "rewards/margins": 0.06127737835049629, + "rewards/rejected": -0.13995404541492462, + "sft_loss": 0.7867666482925415, + "step": 137 + }, + { + "epoch": 0.19956616052060738, + "grad_norm": 2.869697362833916, + "learning_rate": 7.998258940274828e-06, + "logits/chosen": 0.969634473323822, + "logits/rejected": 0.621441662311554, + "logps/chosen": -0.8326024413108826, + "logps/rejected": -1.6405725479125977, + "loss": 0.8682, + "odds_ratio_loss": 0.5707529783248901, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08326023817062378, + "rewards/margins": 0.08079702407121658, + "rewards/rejected": -0.16405726969242096, + "sft_loss": 0.8326024413108826, + "step": 138 + }, + { + "epoch": 0.2010122921185828, + "grad_norm": 8.40598616315667, + "learning_rate": 7.998166106927197e-06, + "logits/chosen": 1.067244291305542, + "logits/rejected": 0.7582840919494629, + "logps/chosen": -0.5070884227752686, + "logps/rejected": -1.4907238483428955, + "loss": 0.8181, + "odds_ratio_loss": 0.4089462459087372, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0507088378071785, + "rewards/margins": 0.09836354106664658, + "rewards/rejected": -0.14907237887382507, + "sft_loss": 0.5070884227752686, + "step": 139 + }, + { + "epoch": 0.2024584237165582, + "grad_norm": 3.2177545453756986, + "learning_rate": 7.998070863070329e-06, + "logits/chosen": 1.0560888051986694, + "logits/rejected": 0.7042299509048462, + "logps/chosen": -0.9449357390403748, + "logps/rejected": -1.1625012159347534, + "loss": 0.8684, + "odds_ratio_loss": 0.5798974633216858, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0944935753941536, + "rewards/margins": 0.021756542846560478, + "rewards/rejected": -0.11625012010335922, + "sft_loss": 0.9449357390403748, + "step": 140 + }, + { + "epoch": 0.2039045553145336, + "grad_norm": 4.626119117506995, + "learning_rate": 7.997973208761647e-06, + "logits/chosen": 0.8979699611663818, + "logits/rejected": 0.7739961743354797, + "logps/chosen": -0.7614917159080505, + "logps/rejected": -1.5159138441085815, + "loss": 0.8879, + "odds_ratio_loss": 0.48651716113090515, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07614917308092117, + "rewards/margins": 0.07544222474098206, + "rewards/rejected": -0.15159140527248383, + "sft_loss": 0.7614917159080505, + "step": 141 + }, + { + "epoch": 0.20535068691250905, + "grad_norm": 6.122175759295342, + "learning_rate": 7.997873144060028e-06, + "logits/chosen": 1.0175689458847046, + "logits/rejected": 0.8373978734016418, + "logps/chosen": -0.7784775495529175, + "logps/rejected": -1.4234530925750732, + "loss": 0.962, + "odds_ratio_loss": 0.6008108854293823, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07784774899482727, + "rewards/margins": 0.06449756026268005, + "rewards/rejected": -0.14234532415866852, + "sft_loss": 0.7784775495529175, + "step": 142 + }, + { + "epoch": 0.20679681851048445, + "grad_norm": 4.8238500026764095, + "learning_rate": 7.9977706690258e-06, + "logits/chosen": 0.9935001134872437, + "logits/rejected": 0.8581937551498413, + "logps/chosen": -0.8578831553459167, + "logps/rejected": -1.0968329906463623, + "loss": 0.8869, + "odds_ratio_loss": 0.6052335500717163, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0857883170247078, + "rewards/margins": 0.023894988000392914, + "rewards/rejected": -0.10968329757452011, + "sft_loss": 0.8578831553459167, + "step": 143 + }, + { + "epoch": 0.20824295010845986, + "grad_norm": 3.1206319457795875, + "learning_rate": 7.997665783720749e-06, + "logits/chosen": 1.1067547798156738, + "logits/rejected": 1.0524673461914062, + "logps/chosen": -0.6326704025268555, + "logps/rejected": -1.2812092304229736, + "loss": 0.8483, + "odds_ratio_loss": 0.4811175763607025, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0632670447230339, + "rewards/margins": 0.06485386937856674, + "rewards/rejected": -0.12812092900276184, + "sft_loss": 0.6326704025268555, + "step": 144 + }, + { + "epoch": 0.2096890817064353, + "grad_norm": 3.7395254574099126, + "learning_rate": 7.997558488208105e-06, + "logits/chosen": 0.8425149917602539, + "logits/rejected": 0.6503562927246094, + "logps/chosen": -0.6809152960777283, + "logps/rejected": -1.6081905364990234, + "loss": 0.8512, + "odds_ratio_loss": 0.4523545801639557, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06809153407812119, + "rewards/margins": 0.09272752702236176, + "rewards/rejected": -0.16081905364990234, + "sft_loss": 0.6809152960777283, + "step": 145 + }, + { + "epoch": 0.2111352133044107, + "grad_norm": 3.482516836820913, + "learning_rate": 7.997448782552561e-06, + "logits/chosen": 1.1837321519851685, + "logits/rejected": 0.89334636926651, + "logps/chosen": -0.8056322932243347, + "logps/rejected": -1.4506659507751465, + "loss": 0.8242, + "odds_ratio_loss": 0.5507670640945435, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08056323230266571, + "rewards/margins": 0.06450335681438446, + "rewards/rejected": -0.14506658911705017, + "sft_loss": 0.8056322932243347, + "step": 146 + }, + { + "epoch": 0.21258134490238612, + "grad_norm": 3.3963256945903915, + "learning_rate": 7.997336666820258e-06, + "logits/chosen": 0.7776620388031006, + "logits/rejected": 0.8272684216499329, + "logps/chosen": -0.7493107318878174, + "logps/rejected": -1.14316987991333, + "loss": 0.8604, + "odds_ratio_loss": 0.5952869057655334, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0749310702085495, + "rewards/margins": 0.039385922253131866, + "rewards/rejected": -0.11431698501110077, + "sft_loss": 0.7493107318878174, + "step": 147 + }, + { + "epoch": 0.21402747650036152, + "grad_norm": 3.3238714858314244, + "learning_rate": 7.997222141078791e-06, + "logits/chosen": 0.8280146718025208, + "logits/rejected": 0.7190472483634949, + "logps/chosen": -0.8145158290863037, + "logps/rejected": -1.596473217010498, + "loss": 0.8569, + "odds_ratio_loss": 0.6677970886230469, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08145157992839813, + "rewards/margins": 0.07819575071334839, + "rewards/rejected": -0.15964734554290771, + "sft_loss": 0.8145158290863037, + "step": 148 + }, + { + "epoch": 0.21547360809833696, + "grad_norm": 3.8446953387308724, + "learning_rate": 7.997105205397208e-06, + "logits/chosen": 0.7918663024902344, + "logits/rejected": 0.7498356103897095, + "logps/chosen": -0.9449871182441711, + "logps/rejected": -1.143221378326416, + "loss": 0.8744, + "odds_ratio_loss": 0.5902383327484131, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09449870884418488, + "rewards/margins": 0.019823430106043816, + "rewards/rejected": -0.11432213336229324, + "sft_loss": 0.9449871182441711, + "step": 149 + }, + { + "epoch": 0.21691973969631237, + "grad_norm": 3.3011722687802316, + "learning_rate": 7.99698585984601e-06, + "logits/chosen": 1.0179299116134644, + "logits/rejected": 0.8888053297996521, + "logps/chosen": -0.9322119951248169, + "logps/rejected": -1.32847261428833, + "loss": 0.8966, + "odds_ratio_loss": 0.7749242186546326, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.09322120249271393, + "rewards/margins": 0.039626047015190125, + "rewards/rejected": -0.13284724950790405, + "sft_loss": 0.9322119951248169, + "step": 150 + }, + { + "epoch": 0.21836587129428778, + "grad_norm": 4.782501865845731, + "learning_rate": 7.99686410449715e-06, + "logits/chosen": 1.043616533279419, + "logits/rejected": 0.8453623056411743, + "logps/chosen": -1.0205018520355225, + "logps/rejected": -1.2420902252197266, + "loss": 0.9077, + "odds_ratio_loss": 0.6890058517456055, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10205018520355225, + "rewards/margins": 0.02215883880853653, + "rewards/rejected": -0.12420902401208878, + "sft_loss": 1.0205018520355225, + "step": 151 + }, + { + "epoch": 0.2198120028922632, + "grad_norm": 2.7453397904711867, + "learning_rate": 7.996739939424036e-06, + "logits/chosen": 1.0936996936798096, + "logits/rejected": 0.7009330987930298, + "logps/chosen": -0.9429874420166016, + "logps/rejected": -1.2208938598632812, + "loss": 0.8392, + "odds_ratio_loss": 0.6516534686088562, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09429875016212463, + "rewards/margins": 0.027790643274784088, + "rewards/rejected": -0.12208940088748932, + "sft_loss": 0.9429874420166016, + "step": 152 + }, + { + "epoch": 0.22125813449023862, + "grad_norm": 3.5114275545087272, + "learning_rate": 7.996613364701528e-06, + "logits/chosen": 0.8619660139083862, + "logits/rejected": 0.8545634746551514, + "logps/chosen": -0.8546210527420044, + "logps/rejected": -0.9204950332641602, + "loss": 0.823, + "odds_ratio_loss": 0.7684470415115356, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.08546211570501328, + "rewards/margins": 0.006587391719222069, + "rewards/rejected": -0.0920495018362999, + "sft_loss": 0.8546210527420044, + "step": 153 + }, + { + "epoch": 0.22270426608821403, + "grad_norm": 3.1464188184377067, + "learning_rate": 7.996484380405936e-06, + "logits/chosen": 0.8472630977630615, + "logits/rejected": 0.9261313080787659, + "logps/chosen": -0.8409748673439026, + "logps/rejected": -1.471449851989746, + "loss": 0.8471, + "odds_ratio_loss": 0.5609486699104309, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0840974897146225, + "rewards/margins": 0.06304750591516495, + "rewards/rejected": -0.14714500308036804, + "sft_loss": 0.8409748673439026, + "step": 154 + }, + { + "epoch": 0.22415039768618944, + "grad_norm": 3.82484646727686, + "learning_rate": 7.996352986615026e-06, + "logits/chosen": 1.1553988456726074, + "logits/rejected": 0.9536395072937012, + "logps/chosen": -0.8071169257164001, + "logps/rejected": -2.1079392433166504, + "loss": 1.0003, + "odds_ratio_loss": 0.44714686274528503, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08071169257164001, + "rewards/margins": 0.13008223474025726, + "rewards/rejected": -0.21079392731189728, + "sft_loss": 0.8071169257164001, + "step": 155 + }, + { + "epoch": 0.22559652928416485, + "grad_norm": 3.3443773185383248, + "learning_rate": 7.996219183408017e-06, + "logits/chosen": 0.8596165180206299, + "logits/rejected": 0.8327771425247192, + "logps/chosen": -1.0019909143447876, + "logps/rejected": -1.1135194301605225, + "loss": 0.9495, + "odds_ratio_loss": 0.7217904329299927, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.10019908845424652, + "rewards/margins": 0.011152852326631546, + "rewards/rejected": -0.11135194450616837, + "sft_loss": 1.0019909143447876, + "step": 156 + }, + { + "epoch": 0.22704266088214028, + "grad_norm": 3.253728818340114, + "learning_rate": 7.99608297086558e-06, + "logits/chosen": 0.7683792114257812, + "logits/rejected": 0.7193489074707031, + "logps/chosen": -0.7505009174346924, + "logps/rejected": -1.5825207233428955, + "loss": 0.8003, + "odds_ratio_loss": 0.5120865702629089, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07505009323358536, + "rewards/margins": 0.08320198208093643, + "rewards/rejected": -0.1582520753145218, + "sft_loss": 0.7505009174346924, + "step": 157 + }, + { + "epoch": 0.2284887924801157, + "grad_norm": 4.3523818776563035, + "learning_rate": 7.995944349069836e-06, + "logits/chosen": 1.1126295328140259, + "logits/rejected": 0.9180561900138855, + "logps/chosen": -0.870684027671814, + "logps/rejected": -1.4605320692062378, + "loss": 0.8131, + "odds_ratio_loss": 0.5564998388290405, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08706840872764587, + "rewards/margins": 0.05898480489850044, + "rewards/rejected": -0.14605320990085602, + "sft_loss": 0.870684027671814, + "step": 158 + }, + { + "epoch": 0.2299349240780911, + "grad_norm": 4.108594412816073, + "learning_rate": 7.99580331810436e-06, + "logits/chosen": 1.1916407346725464, + "logits/rejected": 0.7693163156509399, + "logps/chosen": -0.7917389869689941, + "logps/rejected": -1.244872808456421, + "loss": 0.877, + "odds_ratio_loss": 0.5736300945281982, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07917389273643494, + "rewards/margins": 0.04531338810920715, + "rewards/rejected": -0.12448728084564209, + "sft_loss": 0.7917389869689941, + "step": 159 + }, + { + "epoch": 0.2313810556760665, + "grad_norm": 3.113607572889762, + "learning_rate": 7.995659878054184e-06, + "logits/chosen": 1.040024995803833, + "logits/rejected": 0.8305615782737732, + "logps/chosen": -0.8537604808807373, + "logps/rejected": -1.1052323579788208, + "loss": 0.8598, + "odds_ratio_loss": 0.5960687398910522, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08537605404853821, + "rewards/margins": 0.025147181004285812, + "rewards/rejected": -0.11052322387695312, + "sft_loss": 0.8537604808807373, + "step": 160 + }, + { + "epoch": 0.23282718727404195, + "grad_norm": 4.094550792057405, + "learning_rate": 7.995514029005786e-06, + "logits/chosen": 0.8100274801254272, + "logits/rejected": 0.7823082208633423, + "logps/chosen": -0.8013307452201843, + "logps/rejected": -1.2279610633850098, + "loss": 0.9395, + "odds_ratio_loss": 0.5334639549255371, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08013307303190231, + "rewards/margins": 0.042663030326366425, + "rewards/rejected": -0.12279611080884933, + "sft_loss": 0.8013307452201843, + "step": 161 + }, + { + "epoch": 0.23427331887201736, + "grad_norm": 4.417141454685987, + "learning_rate": 7.995365771047098e-06, + "logits/chosen": 1.0468742847442627, + "logits/rejected": 0.8377524018287659, + "logps/chosen": -0.9060525298118591, + "logps/rejected": -1.0592972040176392, + "loss": 0.8911, + "odds_ratio_loss": 0.7184407711029053, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09060525894165039, + "rewards/margins": 0.015324457548558712, + "rewards/rejected": -0.10592971742153168, + "sft_loss": 0.9060525298118591, + "step": 162 + }, + { + "epoch": 0.23571945046999276, + "grad_norm": 4.991713895984588, + "learning_rate": 7.995215104267506e-06, + "logits/chosen": 0.8785092830657959, + "logits/rejected": 0.7008171677589417, + "logps/chosen": -0.9880103468894958, + "logps/rejected": -1.4084423780441284, + "loss": 0.954, + "odds_ratio_loss": 0.6232841610908508, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09880103915929794, + "rewards/margins": 0.042043209075927734, + "rewards/rejected": -0.14084425568580627, + "sft_loss": 0.9880103468894958, + "step": 163 + }, + { + "epoch": 0.23716558206796817, + "grad_norm": 3.888510837008354, + "learning_rate": 7.995062028757848e-06, + "logits/chosen": 1.2152018547058105, + "logits/rejected": 1.0892407894134521, + "logps/chosen": -0.7016069293022156, + "logps/rejected": -1.010518193244934, + "loss": 0.8521, + "odds_ratio_loss": 0.5655720233917236, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07016069442033768, + "rewards/margins": 0.03089112415909767, + "rewards/rejected": -0.10105182230472565, + "sft_loss": 0.7016069293022156, + "step": 164 + }, + { + "epoch": 0.2386117136659436, + "grad_norm": 3.047500446746817, + "learning_rate": 7.994906544610413e-06, + "logits/chosen": 1.074456810951233, + "logits/rejected": 0.809201180934906, + "logps/chosen": -0.6478066444396973, + "logps/rejected": -1.6955187320709229, + "loss": 0.8543, + "odds_ratio_loss": 0.38769054412841797, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06478067487478256, + "rewards/margins": 0.1047712117433548, + "rewards/rejected": -0.16955187916755676, + "sft_loss": 0.6478066444396973, + "step": 165 + }, + { + "epoch": 0.24005784526391902, + "grad_norm": 2.6536028558426774, + "learning_rate": 7.994748651918946e-06, + "logits/chosen": 1.1055707931518555, + "logits/rejected": 0.8093370795249939, + "logps/chosen": -0.8095314502716064, + "logps/rejected": -1.4664175510406494, + "loss": 0.9225, + "odds_ratio_loss": 0.664374828338623, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08095315098762512, + "rewards/margins": 0.0656886100769043, + "rewards/rejected": -0.14664176106452942, + "sft_loss": 0.8095314502716064, + "step": 166 + }, + { + "epoch": 0.24150397686189443, + "grad_norm": 3.541652037604111, + "learning_rate": 7.994588350778638e-06, + "logits/chosen": 1.0057575702667236, + "logits/rejected": 0.8427650928497314, + "logps/chosen": -0.9639623165130615, + "logps/rejected": -1.1809089183807373, + "loss": 0.9288, + "odds_ratio_loss": 0.6998913288116455, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09639623761177063, + "rewards/margins": 0.02169465646147728, + "rewards/rejected": -0.11809088289737701, + "sft_loss": 0.9639623165130615, + "step": 167 + }, + { + "epoch": 0.24295010845986983, + "grad_norm": 4.277742047003352, + "learning_rate": 7.994425641286135e-06, + "logits/chosen": 1.0967094898223877, + "logits/rejected": 1.0031440258026123, + "logps/chosen": -0.7859099507331848, + "logps/rejected": -1.1452839374542236, + "loss": 0.8337, + "odds_ratio_loss": 0.5716875791549683, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0785909965634346, + "rewards/margins": 0.03593740612268448, + "rewards/rejected": -0.11452840268611908, + "sft_loss": 0.7859099507331848, + "step": 168 + }, + { + "epoch": 0.24439624005784527, + "grad_norm": 2.8034050061730107, + "learning_rate": 7.994260523539536e-06, + "logits/chosen": 1.0453124046325684, + "logits/rejected": 0.8825990557670593, + "logps/chosen": -0.7164455652236938, + "logps/rejected": -1.6012907028198242, + "loss": 0.8723, + "odds_ratio_loss": 0.4701683223247528, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07164455950260162, + "rewards/margins": 0.0884845107793808, + "rewards/rejected": -0.16012907028198242, + "sft_loss": 0.7164455652236938, + "step": 169 + }, + { + "epoch": 0.24584237165582068, + "grad_norm": 2.977337634451211, + "learning_rate": 7.994092997638392e-06, + "logits/chosen": 0.9832794666290283, + "logits/rejected": 0.7748570442199707, + "logps/chosen": -0.6102631092071533, + "logps/rejected": -2.1352500915527344, + "loss": 0.8238, + "odds_ratio_loss": 0.2782694697380066, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06102630868554115, + "rewards/margins": 0.15249869227409363, + "rewards/rejected": -0.2135249823331833, + "sft_loss": 0.6102631092071533, + "step": 170 + }, + { + "epoch": 0.2472885032537961, + "grad_norm": 3.1791088490599693, + "learning_rate": 7.993923063683702e-06, + "logits/chosen": 1.1284598112106323, + "logits/rejected": 0.8740060329437256, + "logps/chosen": -0.739490807056427, + "logps/rejected": -1.5526961088180542, + "loss": 0.8523, + "odds_ratio_loss": 0.546734094619751, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07394907623529434, + "rewards/margins": 0.08132053166627884, + "rewards/rejected": -0.15526960790157318, + "sft_loss": 0.739490807056427, + "step": 171 + }, + { + "epoch": 0.24873463485177152, + "grad_norm": 3.104876414975262, + "learning_rate": 7.993750721777924e-06, + "logits/chosen": 0.7685276865959167, + "logits/rejected": 0.6663203239440918, + "logps/chosen": -0.869566798210144, + "logps/rejected": -1.5396361351013184, + "loss": 0.9164, + "odds_ratio_loss": 0.5057757496833801, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0869566798210144, + "rewards/margins": 0.06700694561004639, + "rewards/rejected": -0.1539636254310608, + "sft_loss": 0.869566798210144, + "step": 172 + }, + { + "epoch": 0.2501807664497469, + "grad_norm": 3.153797811030736, + "learning_rate": 7.993575972024962e-06, + "logits/chosen": 0.8820764422416687, + "logits/rejected": 0.8640388250350952, + "logps/chosen": -0.881617546081543, + "logps/rejected": -1.2504996061325073, + "loss": 0.898, + "odds_ratio_loss": 0.6156437397003174, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08816175162792206, + "rewards/margins": 0.036888204514980316, + "rewards/rejected": -0.12504996359348297, + "sft_loss": 0.881617546081543, + "step": 173 + }, + { + "epoch": 0.25162689804772237, + "grad_norm": 3.111404000329643, + "learning_rate": 7.99339881453017e-06, + "logits/chosen": 0.9390738606452942, + "logits/rejected": 0.9202344417572021, + "logps/chosen": -0.7253485918045044, + "logps/rejected": -1.4922535419464111, + "loss": 0.8046, + "odds_ratio_loss": 0.46386268734931946, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07253485172986984, + "rewards/margins": 0.07669049501419067, + "rewards/rejected": -0.1492253541946411, + "sft_loss": 0.7253485918045044, + "step": 174 + }, + { + "epoch": 0.2530730296456978, + "grad_norm": 3.9720902981170942, + "learning_rate": 7.993219249400363e-06, + "logits/chosen": 0.9836024045944214, + "logits/rejected": 0.8066073060035706, + "logps/chosen": -0.8062931299209595, + "logps/rejected": -1.5868737697601318, + "loss": 0.9287, + "odds_ratio_loss": 0.5350762605667114, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08062931895256042, + "rewards/margins": 0.07805806398391724, + "rewards/rejected": -0.15868738293647766, + "sft_loss": 0.8062931299209595, + "step": 175 + }, + { + "epoch": 0.2545191612436732, + "grad_norm": 2.800439660609358, + "learning_rate": 7.993037276743796e-06, + "logits/chosen": 0.7656744122505188, + "logits/rejected": 0.5632432699203491, + "logps/chosen": -0.7139934301376343, + "logps/rejected": -2.1812193393707275, + "loss": 0.8189, + "odds_ratio_loss": 0.3666784465312958, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07139935344457626, + "rewards/margins": 0.14672258496284485, + "rewards/rejected": -0.21812193095684052, + "sft_loss": 0.7139934301376343, + "step": 176 + }, + { + "epoch": 0.2559652928416486, + "grad_norm": 2.8937622627656037, + "learning_rate": 7.992852896670184e-06, + "logits/chosen": 0.9522703886032104, + "logits/rejected": 0.8420579433441162, + "logps/chosen": -1.0373575687408447, + "logps/rejected": -1.0805912017822266, + "loss": 0.9378, + "odds_ratio_loss": 0.8096970319747925, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.10373575985431671, + "rewards/margins": 0.0043233660981059074, + "rewards/rejected": -0.1080591231584549, + "sft_loss": 1.0373575687408447, + "step": 177 + }, + { + "epoch": 0.257411424439624, + "grad_norm": 5.312025248990845, + "learning_rate": 7.99266610929069e-06, + "logits/chosen": 1.116875410079956, + "logits/rejected": 0.8645721673965454, + "logps/chosen": -0.8963755965232849, + "logps/rejected": -1.5484733581542969, + "loss": 0.7993, + "odds_ratio_loss": 0.6069468259811401, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08963755518198013, + "rewards/margins": 0.0652097761631012, + "rewards/rejected": -0.15484732389450073, + "sft_loss": 0.8963755965232849, + "step": 178 + }, + { + "epoch": 0.2588575560375994, + "grad_norm": 4.10362786725861, + "learning_rate": 7.992476914717928e-06, + "logits/chosen": 0.9966639876365662, + "logits/rejected": 0.9122011661529541, + "logps/chosen": -0.7157900333404541, + "logps/rejected": -1.8117486238479614, + "loss": 0.8281, + "odds_ratio_loss": 0.4310546815395355, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07157900184392929, + "rewards/margins": 0.10959585756063461, + "rewards/rejected": -0.1811748594045639, + "sft_loss": 0.7157900333404541, + "step": 179 + }, + { + "epoch": 0.2603036876355748, + "grad_norm": 6.129315995584233, + "learning_rate": 7.992285313065964e-06, + "logits/chosen": 0.8830561637878418, + "logits/rejected": 0.8202065229415894, + "logps/chosen": -0.9284470081329346, + "logps/rejected": -1.648970603942871, + "loss": 0.9707, + "odds_ratio_loss": 0.7073562145233154, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.09284469485282898, + "rewards/margins": 0.07205238938331604, + "rewards/rejected": -0.16489706933498383, + "sft_loss": 0.9284470081329346, + "step": 180 + }, + { + "epoch": 0.26174981923355023, + "grad_norm": 3.8183141881814393, + "learning_rate": 7.992091304450316e-06, + "logits/chosen": 1.123767375946045, + "logits/rejected": 0.7026354670524597, + "logps/chosen": -0.8758699893951416, + "logps/rejected": -2.0314223766326904, + "loss": 0.7393, + "odds_ratio_loss": 0.5088123083114624, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08758699893951416, + "rewards/margins": 0.11555524170398712, + "rewards/rejected": -0.20314225554466248, + "sft_loss": 0.8758699893951416, + "step": 181 + }, + { + "epoch": 0.2631959508315257, + "grad_norm": 3.181819086244578, + "learning_rate": 7.991894888987954e-06, + "logits/chosen": 0.8263757824897766, + "logits/rejected": 0.6464723944664001, + "logps/chosen": -0.6118509769439697, + "logps/rejected": -1.7053591012954712, + "loss": 0.7897, + "odds_ratio_loss": 0.40566810965538025, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06118509918451309, + "rewards/margins": 0.10935080051422119, + "rewards/rejected": -0.17053590714931488, + "sft_loss": 0.6118509769439697, + "step": 182 + }, + { + "epoch": 0.2646420824295011, + "grad_norm": 2.7941883907120846, + "learning_rate": 7.991696066797293e-06, + "logits/chosen": 1.130903720855713, + "logits/rejected": 0.7515724301338196, + "logps/chosen": -0.7077884674072266, + "logps/rejected": -1.0460158586502075, + "loss": 0.8384, + "odds_ratio_loss": 0.4854280948638916, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07077884674072266, + "rewards/margins": 0.033822737634181976, + "rewards/rejected": -0.10460158437490463, + "sft_loss": 0.7077884674072266, + "step": 183 + }, + { + "epoch": 0.2660882140274765, + "grad_norm": 3.922413376181467, + "learning_rate": 7.991494837998209e-06, + "logits/chosen": 0.867751955986023, + "logits/rejected": 0.6655453443527222, + "logps/chosen": -1.1125421524047852, + "logps/rejected": -2.218876838684082, + "loss": 0.9668, + "odds_ratio_loss": 0.6620099544525146, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.11125421524047852, + "rewards/margins": 0.1106334701180458, + "rewards/rejected": -0.22188769280910492, + "sft_loss": 1.1125421524047852, + "step": 184 + }, + { + "epoch": 0.2675343456254519, + "grad_norm": 4.849556019250261, + "learning_rate": 7.991291202712021e-06, + "logits/chosen": 0.9399557113647461, + "logits/rejected": 0.7142766714096069, + "logps/chosen": -0.7886428833007812, + "logps/rejected": -1.5420012474060059, + "loss": 0.8707, + "odds_ratio_loss": 0.5734947323799133, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07886429131031036, + "rewards/margins": 0.07533583790063858, + "rewards/rejected": -0.15420013666152954, + "sft_loss": 0.7886428833007812, + "step": 185 + }, + { + "epoch": 0.26898047722342733, + "grad_norm": 3.5359765528486298, + "learning_rate": 7.991085161061502e-06, + "logits/chosen": 0.9476644992828369, + "logits/rejected": 0.8192164897918701, + "logps/chosen": -0.6491098403930664, + "logps/rejected": -1.3708826303482056, + "loss": 0.8088, + "odds_ratio_loss": 0.4414351284503937, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06491097807884216, + "rewards/margins": 0.07217729091644287, + "rewards/rejected": -0.13708826899528503, + "sft_loss": 0.6491098403930664, + "step": 186 + }, + { + "epoch": 0.27042660882140274, + "grad_norm": 3.5799876990998767, + "learning_rate": 7.990876713170873e-06, + "logits/chosen": 0.988240659236908, + "logits/rejected": 0.8487038612365723, + "logps/chosen": -0.7226072549819946, + "logps/rejected": -1.360625982284546, + "loss": 0.8501, + "odds_ratio_loss": 0.538947343826294, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07226072996854782, + "rewards/margins": 0.06380186975002289, + "rewards/rejected": -0.1360626071691513, + "sft_loss": 0.7226072549819946, + "step": 187 + }, + { + "epoch": 0.27187274041937814, + "grad_norm": 2.952397694472701, + "learning_rate": 7.990665859165812e-06, + "logits/chosen": 0.9410344362258911, + "logits/rejected": 0.8459163904190063, + "logps/chosen": -0.8709695339202881, + "logps/rejected": -1.0749214887619019, + "loss": 0.8362, + "odds_ratio_loss": 0.6080572009086609, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08709695935249329, + "rewards/margins": 0.020395198836922646, + "rewards/rejected": -0.10749214887619019, + "sft_loss": 0.8709695339202881, + "step": 188 + }, + { + "epoch": 0.27331887201735355, + "grad_norm": 3.0804169157983807, + "learning_rate": 7.990452599173442e-06, + "logits/chosen": 1.2123034000396729, + "logits/rejected": 0.9961423873901367, + "logps/chosen": -0.5125177502632141, + "logps/rejected": -1.496875524520874, + "loss": 0.862, + "odds_ratio_loss": 0.3468582034111023, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05125177651643753, + "rewards/margins": 0.09843578189611435, + "rewards/rejected": -0.14968755841255188, + "sft_loss": 0.5125177502632141, + "step": 189 + }, + { + "epoch": 0.274765003615329, + "grad_norm": 2.6526058029397577, + "learning_rate": 7.990236933322337e-06, + "logits/chosen": 0.8868533372879028, + "logits/rejected": 0.6382319927215576, + "logps/chosen": -0.8466877937316895, + "logps/rejected": -1.705843448638916, + "loss": 0.8327, + "odds_ratio_loss": 0.407950222492218, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.08466878533363342, + "rewards/margins": 0.08591558039188385, + "rewards/rejected": -0.17058435082435608, + "sft_loss": 0.8466877937316895, + "step": 190 + }, + { + "epoch": 0.2762111352133044, + "grad_norm": 3.995125160090004, + "learning_rate": 7.990018861742524e-06, + "logits/chosen": 0.7121444940567017, + "logits/rejected": 0.6306707859039307, + "logps/chosen": -0.7519279718399048, + "logps/rejected": -1.428523063659668, + "loss": 0.8425, + "odds_ratio_loss": 0.5476968288421631, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07519279420375824, + "rewards/margins": 0.06765950471162796, + "rewards/rejected": -0.1428523063659668, + "sft_loss": 0.7519279718399048, + "step": 191 + }, + { + "epoch": 0.27765726681127983, + "grad_norm": 3.632543289305855, + "learning_rate": 7.989798384565478e-06, + "logits/chosen": 0.7635389566421509, + "logits/rejected": 0.72230064868927, + "logps/chosen": -0.7457807064056396, + "logps/rejected": -1.7796211242675781, + "loss": 0.9186, + "odds_ratio_loss": 0.4742932915687561, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07457807660102844, + "rewards/margins": 0.10338405519723892, + "rewards/rejected": -0.17796212434768677, + "sft_loss": 0.7457807064056396, + "step": 192 + }, + { + "epoch": 0.27910339840925524, + "grad_norm": 3.0980154406972362, + "learning_rate": 7.989575501924127e-06, + "logits/chosen": 0.8246858716011047, + "logits/rejected": 0.5677071809768677, + "logps/chosen": -0.7799168229103088, + "logps/rejected": -1.816365122795105, + "loss": 0.9227, + "odds_ratio_loss": 0.5203930735588074, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07799168676137924, + "rewards/margins": 0.10364483296871185, + "rewards/rejected": -0.1816365271806717, + "sft_loss": 0.7799168229103088, + "step": 193 + }, + { + "epoch": 0.28054953000723065, + "grad_norm": 3.2989461799711544, + "learning_rate": 7.989350213952848e-06, + "logits/chosen": 0.9356462955474854, + "logits/rejected": 0.629136323928833, + "logps/chosen": -0.9007641673088074, + "logps/rejected": -1.534816026687622, + "loss": 0.899, + "odds_ratio_loss": 0.549186646938324, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09007642418146133, + "rewards/margins": 0.06340517848730087, + "rewards/rejected": -0.1534816026687622, + "sft_loss": 0.9007641673088074, + "step": 194 + }, + { + "epoch": 0.28199566160520606, + "grad_norm": 5.46581584343181, + "learning_rate": 7.989122520787467e-06, + "logits/chosen": 0.8825095891952515, + "logits/rejected": 0.685544490814209, + "logps/chosen": -1.0721142292022705, + "logps/rejected": -1.4573118686676025, + "loss": 0.9202, + "odds_ratio_loss": 0.7042291164398193, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.10721142590045929, + "rewards/margins": 0.038519762456417084, + "rewards/rejected": -0.14573121070861816, + "sft_loss": 1.0721142292022705, + "step": 195 + }, + { + "epoch": 0.28344179320318147, + "grad_norm": 4.080158423120696, + "learning_rate": 7.98889242256526e-06, + "logits/chosen": 0.7583059668540955, + "logits/rejected": 0.7805667519569397, + "logps/chosen": -0.9448596239089966, + "logps/rejected": -1.3803226947784424, + "loss": 0.8473, + "odds_ratio_loss": 0.5857207775115967, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09448596090078354, + "rewards/margins": 0.04354630410671234, + "rewards/rejected": -0.13803227245807648, + "sft_loss": 0.9448596239089966, + "step": 196 + }, + { + "epoch": 0.2848879248011569, + "grad_norm": 3.973133965232593, + "learning_rate": 7.988659919424955e-06, + "logits/chosen": 1.090093731880188, + "logits/rejected": 0.8616019487380981, + "logps/chosen": -0.7189903855323792, + "logps/rejected": -1.4010587930679321, + "loss": 0.7704, + "odds_ratio_loss": 0.48269736766815186, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07189904153347015, + "rewards/margins": 0.06820684671401978, + "rewards/rejected": -0.14010588824748993, + "sft_loss": 0.7189903855323792, + "step": 197 + }, + { + "epoch": 0.28633405639913234, + "grad_norm": 5.238268762550638, + "learning_rate": 7.988425011506729e-06, + "logits/chosen": 0.8917097449302673, + "logits/rejected": 0.768082857131958, + "logps/chosen": -0.9902671575546265, + "logps/rejected": -1.6686362028121948, + "loss": 0.9195, + "odds_ratio_loss": 0.5506667494773865, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09902672469615936, + "rewards/margins": 0.06783689558506012, + "rewards/rejected": -0.16686362028121948, + "sft_loss": 0.9902671575546265, + "step": 198 + }, + { + "epoch": 0.28778018799710775, + "grad_norm": 2.912157324642061, + "learning_rate": 7.98818769895221e-06, + "logits/chosen": 0.7989850044250488, + "logits/rejected": 0.804793655872345, + "logps/chosen": -0.7779511213302612, + "logps/rejected": -1.3023908138275146, + "loss": 0.8612, + "odds_ratio_loss": 0.4791719913482666, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0777951180934906, + "rewards/margins": 0.0524439737200737, + "rewards/rejected": -0.1302390843629837, + "sft_loss": 0.7779511213302612, + "step": 199 + }, + { + "epoch": 0.28922631959508316, + "grad_norm": 2.5819984768363367, + "learning_rate": 7.987947981904474e-06, + "logits/chosen": 0.7809892892837524, + "logits/rejected": 0.7020816802978516, + "logps/chosen": -0.9111021161079407, + "logps/rejected": -1.5197502374649048, + "loss": 0.8977, + "odds_ratio_loss": 0.5979732275009155, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0911102145910263, + "rewards/margins": 0.06086481735110283, + "rewards/rejected": -0.15197503566741943, + "sft_loss": 0.9111021161079407, + "step": 200 + }, + { + "epoch": 0.29067245119305857, + "grad_norm": 3.718047797689079, + "learning_rate": 7.987705860508047e-06, + "logits/chosen": 0.7863653898239136, + "logits/rejected": 0.6933090090751648, + "logps/chosen": -1.001847267150879, + "logps/rejected": -1.0187654495239258, + "loss": 0.9285, + "odds_ratio_loss": 0.7681245803833008, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.10018472373485565, + "rewards/margins": 0.0016918233595788479, + "rewards/rejected": -0.10187654942274094, + "sft_loss": 1.001847267150879, + "step": 201 + }, + { + "epoch": 0.292118582791034, + "grad_norm": 3.2947050827622753, + "learning_rate": 7.987461334908904e-06, + "logits/chosen": 0.7659215927124023, + "logits/rejected": 0.7039311528205872, + "logps/chosen": -0.7966563105583191, + "logps/rejected": -1.5445137023925781, + "loss": 0.8617, + "odds_ratio_loss": 0.47074514627456665, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0796656385064125, + "rewards/margins": 0.0747857540845871, + "rewards/rejected": -0.154451385140419, + "sft_loss": 0.7966563105583191, + "step": 202 + }, + { + "epoch": 0.2935647143890094, + "grad_norm": 2.750448225020009, + "learning_rate": 7.98721440525447e-06, + "logits/chosen": 0.6041221618652344, + "logits/rejected": 0.571781575679779, + "logps/chosen": -0.8432777523994446, + "logps/rejected": -1.7520842552185059, + "loss": 0.8824, + "odds_ratio_loss": 0.4685782194137573, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08432777971029282, + "rewards/margins": 0.09088063985109329, + "rewards/rejected": -0.1752084195613861, + "sft_loss": 0.8432777523994446, + "step": 203 + }, + { + "epoch": 0.2950108459869848, + "grad_norm": 3.2302840293561395, + "learning_rate": 7.986965071693625e-06, + "logits/chosen": 0.8296738862991333, + "logits/rejected": 0.7145066857337952, + "logps/chosen": -0.9232079982757568, + "logps/rejected": -1.2196871042251587, + "loss": 0.9195, + "odds_ratio_loss": 0.6474923491477966, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09232080727815628, + "rewards/margins": 0.029647907242178917, + "rewards/rejected": -0.12196871638298035, + "sft_loss": 0.9232079982757568, + "step": 204 + }, + { + "epoch": 0.29645697758496026, + "grad_norm": 3.1285063753356144, + "learning_rate": 7.986713334376686e-06, + "logits/chosen": 0.7811083793640137, + "logits/rejected": 0.7227165102958679, + "logps/chosen": -0.8592801094055176, + "logps/rejected": -1.6300902366638184, + "loss": 0.817, + "odds_ratio_loss": 0.4264185428619385, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.08592800796031952, + "rewards/margins": 0.07708100974559784, + "rewards/rejected": -0.16300901770591736, + "sft_loss": 0.8592801094055176, + "step": 205 + }, + { + "epoch": 0.29790310918293567, + "grad_norm": 3.558090548670039, + "learning_rate": 7.98645919345543e-06, + "logits/chosen": 1.2186182737350464, + "logits/rejected": 0.79095858335495, + "logps/chosen": -0.8577883243560791, + "logps/rejected": -1.7429773807525635, + "loss": 0.9034, + "odds_ratio_loss": 0.5151891112327576, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08577883243560791, + "rewards/margins": 0.08851891756057739, + "rewards/rejected": -0.1742977499961853, + "sft_loss": 0.8577883243560791, + "step": 206 + }, + { + "epoch": 0.2993492407809111, + "grad_norm": 2.9503574400099932, + "learning_rate": 7.986202649083081e-06, + "logits/chosen": 0.8998777270317078, + "logits/rejected": 0.6546536087989807, + "logps/chosen": -0.7888596057891846, + "logps/rejected": -1.6412047147750854, + "loss": 0.8684, + "odds_ratio_loss": 0.5090416073799133, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07888597249984741, + "rewards/margins": 0.08523451536893845, + "rewards/rejected": -0.16412046551704407, + "sft_loss": 0.7888596057891846, + "step": 207 + }, + { + "epoch": 0.3007953723788865, + "grad_norm": 2.9896940410495003, + "learning_rate": 7.985943701414308e-06, + "logits/chosen": 0.8296674489974976, + "logits/rejected": 0.6985807418823242, + "logps/chosen": -0.7949290871620178, + "logps/rejected": -1.3496679067611694, + "loss": 0.9616, + "odds_ratio_loss": 0.4149304926395416, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07949291169643402, + "rewards/margins": 0.05547389015555382, + "rewards/rejected": -0.13496679067611694, + "sft_loss": 0.7949290871620178, + "step": 208 + }, + { + "epoch": 0.3022415039768619, + "grad_norm": 2.8436164762025062, + "learning_rate": 7.98568235060523e-06, + "logits/chosen": 0.9186201095581055, + "logits/rejected": 0.775266706943512, + "logps/chosen": -0.7632952332496643, + "logps/rejected": -1.2405225038528442, + "loss": 0.8446, + "odds_ratio_loss": 0.6041321158409119, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0763295367360115, + "rewards/margins": 0.047722719609737396, + "rewards/rejected": -0.1240522563457489, + "sft_loss": 0.7632952332496643, + "step": 209 + }, + { + "epoch": 0.3036876355748373, + "grad_norm": 3.2744747177577493, + "learning_rate": 7.98541859681342e-06, + "logits/chosen": 0.8311449289321899, + "logits/rejected": 0.6091936826705933, + "logps/chosen": -0.8429104685783386, + "logps/rejected": -1.8527532815933228, + "loss": 0.885, + "odds_ratio_loss": 0.5193464756011963, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08429104834794998, + "rewards/margins": 0.10098428279161453, + "rewards/rejected": -0.18527531623840332, + "sft_loss": 0.8429104685783386, + "step": 210 + }, + { + "epoch": 0.3051337671728127, + "grad_norm": 3.581434079691926, + "learning_rate": 7.985152440197896e-06, + "logits/chosen": 0.924926221370697, + "logits/rejected": 0.681089460849762, + "logps/chosen": -0.7374269366264343, + "logps/rejected": -1.5732078552246094, + "loss": 0.8093, + "odds_ratio_loss": 0.4177606701850891, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07374269515275955, + "rewards/margins": 0.08357809484004974, + "rewards/rejected": -0.1573207825422287, + "sft_loss": 0.7374269366264343, + "step": 211 + }, + { + "epoch": 0.3065798987707881, + "grad_norm": 4.3947384054323395, + "learning_rate": 7.984883880919123e-06, + "logits/chosen": 0.8097667694091797, + "logits/rejected": 0.7048723697662354, + "logps/chosen": -0.8589484691619873, + "logps/rejected": -1.5178613662719727, + "loss": 0.8306, + "odds_ratio_loss": 0.5315617918968201, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08589484542608261, + "rewards/margins": 0.06589128822088242, + "rewards/rejected": -0.15178614854812622, + "sft_loss": 0.8589484691619873, + "step": 212 + }, + { + "epoch": 0.3080260303687636, + "grad_norm": 3.1907573370753695, + "learning_rate": 7.984612919139015e-06, + "logits/chosen": 0.6455193758010864, + "logits/rejected": 0.5276200771331787, + "logps/chosen": -0.7528071403503418, + "logps/rejected": -1.862764596939087, + "loss": 0.7911, + "odds_ratio_loss": 0.4871300458908081, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07528071105480194, + "rewards/margins": 0.11099573969841003, + "rewards/rejected": -0.18627645075321198, + "sft_loss": 0.7528071403503418, + "step": 213 + }, + { + "epoch": 0.309472161966739, + "grad_norm": 3.0906834831413805, + "learning_rate": 7.98433955502094e-06, + "logits/chosen": 1.0261489152908325, + "logits/rejected": 0.9212863445281982, + "logps/chosen": -0.7404288053512573, + "logps/rejected": -1.2891268730163574, + "loss": 0.9422, + "odds_ratio_loss": 0.729023814201355, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.07404288649559021, + "rewards/margins": 0.054869815707206726, + "rewards/rejected": -0.12891268730163574, + "sft_loss": 0.7404288053512573, + "step": 214 + }, + { + "epoch": 0.3109182935647144, + "grad_norm": 3.465475173558746, + "learning_rate": 7.984063788729707e-06, + "logits/chosen": 0.9017759561538696, + "logits/rejected": 0.6762468814849854, + "logps/chosen": -0.8559285402297974, + "logps/rejected": -1.2730680704116821, + "loss": 0.9338, + "odds_ratio_loss": 0.6160497069358826, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0855928584933281, + "rewards/margins": 0.041713956743478775, + "rewards/rejected": -0.12730681896209717, + "sft_loss": 0.8559285402297974, + "step": 215 + }, + { + "epoch": 0.3123644251626898, + "grad_norm": 4.054453016381701, + "learning_rate": 7.983785620431576e-06, + "logits/chosen": 0.8824461102485657, + "logits/rejected": 0.7006586790084839, + "logps/chosen": -0.9869768619537354, + "logps/rejected": -1.6781079769134521, + "loss": 0.9413, + "odds_ratio_loss": 0.609472393989563, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09869769215583801, + "rewards/margins": 0.0691131055355072, + "rewards/rejected": -0.16781079769134521, + "sft_loss": 0.9869768619537354, + "step": 216 + }, + { + "epoch": 0.3138105567606652, + "grad_norm": 3.3550744195656486, + "learning_rate": 7.98350505029426e-06, + "logits/chosen": 0.831702470779419, + "logits/rejected": 0.6789094805717468, + "logps/chosen": -0.8098732233047485, + "logps/rejected": -1.5542247295379639, + "loss": 0.8176, + "odds_ratio_loss": 0.5694811344146729, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0809873417019844, + "rewards/margins": 0.07443515211343765, + "rewards/rejected": -0.15542249381542206, + "sft_loss": 0.8098732233047485, + "step": 217 + }, + { + "epoch": 0.3152566883586406, + "grad_norm": 4.018954232761074, + "learning_rate": 7.983222078486912e-06, + "logits/chosen": 0.7321873903274536, + "logits/rejected": 0.686294674873352, + "logps/chosen": -0.7930043339729309, + "logps/rejected": -1.2628358602523804, + "loss": 0.8478, + "odds_ratio_loss": 0.5880774259567261, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.07930043339729309, + "rewards/margins": 0.046983152627944946, + "rewards/rejected": -0.12628358602523804, + "sft_loss": 0.7930043339729309, + "step": 218 + }, + { + "epoch": 0.31670281995661603, + "grad_norm": 2.788787607391925, + "learning_rate": 7.982936705180138e-06, + "logits/chosen": 0.8820023536682129, + "logits/rejected": 0.6825209259986877, + "logps/chosen": -0.6613825559616089, + "logps/rejected": -1.4193311929702759, + "loss": 0.7987, + "odds_ratio_loss": 0.49998748302459717, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06613826006650925, + "rewards/margins": 0.07579487562179565, + "rewards/rejected": -0.1419331282377243, + "sft_loss": 0.6613825559616089, + "step": 219 + }, + { + "epoch": 0.31814895155459144, + "grad_norm": 2.4168572760464793, + "learning_rate": 7.98264893054599e-06, + "logits/chosen": 0.8746920824050903, + "logits/rejected": 0.6152170896530151, + "logps/chosen": -0.8145027160644531, + "logps/rejected": -1.6139273643493652, + "loss": 0.812, + "odds_ratio_loss": 0.5231243968009949, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08145026862621307, + "rewards/margins": 0.07994245737791061, + "rewards/rejected": -0.16139273345470428, + "sft_loss": 0.8145027160644531, + "step": 220 + }, + { + "epoch": 0.3195950831525669, + "grad_norm": 6.559029582787577, + "learning_rate": 7.98235875475797e-06, + "logits/chosen": 0.687819242477417, + "logits/rejected": 0.5400627851486206, + "logps/chosen": -0.811687171459198, + "logps/rejected": -1.8423746824264526, + "loss": 0.9222, + "odds_ratio_loss": 0.4002645015716553, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08116871118545532, + "rewards/margins": 0.1030687540769577, + "rewards/rejected": -0.18423748016357422, + "sft_loss": 0.811687171459198, + "step": 221 + }, + { + "epoch": 0.3210412147505423, + "grad_norm": 4.26878072464604, + "learning_rate": 7.982066177991022e-06, + "logits/chosen": 0.7749350070953369, + "logits/rejected": 0.6633896231651306, + "logps/chosen": -0.8835655450820923, + "logps/rejected": -1.1134123802185059, + "loss": 0.9247, + "odds_ratio_loss": 0.8716833591461182, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.08835654705762863, + "rewards/margins": 0.02298467978835106, + "rewards/rejected": -0.11134123802185059, + "sft_loss": 0.8835655450820923, + "step": 222 + }, + { + "epoch": 0.3224873463485177, + "grad_norm": 3.856415488368599, + "learning_rate": 7.981771200421547e-06, + "logits/chosen": 0.8121925592422485, + "logits/rejected": 0.5447680354118347, + "logps/chosen": -0.7949036359786987, + "logps/rejected": -1.3006750345230103, + "loss": 0.8606, + "odds_ratio_loss": 0.5626305341720581, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.07949037104845047, + "rewards/margins": 0.05057713761925697, + "rewards/rejected": -0.13006749749183655, + "sft_loss": 0.7949036359786987, + "step": 223 + }, + { + "epoch": 0.32393347794649313, + "grad_norm": 3.195463927964763, + "learning_rate": 7.981473822227383e-06, + "logits/chosen": 1.0078376531600952, + "logits/rejected": 0.8154879212379456, + "logps/chosen": -0.8676646947860718, + "logps/rejected": -1.233111023902893, + "loss": 0.8813, + "odds_ratio_loss": 0.6121217608451843, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08676647394895554, + "rewards/margins": 0.03654462844133377, + "rewards/rejected": -0.1233111023902893, + "sft_loss": 0.8676646947860718, + "step": 224 + }, + { + "epoch": 0.32537960954446854, + "grad_norm": 5.3146650131117665, + "learning_rate": 7.981174043587826e-06, + "logits/chosen": 0.5916678309440613, + "logits/rejected": 0.46369603276252747, + "logps/chosen": -1.0442885160446167, + "logps/rejected": -1.6847314834594727, + "loss": 0.9145, + "odds_ratio_loss": 0.5118202567100525, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10442885756492615, + "rewards/margins": 0.06404431909322739, + "rewards/rejected": -0.16847318410873413, + "sft_loss": 1.0442885160446167, + "step": 225 + }, + { + "epoch": 0.32682574114244395, + "grad_norm": 3.0340595722346992, + "learning_rate": 7.98087186468361e-06, + "logits/chosen": 0.6914336085319519, + "logits/rejected": 0.5757249593734741, + "logps/chosen": -0.9663025140762329, + "logps/rejected": -0.9450004696846008, + "loss": 0.9079, + "odds_ratio_loss": 0.7738305926322937, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.09663024544715881, + "rewards/margins": -0.002130193170160055, + "rewards/rejected": -0.09450005739927292, + "sft_loss": 0.9663025140762329, + "step": 226 + }, + { + "epoch": 0.32827187274041936, + "grad_norm": 2.8726087856467557, + "learning_rate": 7.98056728569692e-06, + "logits/chosen": 0.8992648720741272, + "logits/rejected": 0.8230560421943665, + "logps/chosen": -0.772228479385376, + "logps/rejected": -1.0922561883926392, + "loss": 0.8579, + "odds_ratio_loss": 0.5426236391067505, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07722284644842148, + "rewards/margins": 0.0320027731359005, + "rewards/rejected": -0.10922562330961227, + "sft_loss": 0.772228479385376, + "step": 227 + }, + { + "epoch": 0.3297180043383948, + "grad_norm": 3.7331750453817865, + "learning_rate": 7.980260306811388e-06, + "logits/chosen": 0.820920467376709, + "logits/rejected": 0.6605263352394104, + "logps/chosen": -0.8334155082702637, + "logps/rejected": -1.178484559059143, + "loss": 0.8927, + "odds_ratio_loss": 0.5583294630050659, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0833415538072586, + "rewards/margins": 0.0345069095492363, + "rewards/rejected": -0.11784844845533371, + "sft_loss": 0.8334155082702637, + "step": 228 + }, + { + "epoch": 0.33116413593637023, + "grad_norm": 2.999700585683244, + "learning_rate": 7.979950928212092e-06, + "logits/chosen": 0.7122136354446411, + "logits/rejected": 0.5927017331123352, + "logps/chosen": -0.9525954723358154, + "logps/rejected": -1.6945300102233887, + "loss": 0.8914, + "odds_ratio_loss": 0.6651919484138489, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.09525954723358154, + "rewards/margins": 0.07419347018003464, + "rewards/rejected": -0.16945302486419678, + "sft_loss": 0.9525954723358154, + "step": 229 + }, + { + "epoch": 0.33261026753434564, + "grad_norm": 3.173819627640913, + "learning_rate": 7.97963915008556e-06, + "logits/chosen": 0.8333603143692017, + "logits/rejected": 0.542677640914917, + "logps/chosen": -0.7230704426765442, + "logps/rejected": -1.862441062927246, + "loss": 0.8663, + "odds_ratio_loss": 0.5365059971809387, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0723070502281189, + "rewards/margins": 0.11393707245588303, + "rewards/rejected": -0.18624411523342133, + "sft_loss": 0.7230704426765442, + "step": 230 + }, + { + "epoch": 0.33405639913232105, + "grad_norm": 7.7397925754093855, + "learning_rate": 7.979324972619762e-06, + "logits/chosen": 0.690934419631958, + "logits/rejected": 0.5235450863838196, + "logps/chosen": -0.7463875412940979, + "logps/rejected": -1.8125139474868774, + "loss": 0.818, + "odds_ratio_loss": 0.42135879397392273, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07463876157999039, + "rewards/margins": 0.10661264508962631, + "rewards/rejected": -0.1812514215707779, + "sft_loss": 0.7463875412940979, + "step": 231 + }, + { + "epoch": 0.33550253073029646, + "grad_norm": 13.429401940993012, + "learning_rate": 7.979008396004118e-06, + "logits/chosen": 0.7407370805740356, + "logits/rejected": 0.5966250896453857, + "logps/chosen": -0.9655992388725281, + "logps/rejected": -1.7845059633255005, + "loss": 0.8857, + "odds_ratio_loss": 0.49398142099380493, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09655991941690445, + "rewards/margins": 0.08189067989587784, + "rewards/rejected": -0.17845061421394348, + "sft_loss": 0.9655992388725281, + "step": 232 + }, + { + "epoch": 0.33694866232827186, + "grad_norm": 2.784215244397478, + "learning_rate": 7.978689420429491e-06, + "logits/chosen": 0.784191370010376, + "logits/rejected": 0.6294467449188232, + "logps/chosen": -0.8379493951797485, + "logps/rejected": -1.453858494758606, + "loss": 0.7912, + "odds_ratio_loss": 0.5905799865722656, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08379494398832321, + "rewards/margins": 0.06159090995788574, + "rewards/rejected": -0.14538586139678955, + "sft_loss": 0.8379493951797485, + "step": 233 + }, + { + "epoch": 0.3383947939262473, + "grad_norm": 3.809142821856424, + "learning_rate": 7.978368046088197e-06, + "logits/chosen": 0.7262183427810669, + "logits/rejected": 0.6582277417182922, + "logps/chosen": -1.0824716091156006, + "logps/rejected": -1.1397984027862549, + "loss": 0.9569, + "odds_ratio_loss": 0.7705241441726685, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10824716836214066, + "rewards/margins": 0.0057326690293848515, + "rewards/rejected": -0.11397983133792877, + "sft_loss": 1.0824716091156006, + "step": 234 + }, + { + "epoch": 0.3398409255242227, + "grad_norm": 3.0346959899246713, + "learning_rate": 7.978044273173988e-06, + "logits/chosen": 0.8044057488441467, + "logits/rejected": 0.6709400415420532, + "logps/chosen": -0.8678100109100342, + "logps/rejected": -1.0254902839660645, + "loss": 0.852, + "odds_ratio_loss": 0.7025970816612244, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08678101003170013, + "rewards/margins": 0.015768028795719147, + "rewards/rejected": -0.10254903137683868, + "sft_loss": 0.8678100109100342, + "step": 235 + }, + { + "epoch": 0.34128705712219815, + "grad_norm": 2.7268429100393785, + "learning_rate": 7.977718101882074e-06, + "logits/chosen": 0.8474991321563721, + "logits/rejected": 0.6978375911712646, + "logps/chosen": -0.6163941025733948, + "logps/rejected": -1.8094571828842163, + "loss": 0.8349, + "odds_ratio_loss": 0.525054931640625, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06163940951228142, + "rewards/margins": 0.11930631101131439, + "rewards/rejected": -0.18094570934772491, + "sft_loss": 0.6163941025733948, + "step": 236 + }, + { + "epoch": 0.34273318872017355, + "grad_norm": 3.5101613560433056, + "learning_rate": 7.977389532409099e-06, + "logits/chosen": 0.6671175956726074, + "logits/rejected": 0.49634212255477905, + "logps/chosen": -0.7292395234107971, + "logps/rejected": -1.6196175813674927, + "loss": 0.8755, + "odds_ratio_loss": 0.4010503888130188, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07292395830154419, + "rewards/margins": 0.08903781324625015, + "rewards/rejected": -0.16196177899837494, + "sft_loss": 0.7292395234107971, + "step": 237 + }, + { + "epoch": 0.34417932031814896, + "grad_norm": 2.803002663496337, + "learning_rate": 7.977058564953163e-06, + "logits/chosen": 0.8379513025283813, + "logits/rejected": 0.715684175491333, + "logps/chosen": -0.8583624362945557, + "logps/rejected": -1.6935725212097168, + "loss": 0.8992, + "odds_ratio_loss": 0.5625948905944824, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0858362466096878, + "rewards/margins": 0.08352100849151611, + "rewards/rejected": -0.1693572700023651, + "sft_loss": 0.8583624362945557, + "step": 238 + }, + { + "epoch": 0.34562545191612437, + "grad_norm": 4.167844335180785, + "learning_rate": 7.976725199713806e-06, + "logits/chosen": 0.899992048740387, + "logits/rejected": 0.6711959838867188, + "logps/chosen": -0.7251548767089844, + "logps/rejected": -1.621293306350708, + "loss": 0.9368, + "odds_ratio_loss": 0.5080724954605103, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07251548767089844, + "rewards/margins": 0.08961383998394012, + "rewards/rejected": -0.16212932765483856, + "sft_loss": 0.7251548767089844, + "step": 239 + }, + { + "epoch": 0.3470715835140998, + "grad_norm": 3.35087123326394, + "learning_rate": 7.976389436892015e-06, + "logits/chosen": 0.7564694881439209, + "logits/rejected": 0.6249856948852539, + "logps/chosen": -0.8904150128364563, + "logps/rejected": -1.1712754964828491, + "loss": 0.8692, + "odds_ratio_loss": 0.5690667033195496, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08904150128364563, + "rewards/margins": 0.02808605134487152, + "rewards/rejected": -0.11712755262851715, + "sft_loss": 0.8904150128364563, + "step": 240 + }, + { + "epoch": 0.3485177151120752, + "grad_norm": 2.6406013197038387, + "learning_rate": 7.976051276690223e-06, + "logits/chosen": 0.6425853967666626, + "logits/rejected": 0.5586930513381958, + "logps/chosen": -0.7436127662658691, + "logps/rejected": -2.275128126144409, + "loss": 0.8125, + "odds_ratio_loss": 0.403175950050354, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07436127960681915, + "rewards/margins": 0.15315154194831848, + "rewards/rejected": -0.22751282155513763, + "sft_loss": 0.7436127662658691, + "step": 241 + }, + { + "epoch": 0.3499638467100506, + "grad_norm": 4.369279972922537, + "learning_rate": 7.975710719312306e-06, + "logits/chosen": 0.6873583793640137, + "logits/rejected": 0.5604276657104492, + "logps/chosen": -0.7742418646812439, + "logps/rejected": -1.7675644159317017, + "loss": 0.895, + "odds_ratio_loss": 0.5378227829933167, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07742418348789215, + "rewards/margins": 0.09933225810527802, + "rewards/rejected": -0.17675642669200897, + "sft_loss": 0.7742418646812439, + "step": 242 + }, + { + "epoch": 0.351409978308026, + "grad_norm": 3.4169304241180485, + "learning_rate": 7.975367764963591e-06, + "logits/chosen": 0.9993571043014526, + "logits/rejected": 0.8231282234191895, + "logps/chosen": -0.8939650058746338, + "logps/rejected": -1.2862818241119385, + "loss": 0.9304, + "odds_ratio_loss": 0.6970391869544983, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08939650654792786, + "rewards/margins": 0.039231669157743454, + "rewards/rejected": -0.12862816452980042, + "sft_loss": 0.8939650058746338, + "step": 243 + }, + { + "epoch": 0.35285610990600147, + "grad_norm": 3.577479393127468, + "learning_rate": 7.975022413850844e-06, + "logits/chosen": 0.7503065466880798, + "logits/rejected": 0.5771316289901733, + "logps/chosen": -0.7733768224716187, + "logps/rejected": -1.5073132514953613, + "loss": 0.8007, + "odds_ratio_loss": 0.4124954044818878, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07733768224716187, + "rewards/margins": 0.07339364290237427, + "rewards/rejected": -0.15073132514953613, + "sft_loss": 0.7733768224716187, + "step": 244 + }, + { + "epoch": 0.3543022415039769, + "grad_norm": 3.836112525892133, + "learning_rate": 7.974674666182281e-06, + "logits/chosen": 0.8848112225532532, + "logits/rejected": 0.8300377130508423, + "logps/chosen": -0.752465546131134, + "logps/rejected": -2.1321396827697754, + "loss": 0.8716, + "odds_ratio_loss": 0.5637364983558655, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07524655759334564, + "rewards/margins": 0.13796743750572205, + "rewards/rejected": -0.2132139950990677, + "sft_loss": 0.752465546131134, + "step": 245 + }, + { + "epoch": 0.3557483731019523, + "grad_norm": 3.1277493586154126, + "learning_rate": 7.974324522167557e-06, + "logits/chosen": 0.7861306667327881, + "logits/rejected": 0.615057647228241, + "logps/chosen": -0.9151326417922974, + "logps/rejected": -1.5386865139007568, + "loss": 0.8864, + "odds_ratio_loss": 0.5509360432624817, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0915132611989975, + "rewards/margins": 0.062355391681194305, + "rewards/rejected": -0.1538686603307724, + "sft_loss": 0.9151326417922974, + "step": 246 + }, + { + "epoch": 0.3571945046999277, + "grad_norm": 3.506251057509387, + "learning_rate": 7.973971982017775e-06, + "logits/chosen": 0.8024786710739136, + "logits/rejected": 0.5016666054725647, + "logps/chosen": -0.6327417492866516, + "logps/rejected": -1.9561928510665894, + "loss": 0.8688, + "odds_ratio_loss": 0.44218164682388306, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06327417492866516, + "rewards/margins": 0.13234511017799377, + "rewards/rejected": -0.19561928510665894, + "sft_loss": 0.6327417492866516, + "step": 247 + }, + { + "epoch": 0.3586406362979031, + "grad_norm": 2.6104103628429796, + "learning_rate": 7.973617045945487e-06, + "logits/chosen": 0.8094074726104736, + "logits/rejected": 0.5848209857940674, + "logps/chosen": -0.778999388217926, + "logps/rejected": -1.5029215812683105, + "loss": 0.8303, + "odds_ratio_loss": 0.6139599084854126, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.07789994776248932, + "rewards/margins": 0.07239222526550293, + "rewards/rejected": -0.15029215812683105, + "sft_loss": 0.778999388217926, + "step": 248 + }, + { + "epoch": 0.3600867678958785, + "grad_norm": 3.8613735777872993, + "learning_rate": 7.97325971416468e-06, + "logits/chosen": 0.8423810601234436, + "logits/rejected": 0.6365971565246582, + "logps/chosen": -0.768801748752594, + "logps/rejected": -2.2783043384552, + "loss": 0.8586, + "odds_ratio_loss": 0.37025052309036255, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07688017189502716, + "rewards/margins": 0.15095025300979614, + "rewards/rejected": -0.2278304398059845, + "sft_loss": 0.768801748752594, + "step": 249 + }, + { + "epoch": 0.3615328994938539, + "grad_norm": 5.012643427887363, + "learning_rate": 7.972899986890796e-06, + "logits/chosen": 0.7972968220710754, + "logits/rejected": 0.7219531536102295, + "logps/chosen": -1.2234079837799072, + "logps/rejected": -1.1467794179916382, + "loss": 1.0785, + "odds_ratio_loss": 0.8847633600234985, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.12234079837799072, + "rewards/margins": -0.007662854623049498, + "rewards/rejected": -0.11467794328927994, + "sft_loss": 1.2234079837799072, + "step": 250 + }, + { + "epoch": 0.36297903109182933, + "grad_norm": 4.279210626470339, + "learning_rate": 7.972537864340714e-06, + "logits/chosen": 0.9484211206436157, + "logits/rejected": 0.7891027331352234, + "logps/chosen": -0.8660564422607422, + "logps/rejected": -1.464170217514038, + "loss": 0.8468, + "odds_ratio_loss": 0.6933861970901489, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08660565316677094, + "rewards/margins": 0.059811390936374664, + "rewards/rejected": -0.146417036652565, + "sft_loss": 0.8660564422607422, + "step": 251 + }, + { + "epoch": 0.3644251626898048, + "grad_norm": 3.471495045207052, + "learning_rate": 7.972173346732755e-06, + "logits/chosen": 0.7789313793182373, + "logits/rejected": 0.7130517959594727, + "logps/chosen": -0.9272181391716003, + "logps/rejected": -1.905896782875061, + "loss": 0.8858, + "odds_ratio_loss": 0.5296302437782288, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.09272181242704391, + "rewards/margins": 0.09786785393953323, + "rewards/rejected": -0.19058966636657715, + "sft_loss": 0.9272181391716003, + "step": 252 + }, + { + "epoch": 0.3658712942877802, + "grad_norm": 10.562055110820904, + "learning_rate": 7.971806434286693e-06, + "logits/chosen": 0.9116954803466797, + "logits/rejected": 0.7316747307777405, + "logps/chosen": -0.6885159015655518, + "logps/rejected": -2.1187455654144287, + "loss": 0.8137, + "odds_ratio_loss": 0.40810999274253845, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06885159015655518, + "rewards/margins": 0.14302296936511993, + "rewards/rejected": -0.21187454462051392, + "sft_loss": 0.6885159015655518, + "step": 253 + }, + { + "epoch": 0.3673174258857556, + "grad_norm": 2.417811083637075, + "learning_rate": 7.97143712722374e-06, + "logits/chosen": 0.7554264664649963, + "logits/rejected": 0.5283613204956055, + "logps/chosen": -0.8525446653366089, + "logps/rejected": -1.62563157081604, + "loss": 0.9782, + "odds_ratio_loss": 0.5037074089050293, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08525446802377701, + "rewards/margins": 0.07730869948863983, + "rewards/rejected": -0.16256316006183624, + "sft_loss": 0.8525446653366089, + "step": 254 + }, + { + "epoch": 0.368763557483731, + "grad_norm": 2.4927832005447828, + "learning_rate": 7.97106542576655e-06, + "logits/chosen": 0.8163594007492065, + "logits/rejected": 0.5682124495506287, + "logps/chosen": -0.6700804829597473, + "logps/rejected": -2.2717583179473877, + "loss": 0.8972, + "odds_ratio_loss": 0.33437198400497437, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06700804829597473, + "rewards/margins": 0.16016778349876404, + "rewards/rejected": -0.22717583179473877, + "sft_loss": 0.6700804829597473, + "step": 255 + }, + { + "epoch": 0.37020968908170643, + "grad_norm": 2.980750378676755, + "learning_rate": 7.970691330139226e-06, + "logits/chosen": 0.7557944655418396, + "logits/rejected": 0.6307120323181152, + "logps/chosen": -0.8848236203193665, + "logps/rejected": -0.9924321174621582, + "loss": 0.871, + "odds_ratio_loss": 0.7175642251968384, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08848236501216888, + "rewards/margins": 0.010760847479104996, + "rewards/rejected": -0.09924320876598358, + "sft_loss": 0.8848236203193665, + "step": 256 + }, + { + "epoch": 0.37165582067968184, + "grad_norm": 4.391248806591652, + "learning_rate": 7.97031484056731e-06, + "logits/chosen": 0.7138476371765137, + "logits/rejected": 0.6476942300796509, + "logps/chosen": -0.7389509677886963, + "logps/rejected": -1.5851773023605347, + "loss": 0.864, + "odds_ratio_loss": 0.41973942518234253, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07389508932828903, + "rewards/margins": 0.08462263643741608, + "rewards/rejected": -0.1585177183151245, + "sft_loss": 0.7389509677886963, + "step": 257 + }, + { + "epoch": 0.37310195227765725, + "grad_norm": 3.3143549158128818, + "learning_rate": 7.96993595727779e-06, + "logits/chosen": 0.6371474266052246, + "logits/rejected": 0.5004527568817139, + "logps/chosen": -0.7199169397354126, + "logps/rejected": -2.65303111076355, + "loss": 0.8483, + "odds_ratio_loss": 0.44243472814559937, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0719916895031929, + "rewards/margins": 0.1933114230632782, + "rewards/rejected": -0.2653031349182129, + "sft_loss": 0.7199169397354126, + "step": 258 + }, + { + "epoch": 0.3745480838756327, + "grad_norm": 3.1321180531981687, + "learning_rate": 7.969554680499097e-06, + "logits/chosen": 0.6553201675415039, + "logits/rejected": 0.6746339797973633, + "logps/chosen": -0.8668511509895325, + "logps/rejected": -1.6837364435195923, + "loss": 0.9129, + "odds_ratio_loss": 0.5618002414703369, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08668512105941772, + "rewards/margins": 0.0816885307431221, + "rewards/rejected": -0.16837365925312042, + "sft_loss": 0.8668511509895325, + "step": 259 + }, + { + "epoch": 0.3759942154736081, + "grad_norm": 7.57803343896559, + "learning_rate": 7.969171010461101e-06, + "logits/chosen": 0.6894788146018982, + "logits/rejected": 0.5525285005569458, + "logps/chosen": -0.7234008312225342, + "logps/rejected": -1.6714537143707275, + "loss": 0.7727, + "odds_ratio_loss": 0.38769158720970154, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07234008610248566, + "rewards/margins": 0.09480530768632889, + "rewards/rejected": -0.16714540123939514, + "sft_loss": 0.7234008312225342, + "step": 260 + }, + { + "epoch": 0.3774403470715835, + "grad_norm": 3.0196548162625976, + "learning_rate": 7.968784947395122e-06, + "logits/chosen": 0.7908775210380554, + "logits/rejected": 0.5918086171150208, + "logps/chosen": -1.0163339376449585, + "logps/rejected": -1.382534146308899, + "loss": 0.9061, + "odds_ratio_loss": 0.7340130805969238, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10163339227437973, + "rewards/margins": 0.03662002459168434, + "rewards/rejected": -0.13825342059135437, + "sft_loss": 1.0163339376449585, + "step": 261 + }, + { + "epoch": 0.37888647866955893, + "grad_norm": 4.090776023213169, + "learning_rate": 7.968396491533914e-06, + "logits/chosen": 0.7099351286888123, + "logits/rejected": 0.5062960982322693, + "logps/chosen": -1.0517055988311768, + "logps/rejected": -1.2568538188934326, + "loss": 0.9126, + "odds_ratio_loss": 0.7793716788291931, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.10517056286334991, + "rewards/margins": 0.020514825358986855, + "rewards/rejected": -0.12568539381027222, + "sft_loss": 1.0517055988311768, + "step": 262 + }, + { + "epoch": 0.38033261026753434, + "grad_norm": 4.6262771091993224, + "learning_rate": 7.968005643111684e-06, + "logits/chosen": 0.7514022588729858, + "logits/rejected": 0.7052969336509705, + "logps/chosen": -0.9725602865219116, + "logps/rejected": -2.683314323425293, + "loss": 0.898, + "odds_ratio_loss": 0.6760257482528687, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.09725602716207504, + "rewards/margins": 0.17107540369033813, + "rewards/rejected": -0.2683314383029938, + "sft_loss": 0.9725602865219116, + "step": 263 + }, + { + "epoch": 0.38177874186550975, + "grad_norm": 3.091384295153508, + "learning_rate": 7.967612402364071e-06, + "logits/chosen": 0.7682317495346069, + "logits/rejected": 0.7247217297554016, + "logps/chosen": -0.8237577676773071, + "logps/rejected": -1.758196234703064, + "loss": 0.8667, + "odds_ratio_loss": 0.7248599529266357, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08237577974796295, + "rewards/margins": 0.0934438556432724, + "rewards/rejected": -0.17581963539123535, + "sft_loss": 0.8237577676773071, + "step": 264 + }, + { + "epoch": 0.38322487346348516, + "grad_norm": 2.6477759807391825, + "learning_rate": 7.967216769528166e-06, + "logits/chosen": 0.6989853382110596, + "logits/rejected": 0.5311872959136963, + "logps/chosen": -0.8110822439193726, + "logps/rejected": -2.2039005756378174, + "loss": 0.9244, + "odds_ratio_loss": 0.5130632519721985, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0811082199215889, + "rewards/margins": 0.13928183913230896, + "rewards/rejected": -0.22039005160331726, + "sft_loss": 0.8110822439193726, + "step": 265 + }, + { + "epoch": 0.38467100506146057, + "grad_norm": 3.867013203125484, + "learning_rate": 7.966818744842494e-06, + "logits/chosen": 0.8495751023292542, + "logits/rejected": 0.74996018409729, + "logps/chosen": -0.7023511528968811, + "logps/rejected": -1.9245699644088745, + "loss": 0.8556, + "odds_ratio_loss": 0.46690189838409424, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07023511826992035, + "rewards/margins": 0.12222187966108322, + "rewards/rejected": -0.19245699048042297, + "sft_loss": 0.7023511528968811, + "step": 266 + }, + { + "epoch": 0.38611713665943603, + "grad_norm": 7.385204554151738, + "learning_rate": 7.966418328547026e-06, + "logits/chosen": 0.821699857711792, + "logits/rejected": 0.6689359545707703, + "logps/chosen": -0.5747587084770203, + "logps/rejected": -1.5719361305236816, + "loss": 0.9209, + "odds_ratio_loss": 0.5139928460121155, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.057475872337818146, + "rewards/margins": 0.09971773624420166, + "rewards/rejected": -0.1571936011314392, + "sft_loss": 0.5747587084770203, + "step": 267 + }, + { + "epoch": 0.38756326825741144, + "grad_norm": 3.0184638626608633, + "learning_rate": 7.966015520883178e-06, + "logits/chosen": 0.7042960524559021, + "logits/rejected": 0.5639068484306335, + "logps/chosen": -0.7901575565338135, + "logps/rejected": -3.015544891357422, + "loss": 0.7691, + "odds_ratio_loss": 0.41159412264823914, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07901576161384583, + "rewards/margins": 0.22253873944282532, + "rewards/rejected": -0.30155447125434875, + "sft_loss": 0.7901575565338135, + "step": 268 + }, + { + "epoch": 0.38900939985538685, + "grad_norm": 4.235634529152362, + "learning_rate": 7.965610322093798e-06, + "logits/chosen": 0.7846585512161255, + "logits/rejected": 0.711887001991272, + "logps/chosen": -0.9673622250556946, + "logps/rejected": -1.4234188795089722, + "loss": 0.8955, + "odds_ratio_loss": 0.6162651777267456, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09673622250556946, + "rewards/margins": 0.04560566693544388, + "rewards/rejected": -0.14234188199043274, + "sft_loss": 0.9673622250556946, + "step": 269 + }, + { + "epoch": 0.39045553145336226, + "grad_norm": 4.376119601183979, + "learning_rate": 7.965202732423186e-06, + "logits/chosen": 0.7393103837966919, + "logits/rejected": 0.5519256591796875, + "logps/chosen": -0.9471575021743774, + "logps/rejected": -1.5052287578582764, + "loss": 0.9612, + "odds_ratio_loss": 0.5077100992202759, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.09471575170755386, + "rewards/margins": 0.055807143449783325, + "rewards/rejected": -0.1505228877067566, + "sft_loss": 0.9471575021743774, + "step": 270 + }, + { + "epoch": 0.39190166305133767, + "grad_norm": 3.621340032032785, + "learning_rate": 7.96479275211708e-06, + "logits/chosen": 0.9538942575454712, + "logits/rejected": 0.7590014934539795, + "logps/chosen": -0.7168223857879639, + "logps/rejected": -1.5027300119400024, + "loss": 0.8363, + "odds_ratio_loss": 0.48245489597320557, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07168224453926086, + "rewards/margins": 0.07859078049659729, + "rewards/rejected": -0.15027301013469696, + "sft_loss": 0.7168223857879639, + "step": 271 + }, + { + "epoch": 0.3933477946493131, + "grad_norm": 3.2832296292942167, + "learning_rate": 7.964380381422656e-06, + "logits/chosen": 0.6526778340339661, + "logits/rejected": 0.3193623423576355, + "logps/chosen": -0.991958737373352, + "logps/rejected": -1.9510480165481567, + "loss": 0.865, + "odds_ratio_loss": 0.51337730884552, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09919588267803192, + "rewards/margins": 0.09590893238782883, + "rewards/rejected": -0.19510480761528015, + "sft_loss": 0.991958737373352, + "step": 272 + }, + { + "epoch": 0.3947939262472885, + "grad_norm": 2.696843063651091, + "learning_rate": 7.963965620588536e-06, + "logits/chosen": 0.7486572861671448, + "logits/rejected": 0.5010773539543152, + "logps/chosen": -0.8629076480865479, + "logps/rejected": -1.6694220304489136, + "loss": 0.7328, + "odds_ratio_loss": 0.5291403532028198, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08629077672958374, + "rewards/margins": 0.08065143972635269, + "rewards/rejected": -0.16694220900535583, + "sft_loss": 0.8629076480865479, + "step": 273 + }, + { + "epoch": 0.3962400578452639, + "grad_norm": 3.1227033798437516, + "learning_rate": 7.96354846986478e-06, + "logits/chosen": 0.7784473299980164, + "logits/rejected": 0.5128236413002014, + "logps/chosen": -0.6188066005706787, + "logps/rejected": -1.8390074968338013, + "loss": 0.8487, + "odds_ratio_loss": 0.3730120360851288, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06188066303730011, + "rewards/margins": 0.12202008068561554, + "rewards/rejected": -0.18390074372291565, + "sft_loss": 0.6188066005706787, + "step": 274 + }, + { + "epoch": 0.39768618944323936, + "grad_norm": 3.6086922714292418, + "learning_rate": 7.963128929502889e-06, + "logits/chosen": 0.8294479250907898, + "logits/rejected": 0.664797842502594, + "logps/chosen": -0.8582887053489685, + "logps/rejected": -1.5387067794799805, + "loss": 0.904, + "odds_ratio_loss": 0.6132429838180542, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08582887053489685, + "rewards/margins": 0.06804181635379791, + "rewards/rejected": -0.15387068688869476, + "sft_loss": 0.8582887053489685, + "step": 275 + }, + { + "epoch": 0.39913232104121477, + "grad_norm": 3.5451162311098243, + "learning_rate": 7.962706999755807e-06, + "logits/chosen": 0.7107446193695068, + "logits/rejected": 0.5381249189376831, + "logps/chosen": -0.8531709909439087, + "logps/rejected": -1.67955482006073, + "loss": 0.9392, + "odds_ratio_loss": 0.4603929817676544, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08531709760427475, + "rewards/margins": 0.08263837546110153, + "rewards/rejected": -0.16795547306537628, + "sft_loss": 0.8531709909439087, + "step": 276 + }, + { + "epoch": 0.4005784526391902, + "grad_norm": 2.388888186464326, + "learning_rate": 7.962282680877915e-06, + "logits/chosen": 0.8900540471076965, + "logits/rejected": 0.5301176905632019, + "logps/chosen": -0.6871480941772461, + "logps/rejected": -2.295354127883911, + "loss": 0.8126, + "odds_ratio_loss": 0.3517339527606964, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06871479749679565, + "rewards/margins": 0.1608206033706665, + "rewards/rejected": -0.22953541576862335, + "sft_loss": 0.6871480941772461, + "step": 277 + }, + { + "epoch": 0.4020245842371656, + "grad_norm": 2.6315560832486775, + "learning_rate": 7.96185597312504e-06, + "logits/chosen": 0.9399300813674927, + "logits/rejected": 0.6871672868728638, + "logps/chosen": -0.6519705653190613, + "logps/rejected": -2.3862929344177246, + "loss": 0.8321, + "odds_ratio_loss": 0.3415951430797577, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06519706547260284, + "rewards/margins": 0.17343223094940186, + "rewards/rejected": -0.2386292815208435, + "sft_loss": 0.6519705653190613, + "step": 278 + }, + { + "epoch": 0.403470715835141, + "grad_norm": 9.473931696319617, + "learning_rate": 7.96142687675444e-06, + "logits/chosen": 0.712382435798645, + "logits/rejected": 0.525117039680481, + "logps/chosen": -0.5526185035705566, + "logps/rejected": -2.451003313064575, + "loss": 0.8902, + "odds_ratio_loss": 0.2894706130027771, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.055261846631765366, + "rewards/margins": 0.1898384690284729, + "rewards/rejected": -0.24510033428668976, + "sft_loss": 0.5526185035705566, + "step": 279 + }, + { + "epoch": 0.4049168474331164, + "grad_norm": 13.10604992325088, + "learning_rate": 7.960995392024826e-06, + "logits/chosen": 0.9458888173103333, + "logits/rejected": 0.6861677169799805, + "logps/chosen": -0.8119727373123169, + "logps/rejected": -1.5805472135543823, + "loss": 0.8809, + "odds_ratio_loss": 0.576356828212738, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08119726926088333, + "rewards/margins": 0.07685744762420654, + "rewards/rejected": -0.15805472433567047, + "sft_loss": 0.8119727373123169, + "step": 280 + }, + { + "epoch": 0.4063629790310918, + "grad_norm": 2.21110470027196, + "learning_rate": 7.960561519196334e-06, + "logits/chosen": 0.7865252494812012, + "logits/rejected": 0.5100930333137512, + "logps/chosen": -0.7717297077178955, + "logps/rejected": -2.293206214904785, + "loss": 0.8973, + "odds_ratio_loss": 0.43415284156799316, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07717297971248627, + "rewards/margins": 0.15214766561985016, + "rewards/rejected": -0.22932063043117523, + "sft_loss": 0.7717297077178955, + "step": 281 + }, + { + "epoch": 0.4078091106290672, + "grad_norm": 3.993304152364151, + "learning_rate": 7.960125258530553e-06, + "logits/chosen": 0.7694143056869507, + "logits/rejected": 0.5560774207115173, + "logps/chosen": -0.8932427167892456, + "logps/rejected": -1.4928797483444214, + "loss": 0.9099, + "odds_ratio_loss": 0.5252251029014587, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08932427316904068, + "rewards/margins": 0.05996370688080788, + "rewards/rejected": -0.14928798377513885, + "sft_loss": 0.8932427167892456, + "step": 282 + }, + { + "epoch": 0.4092552422270427, + "grad_norm": 3.0189154414124775, + "learning_rate": 7.959686610290504e-06, + "logits/chosen": 0.5652546882629395, + "logits/rejected": 0.5164136290550232, + "logps/chosen": -0.7622696161270142, + "logps/rejected": -1.2584534883499146, + "loss": 0.8417, + "odds_ratio_loss": 0.4685708284378052, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07622695714235306, + "rewards/margins": 0.04961838945746422, + "rewards/rejected": -0.12584535777568817, + "sft_loss": 0.7622696161270142, + "step": 283 + }, + { + "epoch": 0.4107013738250181, + "grad_norm": 2.5548285360039285, + "learning_rate": 7.959245574740652e-06, + "logits/chosen": 0.868388831615448, + "logits/rejected": 0.6646077036857605, + "logps/chosen": -0.7296291589736938, + "logps/rejected": -1.3972752094268799, + "loss": 0.7955, + "odds_ratio_loss": 0.4898286461830139, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0729629248380661, + "rewards/margins": 0.06676461547613144, + "rewards/rejected": -0.13972753286361694, + "sft_loss": 0.7296291589736938, + "step": 284 + }, + { + "epoch": 0.4121475054229935, + "grad_norm": 4.044250113535489, + "learning_rate": 7.958802152146895e-06, + "logits/chosen": 0.8534368276596069, + "logits/rejected": 0.5839318633079529, + "logps/chosen": -0.7515661120414734, + "logps/rejected": -1.5278918743133545, + "loss": 0.8219, + "odds_ratio_loss": 0.44184809923171997, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07515661418437958, + "rewards/margins": 0.07763257622718811, + "rewards/rejected": -0.1527891904115677, + "sft_loss": 0.7515661120414734, + "step": 285 + }, + { + "epoch": 0.4135936370209689, + "grad_norm": 3.854922628972753, + "learning_rate": 7.958356342776576e-06, + "logits/chosen": 0.8270955085754395, + "logits/rejected": 0.7576125860214233, + "logps/chosen": -0.7885029315948486, + "logps/rejected": -1.602339744567871, + "loss": 0.8647, + "odds_ratio_loss": 0.6078847050666809, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.07885029166936874, + "rewards/margins": 0.08138368278741837, + "rewards/rejected": -0.1602339744567871, + "sft_loss": 0.7885029315948486, + "step": 286 + }, + { + "epoch": 0.4150397686189443, + "grad_norm": 2.97233807277169, + "learning_rate": 7.957908146898477e-06, + "logits/chosen": 0.8544989228248596, + "logits/rejected": 0.6331669092178345, + "logps/chosen": -0.9146354794502258, + "logps/rejected": -1.7395083904266357, + "loss": 0.9325, + "odds_ratio_loss": 0.6201297044754028, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.09146355092525482, + "rewards/margins": 0.08248727768659592, + "rewards/rejected": -0.17395083606243134, + "sft_loss": 0.9146354794502258, + "step": 287 + }, + { + "epoch": 0.4164859002169197, + "grad_norm": 3.9764262611407064, + "learning_rate": 7.957457564782816e-06, + "logits/chosen": 0.7527596950531006, + "logits/rejected": 0.619892418384552, + "logps/chosen": -0.7813956141471863, + "logps/rejected": -2.4308414459228516, + "loss": 0.7837, + "odds_ratio_loss": 0.547453761100769, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07813956588506699, + "rewards/margins": 0.164944589138031, + "rewards/rejected": -0.2430841624736786, + "sft_loss": 0.7813956141471863, + "step": 288 + }, + { + "epoch": 0.41793203181489513, + "grad_norm": 7.563020631354844, + "learning_rate": 7.95700459670125e-06, + "logits/chosen": 0.8600162863731384, + "logits/rejected": 0.6054329872131348, + "logps/chosen": -0.8736534714698792, + "logps/rejected": -2.9901254177093506, + "loss": 0.9142, + "odds_ratio_loss": 0.5161182880401611, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08736535161733627, + "rewards/margins": 0.211647167801857, + "rewards/rejected": -0.29901254177093506, + "sft_loss": 0.8736534714698792, + "step": 289 + }, + { + "epoch": 0.4193781634128706, + "grad_norm": 2.4611000413126676, + "learning_rate": 7.956549242926872e-06, + "logits/chosen": 0.7825429439544678, + "logits/rejected": 0.5044749975204468, + "logps/chosen": -0.7324574589729309, + "logps/rejected": -1.9676034450531006, + "loss": 0.7825, + "odds_ratio_loss": 0.4541637897491455, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07324574887752533, + "rewards/margins": 0.12351461499929428, + "rewards/rejected": -0.196760356426239, + "sft_loss": 0.7324574589729309, + "step": 290 + }, + { + "epoch": 0.420824295010846, + "grad_norm": 3.0288336601213315, + "learning_rate": 7.956091503734223e-06, + "logits/chosen": 0.7554039359092712, + "logits/rejected": 0.5739782452583313, + "logps/chosen": -0.5028554797172546, + "logps/rejected": -3.680788993835449, + "loss": 0.6984, + "odds_ratio_loss": 0.28743448853492737, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.050285547971725464, + "rewards/margins": 0.3177933394908905, + "rewards/rejected": -0.36807888746261597, + "sft_loss": 0.5028554797172546, + "step": 291 + }, + { + "epoch": 0.4222704266088214, + "grad_norm": 3.5405260189111947, + "learning_rate": 7.955631379399271e-06, + "logits/chosen": 0.7954279780387878, + "logits/rejected": 0.5444122552871704, + "logps/chosen": -0.6753735542297363, + "logps/rejected": -2.6563949584960938, + "loss": 0.7875, + "odds_ratio_loss": 0.43333977460861206, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06753735989332199, + "rewards/margins": 0.19810214638710022, + "rewards/rejected": -0.2656395137310028, + "sft_loss": 0.6753735542297363, + "step": 292 + }, + { + "epoch": 0.4237165582067968, + "grad_norm": 3.1754832413171004, + "learning_rate": 7.955168870199428e-06, + "logits/chosen": 0.7190636992454529, + "logits/rejected": 0.5010616779327393, + "logps/chosen": -0.7396849393844604, + "logps/rejected": -2.7797493934631348, + "loss": 0.9183, + "odds_ratio_loss": 0.43546822667121887, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07396849989891052, + "rewards/margins": 0.20400643348693848, + "rewards/rejected": -0.277974933385849, + "sft_loss": 0.7396849393844604, + "step": 293 + }, + { + "epoch": 0.42516268980477223, + "grad_norm": 4.057110333121107, + "learning_rate": 7.954703976413544e-06, + "logits/chosen": 0.561728298664093, + "logits/rejected": 0.47190630435943604, + "logps/chosen": -0.9969494342803955, + "logps/rejected": -1.62455153465271, + "loss": 0.9558, + "odds_ratio_loss": 0.6601800322532654, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.09969495236873627, + "rewards/margins": 0.06276021897792816, + "rewards/rejected": -0.16245517134666443, + "sft_loss": 0.9969494342803955, + "step": 294 + }, + { + "epoch": 0.42660882140274764, + "grad_norm": 2.864194448801957, + "learning_rate": 7.954236698321901e-06, + "logits/chosen": 0.6871510744094849, + "logits/rejected": 0.465214341878891, + "logps/chosen": -0.6832473278045654, + "logps/rejected": -1.775686502456665, + "loss": 0.7974, + "odds_ratio_loss": 0.4238354563713074, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0683247298002243, + "rewards/margins": 0.10924392938613892, + "rewards/rejected": -0.17756864428520203, + "sft_loss": 0.6832473278045654, + "step": 295 + }, + { + "epoch": 0.42805495300072305, + "grad_norm": 4.022318915961337, + "learning_rate": 7.953767036206228e-06, + "logits/chosen": 0.548240602016449, + "logits/rejected": 0.49791282415390015, + "logps/chosen": -0.8850837349891663, + "logps/rejected": -1.8481075763702393, + "loss": 0.9559, + "odds_ratio_loss": 0.4951096773147583, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08850837498903275, + "rewards/margins": 0.09630238264799118, + "rewards/rejected": -0.18481075763702393, + "sft_loss": 0.8850837349891663, + "step": 296 + }, + { + "epoch": 0.42950108459869846, + "grad_norm": 4.1252342060496066, + "learning_rate": 7.953294990349683e-06, + "logits/chosen": 0.6419621706008911, + "logits/rejected": 0.636412501335144, + "logps/chosen": -0.929691731929779, + "logps/rejected": -2.140536069869995, + "loss": 0.9064, + "odds_ratio_loss": 0.6660383343696594, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09296917915344238, + "rewards/margins": 0.12108444422483444, + "rewards/rejected": -0.21405361592769623, + "sft_loss": 0.929691731929779, + "step": 297 + }, + { + "epoch": 0.4309472161966739, + "grad_norm": 6.395196153201281, + "learning_rate": 7.952820561036864e-06, + "logits/chosen": 0.6038861870765686, + "logits/rejected": 0.4589577913284302, + "logps/chosen": -0.7986758351325989, + "logps/rejected": -2.17226505279541, + "loss": 0.9359, + "odds_ratio_loss": 0.5335122346878052, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07986757904291153, + "rewards/margins": 0.13735893368721008, + "rewards/rejected": -0.21722650527954102, + "sft_loss": 0.7986758351325989, + "step": 298 + }, + { + "epoch": 0.43239334779464933, + "grad_norm": 2.7876002435377765, + "learning_rate": 7.952343748553806e-06, + "logits/chosen": 0.6711763739585876, + "logits/rejected": 0.6253769397735596, + "logps/chosen": -0.820818305015564, + "logps/rejected": -1.4959368705749512, + "loss": 0.9121, + "odds_ratio_loss": 0.5811915993690491, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08208183199167252, + "rewards/margins": 0.06751186400651932, + "rewards/rejected": -0.14959371089935303, + "sft_loss": 0.820818305015564, + "step": 299 + }, + { + "epoch": 0.43383947939262474, + "grad_norm": 2.9225860620617152, + "learning_rate": 7.951864553187983e-06, + "logits/chosen": 0.6169509291648865, + "logits/rejected": 0.5235470533370972, + "logps/chosen": -0.6802053451538086, + "logps/rejected": -2.5945489406585693, + "loss": 0.7323, + "odds_ratio_loss": 0.3930429518222809, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0680205300450325, + "rewards/margins": 0.19143438339233398, + "rewards/rejected": -0.2594549059867859, + "sft_loss": 0.6802053451538086, + "step": 300 + }, + { + "epoch": 0.43528561099060015, + "grad_norm": 3.2161174399034347, + "learning_rate": 7.951382975228301e-06, + "logits/chosen": 0.7169187068939209, + "logits/rejected": 0.7688310146331787, + "logps/chosen": -0.8765419721603394, + "logps/rejected": -1.1146844625473022, + "loss": 0.9063, + "odds_ratio_loss": 0.6021533012390137, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08765420317649841, + "rewards/margins": 0.02381424978375435, + "rewards/rejected": -0.11146844923496246, + "sft_loss": 0.8765419721603394, + "step": 301 + }, + { + "epoch": 0.43673174258857556, + "grad_norm": 3.0896141078168187, + "learning_rate": 7.95089901496511e-06, + "logits/chosen": 0.8363983631134033, + "logits/rejected": 0.5107532739639282, + "logps/chosen": -0.8250682353973389, + "logps/rejected": -2.3571953773498535, + "loss": 0.7644, + "odds_ratio_loss": 0.5739777088165283, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08250682055950165, + "rewards/margins": 0.15321271121501923, + "rewards/rejected": -0.23571954667568207, + "sft_loss": 0.8250682353973389, + "step": 302 + }, + { + "epoch": 0.43817787418655096, + "grad_norm": 3.811865270406822, + "learning_rate": 7.950412672690186e-06, + "logits/chosen": 0.6645305752754211, + "logits/rejected": 0.5417653322219849, + "logps/chosen": -0.9253389835357666, + "logps/rejected": -2.211357831954956, + "loss": 0.9014, + "odds_ratio_loss": 0.5837020874023438, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0925339013338089, + "rewards/margins": 0.12860189378261566, + "rewards/rejected": -0.22113579511642456, + "sft_loss": 0.9253389835357666, + "step": 303 + }, + { + "epoch": 0.4396240057845264, + "grad_norm": 2.672569700743552, + "learning_rate": 7.94992394869675e-06, + "logits/chosen": 0.7562464475631714, + "logits/rejected": 0.536525309085846, + "logps/chosen": -0.8378995656967163, + "logps/rejected": -1.695633888244629, + "loss": 0.8967, + "odds_ratio_loss": 0.4167096018791199, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08378996700048447, + "rewards/margins": 0.08577344566583633, + "rewards/rejected": -0.1695634126663208, + "sft_loss": 0.8378995656967163, + "step": 304 + }, + { + "epoch": 0.4410701373825018, + "grad_norm": 3.008113742586865, + "learning_rate": 7.949432843279453e-06, + "logits/chosen": 0.7660605311393738, + "logits/rejected": 0.599168598651886, + "logps/chosen": -0.6577078700065613, + "logps/rejected": -2.418144464492798, + "loss": 0.8728, + "odds_ratio_loss": 0.3436301648616791, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06577078998088837, + "rewards/margins": 0.17604365944862366, + "rewards/rejected": -0.24181444942951202, + "sft_loss": 0.6577078700065613, + "step": 305 + }, + { + "epoch": 0.44251626898047725, + "grad_norm": 2.916373711671278, + "learning_rate": 7.948939356734385e-06, + "logits/chosen": 0.7118316292762756, + "logits/rejected": 0.5230697393417358, + "logps/chosen": -0.6964635252952576, + "logps/rejected": -1.8553109169006348, + "loss": 0.9158, + "odds_ratio_loss": 0.4708198308944702, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06964635848999023, + "rewards/margins": 0.11588473618030548, + "rewards/rejected": -0.1855311095714569, + "sft_loss": 0.6964635252952576, + "step": 306 + }, + { + "epoch": 0.44396240057845265, + "grad_norm": 3.1516702895048816, + "learning_rate": 7.948443489359071e-06, + "logits/chosen": 0.8500151634216309, + "logits/rejected": 0.686989426612854, + "logps/chosen": -0.6953170895576477, + "logps/rejected": -1.5989413261413574, + "loss": 0.8902, + "odds_ratio_loss": 0.3981609046459198, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06953170895576477, + "rewards/margins": 0.09036242961883545, + "rewards/rejected": -0.15989412367343903, + "sft_loss": 0.6953170895576477, + "step": 307 + }, + { + "epoch": 0.44540853217642806, + "grad_norm": 3.5710829304772584, + "learning_rate": 7.947945241452475e-06, + "logits/chosen": 0.5635668039321899, + "logits/rejected": 0.46536314487457275, + "logps/chosen": -0.8453367948532104, + "logps/rejected": -1.4774147272109985, + "loss": 0.8867, + "odds_ratio_loss": 0.5234245657920837, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0845336765050888, + "rewards/margins": 0.06320779025554657, + "rewards/rejected": -0.14774146676063538, + "sft_loss": 0.8453367948532104, + "step": 308 + }, + { + "epoch": 0.44685466377440347, + "grad_norm": 3.390233696947453, + "learning_rate": 7.947444613314986e-06, + "logits/chosen": 0.7768136262893677, + "logits/rejected": 0.660037100315094, + "logps/chosen": -0.8382095098495483, + "logps/rejected": -1.4200916290283203, + "loss": 0.797, + "odds_ratio_loss": 0.5195042490959167, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08382095396518707, + "rewards/margins": 0.05818820744752884, + "rewards/rejected": -0.1420091688632965, + "sft_loss": 0.8382095098495483, + "step": 309 + }, + { + "epoch": 0.4483007953723789, + "grad_norm": 4.509100866130907, + "learning_rate": 7.94694160524844e-06, + "logits/chosen": 0.5886682868003845, + "logits/rejected": 0.384355753660202, + "logps/chosen": -0.867601752281189, + "logps/rejected": -1.5529288053512573, + "loss": 0.8558, + "odds_ratio_loss": 0.5688661932945251, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08676017820835114, + "rewards/margins": 0.06853270530700684, + "rewards/rejected": -0.15529288351535797, + "sft_loss": 0.867601752281189, + "step": 310 + }, + { + "epoch": 0.4497469269703543, + "grad_norm": 2.1647577543237664, + "learning_rate": 7.946436217556099e-06, + "logits/chosen": 0.6968039274215698, + "logits/rejected": 0.5218100547790527, + "logps/chosen": -0.693936288356781, + "logps/rejected": -1.393923282623291, + "loss": 0.7933, + "odds_ratio_loss": 0.41789939999580383, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06939362734556198, + "rewards/margins": 0.06999869644641876, + "rewards/rejected": -0.13939233124256134, + "sft_loss": 0.693936288356781, + "step": 311 + }, + { + "epoch": 0.4511930585683297, + "grad_norm": 4.30465084520022, + "learning_rate": 7.945928450542664e-06, + "logits/chosen": 0.5718408226966858, + "logits/rejected": 0.6034159064292908, + "logps/chosen": -0.8760853409767151, + "logps/rejected": -1.6363561153411865, + "loss": 0.8284, + "odds_ratio_loss": 0.40968573093414307, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08760854601860046, + "rewards/margins": 0.07602708041667938, + "rewards/rejected": -0.16363561153411865, + "sft_loss": 0.8760853409767151, + "step": 312 + }, + { + "epoch": 0.45263919016630516, + "grad_norm": 3.0022074366655866, + "learning_rate": 7.94541830451427e-06, + "logits/chosen": 0.5793265104293823, + "logits/rejected": 0.42465144395828247, + "logps/chosen": -0.5700391530990601, + "logps/rejected": -2.0401134490966797, + "loss": 0.8435, + "odds_ratio_loss": 0.29582464694976807, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.057003915309906006, + "rewards/margins": 0.14700743556022644, + "rewards/rejected": -0.20401133596897125, + "sft_loss": 0.5700391530990601, + "step": 313 + }, + { + "epoch": 0.45408532176428057, + "grad_norm": 2.481391841589562, + "learning_rate": 7.944905779778487e-06, + "logits/chosen": 0.6300640106201172, + "logits/rejected": 0.5014770030975342, + "logps/chosen": -0.6986908912658691, + "logps/rejected": -1.7633607387542725, + "loss": 0.7659, + "odds_ratio_loss": 0.5082624554634094, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06986908614635468, + "rewards/margins": 0.10646697133779526, + "rewards/rejected": -0.17633606493473053, + "sft_loss": 0.6986908912658691, + "step": 314 + }, + { + "epoch": 0.455531453362256, + "grad_norm": 2.6200652171864083, + "learning_rate": 7.944390876644317e-06, + "logits/chosen": 0.5968663096427917, + "logits/rejected": 0.5737828016281128, + "logps/chosen": -0.7979955077171326, + "logps/rejected": -1.5119889974594116, + "loss": 0.8404, + "odds_ratio_loss": 0.6428624391555786, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07979955524206161, + "rewards/margins": 0.07139935344457626, + "rewards/rejected": -0.15119890868663788, + "sft_loss": 0.7979955077171326, + "step": 315 + }, + { + "epoch": 0.4569775849602314, + "grad_norm": 3.2994965196511035, + "learning_rate": 7.943873595422195e-06, + "logits/chosen": 0.7686248421669006, + "logits/rejected": 0.6836127638816833, + "logps/chosen": -0.8711780309677124, + "logps/rejected": -1.6719789505004883, + "loss": 0.8572, + "odds_ratio_loss": 0.4641047716140747, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08711780607700348, + "rewards/margins": 0.08008009195327759, + "rewards/rejected": -0.16719789803028107, + "sft_loss": 0.8711780309677124, + "step": 316 + }, + { + "epoch": 0.4584237165582068, + "grad_norm": 3.644622791431945, + "learning_rate": 7.943353936423996e-06, + "logits/chosen": 0.8206264972686768, + "logits/rejected": 0.5815707445144653, + "logps/chosen": -0.9250833988189697, + "logps/rejected": -1.7713178396224976, + "loss": 0.8729, + "odds_ratio_loss": 0.5220272541046143, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09250834584236145, + "rewards/margins": 0.08462343364953995, + "rewards/rejected": -0.1771317720413208, + "sft_loss": 0.9250833988189697, + "step": 317 + }, + { + "epoch": 0.4598698481561822, + "grad_norm": 3.631087682077212, + "learning_rate": 7.94283189996302e-06, + "logits/chosen": 0.6323086619377136, + "logits/rejected": 0.5804564952850342, + "logps/chosen": -0.835005521774292, + "logps/rejected": -1.089737892150879, + "loss": 0.7905, + "odds_ratio_loss": 0.609061062335968, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08350054919719696, + "rewards/margins": 0.025473233312368393, + "rewards/rejected": -0.10897378623485565, + "sft_loss": 0.835005521774292, + "step": 318 + }, + { + "epoch": 0.4613159797541576, + "grad_norm": 3.383821151178241, + "learning_rate": 7.942307486354009e-06, + "logits/chosen": 0.5765923857688904, + "logits/rejected": 0.4810538589954376, + "logps/chosen": -0.7016163468360901, + "logps/rejected": -1.5007145404815674, + "loss": 0.8345, + "odds_ratio_loss": 0.41251441836357117, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07016163319349289, + "rewards/margins": 0.07990982383489609, + "rewards/rejected": -0.15007147192955017, + "sft_loss": 0.7016163468360901, + "step": 319 + }, + { + "epoch": 0.462762111352133, + "grad_norm": 6.031998545816025, + "learning_rate": 7.94178069591313e-06, + "logits/chosen": 0.7171785235404968, + "logits/rejected": 0.6892584562301636, + "logps/chosen": -0.8179836273193359, + "logps/rejected": -1.0612622499465942, + "loss": 0.8643, + "odds_ratio_loss": 0.5907955169677734, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08179835230112076, + "rewards/margins": 0.024327874183654785, + "rewards/rejected": -0.10612623393535614, + "sft_loss": 0.8179836273193359, + "step": 320 + }, + { + "epoch": 0.4642082429501085, + "grad_norm": 3.4977656147220975, + "learning_rate": 7.94125152895799e-06, + "logits/chosen": 0.6936742067337036, + "logits/rejected": 0.5295305848121643, + "logps/chosen": -0.7622445821762085, + "logps/rejected": -1.9071930646896362, + "loss": 0.7984, + "odds_ratio_loss": 0.4924374222755432, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07622446119785309, + "rewards/margins": 0.11449483782052994, + "rewards/rejected": -0.19071930646896362, + "sft_loss": 0.7622445821762085, + "step": 321 + }, + { + "epoch": 0.4656543745480839, + "grad_norm": 2.5042671601600475, + "learning_rate": 7.940719985807624e-06, + "logits/chosen": 0.7296179533004761, + "logits/rejected": 0.49951764941215515, + "logps/chosen": -0.6940515041351318, + "logps/rejected": -1.7299174070358276, + "loss": 0.7871, + "odds_ratio_loss": 0.47463756799697876, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06940515339374542, + "rewards/margins": 0.1035865843296051, + "rewards/rejected": -0.17299173772335052, + "sft_loss": 0.6940515041351318, + "step": 322 + }, + { + "epoch": 0.4671005061460593, + "grad_norm": 3.292946892095406, + "learning_rate": 7.9401860667825e-06, + "logits/chosen": 0.7062532305717468, + "logits/rejected": 0.4862893223762512, + "logps/chosen": -0.6891757249832153, + "logps/rejected": -1.7076308727264404, + "loss": 0.7987, + "odds_ratio_loss": 0.35063761472702026, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06891757249832153, + "rewards/margins": 0.10184551775455475, + "rewards/rejected": -0.17076310515403748, + "sft_loss": 0.6891757249832153, + "step": 323 + }, + { + "epoch": 0.4685466377440347, + "grad_norm": 3.6446604374799274, + "learning_rate": 7.939649772204524e-06, + "logits/chosen": 0.6098726987838745, + "logits/rejected": 0.44188717007637024, + "logps/chosen": -0.9559364318847656, + "logps/rejected": -1.3839476108551025, + "loss": 0.8978, + "odds_ratio_loss": 0.5117559432983398, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0955936461687088, + "rewards/margins": 0.04280112311244011, + "rewards/rejected": -0.1383947730064392, + "sft_loss": 0.9559364318847656, + "step": 324 + }, + { + "epoch": 0.4699927693420101, + "grad_norm": 3.584034339628172, + "learning_rate": 7.939111102397025e-06, + "logits/chosen": 0.6634198427200317, + "logits/rejected": 0.5811032652854919, + "logps/chosen": -0.6490356922149658, + "logps/rejected": -1.5838490724563599, + "loss": 0.8837, + "odds_ratio_loss": 0.3766994774341583, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06490357220172882, + "rewards/margins": 0.09348133206367493, + "rewards/rejected": -0.15838490426540375, + "sft_loss": 0.6490356922149658, + "step": 325 + }, + { + "epoch": 0.47143890093998553, + "grad_norm": 3.5189379597603314, + "learning_rate": 7.938570057684775e-06, + "logits/chosen": 0.5340790748596191, + "logits/rejected": 0.4121875464916229, + "logps/chosen": -0.8325109481811523, + "logps/rejected": -1.5845043659210205, + "loss": 0.8627, + "odds_ratio_loss": 0.49629390239715576, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08325108885765076, + "rewards/margins": 0.07519934326410294, + "rewards/rejected": -0.1584504395723343, + "sft_loss": 0.8325109481811523, + "step": 326 + }, + { + "epoch": 0.47288503253796094, + "grad_norm": 3.5375364033379837, + "learning_rate": 7.938026638393967e-06, + "logits/chosen": 0.75108802318573, + "logits/rejected": 0.6010204553604126, + "logps/chosen": -0.7769035696983337, + "logps/rejected": -1.7428910732269287, + "loss": 0.8074, + "odds_ratio_loss": 0.4815066456794739, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07769035547971725, + "rewards/margins": 0.09659874439239502, + "rewards/rejected": -0.17428909242153168, + "sft_loss": 0.7769035696983337, + "step": 327 + }, + { + "epoch": 0.47433116413593635, + "grad_norm": 4.074796522105809, + "learning_rate": 7.93748084485223e-06, + "logits/chosen": 0.6220093965530396, + "logits/rejected": 0.4709737300872803, + "logps/chosen": -0.6528467535972595, + "logps/rejected": -1.3154959678649902, + "loss": 0.8344, + "odds_ratio_loss": 0.4504534602165222, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06528467684984207, + "rewards/margins": 0.06626491993665695, + "rewards/rejected": -0.13154959678649902, + "sft_loss": 0.6528467535972595, + "step": 328 + }, + { + "epoch": 0.4757772957339118, + "grad_norm": 2.9413111640887206, + "learning_rate": 7.936932677388629e-06, + "logits/chosen": 0.48603928089141846, + "logits/rejected": 0.4315299391746521, + "logps/chosen": -0.9008134603500366, + "logps/rejected": -1.2313823699951172, + "loss": 0.8851, + "odds_ratio_loss": 0.5827435255050659, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0900813490152359, + "rewards/margins": 0.033056896179914474, + "rewards/rejected": -0.12313823401927948, + "sft_loss": 0.9008134603500366, + "step": 329 + }, + { + "epoch": 0.4772234273318872, + "grad_norm": 4.2494599421762045, + "learning_rate": 7.936382136333653e-06, + "logits/chosen": 0.7024039030075073, + "logits/rejected": 0.5576363801956177, + "logps/chosen": -0.9647277593612671, + "logps/rejected": -2.111638069152832, + "loss": 0.9772, + "odds_ratio_loss": 0.5717940330505371, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09647278487682343, + "rewards/margins": 0.11469104140996933, + "rewards/rejected": -0.21116381883621216, + "sft_loss": 0.9647277593612671, + "step": 330 + }, + { + "epoch": 0.4786695589298626, + "grad_norm": 4.782077019617622, + "learning_rate": 7.935829222019228e-06, + "logits/chosen": 0.6256594657897949, + "logits/rejected": 0.5600475072860718, + "logps/chosen": -0.9119490385055542, + "logps/rejected": -1.3465038537979126, + "loss": 0.8941, + "odds_ratio_loss": 0.6236871480941772, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09119491279125214, + "rewards/margins": 0.04345548897981644, + "rewards/rejected": -0.13465039432048798, + "sft_loss": 0.9119490385055542, + "step": 331 + }, + { + "epoch": 0.48011569052783803, + "grad_norm": 3.528445359856052, + "learning_rate": 7.935273934778704e-06, + "logits/chosen": 0.5765432715415955, + "logits/rejected": 0.43297064304351807, + "logps/chosen": -0.7337332963943481, + "logps/rejected": -1.637449860572815, + "loss": 0.7898, + "odds_ratio_loss": 0.3902113437652588, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07337333261966705, + "rewards/margins": 0.09037166088819504, + "rewards/rejected": -0.1637450009584427, + "sft_loss": 0.7337332963943481, + "step": 332 + }, + { + "epoch": 0.48156182212581344, + "grad_norm": 2.9637343259088826, + "learning_rate": 7.93471627494687e-06, + "logits/chosen": 0.6577078700065613, + "logits/rejected": 0.5420234203338623, + "logps/chosen": -0.9077745676040649, + "logps/rejected": -0.8908224105834961, + "loss": 0.9123, + "odds_ratio_loss": 0.7640987634658813, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.09077746421098709, + "rewards/margins": -0.001695222221314907, + "rewards/rejected": -0.08908224105834961, + "sft_loss": 0.9077745676040649, + "step": 333 + }, + { + "epoch": 0.48300795372378885, + "grad_norm": 3.087555470787611, + "learning_rate": 7.934156242859939e-06, + "logits/chosen": 0.5774292945861816, + "logits/rejected": 0.5972946882247925, + "logps/chosen": -0.7331134676933289, + "logps/rejected": -2.0628983974456787, + "loss": 0.8601, + "odds_ratio_loss": 0.30700796842575073, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07331134378910065, + "rewards/margins": 0.13297849893569946, + "rewards/rejected": -0.2062898576259613, + "sft_loss": 0.7331134676933289, + "step": 334 + }, + { + "epoch": 0.48445408532176426, + "grad_norm": 3.454287106669069, + "learning_rate": 7.933593838855558e-06, + "logits/chosen": 0.45803093910217285, + "logits/rejected": 0.3993680775165558, + "logps/chosen": -1.1503312587738037, + "logps/rejected": -1.1803433895111084, + "loss": 1.0082, + "odds_ratio_loss": 0.7515895962715149, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11503313481807709, + "rewards/margins": 0.0030012130737304688, + "rewards/rejected": -0.11803434044122696, + "sft_loss": 1.1503312587738037, + "step": 335 + }, + { + "epoch": 0.48590021691973967, + "grad_norm": 4.234184234884144, + "learning_rate": 7.9330290632728e-06, + "logits/chosen": 0.4678802490234375, + "logits/rejected": 0.39046192169189453, + "logps/chosen": -0.9035397171974182, + "logps/rejected": -1.1667201519012451, + "loss": 0.9164, + "odds_ratio_loss": 0.623311460018158, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09035398066043854, + "rewards/margins": 0.02631804160773754, + "rewards/rejected": -0.11667200922966003, + "sft_loss": 0.9035397171974182, + "step": 336 + }, + { + "epoch": 0.48734634851771513, + "grad_norm": 2.868596882817971, + "learning_rate": 7.93246191645217e-06, + "logits/chosen": 0.5796488523483276, + "logits/rejected": 0.5661954283714294, + "logps/chosen": -0.7045271396636963, + "logps/rejected": -1.4447965621948242, + "loss": 0.8614, + "odds_ratio_loss": 0.4747796356678009, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0704527199268341, + "rewards/margins": 0.07402694970369339, + "rewards/rejected": -0.1444796621799469, + "sft_loss": 0.7045271396636963, + "step": 337 + }, + { + "epoch": 0.48879248011569054, + "grad_norm": 4.342722573935673, + "learning_rate": 7.931892398735607e-06, + "logits/chosen": 0.600724458694458, + "logits/rejected": 0.5414420366287231, + "logps/chosen": -0.7105461955070496, + "logps/rejected": -1.3550055027008057, + "loss": 0.8715, + "odds_ratio_loss": 0.4188329875469208, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0710546150803566, + "rewards/margins": 0.06444593518972397, + "rewards/rejected": -0.13550056517124176, + "sft_loss": 0.7105461955070496, + "step": 338 + }, + { + "epoch": 0.49023861171366595, + "grad_norm": 4.582201948309204, + "learning_rate": 7.931320510466472e-06, + "logits/chosen": 0.5396424531936646, + "logits/rejected": 0.3977088928222656, + "logps/chosen": -0.9911842346191406, + "logps/rejected": -1.341888427734375, + "loss": 0.9325, + "odds_ratio_loss": 0.741445004940033, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0991184264421463, + "rewards/margins": 0.03507041931152344, + "rewards/rejected": -0.13418884575366974, + "sft_loss": 0.9911842346191406, + "step": 339 + }, + { + "epoch": 0.49168474331164136, + "grad_norm": 2.782969535100825, + "learning_rate": 7.930746251989558e-06, + "logits/chosen": 0.5857880711555481, + "logits/rejected": 0.46557682752609253, + "logps/chosen": -0.8816125392913818, + "logps/rejected": -1.6498929262161255, + "loss": 0.8827, + "odds_ratio_loss": 0.5967287421226501, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08816125243902206, + "rewards/margins": 0.07682802528142929, + "rewards/rejected": -0.16498929262161255, + "sft_loss": 0.8816125392913818, + "step": 340 + }, + { + "epoch": 0.49313087490961677, + "grad_norm": 3.50913863564006, + "learning_rate": 7.930169623651092e-06, + "logits/chosen": 0.7082901000976562, + "logits/rejected": 0.5184527039527893, + "logps/chosen": -0.6331555247306824, + "logps/rejected": -2.131300449371338, + "loss": 0.8071, + "odds_ratio_loss": 0.5050643086433411, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06331555545330048, + "rewards/margins": 0.14981448650360107, + "rewards/rejected": -0.21313002705574036, + "sft_loss": 0.6331555247306824, + "step": 341 + }, + { + "epoch": 0.4945770065075922, + "grad_norm": 4.334338505420553, + "learning_rate": 7.92959062579872e-06, + "logits/chosen": 0.667914628982544, + "logits/rejected": 0.5001978874206543, + "logps/chosen": -0.7079198360443115, + "logps/rejected": -1.6759718656539917, + "loss": 0.8597, + "odds_ratio_loss": 0.46850132942199707, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07079198956489563, + "rewards/margins": 0.09680519998073578, + "rewards/rejected": -0.1675971895456314, + "sft_loss": 0.7079198360443115, + "step": 342 + }, + { + "epoch": 0.4960231381055676, + "grad_norm": 10.026558896952501, + "learning_rate": 7.929009258781526e-06, + "logits/chosen": 0.5944724082946777, + "logits/rejected": 0.4023160934448242, + "logps/chosen": -0.758367657661438, + "logps/rejected": -2.4440927505493164, + "loss": 0.864, + "odds_ratio_loss": 0.45333942770957947, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07583675533533096, + "rewards/margins": 0.16857251524925232, + "rewards/rejected": -0.24440926313400269, + "sft_loss": 0.758367657661438, + "step": 343 + }, + { + "epoch": 0.49746926970354305, + "grad_norm": 2.533667984333284, + "learning_rate": 7.928425522950015e-06, + "logits/chosen": 0.6907603144645691, + "logits/rejected": 0.5649253129959106, + "logps/chosen": -0.7110158205032349, + "logps/rejected": -1.8234652280807495, + "loss": 0.7522, + "odds_ratio_loss": 0.40478286147117615, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0711015835404396, + "rewards/margins": 0.11124493926763535, + "rewards/rejected": -0.18234652280807495, + "sft_loss": 0.7110158205032349, + "step": 344 + }, + { + "epoch": 0.49891540130151846, + "grad_norm": 3.575246323886135, + "learning_rate": 7.927839418656126e-06, + "logits/chosen": 0.7409899830818176, + "logits/rejected": 0.6180683970451355, + "logps/chosen": -0.6791451573371887, + "logps/rejected": -1.5107991695404053, + "loss": 0.8689, + "odds_ratio_loss": 0.472514271736145, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06791451573371887, + "rewards/margins": 0.08316539227962494, + "rewards/rejected": -0.1510799080133438, + "sft_loss": 0.6791451573371887, + "step": 345 + }, + { + "epoch": 0.5003615328994938, + "grad_norm": 5.550086385061495, + "learning_rate": 7.927250946253224e-06, + "logits/chosen": 0.7460111379623413, + "logits/rejected": 0.6480680704116821, + "logps/chosen": -0.8033276796340942, + "logps/rejected": -1.2967023849487305, + "loss": 0.8245, + "odds_ratio_loss": 0.5949506163597107, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08033277094364166, + "rewards/margins": 0.0493374727666378, + "rewards/rejected": -0.12967024743556976, + "sft_loss": 0.8033276796340942, + "step": 346 + }, + { + "epoch": 0.5018076644974693, + "grad_norm": 2.8018962595308285, + "learning_rate": 7.926660106096098e-06, + "logits/chosen": 0.6507607698440552, + "logits/rejected": 0.5750927925109863, + "logps/chosen": -0.6767846345901489, + "logps/rejected": -1.9638330936431885, + "loss": 0.8087, + "odds_ratio_loss": 0.6127895712852478, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.06767846643924713, + "rewards/margins": 0.12870484590530396, + "rewards/rejected": -0.19638332724571228, + "sft_loss": 0.6767846345901489, + "step": 347 + }, + { + "epoch": 0.5032537960954447, + "grad_norm": 5.415107124571396, + "learning_rate": 7.92606689854097e-06, + "logits/chosen": 0.5848195552825928, + "logits/rejected": 0.52767014503479, + "logps/chosen": -0.9780311584472656, + "logps/rejected": -1.3824843168258667, + "loss": 0.886, + "odds_ratio_loss": 0.5286108255386353, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.09780311584472656, + "rewards/margins": 0.040445320308208466, + "rewards/rejected": -0.13824842870235443, + "sft_loss": 0.9780311584472656, + "step": 348 + }, + { + "epoch": 0.5046999276934201, + "grad_norm": 3.154755409577211, + "learning_rate": 7.925471323945487e-06, + "logits/chosen": 0.6179808378219604, + "logits/rejected": 0.5163712501525879, + "logps/chosen": -0.9327211976051331, + "logps/rejected": -1.4365931749343872, + "loss": 0.9042, + "odds_ratio_loss": 0.6557815074920654, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0932721197605133, + "rewards/margins": 0.050387196242809296, + "rewards/rejected": -0.1436593234539032, + "sft_loss": 0.9327211976051331, + "step": 349 + }, + { + "epoch": 0.5061460592913956, + "grad_norm": 2.6911740599565173, + "learning_rate": 7.924873382668724e-06, + "logits/chosen": 0.6831887364387512, + "logits/rejected": 0.7219184041023254, + "logps/chosen": -0.7544329166412354, + "logps/rejected": -1.539219617843628, + "loss": 0.859, + "odds_ratio_loss": 0.5167781710624695, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07544328272342682, + "rewards/margins": 0.07847868651151657, + "rewards/rejected": -0.153921976685524, + "sft_loss": 0.7544329166412354, + "step": 350 + }, + { + "epoch": 0.5075921908893709, + "grad_norm": 3.156218942574542, + "learning_rate": 7.924273075071177e-06, + "logits/chosen": 0.6295790672302246, + "logits/rejected": 0.4227325916290283, + "logps/chosen": -0.878291666507721, + "logps/rejected": -1.71316659450531, + "loss": 0.8423, + "odds_ratio_loss": 0.48217105865478516, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08782917261123657, + "rewards/margins": 0.08348748087882996, + "rewards/rejected": -0.17131665349006653, + "sft_loss": 0.878291666507721, + "step": 351 + }, + { + "epoch": 0.5090383224873464, + "grad_norm": 4.709022398652423, + "learning_rate": 7.92367040151478e-06, + "logits/chosen": 0.5759553909301758, + "logits/rejected": 0.4466925859451294, + "logps/chosen": -0.7179098129272461, + "logps/rejected": -1.511012315750122, + "loss": 1.0013, + "odds_ratio_loss": 0.5645134449005127, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.07179098576307297, + "rewards/margins": 0.07931024581193924, + "rewards/rejected": -0.1511012315750122, + "sft_loss": 0.7179098129272461, + "step": 352 + }, + { + "epoch": 0.5104844540853217, + "grad_norm": 4.067706710450371, + "learning_rate": 7.923065362362885e-06, + "logits/chosen": 0.4464551508426666, + "logits/rejected": 0.48552900552749634, + "logps/chosen": -0.8215190172195435, + "logps/rejected": -2.112861156463623, + "loss": 0.86, + "odds_ratio_loss": 0.49512815475463867, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08215189725160599, + "rewards/margins": 0.12913422286510468, + "rewards/rejected": -0.21128612756729126, + "sft_loss": 0.8215190172195435, + "step": 353 + }, + { + "epoch": 0.5119305856832972, + "grad_norm": 3.7090631226933852, + "learning_rate": 7.922457957980272e-06, + "logits/chosen": 0.5708781480789185, + "logits/rejected": 0.6100287437438965, + "logps/chosen": -0.8479795455932617, + "logps/rejected": -1.83680260181427, + "loss": 0.8505, + "odds_ratio_loss": 0.4224860668182373, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08479795604944229, + "rewards/margins": 0.09888231754302979, + "rewards/rejected": -0.18368026614189148, + "sft_loss": 0.8479795455932617, + "step": 354 + }, + { + "epoch": 0.5133767172812725, + "grad_norm": 2.474186214253982, + "learning_rate": 7.921848188733146e-06, + "logits/chosen": 0.6852070093154907, + "logits/rejected": 0.43174925446510315, + "logps/chosen": -0.6690998077392578, + "logps/rejected": -2.255444288253784, + "loss": 0.8793, + "odds_ratio_loss": 0.4386477470397949, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.06690998375415802, + "rewards/margins": 0.15863445401191711, + "rewards/rejected": -0.22554443776607513, + "sft_loss": 0.6690998077392578, + "step": 355 + }, + { + "epoch": 0.514822848879248, + "grad_norm": 2.797892389259021, + "learning_rate": 7.921236054989142e-06, + "logits/chosen": 0.6899568438529968, + "logits/rejected": 0.41177740693092346, + "logps/chosen": -0.713912844657898, + "logps/rejected": -2.1068050861358643, + "loss": 0.8, + "odds_ratio_loss": 0.390377402305603, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07139129191637039, + "rewards/margins": 0.1392892301082611, + "rewards/rejected": -0.2106805145740509, + "sft_loss": 0.713912844657898, + "step": 356 + }, + { + "epoch": 0.5162689804772235, + "grad_norm": 2.9882574854982016, + "learning_rate": 7.920621557117316e-06, + "logits/chosen": 0.6059397459030151, + "logits/rejected": 0.4042537212371826, + "logps/chosen": -0.8969763517379761, + "logps/rejected": -1.3820322751998901, + "loss": 0.8743, + "odds_ratio_loss": 0.5645525455474854, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08969763666391373, + "rewards/margins": 0.04850558564066887, + "rewards/rejected": -0.1382032185792923, + "sft_loss": 0.8969763517379761, + "step": 357 + }, + { + "epoch": 0.5177151120751988, + "grad_norm": 2.9687671798718585, + "learning_rate": 7.92000469548815e-06, + "logits/chosen": 0.6877776384353638, + "logits/rejected": 0.6707451343536377, + "logps/chosen": -0.6282877326011658, + "logps/rejected": -1.3870527744293213, + "loss": 0.8462, + "odds_ratio_loss": 0.42180609703063965, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06282877922058105, + "rewards/margins": 0.07587650418281555, + "rewards/rejected": -0.1387052834033966, + "sft_loss": 0.6282877326011658, + "step": 358 + }, + { + "epoch": 0.5191612436731743, + "grad_norm": 3.1784523709075825, + "learning_rate": 7.919385470473554e-06, + "logits/chosen": 0.6407051682472229, + "logits/rejected": 0.5854879021644592, + "logps/chosen": -0.7749360799789429, + "logps/rejected": -1.426418662071228, + "loss": 0.93, + "odds_ratio_loss": 0.5467308163642883, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07749360799789429, + "rewards/margins": 0.0651482492685318, + "rewards/rejected": -0.1426418572664261, + "sft_loss": 0.7749360799789429, + "step": 359 + }, + { + "epoch": 0.5206073752711496, + "grad_norm": 2.4008458367496455, + "learning_rate": 7.918763882446861e-06, + "logits/chosen": 0.5963618755340576, + "logits/rejected": 0.5182086229324341, + "logps/chosen": -0.8708814382553101, + "logps/rejected": -1.3293944597244263, + "loss": 0.8985, + "odds_ratio_loss": 0.623752236366272, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08708814531564713, + "rewards/margins": 0.04585129767656326, + "rewards/rejected": -0.13293945789337158, + "sft_loss": 0.8708814382553101, + "step": 360 + }, + { + "epoch": 0.5220535068691251, + "grad_norm": 4.443409229829391, + "learning_rate": 7.918139931782827e-06, + "logits/chosen": 0.5416839718818665, + "logits/rejected": 0.5389788746833801, + "logps/chosen": -0.9856581091880798, + "logps/rejected": -1.6269183158874512, + "loss": 1.0153, + "odds_ratio_loss": 0.591167688369751, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09856581687927246, + "rewards/margins": 0.06412601470947266, + "rewards/rejected": -0.16269183158874512, + "sft_loss": 0.9856581091880798, + "step": 361 + }, + { + "epoch": 0.5234996384671005, + "grad_norm": 2.9817466376184916, + "learning_rate": 7.917513618857637e-06, + "logits/chosen": 0.6022881269454956, + "logits/rejected": 0.4377667009830475, + "logps/chosen": -0.9305622577667236, + "logps/rejected": -1.7188301086425781, + "loss": 0.8499, + "odds_ratio_loss": 0.5162808895111084, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09305623173713684, + "rewards/margins": 0.07882677763700485, + "rewards/rejected": -0.1718830019235611, + "sft_loss": 0.9305622577667236, + "step": 362 + }, + { + "epoch": 0.5249457700650759, + "grad_norm": 2.9696420778180666, + "learning_rate": 7.916884944048896e-06, + "logits/chosen": 0.7431566119194031, + "logits/rejected": 0.534424901008606, + "logps/chosen": -0.8032225370407104, + "logps/rejected": -1.4060473442077637, + "loss": 0.8416, + "odds_ratio_loss": 0.554038405418396, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0803222581744194, + "rewards/margins": 0.060282476246356964, + "rewards/rejected": -0.14060473442077637, + "sft_loss": 0.8032225370407104, + "step": 363 + }, + { + "epoch": 0.5263919016630514, + "grad_norm": 3.572425590012433, + "learning_rate": 7.916253907735632e-06, + "logits/chosen": 0.4249388575553894, + "logits/rejected": 0.3504279851913452, + "logps/chosen": -0.7644526958465576, + "logps/rejected": -0.9701670408248901, + "loss": 0.8393, + "odds_ratio_loss": 0.6225709915161133, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.07644525915384293, + "rewards/margins": 0.02057143673300743, + "rewards/rejected": -0.09701670706272125, + "sft_loss": 0.7644526958465576, + "step": 364 + }, + { + "epoch": 0.5278380332610267, + "grad_norm": 3.258313489123501, + "learning_rate": 7.915620510298303e-06, + "logits/chosen": 0.4414077401161194, + "logits/rejected": 0.37516123056411743, + "logps/chosen": -0.7575925588607788, + "logps/rejected": -2.3582098484039307, + "loss": 0.8239, + "odds_ratio_loss": 0.36041927337646484, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07575926184654236, + "rewards/margins": 0.16006171703338623, + "rewards/rejected": -0.2358209788799286, + "sft_loss": 0.7575925588607788, + "step": 365 + }, + { + "epoch": 0.5292841648590022, + "grad_norm": 4.082631394266917, + "learning_rate": 7.914984752118785e-06, + "logits/chosen": 0.6929055452346802, + "logits/rejected": 0.5584222078323364, + "logps/chosen": -0.7469485402107239, + "logps/rejected": -2.453676223754883, + "loss": 0.8559, + "odds_ratio_loss": 0.47400549054145813, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07469485700130463, + "rewards/margins": 0.17067277431488037, + "rewards/rejected": -0.2453676164150238, + "sft_loss": 0.7469485402107239, + "step": 366 + }, + { + "epoch": 0.5307302964569776, + "grad_norm": 2.7439308886460094, + "learning_rate": 7.91434663358038e-06, + "logits/chosen": 0.4863385260105133, + "logits/rejected": 0.46878552436828613, + "logps/chosen": -0.6547051668167114, + "logps/rejected": -1.097205400466919, + "loss": 0.8037, + "odds_ratio_loss": 0.4927642345428467, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06547051668167114, + "rewards/margins": 0.04425002634525299, + "rewards/rejected": -0.10972055047750473, + "sft_loss": 0.6547051668167114, + "step": 367 + }, + { + "epoch": 0.532176428054953, + "grad_norm": 3.142695590792442, + "learning_rate": 7.913706155067809e-06, + "logits/chosen": 0.6892445683479309, + "logits/rejected": 0.6153708100318909, + "logps/chosen": -0.7438619136810303, + "logps/rejected": -1.5297504663467407, + "loss": 0.8149, + "odds_ratio_loss": 0.44938647747039795, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07438620179891586, + "rewards/margins": 0.07858884334564209, + "rewards/rejected": -0.15297505259513855, + "sft_loss": 0.7438619136810303, + "step": 368 + }, + { + "epoch": 0.5336225596529284, + "grad_norm": 2.969554883142122, + "learning_rate": 7.913063316967221e-06, + "logits/chosen": 0.6924266815185547, + "logits/rejected": 0.5393475294113159, + "logps/chosen": -0.657360315322876, + "logps/rejected": -1.4955742359161377, + "loss": 0.8055, + "odds_ratio_loss": 0.4052860140800476, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06573603302240372, + "rewards/margins": 0.08382140100002289, + "rewards/rejected": -0.149557426571846, + "sft_loss": 0.657360315322876, + "step": 369 + }, + { + "epoch": 0.5350686912509038, + "grad_norm": 7.132638265040183, + "learning_rate": 7.912418119666187e-06, + "logits/chosen": 0.44325125217437744, + "logits/rejected": 0.4056926965713501, + "logps/chosen": -0.9184268712997437, + "logps/rejected": -1.4103426933288574, + "loss": 0.9277, + "odds_ratio_loss": 0.6221168041229248, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09184268862009048, + "rewards/margins": 0.049191586673259735, + "rewards/rejected": -0.14103427529335022, + "sft_loss": 0.9184268712997437, + "step": 370 + }, + { + "epoch": 0.5365148228488793, + "grad_norm": 2.845008287197183, + "learning_rate": 7.911770563553694e-06, + "logits/chosen": 0.6006841659545898, + "logits/rejected": 0.5208953619003296, + "logps/chosen": -0.6521604061126709, + "logps/rejected": -1.7045867443084717, + "loss": 0.9057, + "odds_ratio_loss": 0.5297222137451172, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06521604210138321, + "rewards/margins": 0.10524262487888336, + "rewards/rejected": -0.17045867443084717, + "sft_loss": 0.6521604061126709, + "step": 371 + }, + { + "epoch": 0.5379609544468547, + "grad_norm": 9.071354400551943, + "learning_rate": 7.911120649020162e-06, + "logits/chosen": 0.4639233350753784, + "logits/rejected": 0.262873113155365, + "logps/chosen": -0.9230935573577881, + "logps/rejected": -1.9133673906326294, + "loss": 0.8553, + "odds_ratio_loss": 0.4418998062610626, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.09230935573577881, + "rewards/margins": 0.09902738034725189, + "rewards/rejected": -0.1913367360830307, + "sft_loss": 0.9230935573577881, + "step": 372 + }, + { + "epoch": 0.5394070860448301, + "grad_norm": 2.9865341827725143, + "learning_rate": 7.910468376457424e-06, + "logits/chosen": 0.7367454767227173, + "logits/rejected": 0.5884082317352295, + "logps/chosen": -0.6778854131698608, + "logps/rejected": -1.1167229413986206, + "loss": 0.7755, + "odds_ratio_loss": 0.4953039288520813, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06778854131698608, + "rewards/margins": 0.043883755803108215, + "rewards/rejected": -0.1116722971200943, + "sft_loss": 0.6778854131698608, + "step": 373 + }, + { + "epoch": 0.5408532176428055, + "grad_norm": 2.718373269169299, + "learning_rate": 7.909813746258738e-06, + "logits/chosen": 0.699302613735199, + "logits/rejected": 0.5751674175262451, + "logps/chosen": -0.8385717868804932, + "logps/rejected": -2.344866991043091, + "loss": 0.8182, + "odds_ratio_loss": 0.4515349268913269, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08385718613862991, + "rewards/margins": 0.15062952041625977, + "rewards/rejected": -0.23448669910430908, + "sft_loss": 0.8385717868804932, + "step": 374 + }, + { + "epoch": 0.5422993492407809, + "grad_norm": 2.7568273854501055, + "learning_rate": 7.909156758818782e-06, + "logits/chosen": 0.6291428208351135, + "logits/rejected": 0.5532090067863464, + "logps/chosen": -0.9137899875640869, + "logps/rejected": -1.4941704273223877, + "loss": 0.7956, + "odds_ratio_loss": 0.6681514978408813, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09137900918722153, + "rewards/margins": 0.058038052171468735, + "rewards/rejected": -0.14941705763339996, + "sft_loss": 0.9137899875640869, + "step": 375 + }, + { + "epoch": 0.5437454808387563, + "grad_norm": 2.910942491569993, + "learning_rate": 7.908497414533658e-06, + "logits/chosen": 0.6253929138183594, + "logits/rejected": 0.4845582842826843, + "logps/chosen": -0.6655830144882202, + "logps/rejected": -1.6559513807296753, + "loss": 0.8689, + "odds_ratio_loss": 0.5183405876159668, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06655830144882202, + "rewards/margins": 0.09903683513402939, + "rewards/rejected": -0.1655951291322708, + "sft_loss": 0.6655830144882202, + "step": 376 + }, + { + "epoch": 0.5451916124367318, + "grad_norm": 2.9429362391795815, + "learning_rate": 7.907835713800883e-06, + "logits/chosen": 0.6014878153800964, + "logits/rejected": 0.5209006071090698, + "logps/chosen": -0.7738833427429199, + "logps/rejected": -1.4064111709594727, + "loss": 0.7655, + "odds_ratio_loss": 0.5940650701522827, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07738833129405975, + "rewards/margins": 0.06325278431177139, + "rewards/rejected": -0.14064112305641174, + "sft_loss": 0.7738833427429199, + "step": 377 + }, + { + "epoch": 0.5466377440347071, + "grad_norm": 2.559432919663948, + "learning_rate": 7.907171657019403e-06, + "logits/chosen": 0.5809941291809082, + "logits/rejected": 0.6483409404754639, + "logps/chosen": -1.0075054168701172, + "logps/rejected": -1.088860273361206, + "loss": 0.8069, + "odds_ratio_loss": 0.6888061761856079, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.10075053572654724, + "rewards/margins": 0.008135493844747543, + "rewards/rejected": -0.10888603329658508, + "sft_loss": 1.0075054168701172, + "step": 378 + }, + { + "epoch": 0.5480838756326826, + "grad_norm": 3.50222797522594, + "learning_rate": 7.906505244589581e-06, + "logits/chosen": 0.5703312754631042, + "logits/rejected": 0.46388378739356995, + "logps/chosen": -0.9029073119163513, + "logps/rejected": -1.9283422231674194, + "loss": 0.8891, + "odds_ratio_loss": 0.7023340463638306, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09029072523117065, + "rewards/margins": 0.10254350304603577, + "rewards/rejected": -0.19283424317836761, + "sft_loss": 0.9029073119163513, + "step": 379 + }, + { + "epoch": 0.549530007230658, + "grad_norm": 3.0937027814614506, + "learning_rate": 7.905836476913197e-06, + "logits/chosen": 0.6240139603614807, + "logits/rejected": 0.6435034275054932, + "logps/chosen": -0.7246947884559631, + "logps/rejected": -1.4072096347808838, + "loss": 0.9025, + "odds_ratio_loss": 0.5814554691314697, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07246948033571243, + "rewards/margins": 0.06825148314237595, + "rewards/rejected": -0.14072097837924957, + "sft_loss": 0.7246947884559631, + "step": 380 + }, + { + "epoch": 0.5509761388286334, + "grad_norm": 5.572005139677108, + "learning_rate": 7.905165354393453e-06, + "logits/chosen": 0.5487345457077026, + "logits/rejected": 0.4786183834075928, + "logps/chosen": -1.0790622234344482, + "logps/rejected": -2.643061876296997, + "loss": 0.9764, + "odds_ratio_loss": 0.6008350253105164, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.10790622979402542, + "rewards/margins": 0.1563999503850937, + "rewards/rejected": -0.2643061876296997, + "sft_loss": 1.0790622234344482, + "step": 381 + }, + { + "epoch": 0.5524222704266089, + "grad_norm": 2.7876168403419848, + "learning_rate": 7.904491877434973e-06, + "logits/chosen": 0.49730992317199707, + "logits/rejected": 0.5880797505378723, + "logps/chosen": -0.7353624701499939, + "logps/rejected": -1.8596457242965698, + "loss": 0.9641, + "odds_ratio_loss": 0.5217285752296448, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07353624701499939, + "rewards/margins": 0.11242832243442535, + "rewards/rejected": -0.18596458435058594, + "sft_loss": 0.7353624701499939, + "step": 382 + }, + { + "epoch": 0.5538684020245842, + "grad_norm": 3.9163687919179146, + "learning_rate": 7.903816046443798e-06, + "logits/chosen": 0.6902502179145813, + "logits/rejected": 0.5075284242630005, + "logps/chosen": -0.7549622058868408, + "logps/rejected": -1.832808017730713, + "loss": 0.9054, + "odds_ratio_loss": 0.5362709760665894, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07549622654914856, + "rewards/margins": 0.10778456926345825, + "rewards/rejected": -0.1832807958126068, + "sft_loss": 0.7549622058868408, + "step": 383 + }, + { + "epoch": 0.5553145336225597, + "grad_norm": 2.7712943773528322, + "learning_rate": 7.903137861827391e-06, + "logits/chosen": 0.5990560054779053, + "logits/rejected": 0.45573779940605164, + "logps/chosen": -0.8964800238609314, + "logps/rejected": -1.046126127243042, + "loss": 0.9278, + "odds_ratio_loss": 0.6818205118179321, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08964800089597702, + "rewards/margins": 0.014964621514081955, + "rewards/rejected": -0.10461262613534927, + "sft_loss": 0.8964800238609314, + "step": 384 + }, + { + "epoch": 0.556760665220535, + "grad_norm": 4.917169249683361, + "learning_rate": 7.902457323994629e-06, + "logits/chosen": 0.6207394599914551, + "logits/rejected": 0.6121277809143066, + "logps/chosen": -0.8706837892532349, + "logps/rejected": -1.966713547706604, + "loss": 0.9161, + "odds_ratio_loss": 0.6748664379119873, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08706837892532349, + "rewards/margins": 0.10960298031568527, + "rewards/rejected": -0.19667133688926697, + "sft_loss": 0.8706837892532349, + "step": 385 + }, + { + "epoch": 0.5582067968185105, + "grad_norm": 3.0006340407456142, + "learning_rate": 7.901774433355812e-06, + "logits/chosen": 0.7476394176483154, + "logits/rejected": 0.5458194017410278, + "logps/chosen": -0.9039993286132812, + "logps/rejected": -2.477038621902466, + "loss": 0.8978, + "odds_ratio_loss": 0.7108010053634644, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.09039993584156036, + "rewards/margins": 0.15730392932891846, + "rewards/rejected": -0.24770388007164001, + "sft_loss": 0.9039993286132812, + "step": 386 + }, + { + "epoch": 0.559652928416486, + "grad_norm": 7.6882135807422705, + "learning_rate": 7.901089190322656e-06, + "logits/chosen": 0.5460501909255981, + "logits/rejected": 0.4279562532901764, + "logps/chosen": -0.9958518147468567, + "logps/rejected": -1.7544076442718506, + "loss": 0.8742, + "odds_ratio_loss": 0.5986472368240356, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09958518296480179, + "rewards/margins": 0.0758555680513382, + "rewards/rejected": -0.17544077336788177, + "sft_loss": 0.9958518147468567, + "step": 387 + }, + { + "epoch": 0.5610990600144613, + "grad_norm": 2.977393496482833, + "learning_rate": 7.900401595308299e-06, + "logits/chosen": 0.5677440166473389, + "logits/rejected": 0.32233792543411255, + "logps/chosen": -0.8183796405792236, + "logps/rejected": -2.062242031097412, + "loss": 0.8109, + "odds_ratio_loss": 0.4541066288948059, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.081837959587574, + "rewards/margins": 0.1243862509727478, + "rewards/rejected": -0.2062242031097412, + "sft_loss": 0.8183796405792236, + "step": 388 + }, + { + "epoch": 0.5625451916124368, + "grad_norm": 2.416176246352007, + "learning_rate": 7.899711648727295e-06, + "logits/chosen": 0.4597778618335724, + "logits/rejected": 0.40555012226104736, + "logps/chosen": -0.8106697797775269, + "logps/rejected": -2.3276708126068115, + "loss": 0.8754, + "odds_ratio_loss": 0.40332669019699097, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08106698095798492, + "rewards/margins": 0.15170009434223175, + "rewards/rejected": -0.23276707530021667, + "sft_loss": 0.8106697797775269, + "step": 389 + }, + { + "epoch": 0.5639913232104121, + "grad_norm": 4.587938500883625, + "learning_rate": 7.899019350995612e-06, + "logits/chosen": 0.3854691982269287, + "logits/rejected": 0.32688039541244507, + "logps/chosen": -0.8717947006225586, + "logps/rejected": -2.0538129806518555, + "loss": 0.8744, + "odds_ratio_loss": 0.6730850338935852, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08717946708202362, + "rewards/margins": 0.118201844394207, + "rewards/rejected": -0.20538130402565002, + "sft_loss": 0.8717947006225586, + "step": 390 + }, + { + "epoch": 0.5654374548083876, + "grad_norm": 2.727564894902918, + "learning_rate": 7.89832470253064e-06, + "logits/chosen": 0.44197285175323486, + "logits/rejected": 0.34067657589912415, + "logps/chosen": -0.7974579334259033, + "logps/rejected": -2.4840853214263916, + "loss": 0.8007, + "odds_ratio_loss": 0.5355666875839233, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07974579185247421, + "rewards/margins": 0.16866274178028107, + "rewards/rejected": -0.24840855598449707, + "sft_loss": 0.7974579334259033, + "step": 391 + }, + { + "epoch": 0.5668835864063629, + "grad_norm": 3.9564187454577597, + "learning_rate": 7.89762770375119e-06, + "logits/chosen": 0.5264127850532532, + "logits/rejected": 0.39061862230300903, + "logps/chosen": -0.8429474830627441, + "logps/rejected": -1.5880827903747559, + "loss": 0.8232, + "odds_ratio_loss": 0.725482702255249, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.08429475873708725, + "rewards/margins": 0.07451353222131729, + "rewards/rejected": -0.15880829095840454, + "sft_loss": 0.8429474830627441, + "step": 392 + }, + { + "epoch": 0.5683297180043384, + "grad_norm": 3.6040585185208918, + "learning_rate": 7.896928355077477e-06, + "logits/chosen": 0.5754531621932983, + "logits/rejected": 0.374263733625412, + "logps/chosen": -0.7560480833053589, + "logps/rejected": -2.024271011352539, + "loss": 0.8564, + "odds_ratio_loss": 0.48062875866889954, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07560480386018753, + "rewards/margins": 0.12682229280471802, + "rewards/rejected": -0.20242711901664734, + "sft_loss": 0.7560480833053589, + "step": 393 + }, + { + "epoch": 0.5697758496023138, + "grad_norm": 3.2999991845213326, + "learning_rate": 7.896226656931146e-06, + "logits/chosen": 0.548308789730072, + "logits/rejected": 0.45042145252227783, + "logps/chosen": -0.7207891941070557, + "logps/rejected": -2.421163320541382, + "loss": 0.8021, + "odds_ratio_loss": 0.38493600487709045, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07207891345024109, + "rewards/margins": 0.1700374186038971, + "rewards/rejected": -0.24211633205413818, + "sft_loss": 0.7207891941070557, + "step": 394 + }, + { + "epoch": 0.5712219812002892, + "grad_norm": 2.5181280006116515, + "learning_rate": 7.895522609735254e-06, + "logits/chosen": 0.6037847995758057, + "logits/rejected": 0.48602890968322754, + "logps/chosen": -0.7324075102806091, + "logps/rejected": -1.6046706438064575, + "loss": 0.8519, + "odds_ratio_loss": 0.5805519819259644, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07324075698852539, + "rewards/margins": 0.08722630888223648, + "rewards/rejected": -0.16046705842018127, + "sft_loss": 0.7324075102806091, + "step": 395 + }, + { + "epoch": 0.5726681127982647, + "grad_norm": 3.0548062329457064, + "learning_rate": 7.894816213914271e-06, + "logits/chosen": 0.6022318005561829, + "logits/rejected": 0.5420160293579102, + "logps/chosen": -0.8471623659133911, + "logps/rejected": -1.6920993328094482, + "loss": 0.8923, + "odds_ratio_loss": 0.4801836609840393, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08471624553203583, + "rewards/margins": 0.08449369668960571, + "rewards/rejected": -0.16920992732048035, + "sft_loss": 0.8471623659133911, + "step": 396 + }, + { + "epoch": 0.57411424439624, + "grad_norm": 2.4424090387897763, + "learning_rate": 7.894107469894086e-06, + "logits/chosen": 0.5112386345863342, + "logits/rejected": 0.35575413703918457, + "logps/chosen": -0.6903353333473206, + "logps/rejected": -2.252138614654541, + "loss": 0.809, + "odds_ratio_loss": 0.5777711868286133, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06903353333473206, + "rewards/margins": 0.15618032217025757, + "rewards/rejected": -0.22521387040615082, + "sft_loss": 0.6903353333473206, + "step": 397 + }, + { + "epoch": 0.5755603759942155, + "grad_norm": 4.353942299792494, + "learning_rate": 7.893396378102005e-06, + "logits/chosen": 0.4526655972003937, + "logits/rejected": 0.384705126285553, + "logps/chosen": -0.8826476335525513, + "logps/rejected": -2.210721969604492, + "loss": 0.9631, + "odds_ratio_loss": 0.5307878255844116, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08826476335525513, + "rewards/margins": 0.13280744850635529, + "rewards/rejected": -0.22107219696044922, + "sft_loss": 0.8826476335525513, + "step": 398 + }, + { + "epoch": 0.5770065075921909, + "grad_norm": 2.9194418404101197, + "learning_rate": 7.892682938966748e-06, + "logits/chosen": 0.672846257686615, + "logits/rejected": 0.4256134629249573, + "logps/chosen": -0.8224917650222778, + "logps/rejected": -1.2346034049987793, + "loss": 0.885, + "odds_ratio_loss": 0.6036404967308044, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08224917948246002, + "rewards/margins": 0.041211169213056564, + "rewards/rejected": -0.12346035242080688, + "sft_loss": 0.8224917650222778, + "step": 399 + }, + { + "epoch": 0.5784526391901663, + "grad_norm": 3.9247918318506834, + "learning_rate": 7.891967152918447e-06, + "logits/chosen": 0.5552332401275635, + "logits/rejected": 0.4921160042285919, + "logps/chosen": -0.8602915406227112, + "logps/rejected": -2.5308303833007812, + "loss": 0.8544, + "odds_ratio_loss": 0.49064141511917114, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08602915704250336, + "rewards/margins": 0.16705386340618134, + "rewards/rejected": -0.2530830204486847, + "sft_loss": 0.8602915406227112, + "step": 400 + }, + { + "epoch": 0.5798987707881417, + "grad_norm": 2.5180141294825713, + "learning_rate": 7.891249020388656e-06, + "logits/chosen": 0.47350338101387024, + "logits/rejected": 0.4336029291152954, + "logps/chosen": -0.8087297677993774, + "logps/rejected": -1.585395097732544, + "loss": 0.8251, + "odds_ratio_loss": 0.6435711979866028, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08087297528982162, + "rewards/margins": 0.07766654342412949, + "rewards/rejected": -0.15853950381278992, + "sft_loss": 0.8087297677993774, + "step": 401 + }, + { + "epoch": 0.5813449023861171, + "grad_norm": 2.812975306412341, + "learning_rate": 7.890528541810339e-06, + "logits/chosen": 0.4831019937992096, + "logits/rejected": 0.428011029958725, + "logps/chosen": -0.7527783513069153, + "logps/rejected": -1.017602562904358, + "loss": 0.85, + "odds_ratio_loss": 0.5688704252243042, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07527783513069153, + "rewards/margins": 0.02648242563009262, + "rewards/rejected": -0.10176026821136475, + "sft_loss": 0.7527783513069153, + "step": 402 + }, + { + "epoch": 0.5827910339840926, + "grad_norm": 3.6350799321420135, + "learning_rate": 7.889805717617872e-06, + "logits/chosen": 0.5826254487037659, + "logits/rejected": 0.5473474264144897, + "logps/chosen": -0.7726538181304932, + "logps/rejected": -2.026120185852051, + "loss": 0.815, + "odds_ratio_loss": 0.525509238243103, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.07726538181304932, + "rewards/margins": 0.12534663081169128, + "rewards/rejected": -0.2026120275259018, + "sft_loss": 0.7726538181304932, + "step": 403 + }, + { + "epoch": 0.584237165582068, + "grad_norm": 2.955665410682994, + "learning_rate": 7.889080548247051e-06, + "logits/chosen": 0.4878218173980713, + "logits/rejected": 0.4596579372882843, + "logps/chosen": -0.6897256374359131, + "logps/rejected": -1.9527850151062012, + "loss": 0.8417, + "odds_ratio_loss": 0.42767712473869324, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06897257268428802, + "rewards/margins": 0.1263059377670288, + "rewards/rejected": -0.19527849555015564, + "sft_loss": 0.6897256374359131, + "step": 404 + }, + { + "epoch": 0.5856832971800434, + "grad_norm": 3.3958916438819085, + "learning_rate": 7.888353034135084e-06, + "logits/chosen": 0.4562031328678131, + "logits/rejected": 0.35089221596717834, + "logps/chosen": -0.8547066450119019, + "logps/rejected": -2.9385735988616943, + "loss": 0.794, + "odds_ratio_loss": 0.45870649814605713, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08547066152095795, + "rewards/margins": 0.20838668942451477, + "rewards/rejected": -0.2938573658466339, + "sft_loss": 0.8547066450119019, + "step": 405 + }, + { + "epoch": 0.5871294287780188, + "grad_norm": 2.851949471321928, + "learning_rate": 7.88762317572059e-06, + "logits/chosen": 0.4243275225162506, + "logits/rejected": 0.452597975730896, + "logps/chosen": -0.7892992496490479, + "logps/rejected": -1.2699556350708008, + "loss": 0.7885, + "odds_ratio_loss": 0.5196892023086548, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07892993092536926, + "rewards/margins": 0.048065636307001114, + "rewards/rejected": -0.12699556350708008, + "sft_loss": 0.7892992496490479, + "step": 406 + }, + { + "epoch": 0.5885755603759942, + "grad_norm": 4.356841434270988, + "learning_rate": 7.886890973443606e-06, + "logits/chosen": 0.4870211184024811, + "logits/rejected": 0.36551669239997864, + "logps/chosen": -0.781580924987793, + "logps/rejected": -2.3665194511413574, + "loss": 0.7901, + "odds_ratio_loss": 0.4650033414363861, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07815809547901154, + "rewards/margins": 0.15849386155605316, + "rewards/rejected": -0.2366519570350647, + "sft_loss": 0.781580924987793, + "step": 407 + }, + { + "epoch": 0.5900216919739696, + "grad_norm": 3.1252881359207123, + "learning_rate": 7.886156427745576e-06, + "logits/chosen": 0.5762686729431152, + "logits/rejected": 0.41852471232414246, + "logps/chosen": -0.7866231203079224, + "logps/rejected": -1.6717575788497925, + "loss": 0.844, + "odds_ratio_loss": 0.5896944999694824, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07866232097148895, + "rewards/margins": 0.08851346373558044, + "rewards/rejected": -0.1671757698059082, + "sft_loss": 0.7866231203079224, + "step": 408 + }, + { + "epoch": 0.591467823571945, + "grad_norm": 3.24125061976931, + "learning_rate": 7.885419539069362e-06, + "logits/chosen": 0.5813404321670532, + "logits/rejected": 0.3938213884830475, + "logps/chosen": -0.7569571137428284, + "logps/rejected": -2.2225003242492676, + "loss": 0.8635, + "odds_ratio_loss": 0.41640591621398926, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0756957158446312, + "rewards/margins": 0.14655432105064392, + "rewards/rejected": -0.2222500443458557, + "sft_loss": 0.7569571137428284, + "step": 409 + }, + { + "epoch": 0.5929139551699205, + "grad_norm": 2.8346587836313746, + "learning_rate": 7.884680307859237e-06, + "logits/chosen": 0.7509294152259827, + "logits/rejected": 0.6562870144844055, + "logps/chosen": -0.6297463178634644, + "logps/rejected": -2.454871416091919, + "loss": 0.7754, + "odds_ratio_loss": 0.375518798828125, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06297463178634644, + "rewards/margins": 0.18251252174377441, + "rewards/rejected": -0.24548715353012085, + "sft_loss": 0.6297463178634644, + "step": 410 + }, + { + "epoch": 0.5943600867678959, + "grad_norm": 3.222409965720389, + "learning_rate": 7.883938734560888e-06, + "logits/chosen": 0.5145249366760254, + "logits/rejected": 0.37488415837287903, + "logps/chosen": -0.7391336560249329, + "logps/rejected": -1.8347660303115845, + "loss": 0.861, + "odds_ratio_loss": 0.5218417048454285, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07391335815191269, + "rewards/margins": 0.10956324636936188, + "rewards/rejected": -0.18347659707069397, + "sft_loss": 0.7391336560249329, + "step": 411 + }, + { + "epoch": 0.5958062183658713, + "grad_norm": 3.965379970096697, + "learning_rate": 7.88319481962141e-06, + "logits/chosen": 0.5220549702644348, + "logits/rejected": 0.38239559531211853, + "logps/chosen": -0.7255507111549377, + "logps/rejected": -2.4738118648529053, + "loss": 0.8449, + "odds_ratio_loss": 0.4144716262817383, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0725550651550293, + "rewards/margins": 0.17482611536979675, + "rewards/rejected": -0.24738118052482605, + "sft_loss": 0.7255507111549377, + "step": 412 + }, + { + "epoch": 0.5972523499638467, + "grad_norm": 4.452072359820141, + "learning_rate": 7.882448563489313e-06, + "logits/chosen": 0.4620935916900635, + "logits/rejected": 0.4356992542743683, + "logps/chosen": -0.8362157344818115, + "logps/rejected": -2.1093122959136963, + "loss": 0.8182, + "odds_ratio_loss": 0.5006383061408997, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08362157642841339, + "rewards/margins": 0.127309650182724, + "rewards/rejected": -0.21093124151229858, + "sft_loss": 0.8362157344818115, + "step": 413 + }, + { + "epoch": 0.5986984815618221, + "grad_norm": 4.629463579221466, + "learning_rate": 7.881699966614516e-06, + "logits/chosen": 0.4516201913356781, + "logits/rejected": 0.45159292221069336, + "logps/chosen": -0.6810017824172974, + "logps/rejected": -2.307833671569824, + "loss": 0.8887, + "odds_ratio_loss": 0.4952140748500824, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06810016930103302, + "rewards/margins": 0.16268320381641388, + "rewards/rejected": -0.2307833731174469, + "sft_loss": 0.6810017824172974, + "step": 414 + }, + { + "epoch": 0.6001446131597975, + "grad_norm": 3.2811814411404003, + "learning_rate": 7.880949029448352e-06, + "logits/chosen": 0.47481125593185425, + "logits/rejected": 0.399766206741333, + "logps/chosen": -0.6239845156669617, + "logps/rejected": -2.5231199264526367, + "loss": 0.7472, + "odds_ratio_loss": 0.36092132329940796, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06239845231175423, + "rewards/margins": 0.1899135261774063, + "rewards/rejected": -0.2523120045661926, + "sft_loss": 0.6239845156669617, + "step": 415 + }, + { + "epoch": 0.601590744757773, + "grad_norm": 3.910382755288589, + "learning_rate": 7.880195752443566e-06, + "logits/chosen": 0.40132609009742737, + "logits/rejected": 0.2246161550283432, + "logps/chosen": -0.9353955984115601, + "logps/rejected": -2.6664748191833496, + "loss": 0.8951, + "odds_ratio_loss": 0.38120150566101074, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09353956580162048, + "rewards/margins": 0.17310792207717896, + "rewards/rejected": -0.26664748787879944, + "sft_loss": 0.9353955984115601, + "step": 416 + }, + { + "epoch": 0.6030368763557483, + "grad_norm": 3.83453742589943, + "learning_rate": 7.879440136054307e-06, + "logits/chosen": 0.516146719455719, + "logits/rejected": 0.4591212272644043, + "logps/chosen": -0.7768975496292114, + "logps/rejected": -3.070139169692993, + "loss": 0.8451, + "odds_ratio_loss": 0.453329861164093, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0776897519826889, + "rewards/margins": 0.22932417690753937, + "rewards/rejected": -0.30701392889022827, + "sft_loss": 0.7768975496292114, + "step": 417 + }, + { + "epoch": 0.6044830079537238, + "grad_norm": 3.8217120040188464, + "learning_rate": 7.878682180736142e-06, + "logits/chosen": 0.739425778388977, + "logits/rejected": 0.5814308524131775, + "logps/chosen": -0.8185400366783142, + "logps/rejected": -1.45823073387146, + "loss": 0.7712, + "odds_ratio_loss": 0.6225258111953735, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08185401558876038, + "rewards/margins": 0.06396905332803726, + "rewards/rejected": -0.14582306146621704, + "sft_loss": 0.8185400366783142, + "step": 418 + }, + { + "epoch": 0.6059291395516992, + "grad_norm": 3.184411206872811, + "learning_rate": 7.877921886946046e-06, + "logits/chosen": 0.41738444566726685, + "logits/rejected": 0.22434985637664795, + "logps/chosen": -0.9352853298187256, + "logps/rejected": -2.829294443130493, + "loss": 0.889, + "odds_ratio_loss": 0.4922487139701843, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09352853894233704, + "rewards/margins": 0.18940091133117676, + "rewards/rejected": -0.2829294502735138, + "sft_loss": 0.9352853298187256, + "step": 419 + }, + { + "epoch": 0.6073752711496746, + "grad_norm": 2.658817199141513, + "learning_rate": 7.8771592551424e-06, + "logits/chosen": 0.5982671976089478, + "logits/rejected": 0.4165036082267761, + "logps/chosen": -0.7658669948577881, + "logps/rejected": -1.7320560216903687, + "loss": 0.8374, + "odds_ratio_loss": 0.4825802743434906, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07658669352531433, + "rewards/margins": 0.09661891311407089, + "rewards/rejected": -0.17320561408996582, + "sft_loss": 0.7658669948577881, + "step": 420 + }, + { + "epoch": 0.6088214027476501, + "grad_norm": 3.345391828041502, + "learning_rate": 7.876394285785e-06, + "logits/chosen": 0.5503162145614624, + "logits/rejected": 0.34399694204330444, + "logps/chosen": -0.7347284555435181, + "logps/rejected": -2.7232933044433594, + "loss": 0.7884, + "odds_ratio_loss": 0.4280637502670288, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07347285747528076, + "rewards/margins": 0.19885647296905518, + "rewards/rejected": -0.27232933044433594, + "sft_loss": 0.7347284555435181, + "step": 421 + }, + { + "epoch": 0.6102675343456254, + "grad_norm": 3.386750433238105, + "learning_rate": 7.875626979335047e-06, + "logits/chosen": 0.4964909255504608, + "logits/rejected": 0.4483870267868042, + "logps/chosen": -0.8750501275062561, + "logps/rejected": -1.7626843452453613, + "loss": 0.7978, + "odds_ratio_loss": 0.5853959918022156, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08750501275062561, + "rewards/margins": 0.08876340091228485, + "rewards/rejected": -0.17626842856407166, + "sft_loss": 0.8750501275062561, + "step": 422 + }, + { + "epoch": 0.6117136659436009, + "grad_norm": 2.4670916895507005, + "learning_rate": 7.874857336255153e-06, + "logits/chosen": 0.5551592707633972, + "logits/rejected": 0.2982563376426697, + "logps/chosen": -0.6951935291290283, + "logps/rejected": -4.001288890838623, + "loss": 0.7655, + "odds_ratio_loss": 0.5873510837554932, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06951935589313507, + "rewards/margins": 0.330609530210495, + "rewards/rejected": -0.40012890100479126, + "sft_loss": 0.6951935291290283, + "step": 423 + }, + { + "epoch": 0.6131597975415762, + "grad_norm": 2.8129321951372703, + "learning_rate": 7.874085357009341e-06, + "logits/chosen": 0.6084282994270325, + "logits/rejected": 0.42167001962661743, + "logps/chosen": -0.6516855955123901, + "logps/rejected": -1.563461184501648, + "loss": 0.8, + "odds_ratio_loss": 0.40489715337753296, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06516855955123901, + "rewards/margins": 0.0911775529384613, + "rewards/rejected": -0.15634611248970032, + "sft_loss": 0.6516855955123901, + "step": 424 + }, + { + "epoch": 0.6146059291395517, + "grad_norm": 3.283032567196892, + "learning_rate": 7.873311042063038e-06, + "logits/chosen": 0.554405927658081, + "logits/rejected": 0.3872930705547333, + "logps/chosen": -0.6961603164672852, + "logps/rejected": -2.4004769325256348, + "loss": 0.8816, + "odds_ratio_loss": 0.4127447307109833, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06961603462696075, + "rewards/margins": 0.17043164372444153, + "rewards/rejected": -0.24004767835140228, + "sft_loss": 0.6961603164672852, + "step": 425 + }, + { + "epoch": 0.6160520607375272, + "grad_norm": 2.820835695498693, + "learning_rate": 7.872534391883082e-06, + "logits/chosen": 0.6225115656852722, + "logits/rejected": 0.5951453447341919, + "logps/chosen": -0.780193567276001, + "logps/rejected": -1.2512143850326538, + "loss": 0.9153, + "odds_ratio_loss": 0.5581642389297485, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07801935821771622, + "rewards/margins": 0.04710208252072334, + "rewards/rejected": -0.12512142956256866, + "sft_loss": 0.780193567276001, + "step": 426 + }, + { + "epoch": 0.6174981923355025, + "grad_norm": 2.9845279410385177, + "learning_rate": 7.87175540693772e-06, + "logits/chosen": 0.5121057629585266, + "logits/rejected": 0.4706208109855652, + "logps/chosen": -0.8175680637359619, + "logps/rejected": -1.6420925855636597, + "loss": 0.86, + "odds_ratio_loss": 0.5938406586647034, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08175680041313171, + "rewards/margins": 0.08245246112346649, + "rewards/rejected": -0.1642092615365982, + "sft_loss": 0.8175680637359619, + "step": 427 + }, + { + "epoch": 0.618944323933478, + "grad_norm": 2.756053493228294, + "learning_rate": 7.870974087696601e-06, + "logits/chosen": 0.5588035583496094, + "logits/rejected": 0.4831167459487915, + "logps/chosen": -0.7480887770652771, + "logps/rejected": -1.4949398040771484, + "loss": 0.8318, + "odds_ratio_loss": 0.6169903874397278, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07480888068675995, + "rewards/margins": 0.07468511164188385, + "rewards/rejected": -0.1494939923286438, + "sft_loss": 0.7480887770652771, + "step": 428 + }, + { + "epoch": 0.6203904555314533, + "grad_norm": 6.440433342716773, + "learning_rate": 7.870190434630788e-06, + "logits/chosen": 0.5738497972488403, + "logits/rejected": 0.48213326930999756, + "logps/chosen": -0.9667487740516663, + "logps/rejected": -1.965078353881836, + "loss": 0.9166, + "odds_ratio_loss": 0.6497290134429932, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09667487442493439, + "rewards/margins": 0.0998329371213913, + "rewards/rejected": -0.19650781154632568, + "sft_loss": 0.9667487740516663, + "step": 429 + }, + { + "epoch": 0.6218365871294288, + "grad_norm": 5.637365479715586, + "learning_rate": 7.869404448212748e-06, + "logits/chosen": 0.5685498118400574, + "logits/rejected": 0.39901280403137207, + "logps/chosen": -0.9792392253875732, + "logps/rejected": -1.1231156587600708, + "loss": 0.873, + "odds_ratio_loss": 0.6865856647491455, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.09792391955852509, + "rewards/margins": 0.014387642964720726, + "rewards/rejected": -0.11231156438589096, + "sft_loss": 0.9792392253875732, + "step": 430 + }, + { + "epoch": 0.6232827187274042, + "grad_norm": 3.2594522263553385, + "learning_rate": 7.868616128916355e-06, + "logits/chosen": 0.4767334461212158, + "logits/rejected": 0.5127293467521667, + "logps/chosen": -0.6846072673797607, + "logps/rejected": -1.3331453800201416, + "loss": 0.7829, + "odds_ratio_loss": 0.6704195737838745, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06846071779727936, + "rewards/margins": 0.06485381722450256, + "rewards/rejected": -0.13331454992294312, + "sft_loss": 0.6846072673797607, + "step": 431 + }, + { + "epoch": 0.6247288503253796, + "grad_norm": 12.94226676394391, + "learning_rate": 7.86782547721689e-06, + "logits/chosen": 0.3910233974456787, + "logits/rejected": 0.2703205645084381, + "logps/chosen": -1.022456169128418, + "logps/rejected": -2.576681137084961, + "loss": 1.0595, + "odds_ratio_loss": 0.5771183967590332, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.10224561393260956, + "rewards/margins": 0.15542250871658325, + "rewards/rejected": -0.257668137550354, + "sft_loss": 1.022456169128418, + "step": 432 + }, + { + "epoch": 0.6261749819233551, + "grad_norm": 4.258235258049596, + "learning_rate": 7.867032493591039e-06, + "logits/chosen": 0.6012907028198242, + "logits/rejected": 0.4136401116847992, + "logps/chosen": -0.7054803967475891, + "logps/rejected": -2.6366329193115234, + "loss": 0.8079, + "odds_ratio_loss": 0.5335126519203186, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07054804265499115, + "rewards/margins": 0.1931152641773224, + "rewards/rejected": -0.26366329193115234, + "sft_loss": 0.7054803967475891, + "step": 433 + }, + { + "epoch": 0.6276211135213304, + "grad_norm": 3.2881085900599967, + "learning_rate": 7.866237178516895e-06, + "logits/chosen": 0.45769360661506653, + "logits/rejected": 0.4603123664855957, + "logps/chosen": -0.828456461429596, + "logps/rejected": -2.0454816818237305, + "loss": 0.863, + "odds_ratio_loss": 0.564410924911499, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08284565061330795, + "rewards/margins": 0.12170251458883286, + "rewards/rejected": -0.2045481652021408, + "sft_loss": 0.828456461429596, + "step": 434 + }, + { + "epoch": 0.6290672451193059, + "grad_norm": 2.9543081301550123, + "learning_rate": 7.865439532473956e-06, + "logits/chosen": 0.40643981099128723, + "logits/rejected": 0.35865318775177, + "logps/chosen": -0.9577921032905579, + "logps/rejected": -2.695612668991089, + "loss": 0.8432, + "odds_ratio_loss": 0.48533615469932556, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09577921777963638, + "rewards/margins": 0.173782080411911, + "rewards/rejected": -0.2695612907409668, + "sft_loss": 0.9577921032905579, + "step": 435 + }, + { + "epoch": 0.6305133767172812, + "grad_norm": 3.2712972136551715, + "learning_rate": 7.864639555943128e-06, + "logits/chosen": 0.6304845213890076, + "logits/rejected": 0.3566408157348633, + "logps/chosen": -0.8317915201187134, + "logps/rejected": -3.221254348754883, + "loss": 0.7553, + "odds_ratio_loss": 0.5319594740867615, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08317915350198746, + "rewards/margins": 0.23894628882408142, + "rewards/rejected": -0.3221254348754883, + "sft_loss": 0.8317915201187134, + "step": 436 + }, + { + "epoch": 0.6319595083152567, + "grad_norm": 6.796669067502768, + "learning_rate": 7.863837249406717e-06, + "logits/chosen": 0.4694080352783203, + "logits/rejected": 0.4095763862133026, + "logps/chosen": -0.8717098832130432, + "logps/rejected": -1.5735933780670166, + "loss": 0.8834, + "odds_ratio_loss": 0.6717262268066406, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08717098832130432, + "rewards/margins": 0.07018835842609406, + "rewards/rejected": -0.15735936164855957, + "sft_loss": 0.8717098832130432, + "step": 437 + }, + { + "epoch": 0.6334056399132321, + "grad_norm": 2.795098582170022, + "learning_rate": 7.86303261334844e-06, + "logits/chosen": 0.4883936941623688, + "logits/rejected": 0.4044135510921478, + "logps/chosen": -0.7085312604904175, + "logps/rejected": -3.2201850414276123, + "loss": 0.8154, + "odds_ratio_loss": 0.5115565061569214, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07085312902927399, + "rewards/margins": 0.25116536021232605, + "rewards/rejected": -0.32201850414276123, + "sft_loss": 0.7085312604904175, + "step": 438 + }, + { + "epoch": 0.6348517715112075, + "grad_norm": 3.1284776560367185, + "learning_rate": 7.86222564825341e-06, + "logits/chosen": 0.47537481784820557, + "logits/rejected": 0.49767816066741943, + "logps/chosen": -0.9836025834083557, + "logps/rejected": -1.391028642654419, + "loss": 0.906, + "odds_ratio_loss": 0.7164785265922546, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09836025536060333, + "rewards/margins": 0.04074261337518692, + "rewards/rejected": -0.13910287618637085, + "sft_loss": 0.9836025834083557, + "step": 439 + }, + { + "epoch": 0.6362979031091829, + "grad_norm": 2.8437761782591036, + "learning_rate": 7.861416354608154e-06, + "logits/chosen": 0.4136395752429962, + "logits/rejected": 0.320941299200058, + "logps/chosen": -0.8693005442619324, + "logps/rejected": -1.1584815979003906, + "loss": 0.8628, + "odds_ratio_loss": 0.5910212993621826, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.086930051445961, + "rewards/margins": 0.02891811728477478, + "rewards/rejected": -0.11584816873073578, + "sft_loss": 0.8693005442619324, + "step": 440 + }, + { + "epoch": 0.6377440347071583, + "grad_norm": 4.251716982329854, + "learning_rate": 7.860604732900595e-06, + "logits/chosen": 0.761461615562439, + "logits/rejected": 0.5725336074829102, + "logps/chosen": -0.6326714754104614, + "logps/rejected": -2.821256637573242, + "loss": 0.8185, + "odds_ratio_loss": 0.3858832120895386, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06326714903116226, + "rewards/margins": 0.2188585251569748, + "rewards/rejected": -0.28212568163871765, + "sft_loss": 0.6326714754104614, + "step": 441 + }, + { + "epoch": 0.6391901663051338, + "grad_norm": 2.983907575956472, + "learning_rate": 7.859790783620066e-06, + "logits/chosen": 0.4991647005081177, + "logits/rejected": 0.37928247451782227, + "logps/chosen": -0.757746160030365, + "logps/rejected": -2.254284620285034, + "loss": 0.8131, + "odds_ratio_loss": 0.43578675389289856, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07577462494373322, + "rewards/margins": 0.1496538519859314, + "rewards/rejected": -0.2254284769296646, + "sft_loss": 0.757746160030365, + "step": 442 + }, + { + "epoch": 0.6406362979031092, + "grad_norm": 3.510678850486274, + "learning_rate": 7.858974507257298e-06, + "logits/chosen": 0.45033857226371765, + "logits/rejected": 0.356212854385376, + "logps/chosen": -0.8245335817337036, + "logps/rejected": -2.268136978149414, + "loss": 0.852, + "odds_ratio_loss": 0.5200464129447937, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08245337009429932, + "rewards/margins": 0.14436031877994537, + "rewards/rejected": -0.2268136888742447, + "sft_loss": 0.8245335817337036, + "step": 443 + }, + { + "epoch": 0.6420824295010846, + "grad_norm": 2.7353859290225033, + "learning_rate": 7.858155904304427e-06, + "logits/chosen": 0.4843840003013611, + "logits/rejected": 0.4002265930175781, + "logps/chosen": -0.9183147549629211, + "logps/rejected": -1.4296321868896484, + "loss": 0.8456, + "odds_ratio_loss": 0.7742018103599548, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.09183148294687271, + "rewards/margins": 0.05113174393773079, + "rewards/rejected": -0.1429632306098938, + "sft_loss": 0.9183147549629211, + "step": 444 + }, + { + "epoch": 0.64352856109906, + "grad_norm": 3.0770167286409973, + "learning_rate": 7.85733497525499e-06, + "logits/chosen": 0.4947357177734375, + "logits/rejected": 0.4650039076805115, + "logps/chosen": -0.8962004780769348, + "logps/rejected": -1.226360559463501, + "loss": 0.8767, + "odds_ratio_loss": 0.5764116048812866, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08962005376815796, + "rewards/margins": 0.03301601484417915, + "rewards/rejected": -0.12263606488704681, + "sft_loss": 0.8962004780769348, + "step": 445 + }, + { + "epoch": 0.6449746926970354, + "grad_norm": 2.87017794293484, + "learning_rate": 7.856511720603932e-06, + "logits/chosen": 0.390936017036438, + "logits/rejected": 0.3944934606552124, + "logps/chosen": -0.8416546583175659, + "logps/rejected": -1.379352331161499, + "loss": 0.8475, + "odds_ratio_loss": 0.5354892611503601, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08416546881198883, + "rewards/margins": 0.053769778460264206, + "rewards/rejected": -0.13793525099754333, + "sft_loss": 0.8416546583175659, + "step": 446 + }, + { + "epoch": 0.6464208242950108, + "grad_norm": 4.173759451782166, + "learning_rate": 7.855686140847595e-06, + "logits/chosen": 0.7082953453063965, + "logits/rejected": 0.3387436270713806, + "logps/chosen": -0.8564531207084656, + "logps/rejected": -2.3805789947509766, + "loss": 0.7679, + "odds_ratio_loss": 0.4507424533367157, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08564531058073044, + "rewards/margins": 0.15241259336471558, + "rewards/rejected": -0.2380579113960266, + "sft_loss": 0.8564531207084656, + "step": 447 + }, + { + "epoch": 0.6478669558929863, + "grad_norm": 2.8517830713037955, + "learning_rate": 7.854858236483722e-06, + "logits/chosen": 0.593734085559845, + "logits/rejected": 0.31093043088912964, + "logps/chosen": -0.7490995526313782, + "logps/rejected": -2.389633893966675, + "loss": 0.8676, + "odds_ratio_loss": 0.5279134511947632, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07490995526313782, + "rewards/margins": 0.16405344009399414, + "rewards/rejected": -0.23896339535713196, + "sft_loss": 0.7490995526313782, + "step": 448 + }, + { + "epoch": 0.6493130874909617, + "grad_norm": 3.821681729479148, + "learning_rate": 7.854028008011463e-06, + "logits/chosen": 0.7026189565658569, + "logits/rejected": 0.4407723546028137, + "logps/chosen": -0.7633951902389526, + "logps/rejected": -2.1326398849487305, + "loss": 0.7899, + "odds_ratio_loss": 0.4903205335140228, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.07633952051401138, + "rewards/margins": 0.13692447543144226, + "rewards/rejected": -0.21326400339603424, + "sft_loss": 0.7633951902389526, + "step": 449 + }, + { + "epoch": 0.6507592190889371, + "grad_norm": 5.133699682302873, + "learning_rate": 7.853195455931362e-06, + "logits/chosen": 0.642242431640625, + "logits/rejected": 0.5476385951042175, + "logps/chosen": -0.7822733521461487, + "logps/rejected": -1.8922646045684814, + "loss": 0.8471, + "odds_ratio_loss": 0.4665636718273163, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07822733372449875, + "rewards/margins": 0.11099912226200104, + "rewards/rejected": -0.18922646343708038, + "sft_loss": 0.7822733521461487, + "step": 450 + }, + { + "epoch": 0.6522053506869125, + "grad_norm": 2.4945263473256696, + "learning_rate": 7.85236058074537e-06, + "logits/chosen": 0.43350091576576233, + "logits/rejected": 0.348577082157135, + "logps/chosen": -0.7209492921829224, + "logps/rejected": -2.9272561073303223, + "loss": 0.7655, + "odds_ratio_loss": 0.4050930142402649, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07209491729736328, + "rewards/margins": 0.2206306755542755, + "rewards/rejected": -0.2927256226539612, + "sft_loss": 0.7209492921829224, + "step": 451 + }, + { + "epoch": 0.6536514822848879, + "grad_norm": 3.1603284424366227, + "learning_rate": 7.851523382956839e-06, + "logits/chosen": 0.685008704662323, + "logits/rejected": 0.41446003317832947, + "logps/chosen": -0.6122561693191528, + "logps/rejected": -2.7564573287963867, + "loss": 0.8216, + "odds_ratio_loss": 0.4269762635231018, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.061225611716508865, + "rewards/margins": 0.2144201248884201, + "rewards/rejected": -0.27564573287963867, + "sft_loss": 0.6122561693191528, + "step": 452 + }, + { + "epoch": 0.6550976138828634, + "grad_norm": 2.5123215280417264, + "learning_rate": 7.850683863070513e-06, + "logits/chosen": 0.5416068434715271, + "logits/rejected": 0.39375370740890503, + "logps/chosen": -0.8951384425163269, + "logps/rejected": -2.091689109802246, + "loss": 0.7807, + "odds_ratio_loss": 0.598831832408905, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08951384574174881, + "rewards/margins": 0.1196550726890564, + "rewards/rejected": -0.2091689109802246, + "sft_loss": 0.8951384425163269, + "step": 453 + }, + { + "epoch": 0.6565437454808387, + "grad_norm": 2.737876764066823, + "learning_rate": 7.849842021592546e-06, + "logits/chosen": 0.3288041353225708, + "logits/rejected": 0.3482604920864105, + "logps/chosen": -0.9502256512641907, + "logps/rejected": -1.9984798431396484, + "loss": 0.9195, + "odds_ratio_loss": 0.6456742882728577, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.09502257406711578, + "rewards/margins": 0.10482542961835861, + "rewards/rejected": -0.1998479962348938, + "sft_loss": 0.9502256512641907, + "step": 454 + }, + { + "epoch": 0.6579898770788142, + "grad_norm": 3.869771792844011, + "learning_rate": 7.848997859030484e-06, + "logits/chosen": 0.4661051034927368, + "logits/rejected": 0.3330320119857788, + "logps/chosen": -0.8484359979629517, + "logps/rejected": -2.905519485473633, + "loss": 0.9405, + "odds_ratio_loss": 0.4373171925544739, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.08484360575675964, + "rewards/margins": 0.2057083249092102, + "rewards/rejected": -0.29055193066596985, + "sft_loss": 0.8484359979629517, + "step": 455 + }, + { + "epoch": 0.6594360086767896, + "grad_norm": 3.8838637117754558, + "learning_rate": 7.84815137589328e-06, + "logits/chosen": 0.4900085926055908, + "logits/rejected": 0.449086457490921, + "logps/chosen": -0.915755033493042, + "logps/rejected": -1.9004579782485962, + "loss": 0.9095, + "odds_ratio_loss": 0.5265486836433411, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0915755107998848, + "rewards/margins": 0.0984703004360199, + "rewards/rejected": -0.1900458037853241, + "sft_loss": 0.915755033493042, + "step": 456 + }, + { + "epoch": 0.660882140274765, + "grad_norm": 2.8946868825853342, + "learning_rate": 7.847302572691277e-06, + "logits/chosen": 0.5905077457427979, + "logits/rejected": 0.3706495761871338, + "logps/chosen": -0.7773427963256836, + "logps/rejected": -2.9122698307037354, + "loss": 0.7367, + "odds_ratio_loss": 0.49745047092437744, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07773428410291672, + "rewards/margins": 0.21349269151687622, + "rewards/rejected": -0.29122695326805115, + "sft_loss": 0.7773427963256836, + "step": 457 + }, + { + "epoch": 0.6623282718727405, + "grad_norm": 2.6139847805230563, + "learning_rate": 7.846451449936224e-06, + "logits/chosen": 0.4387049078941345, + "logits/rejected": 0.3426019549369812, + "logps/chosen": -0.7675788402557373, + "logps/rejected": -1.365877389907837, + "loss": 0.7461, + "odds_ratio_loss": 0.5597048997879028, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07675788551568985, + "rewards/margins": 0.05982984974980354, + "rewards/rejected": -0.1365877389907837, + "sft_loss": 0.7675788402557373, + "step": 458 + }, + { + "epoch": 0.6637744034707158, + "grad_norm": 2.6779398241996044, + "learning_rate": 7.845598008141267e-06, + "logits/chosen": 0.4579883813858032, + "logits/rejected": 0.3316115438938141, + "logps/chosen": -0.6582392454147339, + "logps/rejected": -3.040428638458252, + "loss": 0.7372, + "odds_ratio_loss": 0.4125193953514099, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06582392752170563, + "rewards/margins": 0.2382189780473709, + "rewards/rejected": -0.30404287576675415, + "sft_loss": 0.6582392454147339, + "step": 459 + }, + { + "epoch": 0.6652205350686913, + "grad_norm": 2.482458627766373, + "learning_rate": 7.844742247820949e-06, + "logits/chosen": 0.5820556282997131, + "logits/rejected": 0.38141798973083496, + "logps/chosen": -0.6435836553573608, + "logps/rejected": -3.4378960132598877, + "loss": 0.7705, + "odds_ratio_loss": 0.36722418665885925, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06435836851596832, + "rewards/margins": 0.2794312536716461, + "rewards/rejected": -0.34378957748413086, + "sft_loss": 0.6435836553573608, + "step": 460 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 3.849254654154496, + "learning_rate": 7.843884169491209e-06, + "logits/chosen": 0.5479345917701721, + "logits/rejected": 0.38080063462257385, + "logps/chosen": -0.8304556608200073, + "logps/rejected": -1.7188962697982788, + "loss": 0.88, + "odds_ratio_loss": 0.5580796599388123, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08304556459188461, + "rewards/margins": 0.08884406089782715, + "rewards/rejected": -0.17188964784145355, + "sft_loss": 0.8304556608200073, + "step": 461 + }, + { + "epoch": 0.6681127982646421, + "grad_norm": 3.2623131963394396, + "learning_rate": 7.843023773669388e-06, + "logits/chosen": 0.5372626185417175, + "logits/rejected": 0.38888368010520935, + "logps/chosen": -0.732601523399353, + "logps/rejected": -3.131098747253418, + "loss": 0.7745, + "odds_ratio_loss": 0.5246361494064331, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07326015084981918, + "rewards/margins": 0.2398497313261032, + "rewards/rejected": -0.3131098747253418, + "sft_loss": 0.732601523399353, + "step": 462 + }, + { + "epoch": 0.6695589298626174, + "grad_norm": 6.67370859391976, + "learning_rate": 7.842161060874221e-06, + "logits/chosen": 0.6117855906486511, + "logits/rejected": 0.46079525351524353, + "logps/chosen": -0.6942752003669739, + "logps/rejected": -3.08976674079895, + "loss": 0.8945, + "odds_ratio_loss": 0.34388789534568787, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06942752003669739, + "rewards/margins": 0.2395491600036621, + "rewards/rejected": -0.3089766800403595, + "sft_loss": 0.6942752003669739, + "step": 463 + }, + { + "epoch": 0.6710050614605929, + "grad_norm": 3.4901642146880527, + "learning_rate": 7.841296031625842e-06, + "logits/chosen": 0.5016286969184875, + "logits/rejected": 0.30921778082847595, + "logps/chosen": -0.8868895769119263, + "logps/rejected": -1.7468254566192627, + "loss": 0.9116, + "odds_ratio_loss": 0.5753090381622314, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08868895471096039, + "rewards/margins": 0.08599359542131424, + "rewards/rejected": -0.17468255758285522, + "sft_loss": 0.8868895769119263, + "step": 464 + }, + { + "epoch": 0.6724511930585684, + "grad_norm": 2.705934552586253, + "learning_rate": 7.840428686445777e-06, + "logits/chosen": 0.3969650864601135, + "logits/rejected": 0.37639883160591125, + "logps/chosen": -0.7846730947494507, + "logps/rejected": -3.3052570819854736, + "loss": 0.899, + "odds_ratio_loss": 0.578758180141449, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07846730202436447, + "rewards/margins": 0.2520584166049957, + "rewards/rejected": -0.3305257260799408, + "sft_loss": 0.7846730947494507, + "step": 465 + }, + { + "epoch": 0.6738973246565437, + "grad_norm": 3.8524541345226453, + "learning_rate": 7.839559025856954e-06, + "logits/chosen": 0.6053705811500549, + "logits/rejected": 0.5268568992614746, + "logps/chosen": -0.948915958404541, + "logps/rejected": -2.0502045154571533, + "loss": 0.9358, + "odds_ratio_loss": 0.6905159950256348, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.09489160031080246, + "rewards/margins": 0.11012885719537735, + "rewards/rejected": -0.2050204575061798, + "sft_loss": 0.948915958404541, + "step": 466 + }, + { + "epoch": 0.6753434562545192, + "grad_norm": 2.5938676997175505, + "learning_rate": 7.838687050383694e-06, + "logits/chosen": 0.5891566276550293, + "logits/rejected": 0.4464053809642792, + "logps/chosen": -0.857430636882782, + "logps/rejected": -1.6854695081710815, + "loss": 0.853, + "odds_ratio_loss": 0.5570505857467651, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08574306219816208, + "rewards/margins": 0.0828038826584816, + "rewards/rejected": -0.16854694485664368, + "sft_loss": 0.857430636882782, + "step": 467 + }, + { + "epoch": 0.6767895878524945, + "grad_norm": 3.2940178997552967, + "learning_rate": 7.837812760551714e-06, + "logits/chosen": 0.43278759717941284, + "logits/rejected": 0.26053839921951294, + "logps/chosen": -0.7142987251281738, + "logps/rejected": -2.4090588092803955, + "loss": 0.7569, + "odds_ratio_loss": 0.4101409614086151, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07142987847328186, + "rewards/margins": 0.16947603225708008, + "rewards/rejected": -0.24090591073036194, + "sft_loss": 0.7142987251281738, + "step": 468 + }, + { + "epoch": 0.67823571945047, + "grad_norm": 3.7919377185149608, + "learning_rate": 7.83693615688813e-06, + "logits/chosen": 0.35429370403289795, + "logits/rejected": 0.39123567938804626, + "logps/chosen": -0.9550790190696716, + "logps/rejected": -1.3350166082382202, + "loss": 0.8351, + "odds_ratio_loss": 0.614581286907196, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0955079048871994, + "rewards/margins": 0.037993766367435455, + "rewards/rejected": -0.13350167870521545, + "sft_loss": 0.9550790190696716, + "step": 469 + }, + { + "epoch": 0.6796818510484454, + "grad_norm": 2.91281834178183, + "learning_rate": 7.836057239921444e-06, + "logits/chosen": 0.5054612755775452, + "logits/rejected": 0.43213528394699097, + "logps/chosen": -0.9110418558120728, + "logps/rejected": -1.7556121349334717, + "loss": 0.8131, + "odds_ratio_loss": 0.6161195039749146, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0911041870713234, + "rewards/margins": 0.08445702493190765, + "rewards/rejected": -0.17556121945381165, + "sft_loss": 0.9110418558120728, + "step": 470 + }, + { + "epoch": 0.6811279826464208, + "grad_norm": 5.212947929820761, + "learning_rate": 7.835176010181563e-06, + "logits/chosen": 0.5794857740402222, + "logits/rejected": 0.452696293592453, + "logps/chosen": -0.9335224628448486, + "logps/rejected": -2.0056893825531006, + "loss": 0.9011, + "odds_ratio_loss": 0.5872938632965088, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09335225075483322, + "rewards/margins": 0.10721667855978012, + "rewards/rejected": -0.20056892931461334, + "sft_loss": 0.9335224628448486, + "step": 471 + }, + { + "epoch": 0.6825741142443963, + "grad_norm": 4.005336153240001, + "learning_rate": 7.834292468199781e-06, + "logits/chosen": 0.40509578585624695, + "logits/rejected": 0.4229750335216522, + "logps/chosen": -0.8150092959403992, + "logps/rejected": -2.13399600982666, + "loss": 0.9764, + "odds_ratio_loss": 0.48038750886917114, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08150092512369156, + "rewards/margins": 0.1318986564874649, + "rewards/rejected": -0.21339958906173706, + "sft_loss": 0.8150092959403992, + "step": 472 + }, + { + "epoch": 0.6840202458423716, + "grad_norm": 4.70726946640677, + "learning_rate": 7.833406614508788e-06, + "logits/chosen": 0.6632890105247498, + "logits/rejected": 0.6132000684738159, + "logps/chosen": -0.5284931063652039, + "logps/rejected": -2.332587480545044, + "loss": 0.7618, + "odds_ratio_loss": 0.4303593635559082, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05284930765628815, + "rewards/margins": 0.18040944635868073, + "rewards/rejected": -0.23325875401496887, + "sft_loss": 0.5284931063652039, + "step": 473 + }, + { + "epoch": 0.6854663774403471, + "grad_norm": 3.324398929863063, + "learning_rate": 7.832518449642672e-06, + "logits/chosen": 0.422392874956131, + "logits/rejected": 0.4441375434398651, + "logps/chosen": -0.9545919895172119, + "logps/rejected": -2.223881483078003, + "loss": 0.9125, + "odds_ratio_loss": 0.6443504095077515, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09545920789241791, + "rewards/margins": 0.12692894041538239, + "rewards/rejected": -0.2223881483078003, + "sft_loss": 0.9545919895172119, + "step": 474 + }, + { + "epoch": 0.6869125090383225, + "grad_norm": 4.571799028540172, + "learning_rate": 7.831627974136907e-06, + "logits/chosen": 0.4669177830219269, + "logits/rejected": 0.43409720063209534, + "logps/chosen": -0.9336066246032715, + "logps/rejected": -2.756803512573242, + "loss": 0.9133, + "odds_ratio_loss": 0.5451022386550903, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09336065500974655, + "rewards/margins": 0.18231970071792603, + "rewards/rejected": -0.2756803631782532, + "sft_loss": 0.9336066246032715, + "step": 475 + }, + { + "epoch": 0.6883586406362979, + "grad_norm": 8.263383358535402, + "learning_rate": 7.830735188528369e-06, + "logits/chosen": 0.5980125069618225, + "logits/rejected": 0.349201500415802, + "logps/chosen": -0.5357162356376648, + "logps/rejected": -4.337940216064453, + "loss": 0.9422, + "odds_ratio_loss": 0.31241413950920105, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05357161909341812, + "rewards/margins": 0.3802223801612854, + "rewards/rejected": -0.4337940216064453, + "sft_loss": 0.5357162356376648, + "step": 476 + }, + { + "epoch": 0.6898047722342733, + "grad_norm": 2.6729720908747536, + "learning_rate": 7.829840093355315e-06, + "logits/chosen": 0.41900211572647095, + "logits/rejected": 0.4156907796859741, + "logps/chosen": -0.8371658325195312, + "logps/rejected": -1.4804648160934448, + "loss": 0.8874, + "odds_ratio_loss": 0.5720455050468445, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08371657878160477, + "rewards/margins": 0.06432989984750748, + "rewards/rejected": -0.14804649353027344, + "sft_loss": 0.8371658325195312, + "step": 477 + }, + { + "epoch": 0.6912509038322487, + "grad_norm": 2.996041581883456, + "learning_rate": 7.828942689157407e-06, + "logits/chosen": 0.4665514826774597, + "logits/rejected": 0.43654048442840576, + "logps/chosen": -0.8104965090751648, + "logps/rejected": -2.2539286613464355, + "loss": 0.8045, + "odds_ratio_loss": 0.5409241318702698, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08104965090751648, + "rewards/margins": 0.14434322714805603, + "rewards/rejected": -0.2253929078578949, + "sft_loss": 0.8104965090751648, + "step": 478 + }, + { + "epoch": 0.6926970354302241, + "grad_norm": 2.886010343753202, + "learning_rate": 7.82804297647569e-06, + "logits/chosen": 0.39548736810684204, + "logits/rejected": 0.25323015451431274, + "logps/chosen": -0.8145580887794495, + "logps/rejected": -1.8616585731506348, + "loss": 0.7833, + "odds_ratio_loss": 0.581149160861969, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08145581185817719, + "rewards/margins": 0.10471004992723465, + "rewards/rejected": -0.18616585433483124, + "sft_loss": 0.8145580887794495, + "step": 479 + }, + { + "epoch": 0.6941431670281996, + "grad_norm": 3.721831170544819, + "learning_rate": 7.827140955852606e-06, + "logits/chosen": 0.5488985776901245, + "logits/rejected": 0.43202951550483704, + "logps/chosen": -0.8996453285217285, + "logps/rejected": -1.8889070749282837, + "loss": 0.9119, + "odds_ratio_loss": 0.5485165119171143, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08996453881263733, + "rewards/margins": 0.09892617166042328, + "rewards/rejected": -0.18889069557189941, + "sft_loss": 0.8996453285217285, + "step": 480 + }, + { + "epoch": 0.695589298626175, + "grad_norm": 3.8244041653006247, + "learning_rate": 7.826236627831986e-06, + "logits/chosen": 0.49204105138778687, + "logits/rejected": 0.3208334147930145, + "logps/chosen": -0.7775691151618958, + "logps/rejected": -2.3301753997802734, + "loss": 0.875, + "odds_ratio_loss": 0.42941606044769287, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07775691151618958, + "rewards/margins": 0.15526065230369568, + "rewards/rejected": -0.23301756381988525, + "sft_loss": 0.7775691151618958, + "step": 481 + }, + { + "epoch": 0.6970354302241504, + "grad_norm": 2.607178353982076, + "learning_rate": 7.825329992959054e-06, + "logits/chosen": 0.4929500222206116, + "logits/rejected": 0.40421661734580994, + "logps/chosen": -0.8761095404624939, + "logps/rejected": -1.5417531728744507, + "loss": 0.8476, + "odds_ratio_loss": 0.584408700466156, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08761096000671387, + "rewards/margins": 0.06656436622142792, + "rewards/rejected": -0.15417534112930298, + "sft_loss": 0.8761095404624939, + "step": 482 + }, + { + "epoch": 0.6984815618221258, + "grad_norm": 2.6174084912028563, + "learning_rate": 7.82442105178042e-06, + "logits/chosen": 0.6119585633277893, + "logits/rejected": 0.5422635078430176, + "logps/chosen": -0.7514777779579163, + "logps/rejected": -2.8054733276367188, + "loss": 0.7823, + "odds_ratio_loss": 0.3922504782676697, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07514777779579163, + "rewards/margins": 0.20539957284927368, + "rewards/rejected": -0.2805473506450653, + "sft_loss": 0.7514777779579163, + "step": 483 + }, + { + "epoch": 0.6999276934201012, + "grad_norm": 2.3537756879016785, + "learning_rate": 7.823509804844091e-06, + "logits/chosen": 0.5284101963043213, + "logits/rejected": 0.3043856620788574, + "logps/chosen": -0.8965635299682617, + "logps/rejected": -1.57173490524292, + "loss": 0.865, + "odds_ratio_loss": 0.5315253734588623, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.08965635299682617, + "rewards/margins": 0.06751712411642075, + "rewards/rejected": -0.1571734994649887, + "sft_loss": 0.8965635299682617, + "step": 484 + }, + { + "epoch": 0.7013738250180767, + "grad_norm": 4.370410960026388, + "learning_rate": 7.82259625269946e-06, + "logits/chosen": 0.4739269018173218, + "logits/rejected": 0.28087443113327026, + "logps/chosen": -0.7558042407035828, + "logps/rejected": -2.000213623046875, + "loss": 0.8116, + "odds_ratio_loss": 0.4949937164783478, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0755804255604744, + "rewards/margins": 0.12444092333316803, + "rewards/rejected": -0.20002135634422302, + "sft_loss": 0.7558042407035828, + "step": 485 + }, + { + "epoch": 0.702819956616052, + "grad_norm": 2.757296734529158, + "learning_rate": 7.821680395897311e-06, + "logits/chosen": 0.5641602873802185, + "logits/rejected": 0.4281392991542816, + "logps/chosen": -0.9164305925369263, + "logps/rejected": -1.9298672676086426, + "loss": 0.8482, + "odds_ratio_loss": 0.5546359419822693, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0916430652141571, + "rewards/margins": 0.10134366154670715, + "rewards/rejected": -0.19298672676086426, + "sft_loss": 0.9164305925369263, + "step": 486 + }, + { + "epoch": 0.7042660882140275, + "grad_norm": 2.7370944482804203, + "learning_rate": 7.820762234989819e-06, + "logits/chosen": 0.5384199619293213, + "logits/rejected": 0.41531190276145935, + "logps/chosen": -0.8857967257499695, + "logps/rejected": -1.7009695768356323, + "loss": 0.9314, + "odds_ratio_loss": 0.6683101654052734, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08857966959476471, + "rewards/margins": 0.0815172791481018, + "rewards/rejected": -0.1700969636440277, + "sft_loss": 0.8857967257499695, + "step": 487 + }, + { + "epoch": 0.7057122198120029, + "grad_norm": 3.6320189601532973, + "learning_rate": 7.819841770530546e-06, + "logits/chosen": 0.5773338675498962, + "logits/rejected": 0.33606287837028503, + "logps/chosen": -0.5831294059753418, + "logps/rejected": -2.437774181365967, + "loss": 0.8034, + "odds_ratio_loss": 0.40286755561828613, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05831294506788254, + "rewards/margins": 0.18546447157859802, + "rewards/rejected": -0.24377740919589996, + "sft_loss": 0.5831294059753418, + "step": 488 + }, + { + "epoch": 0.7071583514099783, + "grad_norm": 2.8677538750318003, + "learning_rate": 7.818919003074443e-06, + "logits/chosen": 0.42975157499313354, + "logits/rejected": 0.3642490804195404, + "logps/chosen": -0.8703562021255493, + "logps/rejected": -1.5688755512237549, + "loss": 0.8759, + "odds_ratio_loss": 0.5489083528518677, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08703562617301941, + "rewards/margins": 0.06985193490982056, + "rewards/rejected": -0.15688756108283997, + "sft_loss": 0.8703562021255493, + "step": 489 + }, + { + "epoch": 0.7086044830079538, + "grad_norm": 3.346433651687049, + "learning_rate": 7.817993933177848e-06, + "logits/chosen": 0.4698405861854553, + "logits/rejected": 0.3148632347583771, + "logps/chosen": -0.8803499937057495, + "logps/rejected": -2.480961561203003, + "loss": 0.9126, + "odds_ratio_loss": 0.6439063549041748, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08803500235080719, + "rewards/margins": 0.16006116569042206, + "rewards/rejected": -0.24809619784355164, + "sft_loss": 0.8803499937057495, + "step": 490 + }, + { + "epoch": 0.7100506146059291, + "grad_norm": 2.4161250021000606, + "learning_rate": 7.817066561398493e-06, + "logits/chosen": 0.6872197985649109, + "logits/rejected": 0.5721977353096008, + "logps/chosen": -0.6891111731529236, + "logps/rejected": -2.3174729347229004, + "loss": 0.8469, + "odds_ratio_loss": 0.5914407968521118, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0689111202955246, + "rewards/margins": 0.16283617913722992, + "rewards/rejected": -0.23174728453159332, + "sft_loss": 0.6891111731529236, + "step": 491 + }, + { + "epoch": 0.7114967462039046, + "grad_norm": 2.375376214224217, + "learning_rate": 7.81613688829549e-06, + "logits/chosen": 0.5590274930000305, + "logits/rejected": 0.4893609285354614, + "logps/chosen": -0.8814407587051392, + "logps/rejected": -1.0228970050811768, + "loss": 1.008, + "odds_ratio_loss": 0.6809512972831726, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08814407885074615, + "rewards/margins": 0.014145624823868275, + "rewards/rejected": -0.10228970646858215, + "sft_loss": 0.8814407587051392, + "step": 492 + }, + { + "epoch": 0.7129428778018799, + "grad_norm": 2.915036842751362, + "learning_rate": 7.815204914429343e-06, + "logits/chosen": 0.45566874742507935, + "logits/rejected": 0.29462623596191406, + "logps/chosen": -0.8483090400695801, + "logps/rejected": -3.8663082122802734, + "loss": 0.8399, + "odds_ratio_loss": 0.5736794471740723, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08483090251684189, + "rewards/margins": 0.3017999529838562, + "rewards/rejected": -0.3866308331489563, + "sft_loss": 0.8483090400695801, + "step": 493 + }, + { + "epoch": 0.7143890093998554, + "grad_norm": 2.823588392434594, + "learning_rate": 7.814270640361947e-06, + "logits/chosen": 0.4194034934043884, + "logits/rejected": 0.3192807734012604, + "logps/chosen": -0.7508400678634644, + "logps/rejected": -2.6305389404296875, + "loss": 0.9102, + "odds_ratio_loss": 0.37007543444633484, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07508400082588196, + "rewards/margins": 0.1879698932170868, + "rewards/rejected": -0.26305389404296875, + "sft_loss": 0.7508400678634644, + "step": 494 + }, + { + "epoch": 0.7158351409978309, + "grad_norm": 8.703869182596133, + "learning_rate": 7.813334066656575e-06, + "logits/chosen": 0.5819438099861145, + "logits/rejected": 0.35457098484039307, + "logps/chosen": -0.6189179420471191, + "logps/rejected": -4.7622880935668945, + "loss": 0.7467, + "odds_ratio_loss": 0.27410566806793213, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.061891794204711914, + "rewards/margins": 0.41433700919151306, + "rewards/rejected": -0.476228803396225, + "sft_loss": 0.6189179420471191, + "step": 495 + }, + { + "epoch": 0.7172812725958062, + "grad_norm": 2.5095698313273447, + "learning_rate": 7.812395193877891e-06, + "logits/chosen": 0.6065265536308289, + "logits/rejected": 0.36203110218048096, + "logps/chosen": -0.8203286528587341, + "logps/rejected": -1.367362141609192, + "loss": 0.8163, + "odds_ratio_loss": 0.5582252740859985, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08203285932540894, + "rewards/margins": 0.054703351110219955, + "rewards/rejected": -0.1367362141609192, + "sft_loss": 0.8203286528587341, + "step": 496 + }, + { + "epoch": 0.7187274041937817, + "grad_norm": 2.450038513585519, + "learning_rate": 7.811454022591946e-06, + "logits/chosen": 0.410911500453949, + "logits/rejected": 0.3190882205963135, + "logps/chosen": -0.840955913066864, + "logps/rejected": -2.12227725982666, + "loss": 0.861, + "odds_ratio_loss": 0.4995306730270386, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08409559726715088, + "rewards/margins": 0.12813213467597961, + "rewards/rejected": -0.2122277319431305, + "sft_loss": 0.840955913066864, + "step": 497 + }, + { + "epoch": 0.720173535791757, + "grad_norm": 3.1149539308383627, + "learning_rate": 7.810510553366177e-06, + "logits/chosen": 0.6082262396812439, + "logits/rejected": 0.48194149136543274, + "logps/chosen": -0.8298695087432861, + "logps/rejected": -1.5063502788543701, + "loss": 0.8049, + "odds_ratio_loss": 0.5610617399215698, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08298695832490921, + "rewards/margins": 0.06764806807041168, + "rewards/rejected": -0.1506350338459015, + "sft_loss": 0.8298695087432861, + "step": 498 + }, + { + "epoch": 0.7216196673897325, + "grad_norm": 4.575070600880403, + "learning_rate": 7.809564786769403e-06, + "logits/chosen": 0.42572611570358276, + "logits/rejected": 0.5204564332962036, + "logps/chosen": -0.7067459225654602, + "logps/rejected": -1.7655752897262573, + "loss": 0.8539, + "odds_ratio_loss": 0.49053311347961426, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0706745982170105, + "rewards/margins": 0.10588293522596359, + "rewards/rejected": -0.1765575408935547, + "sft_loss": 0.7067459225654602, + "step": 499 + }, + { + "epoch": 0.7230657989877078, + "grad_norm": 4.362979902093869, + "learning_rate": 7.808616723371828e-06, + "logits/chosen": 0.5878921151161194, + "logits/rejected": 0.39825379848480225, + "logps/chosen": -0.6144382953643799, + "logps/rejected": -1.6051030158996582, + "loss": 0.7595, + "odds_ratio_loss": 0.3771412968635559, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06144382804632187, + "rewards/margins": 0.09906647354364395, + "rewards/rejected": -0.16051030158996582, + "sft_loss": 0.6144382953643799, + "step": 500 + }, + { + "epoch": 0.7245119305856833, + "grad_norm": 2.838948408412676, + "learning_rate": 7.807666363745048e-06, + "logits/chosen": 0.4524368941783905, + "logits/rejected": 0.24179279804229736, + "logps/chosen": -0.6637477874755859, + "logps/rejected": -1.8917555809020996, + "loss": 0.9149, + "odds_ratio_loss": 0.4682096838951111, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0663747787475586, + "rewards/margins": 0.1228007823228836, + "rewards/rejected": -0.1891755610704422, + "sft_loss": 0.6637477874755859, + "step": 501 + }, + { + "epoch": 0.7259580621836587, + "grad_norm": 3.244484568190041, + "learning_rate": 7.806713708462036e-06, + "logits/chosen": 0.31584107875823975, + "logits/rejected": 0.3299151062965393, + "logps/chosen": -0.8359942436218262, + "logps/rejected": -1.7388925552368164, + "loss": 0.9218, + "odds_ratio_loss": 0.34620821475982666, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.08359942585229874, + "rewards/margins": 0.09028984606266022, + "rewards/rejected": -0.17388926446437836, + "sft_loss": 0.8359942436218262, + "step": 502 + }, + { + "epoch": 0.7274041937816341, + "grad_norm": 2.9413391620870204, + "learning_rate": 7.805758758097152e-06, + "logits/chosen": 0.39965227246284485, + "logits/rejected": 0.2767530083656311, + "logps/chosen": -0.682356059551239, + "logps/rejected": -1.8037464618682861, + "loss": 0.8816, + "odds_ratio_loss": 0.42978233098983765, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0682356059551239, + "rewards/margins": 0.11213904619216919, + "rewards/rejected": -0.18037466704845428, + "sft_loss": 0.682356059551239, + "step": 503 + }, + { + "epoch": 0.7288503253796096, + "grad_norm": 7.08235782639055, + "learning_rate": 7.804801513226138e-06, + "logits/chosen": 0.6158980131149292, + "logits/rejected": 0.4643901288509369, + "logps/chosen": -0.6825918555259705, + "logps/rejected": -1.3164550065994263, + "loss": 0.9577, + "odds_ratio_loss": 0.5534126162528992, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.06825917959213257, + "rewards/margins": 0.06338632106781006, + "rewards/rejected": -0.13164550065994263, + "sft_loss": 0.6825918555259705, + "step": 504 + }, + { + "epoch": 0.7302964569775849, + "grad_norm": 2.430837485754504, + "learning_rate": 7.80384197442612e-06, + "logits/chosen": 0.6757205724716187, + "logits/rejected": 0.3712140619754791, + "logps/chosen": -0.7965311408042908, + "logps/rejected": -1.7287770509719849, + "loss": 0.8777, + "odds_ratio_loss": 0.5180415511131287, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07965311408042908, + "rewards/margins": 0.09322459250688553, + "rewards/rejected": -0.172877699136734, + "sft_loss": 0.7965311408042908, + "step": 505 + }, + { + "epoch": 0.7317425885755604, + "grad_norm": 2.461862933908863, + "learning_rate": 7.802880142275609e-06, + "logits/chosen": 0.5290156006813049, + "logits/rejected": 0.36588141322135925, + "logps/chosen": -0.6499413251876831, + "logps/rejected": -2.212672472000122, + "loss": 0.7377, + "odds_ratio_loss": 0.4530958831310272, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06499413400888443, + "rewards/margins": 0.15627311170101166, + "rewards/rejected": -0.2212672382593155, + "sft_loss": 0.6499413251876831, + "step": 506 + }, + { + "epoch": 0.7331887201735358, + "grad_norm": 4.607958134692978, + "learning_rate": 7.801916017354498e-06, + "logits/chosen": 0.4226146936416626, + "logits/rejected": 0.34249216318130493, + "logps/chosen": -0.8650091290473938, + "logps/rejected": -1.9593756198883057, + "loss": 0.8793, + "odds_ratio_loss": 0.5410417318344116, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08650091290473938, + "rewards/margins": 0.10943662375211716, + "rewards/rejected": -0.19593754410743713, + "sft_loss": 0.8650091290473938, + "step": 507 + }, + { + "epoch": 0.7346348517715112, + "grad_norm": 2.889267730930254, + "learning_rate": 7.80094960024406e-06, + "logits/chosen": 0.382773756980896, + "logits/rejected": 0.24849705398082733, + "logps/chosen": -0.9779189825057983, + "logps/rejected": -2.239924430847168, + "loss": 0.7804, + "odds_ratio_loss": 0.5563200116157532, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0977918952703476, + "rewards/margins": 0.12620052695274353, + "rewards/rejected": -0.2239924520254135, + "sft_loss": 0.9779189825057983, + "step": 508 + }, + { + "epoch": 0.7360809833694866, + "grad_norm": 3.7010451738157886, + "learning_rate": 7.799980891526951e-06, + "logits/chosen": 0.48358017206192017, + "logits/rejected": 0.28892093896865845, + "logps/chosen": -0.7672395706176758, + "logps/rejected": -2.660430431365967, + "loss": 0.9052, + "odds_ratio_loss": 0.34770241379737854, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07672396302223206, + "rewards/margins": 0.18931908905506134, + "rewards/rejected": -0.2660430669784546, + "sft_loss": 0.7672395706176758, + "step": 509 + }, + { + "epoch": 0.737527114967462, + "grad_norm": 2.965109666641044, + "learning_rate": 7.799009891787211e-06, + "logits/chosen": 0.4797486960887909, + "logits/rejected": 0.34847331047058105, + "logps/chosen": -0.7682068347930908, + "logps/rejected": -1.6704812049865723, + "loss": 0.7642, + "odds_ratio_loss": 0.5312727093696594, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07682067900896072, + "rewards/margins": 0.09022743999958038, + "rewards/rejected": -0.1670481115579605, + "sft_loss": 0.7682068347930908, + "step": 510 + }, + { + "epoch": 0.7389732465654375, + "grad_norm": 4.029197098624356, + "learning_rate": 7.798036601610256e-06, + "logits/chosen": 0.3794160783290863, + "logits/rejected": 0.309948205947876, + "logps/chosen": -0.7439759969711304, + "logps/rejected": -2.09332013130188, + "loss": 0.8295, + "odds_ratio_loss": 0.5097244381904602, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07439759373664856, + "rewards/margins": 0.13493439555168152, + "rewards/rejected": -0.20933200418949127, + "sft_loss": 0.7439759969711304, + "step": 511 + }, + { + "epoch": 0.7404193781634129, + "grad_norm": 2.667176937833829, + "learning_rate": 7.79706102158289e-06, + "logits/chosen": 0.5048097968101501, + "logits/rejected": 0.3359774053096771, + "logps/chosen": -0.9156619310379028, + "logps/rejected": -1.9869725704193115, + "loss": 0.9098, + "odds_ratio_loss": 0.5073124766349792, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09156619757413864, + "rewards/margins": 0.10713106393814087, + "rewards/rejected": -0.1986972540616989, + "sft_loss": 0.9156619310379028, + "step": 512 + }, + { + "epoch": 0.7418655097613883, + "grad_norm": 2.7993967833945868, + "learning_rate": 7.796083152293293e-06, + "logits/chosen": 0.5514634251594543, + "logits/rejected": 0.49493858218193054, + "logps/chosen": -0.719922661781311, + "logps/rejected": -1.1581745147705078, + "loss": 0.8478, + "odds_ratio_loss": 0.5648511052131653, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07199226319789886, + "rewards/margins": 0.0438251867890358, + "rewards/rejected": -0.11581744998693466, + "sft_loss": 0.719922661781311, + "step": 513 + }, + { + "epoch": 0.7433116413593637, + "grad_norm": 3.1335701177478303, + "learning_rate": 7.795102994331024e-06, + "logits/chosen": 0.41231465339660645, + "logits/rejected": 0.26936453580856323, + "logps/chosen": -0.9713563323020935, + "logps/rejected": -1.754350185394287, + "loss": 0.8876, + "odds_ratio_loss": 0.46331292390823364, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09713563323020935, + "rewards/margins": 0.078299380838871, + "rewards/rejected": -0.17543500661849976, + "sft_loss": 0.9713563323020935, + "step": 514 + }, + { + "epoch": 0.7447577729573391, + "grad_norm": 3.4077557927604154, + "learning_rate": 7.794120548287026e-06, + "logits/chosen": 0.5619039535522461, + "logits/rejected": 0.44908225536346436, + "logps/chosen": -0.8658353090286255, + "logps/rejected": -1.4560352563858032, + "loss": 0.8073, + "odds_ratio_loss": 0.6640084981918335, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08658353984355927, + "rewards/margins": 0.05901998281478882, + "rewards/rejected": -0.14560352265834808, + "sft_loss": 0.8658353090286255, + "step": 515 + }, + { + "epoch": 0.7462039045553145, + "grad_norm": 2.278784725218634, + "learning_rate": 7.793135814753618e-06, + "logits/chosen": 0.5649205446243286, + "logits/rejected": 0.4413209855556488, + "logps/chosen": -0.7314971685409546, + "logps/rejected": -1.7357701063156128, + "loss": 0.8308, + "odds_ratio_loss": 0.6118183135986328, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07314971834421158, + "rewards/margins": 0.1004272997379303, + "rewards/rejected": -0.17357701063156128, + "sft_loss": 0.7314971685409546, + "step": 516 + }, + { + "epoch": 0.74765003615329, + "grad_norm": 2.527228379509471, + "learning_rate": 7.7921487943245e-06, + "logits/chosen": 0.5564696192741394, + "logits/rejected": 0.26475444436073303, + "logps/chosen": -0.6380961537361145, + "logps/rejected": -3.072185516357422, + "loss": 0.7328, + "odds_ratio_loss": 0.37601861357688904, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06380962580442429, + "rewards/margins": 0.2434089481830597, + "rewards/rejected": -0.3072185814380646, + "sft_loss": 0.6380961537361145, + "step": 517 + }, + { + "epoch": 0.7490961677512654, + "grad_norm": 4.417874578202541, + "learning_rate": 7.791159487594752e-06, + "logits/chosen": 0.37396425008773804, + "logits/rejected": 0.2895720601081848, + "logps/chosen": -0.9451796412467957, + "logps/rejected": -1.155394434928894, + "loss": 0.854, + "odds_ratio_loss": 0.7048704028129578, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09451796114444733, + "rewards/margins": 0.021021487191319466, + "rewards/rejected": -0.11553944647312164, + "sft_loss": 0.9451796412467957, + "step": 518 + }, + { + "epoch": 0.7505422993492408, + "grad_norm": 4.6204210581385325, + "learning_rate": 7.790167895160827e-06, + "logits/chosen": 0.49021750688552856, + "logits/rejected": 0.38616707921028137, + "logps/chosen": -0.6952154636383057, + "logps/rejected": -1.7352021932601929, + "loss": 0.8745, + "odds_ratio_loss": 0.521263062953949, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06952154636383057, + "rewards/margins": 0.10399869829416275, + "rewards/rejected": -0.17352023720741272, + "sft_loss": 0.6952154636383057, + "step": 519 + }, + { + "epoch": 0.7519884309472162, + "grad_norm": 7.408022711503893, + "learning_rate": 7.789174017620563e-06, + "logits/chosen": 0.5427068471908569, + "logits/rejected": 0.5353628396987915, + "logps/chosen": -0.7601156234741211, + "logps/rejected": -1.1861733198165894, + "loss": 0.8739, + "odds_ratio_loss": 0.5829976797103882, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07601156085729599, + "rewards/margins": 0.04260575771331787, + "rewards/rejected": -0.11861732602119446, + "sft_loss": 0.7601156234741211, + "step": 520 + }, + { + "epoch": 0.7534345625451916, + "grad_norm": 3.002218429840675, + "learning_rate": 7.788177855573172e-06, + "logits/chosen": 0.4132567048072815, + "logits/rejected": 0.40480029582977295, + "logps/chosen": -0.5906501412391663, + "logps/rejected": -1.6327810287475586, + "loss": 0.8547, + "odds_ratio_loss": 0.5081474781036377, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.059065017849206924, + "rewards/margins": 0.10421308130025864, + "rewards/rejected": -0.16327810287475586, + "sft_loss": 0.5906501412391663, + "step": 521 + }, + { + "epoch": 0.754880694143167, + "grad_norm": 4.557354676364852, + "learning_rate": 7.787179409619243e-06, + "logits/chosen": 0.39193394780158997, + "logits/rejected": 0.36725348234176636, + "logps/chosen": -0.7613071203231812, + "logps/rejected": -1.2227325439453125, + "loss": 0.8271, + "odds_ratio_loss": 0.5340095162391663, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07613071799278259, + "rewards/margins": 0.046142540872097015, + "rewards/rejected": -0.12227325141429901, + "sft_loss": 0.7613071203231812, + "step": 522 + }, + { + "epoch": 0.7563268257411424, + "grad_norm": 3.041867939127662, + "learning_rate": 7.786178680360743e-06, + "logits/chosen": 0.4620596468448639, + "logits/rejected": 0.37321341037750244, + "logps/chosen": -0.9166240692138672, + "logps/rejected": -1.4555108547210693, + "loss": 0.9253, + "odds_ratio_loss": 0.6333938837051392, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09166240692138672, + "rewards/margins": 0.053888678550720215, + "rewards/rejected": -0.14555108547210693, + "sft_loss": 0.9166240692138672, + "step": 523 + }, + { + "epoch": 0.7577729573391179, + "grad_norm": 2.850718909419124, + "learning_rate": 7.785175668401015e-06, + "logits/chosen": 0.35010722279548645, + "logits/rejected": 0.2687487006187439, + "logps/chosen": -0.7346320748329163, + "logps/rejected": -2.8913722038269043, + "loss": 0.8036, + "odds_ratio_loss": 0.31207412481307983, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07346320897340775, + "rewards/margins": 0.21567398309707642, + "rewards/rejected": -0.28913718461990356, + "sft_loss": 0.7346320748329163, + "step": 524 + }, + { + "epoch": 0.7592190889370932, + "grad_norm": 2.8661637700070517, + "learning_rate": 7.784170374344778e-06, + "logits/chosen": 0.3620089292526245, + "logits/rejected": 0.28093022108078003, + "logps/chosen": -0.947007417678833, + "logps/rejected": -2.0669689178466797, + "loss": 0.9131, + "odds_ratio_loss": 0.604080855846405, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09470075368881226, + "rewards/margins": 0.11199614405632019, + "rewards/rejected": -0.20669689774513245, + "sft_loss": 0.947007417678833, + "step": 525 + }, + { + "epoch": 0.7606652205350687, + "grad_norm": 3.1260980643123295, + "learning_rate": 7.78316279879813e-06, + "logits/chosen": 0.4290822148323059, + "logits/rejected": 0.3005208373069763, + "logps/chosen": -0.9603489637374878, + "logps/rejected": -1.726524829864502, + "loss": 0.9003, + "odds_ratio_loss": 0.7454808950424194, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09603489935398102, + "rewards/margins": 0.07661759853363037, + "rewards/rejected": -0.1726524978876114, + "sft_loss": 0.9603489637374878, + "step": 526 + }, + { + "epoch": 0.7621113521330442, + "grad_norm": 3.225760777962045, + "learning_rate": 7.78215294236854e-06, + "logits/chosen": 0.41072916984558105, + "logits/rejected": 0.38332122564315796, + "logps/chosen": -0.7351124286651611, + "logps/rejected": -1.3829286098480225, + "loss": 0.889, + "odds_ratio_loss": 0.4489685893058777, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07351124286651611, + "rewards/margins": 0.06478161364793777, + "rewards/rejected": -0.13829286396503448, + "sft_loss": 0.7351124286651611, + "step": 527 + }, + { + "epoch": 0.7635574837310195, + "grad_norm": 3.557799675646004, + "learning_rate": 7.781140805664854e-06, + "logits/chosen": 0.5594673752784729, + "logits/rejected": 0.4795070290565491, + "logps/chosen": -0.6471788883209229, + "logps/rejected": -1.9761933088302612, + "loss": 0.7664, + "odds_ratio_loss": 0.5941170454025269, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06471788883209229, + "rewards/margins": 0.13290144503116608, + "rewards/rejected": -0.19761933386325836, + "sft_loss": 0.6471788883209229, + "step": 528 + }, + { + "epoch": 0.765003615328995, + "grad_norm": 2.6986867787865294, + "learning_rate": 7.780126389297296e-06, + "logits/chosen": 0.2951076626777649, + "logits/rejected": 0.20684948563575745, + "logps/chosen": -0.7162260413169861, + "logps/rejected": -1.82748544216156, + "loss": 0.8949, + "odds_ratio_loss": 0.44935142993927, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07162261009216309, + "rewards/margins": 0.11112594604492188, + "rewards/rejected": -0.18274855613708496, + "sft_loss": 0.7162260413169861, + "step": 529 + }, + { + "epoch": 0.7664497469269703, + "grad_norm": 2.607178013962108, + "learning_rate": 7.779109693877458e-06, + "logits/chosen": 0.5780777931213379, + "logits/rejected": 0.3079754710197449, + "logps/chosen": -0.6577814221382141, + "logps/rejected": -1.864539384841919, + "loss": 0.802, + "odds_ratio_loss": 0.4034327268600464, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06577815115451813, + "rewards/margins": 0.12067579478025436, + "rewards/rejected": -0.1864539533853531, + "sft_loss": 0.6577814221382141, + "step": 530 + }, + { + "epoch": 0.7678958785249458, + "grad_norm": 3.7648940723064124, + "learning_rate": 7.77809072001831e-06, + "logits/chosen": 0.4569055140018463, + "logits/rejected": 0.35255199670791626, + "logps/chosen": -0.8331925868988037, + "logps/rejected": -1.2831447124481201, + "loss": 0.7794, + "odds_ratio_loss": 0.624811053276062, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08331926167011261, + "rewards/margins": 0.04499521851539612, + "rewards/rejected": -0.12831448018550873, + "sft_loss": 0.8331925868988037, + "step": 531 + }, + { + "epoch": 0.7693420101229211, + "grad_norm": 3.154231228709698, + "learning_rate": 7.777069468334197e-06, + "logits/chosen": 0.3883509635925293, + "logits/rejected": 0.4145001173019409, + "logps/chosen": -0.6996007561683655, + "logps/rejected": -1.6742178201675415, + "loss": 0.8262, + "odds_ratio_loss": 0.6708589196205139, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06996007263660431, + "rewards/margins": 0.09746171534061432, + "rewards/rejected": -0.16742177307605743, + "sft_loss": 0.6996007561683655, + "step": 532 + }, + { + "epoch": 0.7707881417208966, + "grad_norm": 2.81732861155204, + "learning_rate": 7.776045939440835e-06, + "logits/chosen": 0.2952539920806885, + "logits/rejected": 0.20435559749603271, + "logps/chosen": -1.0842608213424683, + "logps/rejected": -1.9723320007324219, + "loss": 0.9858, + "odds_ratio_loss": 0.6315648555755615, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.10842607915401459, + "rewards/margins": 0.088807113468647, + "rewards/rejected": -0.197233185172081, + "sft_loss": 1.0842608213424683, + "step": 533 + }, + { + "epoch": 0.7722342733188721, + "grad_norm": 3.248807458733231, + "learning_rate": 7.77502013395531e-06, + "logits/chosen": 0.3924180865287781, + "logits/rejected": 0.3698059320449829, + "logps/chosen": -0.8218859434127808, + "logps/rejected": -1.5072885751724243, + "loss": 0.7944, + "odds_ratio_loss": 0.3994704484939575, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08218859881162643, + "rewards/margins": 0.06854026019573212, + "rewards/rejected": -0.15072886645793915, + "sft_loss": 0.8218859434127808, + "step": 534 + }, + { + "epoch": 0.7736804049168474, + "grad_norm": 2.471150436216554, + "learning_rate": 7.773992052496087e-06, + "logits/chosen": 0.413666307926178, + "logits/rejected": 0.315445214509964, + "logps/chosen": -0.8587773442268372, + "logps/rejected": -2.272141456604004, + "loss": 0.9716, + "odds_ratio_loss": 0.5251275897026062, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08587773889303207, + "rewards/margins": 0.14133641123771667, + "rewards/rejected": -0.22721417248249054, + "sft_loss": 0.8587773442268372, + "step": 535 + }, + { + "epoch": 0.7751265365148229, + "grad_norm": 6.0303720876003615, + "learning_rate": 7.772961695683001e-06, + "logits/chosen": 0.5290374159812927, + "logits/rejected": 0.35298866033554077, + "logps/chosen": -0.8901825547218323, + "logps/rejected": -2.4757721424102783, + "loss": 0.8743, + "odds_ratio_loss": 0.5522928237915039, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08901825547218323, + "rewards/margins": 0.1585589349269867, + "rewards/rejected": -0.24757720530033112, + "sft_loss": 0.8901825547218323, + "step": 536 + }, + { + "epoch": 0.7765726681127982, + "grad_norm": 3.199778027441658, + "learning_rate": 7.771929064137255e-06, + "logits/chosen": 0.301146537065506, + "logits/rejected": 0.21612121164798737, + "logps/chosen": -0.7302257418632507, + "logps/rejected": -3.005721092224121, + "loss": 0.8393, + "odds_ratio_loss": 0.3484118580818176, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07302258163690567, + "rewards/margins": 0.22754952311515808, + "rewards/rejected": -0.30057209730148315, + "sft_loss": 0.7302257418632507, + "step": 537 + }, + { + "epoch": 0.7780187997107737, + "grad_norm": 2.8558577962690075, + "learning_rate": 7.77089415848143e-06, + "logits/chosen": 0.3406679034233093, + "logits/rejected": 0.22741743922233582, + "logps/chosen": -0.7484648823738098, + "logps/rejected": -2.0377368927001953, + "loss": 0.8792, + "odds_ratio_loss": 0.5459549427032471, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07484649121761322, + "rewards/margins": 0.12892721593379974, + "rewards/rejected": -0.20377370715141296, + "sft_loss": 0.7484648823738098, + "step": 538 + }, + { + "epoch": 0.779464931308749, + "grad_norm": 3.044240366400535, + "learning_rate": 7.769856979339473e-06, + "logits/chosen": 0.39080581068992615, + "logits/rejected": 0.27354422211647034, + "logps/chosen": -0.7717220783233643, + "logps/rejected": -2.013183355331421, + "loss": 0.9099, + "odds_ratio_loss": 0.45660167932510376, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07717221230268478, + "rewards/margins": 0.12414614111185074, + "rewards/rejected": -0.20131835341453552, + "sft_loss": 0.7717220783233643, + "step": 539 + }, + { + "epoch": 0.7809110629067245, + "grad_norm": 2.5132725680789516, + "learning_rate": 7.768817527336701e-06, + "logits/chosen": 0.4074794352054596, + "logits/rejected": 0.2864871323108673, + "logps/chosen": -0.9486532211303711, + "logps/rejected": -1.185182809829712, + "loss": 0.9103, + "odds_ratio_loss": 0.7375662326812744, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09486532211303711, + "rewards/margins": 0.023652950301766396, + "rewards/rejected": -0.11851827800273895, + "sft_loss": 0.9486532211303711, + "step": 540 + }, + { + "epoch": 0.7823571945047, + "grad_norm": 4.473324712516512, + "learning_rate": 7.767775803099805e-06, + "logits/chosen": 0.30603158473968506, + "logits/rejected": 0.27318325638771057, + "logps/chosen": -1.0894935131072998, + "logps/rejected": -1.8977537155151367, + "loss": 0.9864, + "odds_ratio_loss": 0.675370454788208, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.10894934833049774, + "rewards/margins": 0.080826036632061, + "rewards/rejected": -0.18977537751197815, + "sft_loss": 1.0894935131072998, + "step": 541 + }, + { + "epoch": 0.7838033261026753, + "grad_norm": 2.366314632530353, + "learning_rate": 7.766731807256845e-06, + "logits/chosen": 0.5116511583328247, + "logits/rejected": 0.262421578168869, + "logps/chosen": -0.7834107279777527, + "logps/rejected": -3.154900550842285, + "loss": 0.8295, + "odds_ratio_loss": 0.4272821545600891, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07834108173847198, + "rewards/margins": 0.2371489703655243, + "rewards/rejected": -0.31549006700515747, + "sft_loss": 0.7834107279777527, + "step": 542 + }, + { + "epoch": 0.7852494577006508, + "grad_norm": 2.752280571379248, + "learning_rate": 7.76568554043725e-06, + "logits/chosen": 0.3837011754512787, + "logits/rejected": 0.3628317415714264, + "logps/chosen": -0.9774251580238342, + "logps/rejected": -1.3284695148468018, + "loss": 0.8892, + "odds_ratio_loss": 0.6802763938903809, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09774252027273178, + "rewards/margins": 0.03510444238781929, + "rewards/rejected": -0.13284695148468018, + "sft_loss": 0.9774251580238342, + "step": 543 + }, + { + "epoch": 0.7866955892986262, + "grad_norm": 3.694066400970215, + "learning_rate": 7.764637003271819e-06, + "logits/chosen": 0.42650485038757324, + "logits/rejected": 0.2961081564426422, + "logps/chosen": -0.9184818863868713, + "logps/rejected": -2.8604061603546143, + "loss": 0.8954, + "odds_ratio_loss": 0.5541467666625977, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.09184818714857101, + "rewards/margins": 0.19419240951538086, + "rewards/rejected": -0.28604060411453247, + "sft_loss": 0.9184818863868713, + "step": 544 + }, + { + "epoch": 0.7881417208966016, + "grad_norm": 3.5112933617172732, + "learning_rate": 7.763586196392715e-06, + "logits/chosen": 0.33292311429977417, + "logits/rejected": 0.3208504915237427, + "logps/chosen": -0.9027593731880188, + "logps/rejected": -2.139615774154663, + "loss": 0.8603, + "odds_ratio_loss": 0.4521198570728302, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.09027593582868576, + "rewards/margins": 0.12368564307689667, + "rewards/rejected": -0.21396157145500183, + "sft_loss": 0.9027593731880188, + "step": 545 + }, + { + "epoch": 0.789587852494577, + "grad_norm": 2.9201291120405575, + "learning_rate": 7.762533120433478e-06, + "logits/chosen": 0.4251149594783783, + "logits/rejected": 0.2440737783908844, + "logps/chosen": -0.7995153665542603, + "logps/rejected": -2.243067979812622, + "loss": 0.8136, + "odds_ratio_loss": 0.564355731010437, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07995154708623886, + "rewards/margins": 0.14435526728630066, + "rewards/rejected": -0.22430679202079773, + "sft_loss": 0.7995153665542603, + "step": 546 + }, + { + "epoch": 0.7910339840925524, + "grad_norm": 2.6748457009798443, + "learning_rate": 7.761477776029008e-06, + "logits/chosen": 0.4318455457687378, + "logits/rejected": 0.2599557042121887, + "logps/chosen": -0.7891461253166199, + "logps/rejected": -2.2876057624816895, + "loss": 0.8256, + "odds_ratio_loss": 0.4860532283782959, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07891461253166199, + "rewards/margins": 0.14984598755836487, + "rewards/rejected": -0.22876060009002686, + "sft_loss": 0.7891461253166199, + "step": 547 + }, + { + "epoch": 0.7924801156905278, + "grad_norm": 5.4769117124092235, + "learning_rate": 7.76042016381558e-06, + "logits/chosen": 0.5130550265312195, + "logits/rejected": 0.34844133257865906, + "logps/chosen": -0.6933074593544006, + "logps/rejected": -2.7397730350494385, + "loss": 0.8176, + "odds_ratio_loss": 0.48113542795181274, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06933074444532394, + "rewards/margins": 0.20464655756950378, + "rewards/rejected": -0.2739773094654083, + "sft_loss": 0.6933074593544006, + "step": 548 + }, + { + "epoch": 0.7939262472885033, + "grad_norm": 3.0348990843968546, + "learning_rate": 7.759360284430827e-06, + "logits/chosen": 0.4405141770839691, + "logits/rejected": 0.265964537858963, + "logps/chosen": -0.8159855008125305, + "logps/rejected": -1.70670485496521, + "loss": 0.8676, + "odds_ratio_loss": 0.5362525582313538, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.08159855753183365, + "rewards/margins": 0.08907192945480347, + "rewards/rejected": -0.1706704944372177, + "sft_loss": 0.8159855008125305, + "step": 549 + }, + { + "epoch": 0.7953723788864787, + "grad_norm": 4.29621985900849, + "learning_rate": 7.75829813851376e-06, + "logits/chosen": 0.352047860622406, + "logits/rejected": 0.25868427753448486, + "logps/chosen": -0.719826877117157, + "logps/rejected": -2.092642307281494, + "loss": 0.7661, + "odds_ratio_loss": 0.5927515625953674, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.07198269665241241, + "rewards/margins": 0.13728153705596924, + "rewards/rejected": -0.20926421880722046, + "sft_loss": 0.719826877117157, + "step": 550 + }, + { + "epoch": 0.7968185104844541, + "grad_norm": 3.4005018103671176, + "learning_rate": 7.757233726704747e-06, + "logits/chosen": 0.24000917375087738, + "logits/rejected": 0.2559305429458618, + "logps/chosen": -0.9485635161399841, + "logps/rejected": -2.154191493988037, + "loss": 0.9052, + "odds_ratio_loss": 0.5316140055656433, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09485635161399841, + "rewards/margins": 0.12056280672550201, + "rewards/rejected": -0.21541914343833923, + "sft_loss": 0.9485635161399841, + "step": 551 + }, + { + "epoch": 0.7982646420824295, + "grad_norm": 3.0623594326374404, + "learning_rate": 7.756167049645526e-06, + "logits/chosen": 0.4697762131690979, + "logits/rejected": 0.27899396419525146, + "logps/chosen": -0.8783195614814758, + "logps/rejected": -2.365593910217285, + "loss": 0.8236, + "odds_ratio_loss": 0.6793208122253418, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08783195912837982, + "rewards/margins": 0.1487274318933487, + "rewards/rejected": -0.2365594208240509, + "sft_loss": 0.8783195614814758, + "step": 552 + }, + { + "epoch": 0.7997107736804049, + "grad_norm": 2.9804375568080834, + "learning_rate": 7.755098107979202e-06, + "logits/chosen": 0.4210900664329529, + "logits/rejected": 0.3507734537124634, + "logps/chosen": -0.7799772620201111, + "logps/rejected": -2.2229514122009277, + "loss": 0.8528, + "odds_ratio_loss": 0.5192586779594421, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07799772918224335, + "rewards/margins": 0.14429740607738495, + "rewards/rejected": -0.2222951352596283, + "sft_loss": 0.7799772620201111, + "step": 553 + }, + { + "epoch": 0.8011569052783803, + "grad_norm": 4.236419244741945, + "learning_rate": 7.754026902350242e-06, + "logits/chosen": 0.3866078853607178, + "logits/rejected": 0.3358733654022217, + "logps/chosen": -0.9486645460128784, + "logps/rejected": -1.8577195405960083, + "loss": 0.9705, + "odds_ratio_loss": 0.6812944412231445, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09486645460128784, + "rewards/margins": 0.09090550243854523, + "rewards/rejected": -0.18577195703983307, + "sft_loss": 0.9486645460128784, + "step": 554 + }, + { + "epoch": 0.8026030368763557, + "grad_norm": 5.48208070685878, + "learning_rate": 7.752953433404482e-06, + "logits/chosen": 0.749567449092865, + "logits/rejected": 0.7038176655769348, + "logps/chosen": -0.8758996725082397, + "logps/rejected": -1.5970778465270996, + "loss": 0.7799, + "odds_ratio_loss": 0.5583348274230957, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08758997172117233, + "rewards/margins": 0.07211781293153763, + "rewards/rejected": -0.15970778465270996, + "sft_loss": 0.8758996725082397, + "step": 555 + }, + { + "epoch": 0.8040491684743312, + "grad_norm": 3.2105285366510823, + "learning_rate": 7.75187770178912e-06, + "logits/chosen": 0.5437260866165161, + "logits/rejected": 0.4299187660217285, + "logps/chosen": -0.784862756729126, + "logps/rejected": -1.8833541870117188, + "loss": 0.8746, + "odds_ratio_loss": 0.4752628207206726, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07848627865314484, + "rewards/margins": 0.10984915494918823, + "rewards/rejected": -0.18833541870117188, + "sft_loss": 0.784862756729126, + "step": 556 + }, + { + "epoch": 0.8054953000723066, + "grad_norm": 3.2621428288627565, + "learning_rate": 7.750799708152716e-06, + "logits/chosen": 0.4966755211353302, + "logits/rejected": 0.35313740372657776, + "logps/chosen": -0.8801267147064209, + "logps/rejected": -2.198486566543579, + "loss": 0.9417, + "odds_ratio_loss": 0.601882815361023, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08801267296075821, + "rewards/margins": 0.13183598220348358, + "rewards/rejected": -0.2198486477136612, + "sft_loss": 0.8801267147064209, + "step": 557 + }, + { + "epoch": 0.806941431670282, + "grad_norm": 2.756186174552345, + "learning_rate": 7.749719453145202e-06, + "logits/chosen": 0.2815437316894531, + "logits/rejected": 0.2515462636947632, + "logps/chosen": -0.8514240980148315, + "logps/rejected": -2.015674352645874, + "loss": 0.8876, + "odds_ratio_loss": 0.6282274723052979, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08514241874217987, + "rewards/margins": 0.11642500758171082, + "rewards/rejected": -0.2015674114227295, + "sft_loss": 0.8514240980148315, + "step": 558 + }, + { + "epoch": 0.8083875632682574, + "grad_norm": 3.421431652763658, + "learning_rate": 7.748636937417862e-06, + "logits/chosen": 0.4633673429489136, + "logits/rejected": 0.3824692964553833, + "logps/chosen": -0.6419711112976074, + "logps/rejected": -2.9679622650146484, + "loss": 0.7585, + "odds_ratio_loss": 0.4057493805885315, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0641971081495285, + "rewards/margins": 0.23259912431240082, + "rewards/rejected": -0.2967962324619293, + "sft_loss": 0.6419711112976074, + "step": 559 + }, + { + "epoch": 0.8098336948662328, + "grad_norm": 53.9455390293421, + "learning_rate": 7.747552161623352e-06, + "logits/chosen": 0.4784644544124603, + "logits/rejected": 0.3171621263027191, + "logps/chosen": -1.4930461645126343, + "logps/rejected": -3.1993000507354736, + "loss": 1.1239, + "odds_ratio_loss": 0.9343154430389404, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.14930462837219238, + "rewards/margins": 0.17062537372112274, + "rewards/rejected": -0.3199300169944763, + "sft_loss": 1.4930461645126343, + "step": 560 + }, + { + "epoch": 0.8112798264642083, + "grad_norm": 5.943313559869521, + "learning_rate": 7.746465126415685e-06, + "logits/chosen": 0.43953937292099, + "logits/rejected": 0.32243290543556213, + "logps/chosen": -0.667884111404419, + "logps/rejected": -2.532602071762085, + "loss": 0.8931, + "odds_ratio_loss": 0.32886582612991333, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06678842008113861, + "rewards/margins": 0.18647179007530212, + "rewards/rejected": -0.25326019525527954, + "sft_loss": 0.667884111404419, + "step": 561 + }, + { + "epoch": 0.8127259580621836, + "grad_norm": 3.2132336268726296, + "learning_rate": 7.74537583245024e-06, + "logits/chosen": 0.5227065086364746, + "logits/rejected": 0.35094642639160156, + "logps/chosen": -0.7416081428527832, + "logps/rejected": -2.211590528488159, + "loss": 0.9376, + "odds_ratio_loss": 0.43778154253959656, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07416081428527832, + "rewards/margins": 0.14699825644493103, + "rewards/rejected": -0.22115907073020935, + "sft_loss": 0.7416081428527832, + "step": 562 + }, + { + "epoch": 0.8141720896601591, + "grad_norm": 3.359980109883608, + "learning_rate": 7.744284280383758e-06, + "logits/chosen": 0.5760366916656494, + "logits/rejected": 0.35373595356941223, + "logps/chosen": -0.7241435050964355, + "logps/rejected": -2.0765225887298584, + "loss": 0.9294, + "odds_ratio_loss": 0.28304675221443176, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0724143534898758, + "rewards/margins": 0.1352379024028778, + "rewards/rejected": -0.2076522558927536, + "sft_loss": 0.7241435050964355, + "step": 563 + }, + { + "epoch": 0.8156182212581344, + "grad_norm": 3.028393325761886, + "learning_rate": 7.743190470874336e-06, + "logits/chosen": 0.5186077356338501, + "logits/rejected": 0.44535890221595764, + "logps/chosen": -0.9656530618667603, + "logps/rejected": -1.3157947063446045, + "loss": 0.8103, + "odds_ratio_loss": 0.697954535484314, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.09656532108783722, + "rewards/margins": 0.035014159977436066, + "rewards/rejected": -0.1315794736146927, + "sft_loss": 0.9656530618667603, + "step": 564 + }, + { + "epoch": 0.8170643528561099, + "grad_norm": 2.5015055682103386, + "learning_rate": 7.74209440458144e-06, + "logits/chosen": 0.46131306886672974, + "logits/rejected": 0.43117648363113403, + "logps/chosen": -0.8131046891212463, + "logps/rejected": -0.8619694709777832, + "loss": 0.8913, + "odds_ratio_loss": 0.6825778484344482, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0813104659318924, + "rewards/margins": 0.004886474460363388, + "rewards/rejected": -0.08619694411754608, + "sft_loss": 0.8131046891212463, + "step": 565 + }, + { + "epoch": 0.8185104844540854, + "grad_norm": 2.552811543812281, + "learning_rate": 7.740996082165889e-06, + "logits/chosen": 0.39947623014450073, + "logits/rejected": 0.3178212642669678, + "logps/chosen": -0.8110004663467407, + "logps/rejected": -1.336201548576355, + "loss": 0.9081, + "odds_ratio_loss": 0.7006158232688904, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08110004663467407, + "rewards/margins": 0.052520107477903366, + "rewards/rejected": -0.13362015783786774, + "sft_loss": 0.8110004663467407, + "step": 566 + }, + { + "epoch": 0.8199566160520607, + "grad_norm": 2.337977459115273, + "learning_rate": 7.739895504289867e-06, + "logits/chosen": 0.3927333354949951, + "logits/rejected": 0.4031215310096741, + "logps/chosen": -0.6630347371101379, + "logps/rejected": -1.4980498552322388, + "loss": 0.8835, + "odds_ratio_loss": 0.5098182559013367, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06630347669124603, + "rewards/margins": 0.08350151777267456, + "rewards/rejected": -0.1498049944639206, + "sft_loss": 0.6630347371101379, + "step": 567 + }, + { + "epoch": 0.8214027476500362, + "grad_norm": 3.035229863767425, + "learning_rate": 7.738792671616918e-06, + "logits/chosen": 0.33209335803985596, + "logits/rejected": 0.2999739944934845, + "logps/chosen": -0.860519528388977, + "logps/rejected": -1.6074646711349487, + "loss": 0.8805, + "odds_ratio_loss": 0.6031328439712524, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08605195581912994, + "rewards/margins": 0.07469449192285538, + "rewards/rejected": -0.16074645519256592, + "sft_loss": 0.860519528388977, + "step": 568 + }, + { + "epoch": 0.8228488792480115, + "grad_norm": 6.249110734978357, + "learning_rate": 7.737687584811942e-06, + "logits/chosen": 0.3918280601501465, + "logits/rejected": 0.48867300152778625, + "logps/chosen": -0.9784340262413025, + "logps/rejected": -1.133239984512329, + "loss": 0.8658, + "odds_ratio_loss": 0.8345239758491516, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09784340858459473, + "rewards/margins": 0.015480585396289825, + "rewards/rejected": -0.11332399398088455, + "sft_loss": 0.9784340262413025, + "step": 569 + }, + { + "epoch": 0.824295010845987, + "grad_norm": 2.959486238121644, + "learning_rate": 7.7365802445412e-06, + "logits/chosen": 0.4703490734100342, + "logits/rejected": 0.3433605134487152, + "logps/chosen": -0.683617115020752, + "logps/rejected": -1.978018879890442, + "loss": 0.7636, + "odds_ratio_loss": 0.4209952652454376, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06836170703172684, + "rewards/margins": 0.12944017350673676, + "rewards/rejected": -0.1978018879890442, + "sft_loss": 0.683617115020752, + "step": 570 + }, + { + "epoch": 0.8257411424439624, + "grad_norm": 4.554090104991005, + "learning_rate": 7.735470651472312e-06, + "logits/chosen": 0.5703073740005493, + "logits/rejected": 0.4268415570259094, + "logps/chosen": -0.6800241470336914, + "logps/rejected": -1.6585583686828613, + "loss": 0.7895, + "odds_ratio_loss": 0.45704489946365356, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06800241768360138, + "rewards/margins": 0.09785342216491699, + "rewards/rejected": -0.16585583984851837, + "sft_loss": 0.6800241470336914, + "step": 571 + }, + { + "epoch": 0.8271872740419378, + "grad_norm": 2.822960883716033, + "learning_rate": 7.734358806274256e-06, + "logits/chosen": 0.4933173656463623, + "logits/rejected": 0.3999761939048767, + "logps/chosen": -0.7931399345397949, + "logps/rejected": -1.2513625621795654, + "loss": 0.7925, + "odds_ratio_loss": 0.5663318634033203, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07931399345397949, + "rewards/margins": 0.04582225903868675, + "rewards/rejected": -0.12513625621795654, + "sft_loss": 0.7931399345397949, + "step": 572 + }, + { + "epoch": 0.8286334056399133, + "grad_norm": 2.648985570783827, + "learning_rate": 7.733244709617369e-06, + "logits/chosen": 0.46373143792152405, + "logits/rejected": 0.29053449630737305, + "logps/chosen": -0.7167465090751648, + "logps/rejected": -1.855074405670166, + "loss": 0.8, + "odds_ratio_loss": 0.3622341454029083, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0716746523976326, + "rewards/margins": 0.11383280158042908, + "rewards/rejected": -0.18550744652748108, + "sft_loss": 0.7167465090751648, + "step": 573 + }, + { + "epoch": 0.8300795372378886, + "grad_norm": 3.7137584494805145, + "learning_rate": 7.73212836217334e-06, + "logits/chosen": 0.4620693624019623, + "logits/rejected": 0.4092079997062683, + "logps/chosen": -1.0050597190856934, + "logps/rejected": -1.536287784576416, + "loss": 0.8899, + "odds_ratio_loss": 0.47553932666778564, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.10050597786903381, + "rewards/margins": 0.05312279611825943, + "rewards/rejected": -0.15362878143787384, + "sft_loss": 1.0050597190856934, + "step": 574 + }, + { + "epoch": 0.8315256688358641, + "grad_norm": 2.8877404863277096, + "learning_rate": 7.731009764615223e-06, + "logits/chosen": 0.4316241443157196, + "logits/rejected": 0.4264254570007324, + "logps/chosen": -0.805022120475769, + "logps/rejected": -1.3257907629013062, + "loss": 0.8333, + "odds_ratio_loss": 0.5716394186019897, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0805022120475769, + "rewards/margins": 0.052076857537031174, + "rewards/rejected": -0.13257908821105957, + "sft_loss": 0.805022120475769, + "step": 575 + }, + { + "epoch": 0.8329718004338394, + "grad_norm": 6.1727650674380765, + "learning_rate": 7.729888917617423e-06, + "logits/chosen": 0.3446384370326996, + "logits/rejected": 0.3820667564868927, + "logps/chosen": -0.7798358201980591, + "logps/rejected": -1.7499778270721436, + "loss": 0.808, + "odds_ratio_loss": 0.4026681184768677, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07798358052968979, + "rewards/margins": 0.09701421111822128, + "rewards/rejected": -0.17499780654907227, + "sft_loss": 0.7798358201980591, + "step": 576 + }, + { + "epoch": 0.8344179320318149, + "grad_norm": 2.625297803365516, + "learning_rate": 7.728765821855703e-06, + "logits/chosen": 0.34016144275665283, + "logits/rejected": 0.2931269407272339, + "logps/chosen": -1.023898959159851, + "logps/rejected": -2.1351914405822754, + "loss": 0.9166, + "odds_ratio_loss": 0.6438050270080566, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10238990187644958, + "rewards/margins": 0.11112925410270691, + "rewards/rejected": -0.2135191559791565, + "sft_loss": 1.023898959159851, + "step": 577 + }, + { + "epoch": 0.8358640636297903, + "grad_norm": 2.5444970560001487, + "learning_rate": 7.72764047800718e-06, + "logits/chosen": 0.49933484196662903, + "logits/rejected": 0.42723822593688965, + "logps/chosen": -0.6545868515968323, + "logps/rejected": -1.498138666152954, + "loss": 0.8401, + "odds_ratio_loss": 0.4437343180179596, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06545868515968323, + "rewards/margins": 0.0843551903963089, + "rewards/rejected": -0.14981386065483093, + "sft_loss": 0.6545868515968323, + "step": 578 + }, + { + "epoch": 0.8373101952277657, + "grad_norm": 2.3585819967991197, + "learning_rate": 7.726512886750331e-06, + "logits/chosen": 0.34519678354263306, + "logits/rejected": 0.19996540248394012, + "logps/chosen": -0.9542055726051331, + "logps/rejected": -1.4532517194747925, + "loss": 0.8652, + "odds_ratio_loss": 0.5589162111282349, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.09542055428028107, + "rewards/margins": 0.049904607236385345, + "rewards/rejected": -0.14532515406608582, + "sft_loss": 0.9542055726051331, + "step": 579 + }, + { + "epoch": 0.8387563268257412, + "grad_norm": 5.125090865717826, + "learning_rate": 7.725383048764985e-06, + "logits/chosen": 0.42127755284309387, + "logits/rejected": 0.2979139983654022, + "logps/chosen": -0.8492597341537476, + "logps/rejected": -1.7331501245498657, + "loss": 0.9187, + "odds_ratio_loss": 0.45528745651245117, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08492596447467804, + "rewards/margins": 0.08838904649019241, + "rewards/rejected": -0.17331500351428986, + "sft_loss": 0.8492597341537476, + "step": 580 + }, + { + "epoch": 0.8402024584237165, + "grad_norm": 3.7775925775826757, + "learning_rate": 7.724250964732322e-06, + "logits/chosen": 0.45021694898605347, + "logits/rejected": 0.275540828704834, + "logps/chosen": -0.9041640162467957, + "logps/rejected": -1.424367904663086, + "loss": 0.9197, + "odds_ratio_loss": 0.5092793107032776, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09041640907526016, + "rewards/margins": 0.05202038586139679, + "rewards/rejected": -0.14243678748607635, + "sft_loss": 0.9041640162467957, + "step": 581 + }, + { + "epoch": 0.841648590021692, + "grad_norm": 2.3382318025798066, + "learning_rate": 7.723116635334883e-06, + "logits/chosen": 0.414359986782074, + "logits/rejected": 0.33379101753234863, + "logps/chosen": -0.781288743019104, + "logps/rejected": -1.8517032861709595, + "loss": 0.7569, + "odds_ratio_loss": 0.47253549098968506, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0781288743019104, + "rewards/margins": 0.10704146325588226, + "rewards/rejected": -0.18517033755779266, + "sft_loss": 0.781288743019104, + "step": 582 + }, + { + "epoch": 0.8430947216196674, + "grad_norm": 3.016134749169285, + "learning_rate": 7.721980061256557e-06, + "logits/chosen": 0.5401621460914612, + "logits/rejected": 0.3053189218044281, + "logps/chosen": -0.893081545829773, + "logps/rejected": -1.5628845691680908, + "loss": 0.8887, + "odds_ratio_loss": 0.5231793522834778, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08930815756320953, + "rewards/margins": 0.06698029488325119, + "rewards/rejected": -0.15628844499588013, + "sft_loss": 0.893081545829773, + "step": 583 + }, + { + "epoch": 0.8445408532176428, + "grad_norm": 3.4336842069827886, + "learning_rate": 7.72084124318259e-06, + "logits/chosen": 0.487199068069458, + "logits/rejected": 0.4739711284637451, + "logps/chosen": -0.9182305932044983, + "logps/rejected": -1.005215048789978, + "loss": 0.8489, + "odds_ratio_loss": 0.7025805711746216, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09182305634021759, + "rewards/margins": 0.008698442950844765, + "rewards/rejected": -0.1005215048789978, + "sft_loss": 0.9182305932044983, + "step": 584 + }, + { + "epoch": 0.8459869848156182, + "grad_norm": 3.199872433035413, + "learning_rate": 7.719700181799581e-06, + "logits/chosen": 0.4485635757446289, + "logits/rejected": 0.4251922369003296, + "logps/chosen": -0.7368327379226685, + "logps/rejected": -1.369962453842163, + "loss": 0.808, + "odds_ratio_loss": 0.582382082939148, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07368327677249908, + "rewards/margins": 0.06331297755241394, + "rewards/rejected": -0.13699625432491302, + "sft_loss": 0.7368327379226685, + "step": 585 + }, + { + "epoch": 0.8474331164135936, + "grad_norm": 2.7751053686439886, + "learning_rate": 7.718556877795479e-06, + "logits/chosen": 0.45472198724746704, + "logits/rejected": 0.47626519203186035, + "logps/chosen": -0.5254935622215271, + "logps/rejected": -1.7815715074539185, + "loss": 0.7916, + "odds_ratio_loss": 0.420468270778656, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05254935473203659, + "rewards/margins": 0.12560781836509705, + "rewards/rejected": -0.17815715074539185, + "sft_loss": 0.5254935622215271, + "step": 586 + }, + { + "epoch": 0.848879248011569, + "grad_norm": 3.23820103346826, + "learning_rate": 7.717411331859584e-06, + "logits/chosen": 0.3615628182888031, + "logits/rejected": 0.36034783720970154, + "logps/chosen": -0.8735466003417969, + "logps/rejected": -1.287621021270752, + "loss": 0.9102, + "odds_ratio_loss": 0.5822567939758301, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08735466003417969, + "rewards/margins": 0.04140744358301163, + "rewards/rejected": -0.12876209616661072, + "sft_loss": 0.8735466003417969, + "step": 587 + }, + { + "epoch": 0.8503253796095445, + "grad_norm": 5.647839940161303, + "learning_rate": 7.716263544682553e-06, + "logits/chosen": 0.5941022634506226, + "logits/rejected": 0.5508843660354614, + "logps/chosen": -0.7583089470863342, + "logps/rejected": -1.2879139184951782, + "loss": 0.8767, + "odds_ratio_loss": 0.5073609352111816, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07583089917898178, + "rewards/margins": 0.05296049639582634, + "rewards/rejected": -0.12879139184951782, + "sft_loss": 0.7583089470863342, + "step": 588 + }, + { + "epoch": 0.8517715112075199, + "grad_norm": 3.480425009507815, + "learning_rate": 7.715113516956389e-06, + "logits/chosen": 0.36370134353637695, + "logits/rejected": 0.40123385190963745, + "logps/chosen": -0.7449005842208862, + "logps/rejected": -1.24686861038208, + "loss": 0.8231, + "odds_ratio_loss": 0.6178784966468811, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07449005544185638, + "rewards/margins": 0.050196800380945206, + "rewards/rejected": -0.12468685954809189, + "sft_loss": 0.7449005842208862, + "step": 589 + }, + { + "epoch": 0.8532176428054953, + "grad_norm": 2.6798844944883706, + "learning_rate": 7.71396124937445e-06, + "logits/chosen": 0.2954362630844116, + "logits/rejected": 0.346477210521698, + "logps/chosen": -0.6796218752861023, + "logps/rejected": -1.1093833446502686, + "loss": 0.8734, + "odds_ratio_loss": 0.5511021018028259, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06796219199895859, + "rewards/margins": 0.04297615587711334, + "rewards/rejected": -0.11093834042549133, + "sft_loss": 0.6796218752861023, + "step": 590 + }, + { + "epoch": 0.8546637744034707, + "grad_norm": 3.2479796489504165, + "learning_rate": 7.71280674263144e-06, + "logits/chosen": 0.4030838906764984, + "logits/rejected": 0.36736756563186646, + "logps/chosen": -0.7332617044448853, + "logps/rejected": -1.6842825412750244, + "loss": 0.8104, + "odds_ratio_loss": 0.40521979331970215, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07332617044448853, + "rewards/margins": 0.09510207921266556, + "rewards/rejected": -0.16842825710773468, + "sft_loss": 0.7332617044448853, + "step": 591 + }, + { + "epoch": 0.8561099060014461, + "grad_norm": 2.6431172294443743, + "learning_rate": 7.71164999742342e-06, + "logits/chosen": 0.5364017486572266, + "logits/rejected": 0.36750340461730957, + "logps/chosen": -0.9896173477172852, + "logps/rejected": -1.9290273189544678, + "loss": 0.8408, + "odds_ratio_loss": 0.6732795238494873, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0989617332816124, + "rewards/margins": 0.09394100308418274, + "rewards/rejected": -0.19290274381637573, + "sft_loss": 0.9896173477172852, + "step": 592 + }, + { + "epoch": 0.8575560375994216, + "grad_norm": 2.5251085232268564, + "learning_rate": 7.71049101444779e-06, + "logits/chosen": 0.4465160071849823, + "logits/rejected": 0.4120599627494812, + "logps/chosen": -0.5574182868003845, + "logps/rejected": -1.7155920267105103, + "loss": 0.7582, + "odds_ratio_loss": 0.405154824256897, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05574183166027069, + "rewards/margins": 0.1158173680305481, + "rewards/rejected": -0.17155921459197998, + "sft_loss": 0.5574182868003845, + "step": 593 + }, + { + "epoch": 0.8590021691973969, + "grad_norm": 3.346546938807114, + "learning_rate": 7.70932979440331e-06, + "logits/chosen": 0.33852913975715637, + "logits/rejected": 0.27749061584472656, + "logps/chosen": -0.8209311962127686, + "logps/rejected": -1.596746802330017, + "loss": 0.887, + "odds_ratio_loss": 0.4618699848651886, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.08209311217069626, + "rewards/margins": 0.07758156955242157, + "rewards/rejected": -0.15967468917369843, + "sft_loss": 0.8209311962127686, + "step": 594 + }, + { + "epoch": 0.8604483007953724, + "grad_norm": 5.284660640530955, + "learning_rate": 7.708166337990082e-06, + "logits/chosen": 0.5029542446136475, + "logits/rejected": 0.47931772470474243, + "logps/chosen": -0.8861660957336426, + "logps/rejected": -1.6336941719055176, + "loss": 0.8685, + "odds_ratio_loss": 0.6479578018188477, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08861660957336426, + "rewards/margins": 0.0747528076171875, + "rewards/rejected": -0.16336941719055176, + "sft_loss": 0.8861660957336426, + "step": 595 + }, + { + "epoch": 0.8618944323933478, + "grad_norm": 2.6787514133901102, + "learning_rate": 7.707000645909557e-06, + "logits/chosen": 0.38561227917671204, + "logits/rejected": 0.34946468472480774, + "logps/chosen": -0.8105971813201904, + "logps/rejected": -1.6445955038070679, + "loss": 0.7593, + "odds_ratio_loss": 0.625914454460144, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08105972409248352, + "rewards/margins": 0.08339983969926834, + "rewards/rejected": -0.16445955634117126, + "sft_loss": 0.8105971813201904, + "step": 596 + }, + { + "epoch": 0.8633405639913232, + "grad_norm": 2.562166564261387, + "learning_rate": 7.705832718864537e-06, + "logits/chosen": 0.4447406232357025, + "logits/rejected": 0.40148672461509705, + "logps/chosen": -0.7819239497184753, + "logps/rejected": -1.901719093322754, + "loss": 0.9083, + "odds_ratio_loss": 0.48765087127685547, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07819239795207977, + "rewards/margins": 0.11197951436042786, + "rewards/rejected": -0.19017191231250763, + "sft_loss": 0.7819239497184753, + "step": 597 + }, + { + "epoch": 0.8647866955892987, + "grad_norm": 3.207797469050618, + "learning_rate": 7.704662557559167e-06, + "logits/chosen": 0.38521963357925415, + "logits/rejected": 0.4216020107269287, + "logps/chosen": -0.921389102935791, + "logps/rejected": -1.601888656616211, + "loss": 0.9603, + "odds_ratio_loss": 0.6697303056716919, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.09213890880346298, + "rewards/margins": 0.06804996728897095, + "rewards/rejected": -0.16018888354301453, + "sft_loss": 0.921389102935791, + "step": 598 + }, + { + "epoch": 0.866232827187274, + "grad_norm": 4.035040836751041, + "learning_rate": 7.703490162698945e-06, + "logits/chosen": 0.45725634694099426, + "logits/rejected": 0.3862098455429077, + "logps/chosen": -0.7345454692840576, + "logps/rejected": -1.4923635721206665, + "loss": 0.7959, + "odds_ratio_loss": 0.40088510513305664, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07345455139875412, + "rewards/margins": 0.07578181475400925, + "rewards/rejected": -0.14923638105392456, + "sft_loss": 0.7345454692840576, + "step": 599 + }, + { + "epoch": 0.8676789587852495, + "grad_norm": 2.5500836204304873, + "learning_rate": 7.70231553499071e-06, + "logits/chosen": 0.5151770710945129, + "logits/rejected": 0.34262362122535706, + "logps/chosen": -0.6712307929992676, + "logps/rejected": -1.7133383750915527, + "loss": 0.816, + "odds_ratio_loss": 0.43570661544799805, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06712308526039124, + "rewards/margins": 0.104210764169693, + "rewards/rejected": -0.17133383452892303, + "sft_loss": 0.6712307929992676, + "step": 600 + }, + { + "epoch": 0.8691250903832248, + "grad_norm": 2.6121252911880544, + "learning_rate": 7.701138675142651e-06, + "logits/chosen": 0.4404212534427643, + "logits/rejected": 0.3398100435733795, + "logps/chosen": -0.7245041131973267, + "logps/rejected": -2.0399234294891357, + "loss": 0.8187, + "odds_ratio_loss": 0.4228803515434265, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0724504142999649, + "rewards/margins": 0.13154193758964539, + "rewards/rejected": -0.2039923518896103, + "sft_loss": 0.7245041131973267, + "step": 601 + }, + { + "epoch": 0.8705712219812003, + "grad_norm": 3.895120218207068, + "learning_rate": 7.6999595838643e-06, + "logits/chosen": 0.3892238438129425, + "logits/rejected": 0.3399989902973175, + "logps/chosen": -0.8683582544326782, + "logps/rejected": -1.7081042528152466, + "loss": 0.8405, + "odds_ratio_loss": 0.5292615294456482, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0868358165025711, + "rewards/margins": 0.08397459983825684, + "rewards/rejected": -0.17081041634082794, + "sft_loss": 0.8683582544326782, + "step": 602 + }, + { + "epoch": 0.8720173535791758, + "grad_norm": 4.916824856051119, + "learning_rate": 7.698778261866536e-06, + "logits/chosen": 0.4902600347995758, + "logits/rejected": 0.43887144327163696, + "logps/chosen": -0.7983700037002563, + "logps/rejected": -1.311253547668457, + "loss": 0.8184, + "odds_ratio_loss": 0.4534938633441925, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07983700931072235, + "rewards/margins": 0.05128835141658783, + "rewards/rejected": -0.13112536072731018, + "sft_loss": 0.7983700037002563, + "step": 603 + }, + { + "epoch": 0.8734634851771511, + "grad_norm": 2.5621387956986723, + "learning_rate": 7.697594709861582e-06, + "logits/chosen": 0.5107913017272949, + "logits/rejected": 0.45272332429885864, + "logps/chosen": -0.8786731958389282, + "logps/rejected": -1.683459758758545, + "loss": 0.8494, + "odds_ratio_loss": 0.521609902381897, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08786732703447342, + "rewards/margins": 0.08047864586114883, + "rewards/rejected": -0.16834597289562225, + "sft_loss": 0.8786731958389282, + "step": 604 + }, + { + "epoch": 0.8749096167751266, + "grad_norm": 4.078257021315965, + "learning_rate": 7.696408928563004e-06, + "logits/chosen": 0.43462055921554565, + "logits/rejected": 0.38017183542251587, + "logps/chosen": -0.9040985107421875, + "logps/rejected": -1.7478702068328857, + "loss": 0.8425, + "odds_ratio_loss": 0.543043315410614, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09040984511375427, + "rewards/margins": 0.08437717705965042, + "rewards/rejected": -0.1747870296239853, + "sft_loss": 0.9040985107421875, + "step": 605 + }, + { + "epoch": 0.8763557483731019, + "grad_norm": 3.110086029010687, + "learning_rate": 7.695220918685718e-06, + "logits/chosen": 0.40548551082611084, + "logits/rejected": 0.33251649141311646, + "logps/chosen": -0.7896912097930908, + "logps/rejected": -1.9131922721862793, + "loss": 0.8279, + "odds_ratio_loss": 0.4588484764099121, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07896912097930908, + "rewards/margins": 0.11235012114048004, + "rewards/rejected": -0.19131924211978912, + "sft_loss": 0.7896912097930908, + "step": 606 + }, + { + "epoch": 0.8778018799710774, + "grad_norm": 3.45857776190995, + "learning_rate": 7.694030680945978e-06, + "logits/chosen": 0.5536810159683228, + "logits/rejected": 0.4505231976509094, + "logps/chosen": -0.640487790107727, + "logps/rejected": -2.052018165588379, + "loss": 0.847, + "odds_ratio_loss": 0.43984222412109375, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06404877454042435, + "rewards/margins": 0.14115305244922638, + "rewards/rejected": -0.20520181953907013, + "sft_loss": 0.640487790107727, + "step": 607 + }, + { + "epoch": 0.8792480115690527, + "grad_norm": 3.5262340037551354, + "learning_rate": 7.692838216061382e-06, + "logits/chosen": 0.3820614218711853, + "logits/rejected": 0.3396124839782715, + "logps/chosen": -0.8231310248374939, + "logps/rejected": -1.9909696578979492, + "loss": 0.9157, + "odds_ratio_loss": 0.4377727806568146, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.08231310546398163, + "rewards/margins": 0.11678387224674225, + "rewards/rejected": -0.19909697771072388, + "sft_loss": 0.8231310248374939, + "step": 608 + }, + { + "epoch": 0.8806941431670282, + "grad_norm": 2.4054024668451857, + "learning_rate": 7.691643524750872e-06, + "logits/chosen": 0.6013118028640747, + "logits/rejected": 0.4211696982383728, + "logps/chosen": -0.6587211489677429, + "logps/rejected": -2.157447338104248, + "loss": 0.9399, + "odds_ratio_loss": 0.398333877325058, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06587211787700653, + "rewards/margins": 0.14987263083457947, + "rewards/rejected": -0.2157447338104248, + "sft_loss": 0.6587211489677429, + "step": 609 + }, + { + "epoch": 0.8821402747650036, + "grad_norm": 5.024109256266422, + "learning_rate": 7.690446607734731e-06, + "logits/chosen": 0.4865860939025879, + "logits/rejected": 0.2488732635974884, + "logps/chosen": -0.9007077813148499, + "logps/rejected": -2.9339916706085205, + "loss": 0.9398, + "odds_ratio_loss": 0.4676297605037689, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09007078409194946, + "rewards/margins": 0.20332840085029602, + "rewards/rejected": -0.2933991849422455, + "sft_loss": 0.9007077813148499, + "step": 610 + }, + { + "epoch": 0.883586406362979, + "grad_norm": 2.4390306772314942, + "learning_rate": 7.689247465734587e-06, + "logits/chosen": 0.4354531168937683, + "logits/rejected": 0.34002625942230225, + "logps/chosen": -0.7873460054397583, + "logps/rejected": -2.1244518756866455, + "loss": 0.8282, + "odds_ratio_loss": 0.440873384475708, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07873459905385971, + "rewards/margins": 0.1337105929851532, + "rewards/rejected": -0.2124451845884323, + "sft_loss": 0.7873460054397583, + "step": 611 + }, + { + "epoch": 0.8850325379609545, + "grad_norm": 3.041808355646371, + "learning_rate": 7.688046099473404e-06, + "logits/chosen": 0.36165985465049744, + "logits/rejected": 0.34865519404411316, + "logps/chosen": -0.7287914752960205, + "logps/rejected": -2.1389732360839844, + "loss": 0.7459, + "odds_ratio_loss": 0.3866339921951294, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07287915050983429, + "rewards/margins": 0.14101818203926086, + "rewards/rejected": -0.21389730274677277, + "sft_loss": 0.7287914752960205, + "step": 612 + }, + { + "epoch": 0.8864786695589298, + "grad_norm": 2.8562172272392146, + "learning_rate": 7.686842509675493e-06, + "logits/chosen": 0.44806107878685, + "logits/rejected": 0.28580302000045776, + "logps/chosen": -0.6983122825622559, + "logps/rejected": -1.8353773355484009, + "loss": 0.7468, + "odds_ratio_loss": 0.3887363076210022, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0698312297463417, + "rewards/margins": 0.11370651423931122, + "rewards/rejected": -0.18353773653507233, + "sft_loss": 0.6983122825622559, + "step": 613 + }, + { + "epoch": 0.8879248011569053, + "grad_norm": 3.892747952798006, + "learning_rate": 7.6856366970665e-06, + "logits/chosen": 0.32401058077812195, + "logits/rejected": 0.2524191439151764, + "logps/chosen": -0.6660705804824829, + "logps/rejected": -2.403459072113037, + "loss": 0.7928, + "odds_ratio_loss": 0.375204473733902, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06660705804824829, + "rewards/margins": 0.17373886704444885, + "rewards/rejected": -0.24034592509269714, + "sft_loss": 0.6660705804824829, + "step": 614 + }, + { + "epoch": 0.8893709327548807, + "grad_norm": 3.7158835902808973, + "learning_rate": 7.68442866237342e-06, + "logits/chosen": 0.3761048913002014, + "logits/rejected": 0.2979498505592346, + "logps/chosen": -0.8295606374740601, + "logps/rejected": -1.7888013124465942, + "loss": 0.7842, + "odds_ratio_loss": 0.4067229628562927, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08295606076717377, + "rewards/margins": 0.09592406451702118, + "rewards/rejected": -0.17888012528419495, + "sft_loss": 0.8295606374740601, + "step": 615 + }, + { + "epoch": 0.8908170643528561, + "grad_norm": 2.4550407569809396, + "learning_rate": 7.683218406324572e-06, + "logits/chosen": 0.6669546365737915, + "logits/rejected": 0.4045952558517456, + "logps/chosen": -0.5585116744041443, + "logps/rejected": -2.035814046859741, + "loss": 0.7754, + "odds_ratio_loss": 0.39348843693733215, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05585116893053055, + "rewards/margins": 0.1477302461862564, + "rewards/rejected": -0.20358142256736755, + "sft_loss": 0.5585116744041443, + "step": 616 + }, + { + "epoch": 0.8922631959508315, + "grad_norm": 3.407732902695217, + "learning_rate": 7.682005929649631e-06, + "logits/chosen": 0.39748919010162354, + "logits/rejected": 0.37334704399108887, + "logps/chosen": -0.7694322466850281, + "logps/rejected": -1.4826719760894775, + "loss": 0.8091, + "odds_ratio_loss": 0.4687405228614807, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07694321870803833, + "rewards/margins": 0.07132397592067719, + "rewards/rejected": -0.1482672095298767, + "sft_loss": 0.7694322466850281, + "step": 617 + }, + { + "epoch": 0.8937093275488069, + "grad_norm": 2.5017227518592233, + "learning_rate": 7.680791233079603e-06, + "logits/chosen": 0.3456028997898102, + "logits/rejected": 0.3027806282043457, + "logps/chosen": -0.7319939136505127, + "logps/rejected": -1.6122633218765259, + "loss": 0.777, + "odds_ratio_loss": 0.4658835530281067, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07319939136505127, + "rewards/margins": 0.08802695572376251, + "rewards/rejected": -0.16122636198997498, + "sft_loss": 0.7319939136505127, + "step": 618 + }, + { + "epoch": 0.8951554591467824, + "grad_norm": 2.3166376522393146, + "learning_rate": 7.67957431734683e-06, + "logits/chosen": 0.3627810478210449, + "logits/rejected": 0.2669435441493988, + "logps/chosen": -0.9080690145492554, + "logps/rejected": -1.8668326139450073, + "loss": 0.8564, + "odds_ratio_loss": 0.5684492588043213, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.09080690145492554, + "rewards/margins": 0.09587635844945908, + "rewards/rejected": -0.18668325245380402, + "sft_loss": 0.9080690145492554, + "step": 619 + }, + { + "epoch": 0.8966015907447578, + "grad_norm": 3.201147938968964, + "learning_rate": 7.678355183184998e-06, + "logits/chosen": 0.47244179248809814, + "logits/rejected": 0.3506423830986023, + "logps/chosen": -0.6035176515579224, + "logps/rejected": -2.130152702331543, + "loss": 0.8337, + "odds_ratio_loss": 0.399160772562027, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.060351770371198654, + "rewards/margins": 0.15266351401805878, + "rewards/rejected": -0.21301528811454773, + "sft_loss": 0.6035176515579224, + "step": 620 + }, + { + "epoch": 0.8980477223427332, + "grad_norm": 3.717093420401316, + "learning_rate": 7.677133831329126e-06, + "logits/chosen": 0.4847029447555542, + "logits/rejected": 0.40981170535087585, + "logps/chosen": -0.7285267114639282, + "logps/rejected": -2.408964157104492, + "loss": 0.7495, + "odds_ratio_loss": 0.38725215196609497, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07285267114639282, + "rewards/margins": 0.16804377734661102, + "rewards/rejected": -0.24089643359184265, + "sft_loss": 0.7285267114639282, + "step": 621 + }, + { + "epoch": 0.8994938539407086, + "grad_norm": 13.31403718261638, + "learning_rate": 7.675910262515571e-06, + "logits/chosen": 0.3689773678779602, + "logits/rejected": 0.31208884716033936, + "logps/chosen": -0.7826531529426575, + "logps/rejected": -1.995023488998413, + "loss": 0.8353, + "odds_ratio_loss": 0.44215285778045654, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07826531678438187, + "rewards/margins": 0.12123703211545944, + "rewards/rejected": -0.1995023488998413, + "sft_loss": 0.7826531529426575, + "step": 622 + }, + { + "epoch": 0.900939985538684, + "grad_norm": 2.5398626802834277, + "learning_rate": 7.67468447748203e-06, + "logits/chosen": 0.42934471368789673, + "logits/rejected": 0.21536873281002045, + "logps/chosen": -0.7805943489074707, + "logps/rejected": -2.9530436992645264, + "loss": 0.7891, + "odds_ratio_loss": 0.5255002379417419, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07805943489074707, + "rewards/margins": 0.217244952917099, + "rewards/rejected": -0.29530438780784607, + "sft_loss": 0.7805943489074707, + "step": 623 + }, + { + "epoch": 0.9023861171366594, + "grad_norm": 2.70020042469918, + "learning_rate": 7.67345647696753e-06, + "logits/chosen": 0.5212641954421997, + "logits/rejected": 0.42973682284355164, + "logps/chosen": -0.6765854954719543, + "logps/rejected": -1.6150991916656494, + "loss": 0.9148, + "odds_ratio_loss": 0.48807549476623535, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06765855103731155, + "rewards/margins": 0.09385137259960175, + "rewards/rejected": -0.1615099161863327, + "sft_loss": 0.6765854954719543, + "step": 624 + }, + { + "epoch": 0.9038322487346349, + "grad_norm": 4.310558816009849, + "learning_rate": 7.67222626171244e-06, + "logits/chosen": 0.33585911989212036, + "logits/rejected": 0.20145192742347717, + "logps/chosen": -0.8163321018218994, + "logps/rejected": -3.1550087928771973, + "loss": 0.9581, + "odds_ratio_loss": 0.38997989892959595, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08163321018218994, + "rewards/margins": 0.23386766016483307, + "rewards/rejected": -0.3155008852481842, + "sft_loss": 0.8163321018218994, + "step": 625 + }, + { + "epoch": 0.9052783803326103, + "grad_norm": 3.448422343794376, + "learning_rate": 7.670993832458459e-06, + "logits/chosen": 0.42779773473739624, + "logits/rejected": 0.36584076285362244, + "logps/chosen": -0.6566274762153625, + "logps/rejected": -1.3442649841308594, + "loss": 0.833, + "odds_ratio_loss": 0.4265965521335602, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06566274911165237, + "rewards/margins": 0.06876374781131744, + "rewards/rejected": -0.13442650437355042, + "sft_loss": 0.6566274762153625, + "step": 626 + }, + { + "epoch": 0.9067245119305857, + "grad_norm": 2.9169308040479223, + "learning_rate": 7.669759189948624e-06, + "logits/chosen": 0.35135525465011597, + "logits/rejected": 0.2400461733341217, + "logps/chosen": -0.6346577405929565, + "logps/rejected": -1.875277042388916, + "loss": 0.7533, + "odds_ratio_loss": 0.3021318018436432, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06346577405929565, + "rewards/margins": 0.1240619346499443, + "rewards/rejected": -0.18752773106098175, + "sft_loss": 0.6346577405929565, + "step": 627 + }, + { + "epoch": 0.9081706435285611, + "grad_norm": 3.0982427667659076, + "learning_rate": 7.668522334927307e-06, + "logits/chosen": 0.30746403336524963, + "logits/rejected": 0.20801600813865662, + "logps/chosen": -0.8239056468009949, + "logps/rejected": -1.7219469547271729, + "loss": 0.8766, + "odds_ratio_loss": 0.6280619502067566, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08239056169986725, + "rewards/margins": 0.08980412036180496, + "rewards/rejected": -0.1721946895122528, + "sft_loss": 0.8239056468009949, + "step": 628 + }, + { + "epoch": 0.9096167751265365, + "grad_norm": 3.907298148969337, + "learning_rate": 7.667283268140211e-06, + "logits/chosen": 0.4386427402496338, + "logits/rejected": 0.31877654790878296, + "logps/chosen": -0.8378483653068542, + "logps/rejected": -1.461822509765625, + "loss": 0.9106, + "odds_ratio_loss": 0.6432319283485413, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08378484100103378, + "rewards/margins": 0.06239742785692215, + "rewards/rejected": -0.14618226885795593, + "sft_loss": 0.8378483653068542, + "step": 629 + }, + { + "epoch": 0.911062906724512, + "grad_norm": 2.7529705801934763, + "learning_rate": 7.666041990334374e-06, + "logits/chosen": 0.40260255336761475, + "logits/rejected": 0.30121004581451416, + "logps/chosen": -0.8997647166252136, + "logps/rejected": -1.8492169380187988, + "loss": 0.8114, + "odds_ratio_loss": 0.4565718173980713, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0899764746427536, + "rewards/margins": 0.09494520723819733, + "rewards/rejected": -0.18492168188095093, + "sft_loss": 0.8997647166252136, + "step": 630 + }, + { + "epoch": 0.9125090383224873, + "grad_norm": 3.2383772151478882, + "learning_rate": 7.664798502258167e-06, + "logits/chosen": 0.43885284662246704, + "logits/rejected": 0.4151266813278198, + "logps/chosen": -0.717272162437439, + "logps/rejected": -1.9099479913711548, + "loss": 0.8819, + "odds_ratio_loss": 0.4223041236400604, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0717272162437439, + "rewards/margins": 0.11926757544279099, + "rewards/rejected": -0.19099479913711548, + "sft_loss": 0.717272162437439, + "step": 631 + }, + { + "epoch": 0.9139551699204628, + "grad_norm": 2.8313172738379153, + "learning_rate": 7.663552804661292e-06, + "logits/chosen": 0.43072545528411865, + "logits/rejected": 0.2662442922592163, + "logps/chosen": -0.8863623738288879, + "logps/rejected": -2.1999828815460205, + "loss": 0.7894, + "odds_ratio_loss": 0.40025269985198975, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08863624185323715, + "rewards/margins": 0.13136205077171326, + "rewards/rejected": -0.21999827027320862, + "sft_loss": 0.8863623738288879, + "step": 632 + }, + { + "epoch": 0.9154013015184381, + "grad_norm": 3.2231547445658433, + "learning_rate": 7.662304898294789e-06, + "logits/chosen": 0.34558531641960144, + "logits/rejected": 0.25568896532058716, + "logps/chosen": -0.9652397632598877, + "logps/rejected": -2.154374599456787, + "loss": 0.8856, + "odds_ratio_loss": 0.6041095852851868, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09652397781610489, + "rewards/margins": 0.11891345679759979, + "rewards/rejected": -0.21543745696544647, + "sft_loss": 0.9652397632598877, + "step": 633 + }, + { + "epoch": 0.9168474331164136, + "grad_norm": 2.929910989196169, + "learning_rate": 7.661054783911023e-06, + "logits/chosen": 0.30516791343688965, + "logits/rejected": 0.3138343095779419, + "logps/chosen": -0.9684697389602661, + "logps/rejected": -2.2495481967926025, + "loss": 1.0268, + "odds_ratio_loss": 0.6787380576133728, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09684698283672333, + "rewards/margins": 0.12810784578323364, + "rewards/rejected": -0.22495481371879578, + "sft_loss": 0.9684697389602661, + "step": 634 + }, + { + "epoch": 0.918293564714389, + "grad_norm": 3.5685552202873656, + "learning_rate": 7.65980246226369e-06, + "logits/chosen": 0.5167609453201294, + "logits/rejected": 0.534476101398468, + "logps/chosen": -0.8154505491256714, + "logps/rejected": -1.6477975845336914, + "loss": 0.8668, + "odds_ratio_loss": 0.6014808416366577, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08154505491256714, + "rewards/margins": 0.08323470503091812, + "rewards/rejected": -0.16477975249290466, + "sft_loss": 0.8154505491256714, + "step": 635 + }, + { + "epoch": 0.9197396963123644, + "grad_norm": 3.791415677312705, + "learning_rate": 7.658547934107826e-06, + "logits/chosen": 0.38681352138519287, + "logits/rejected": 0.3783404529094696, + "logps/chosen": -0.852817714214325, + "logps/rejected": -1.8300766944885254, + "loss": 0.8929, + "odds_ratio_loss": 0.5779595375061035, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08528177440166473, + "rewards/margins": 0.09772589802742004, + "rewards/rejected": -0.18300765752792358, + "sft_loss": 0.852817714214325, + "step": 636 + }, + { + "epoch": 0.9211858279103399, + "grad_norm": 3.3465232296765994, + "learning_rate": 7.657291200199784e-06, + "logits/chosen": 0.290289044380188, + "logits/rejected": 0.3608376979827881, + "logps/chosen": -0.7669796943664551, + "logps/rejected": -2.2326064109802246, + "loss": 0.8354, + "odds_ratio_loss": 0.5988900661468506, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07669797539710999, + "rewards/margins": 0.14656266570091248, + "rewards/rejected": -0.22326062619686127, + "sft_loss": 0.7669796943664551, + "step": 637 + }, + { + "epoch": 0.9226319595083152, + "grad_norm": 4.414553916663672, + "learning_rate": 7.656032261297255e-06, + "logits/chosen": 0.26446521282196045, + "logits/rejected": 0.16627632081508636, + "logps/chosen": -0.7560668587684631, + "logps/rejected": -2.14742374420166, + "loss": 0.8415, + "odds_ratio_loss": 0.4151613414287567, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07560668885707855, + "rewards/margins": 0.1391356885433197, + "rewards/rejected": -0.21474237740039825, + "sft_loss": 0.7560668587684631, + "step": 638 + }, + { + "epoch": 0.9240780911062907, + "grad_norm": 2.585907860504243, + "learning_rate": 7.654771118159262e-06, + "logits/chosen": 0.3260245621204376, + "logits/rejected": 0.29659798741340637, + "logps/chosen": -0.7077478766441345, + "logps/rejected": -1.617417573928833, + "loss": 0.7967, + "odds_ratio_loss": 0.41377967596054077, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07077478617429733, + "rewards/margins": 0.09096695482730865, + "rewards/rejected": -0.16174176335334778, + "sft_loss": 0.7077478766441345, + "step": 639 + }, + { + "epoch": 0.925524222704266, + "grad_norm": 2.6907621885171924, + "learning_rate": 7.653507771546148e-06, + "logits/chosen": 0.4016415476799011, + "logits/rejected": 0.19014433026313782, + "logps/chosen": -0.6699740886688232, + "logps/rejected": -2.6581130027770996, + "loss": 0.8692, + "odds_ratio_loss": 0.3492831885814667, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06699740886688232, + "rewards/margins": 0.19881388545036316, + "rewards/rejected": -0.2658112943172455, + "sft_loss": 0.6699740886688232, + "step": 640 + }, + { + "epoch": 0.9269703543022415, + "grad_norm": 4.452046547002235, + "learning_rate": 7.652242222219593e-06, + "logits/chosen": 0.297119677066803, + "logits/rejected": 0.26825442910194397, + "logps/chosen": -0.8774327039718628, + "logps/rejected": -2.2502782344818115, + "loss": 0.9753, + "odds_ratio_loss": 0.49745067954063416, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08774326741695404, + "rewards/margins": 0.1372845470905304, + "rewards/rejected": -0.22502781450748444, + "sft_loss": 0.8774327039718628, + "step": 641 + }, + { + "epoch": 0.928416485900217, + "grad_norm": 3.1251697468215127, + "learning_rate": 7.650974470942598e-06, + "logits/chosen": 0.4491688013076782, + "logits/rejected": 0.379191517829895, + "logps/chosen": -0.9273581504821777, + "logps/rejected": -1.4209566116333008, + "loss": 0.8902, + "odds_ratio_loss": 0.6889926195144653, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.09273581951856613, + "rewards/margins": 0.049359846860170364, + "rewards/rejected": -0.1420956701040268, + "sft_loss": 0.9273581504821777, + "step": 642 + }, + { + "epoch": 0.9298626174981923, + "grad_norm": 3.00570853026626, + "learning_rate": 7.649704518479497e-06, + "logits/chosen": 0.3809622526168823, + "logits/rejected": 0.3525671660900116, + "logps/chosen": -0.8433955907821655, + "logps/rejected": -1.3788678646087646, + "loss": 0.8706, + "odds_ratio_loss": 0.634644627571106, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08433955907821655, + "rewards/margins": 0.05354723334312439, + "rewards/rejected": -0.13788677752017975, + "sft_loss": 0.8433955907821655, + "step": 643 + }, + { + "epoch": 0.9313087490961678, + "grad_norm": 2.7897702867657324, + "learning_rate": 7.648432365595951e-06, + "logits/chosen": 0.3524807393550873, + "logits/rejected": 0.34861454367637634, + "logps/chosen": -0.9019001722335815, + "logps/rejected": -1.3493397235870361, + "loss": 0.7982, + "odds_ratio_loss": 0.5859642624855042, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09019001573324203, + "rewards/margins": 0.044743962585926056, + "rewards/rejected": -0.1349339783191681, + "sft_loss": 0.9019001722335815, + "step": 644 + }, + { + "epoch": 0.9327548806941431, + "grad_norm": 3.4031829380173746, + "learning_rate": 7.647158013058943e-06, + "logits/chosen": 0.4466613531112671, + "logits/rejected": 0.3243465721607208, + "logps/chosen": -0.8417915105819702, + "logps/rejected": -2.339245319366455, + "loss": 0.8632, + "odds_ratio_loss": 0.5358899235725403, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08417915552854538, + "rewards/margins": 0.1497454047203064, + "rewards/rejected": -0.23392455279827118, + "sft_loss": 0.8417915105819702, + "step": 645 + }, + { + "epoch": 0.9342010122921186, + "grad_norm": 3.1004064500515955, + "learning_rate": 7.645881461636784e-06, + "logits/chosen": 0.44667521119117737, + "logits/rejected": 0.2823071777820587, + "logps/chosen": -0.6927672028541565, + "logps/rejected": -2.4781577587127686, + "loss": 0.7595, + "odds_ratio_loss": 0.380462646484375, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06927672028541565, + "rewards/margins": 0.17853903770446777, + "rewards/rejected": -0.24781575798988342, + "sft_loss": 0.6927672028541565, + "step": 646 + }, + { + "epoch": 0.935647143890094, + "grad_norm": 2.6234045812685682, + "learning_rate": 7.644602712099113e-06, + "logits/chosen": 0.5283761024475098, + "logits/rejected": 0.3219844102859497, + "logps/chosen": -0.6988430023193359, + "logps/rejected": -1.651008129119873, + "loss": 0.7322, + "odds_ratio_loss": 0.4112403988838196, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0698843002319336, + "rewards/margins": 0.09521650522947311, + "rewards/rejected": -0.1651008129119873, + "sft_loss": 0.6988430023193359, + "step": 647 + }, + { + "epoch": 0.9370932754880694, + "grad_norm": 5.239045092918419, + "learning_rate": 7.643321765216894e-06, + "logits/chosen": 0.32309383153915405, + "logits/rejected": 0.30599987506866455, + "logps/chosen": -0.9356340765953064, + "logps/rejected": -1.8596378564834595, + "loss": 0.9289, + "odds_ratio_loss": 0.6126976013183594, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09356341511011124, + "rewards/margins": 0.09240037202835083, + "rewards/rejected": -0.18596379458904266, + "sft_loss": 0.9356340765953064, + "step": 648 + }, + { + "epoch": 0.9385394070860448, + "grad_norm": 2.4529826709658495, + "learning_rate": 7.642038621762414e-06, + "logits/chosen": 0.5031927227973938, + "logits/rejected": 0.3548673093318939, + "logps/chosen": -0.6429674625396729, + "logps/rejected": -1.8498015403747559, + "loss": 0.8475, + "odds_ratio_loss": 0.46681496500968933, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06429674476385117, + "rewards/margins": 0.12068342417478561, + "rewards/rejected": -0.18498015403747559, + "sft_loss": 0.6429674625396729, + "step": 649 + }, + { + "epoch": 0.9399855386840202, + "grad_norm": 4.412976604978248, + "learning_rate": 7.640753282509284e-06, + "logits/chosen": 0.2757370173931122, + "logits/rejected": 0.2888236343860626, + "logps/chosen": -0.78496915102005, + "logps/rejected": -1.9232274293899536, + "loss": 0.859, + "odds_ratio_loss": 0.4362175464630127, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07849692553281784, + "rewards/margins": 0.11382582783699036, + "rewards/rejected": -0.1923227608203888, + "sft_loss": 0.78496915102005, + "step": 650 + }, + { + "epoch": 0.9414316702819957, + "grad_norm": 3.1829134737847937, + "learning_rate": 7.639465748232439e-06, + "logits/chosen": 0.3406945466995239, + "logits/rejected": 0.34385547041893005, + "logps/chosen": -0.8253358602523804, + "logps/rejected": -1.9728519916534424, + "loss": 0.8718, + "odds_ratio_loss": 0.5355536937713623, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0825335830450058, + "rewards/margins": 0.11475162208080292, + "rewards/rejected": -0.1972852200269699, + "sft_loss": 0.8253358602523804, + "step": 651 + }, + { + "epoch": 0.9428778018799711, + "grad_norm": 4.11945466113238, + "learning_rate": 7.638176019708141e-06, + "logits/chosen": 0.45153433084487915, + "logits/rejected": 0.3097097873687744, + "logps/chosen": -0.7182214260101318, + "logps/rejected": -2.8085572719573975, + "loss": 0.8844, + "odds_ratio_loss": 0.3755723834037781, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0718221515417099, + "rewards/margins": 0.20903359353542328, + "rewards/rejected": -0.2808557152748108, + "sft_loss": 0.7182214260101318, + "step": 652 + }, + { + "epoch": 0.9443239334779465, + "grad_norm": 4.169907813750463, + "learning_rate": 7.63688409771397e-06, + "logits/chosen": 0.3641658425331116, + "logits/rejected": 0.16773995757102966, + "logps/chosen": -0.7689434885978699, + "logps/rejected": -2.3531601428985596, + "loss": 0.9164, + "odds_ratio_loss": 0.4541126489639282, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0768943503499031, + "rewards/margins": 0.15842165052890778, + "rewards/rejected": -0.23531600832939148, + "sft_loss": 0.7689434885978699, + "step": 653 + }, + { + "epoch": 0.9457700650759219, + "grad_norm": 2.8738626095421647, + "learning_rate": 7.635589983028832e-06, + "logits/chosen": 0.37339627742767334, + "logits/rejected": 0.29968613386154175, + "logps/chosen": -0.7189725637435913, + "logps/rejected": -2.247807025909424, + "loss": 0.8222, + "odds_ratio_loss": 0.42104122042655945, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07189725339412689, + "rewards/margins": 0.15288342535495758, + "rewards/rejected": -0.22478069365024567, + "sft_loss": 0.7189725637435913, + "step": 654 + }, + { + "epoch": 0.9472161966738973, + "grad_norm": 2.3715953476245004, + "learning_rate": 7.634293676432953e-06, + "logits/chosen": 0.44171804189682007, + "logits/rejected": 0.37442782521247864, + "logps/chosen": -0.6790784597396851, + "logps/rejected": -1.6948163509368896, + "loss": 0.8714, + "odds_ratio_loss": 0.40542173385620117, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06790784746408463, + "rewards/margins": 0.10157379508018494, + "rewards/rejected": -0.16948163509368896, + "sft_loss": 0.6790784597396851, + "step": 655 + }, + { + "epoch": 0.9486623282718727, + "grad_norm": 2.8909595515998308, + "learning_rate": 7.63299517870788e-06, + "logits/chosen": 0.27957409620285034, + "logits/rejected": 0.2359994798898697, + "logps/chosen": -0.8018577098846436, + "logps/rejected": -1.7241246700286865, + "loss": 0.8156, + "odds_ratio_loss": 0.5142108201980591, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08018577098846436, + "rewards/margins": 0.09222669899463654, + "rewards/rejected": -0.1724124699831009, + "sft_loss": 0.8018577098846436, + "step": 656 + }, + { + "epoch": 0.9501084598698482, + "grad_norm": 3.057708923025605, + "learning_rate": 7.631694490636483e-06, + "logits/chosen": 0.5239850282669067, + "logits/rejected": 0.3646376132965088, + "logps/chosen": -0.6075431108474731, + "logps/rejected": -3.623016357421875, + "loss": 0.839, + "odds_ratio_loss": 0.41288918256759644, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06075431406497955, + "rewards/margins": 0.3015473484992981, + "rewards/rejected": -0.36230167746543884, + "sft_loss": 0.6075431108474731, + "step": 657 + }, + { + "epoch": 0.9515545914678236, + "grad_norm": 3.5053796147588305, + "learning_rate": 7.630391613002953e-06, + "logits/chosen": 0.4126778244972229, + "logits/rejected": 0.3100075125694275, + "logps/chosen": -0.7381374835968018, + "logps/rejected": -3.2679390907287598, + "loss": 0.8151, + "odds_ratio_loss": 0.44963547587394714, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07381375133991241, + "rewards/margins": 0.25298017263412476, + "rewards/rejected": -0.326793909072876, + "sft_loss": 0.7381374835968018, + "step": 658 + }, + { + "epoch": 0.953000723065799, + "grad_norm": 3.229921172064812, + "learning_rate": 7.629086546592797e-06, + "logits/chosen": 0.39081621170043945, + "logits/rejected": 0.26037153601646423, + "logps/chosen": -0.8727468252182007, + "logps/rejected": -1.5484097003936768, + "loss": 0.7941, + "odds_ratio_loss": 0.5154801607131958, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08727468550205231, + "rewards/margins": 0.06756629049777985, + "rewards/rejected": -0.15484097599983215, + "sft_loss": 0.8727468252182007, + "step": 659 + }, + { + "epoch": 0.9544468546637744, + "grad_norm": 2.933170422294456, + "learning_rate": 7.6277792921928464e-06, + "logits/chosen": 0.2233104109764099, + "logits/rejected": 0.35420551896095276, + "logps/chosen": -0.8203558921813965, + "logps/rejected": -1.4535131454467773, + "loss": 0.8687, + "odds_ratio_loss": 0.6188152432441711, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08203558623790741, + "rewards/margins": 0.0633157268166542, + "rewards/rejected": -0.1453513205051422, + "sft_loss": 0.8203558921813965, + "step": 660 + }, + { + "epoch": 0.9558929862617498, + "grad_norm": 2.5813611799218976, + "learning_rate": 7.6264698505912504e-06, + "logits/chosen": 0.24971553683280945, + "logits/rejected": 0.24186164140701294, + "logps/chosen": -0.7713397741317749, + "logps/rejected": -1.9475972652435303, + "loss": 0.7881, + "odds_ratio_loss": 0.46839070320129395, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07713396847248077, + "rewards/margins": 0.11762575060129166, + "rewards/rejected": -0.19475972652435303, + "sft_loss": 0.7713397741317749, + "step": 661 + }, + { + "epoch": 0.9573391178597253, + "grad_norm": 3.2723557761645616, + "learning_rate": 7.625158222577474e-06, + "logits/chosen": 0.3482891023159027, + "logits/rejected": 0.2935275435447693, + "logps/chosen": -0.7917287349700928, + "logps/rejected": -1.6271872520446777, + "loss": 0.8783, + "odds_ratio_loss": 0.6316386461257935, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.07917286455631256, + "rewards/margins": 0.08354586362838745, + "rewards/rejected": -0.1627187281847, + "sft_loss": 0.7917287349700928, + "step": 662 + }, + { + "epoch": 0.9587852494577006, + "grad_norm": 3.1102507204547845, + "learning_rate": 7.623844408942304e-06, + "logits/chosen": 0.377737820148468, + "logits/rejected": 0.34229159355163574, + "logps/chosen": -0.7180280685424805, + "logps/rejected": -1.8476381301879883, + "loss": 0.8331, + "odds_ratio_loss": 0.4935680627822876, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07180280983448029, + "rewards/margins": 0.11296100914478302, + "rewards/rejected": -0.1847638040781021, + "sft_loss": 0.7180280685424805, + "step": 663 + }, + { + "epoch": 0.9602313810556761, + "grad_norm": 3.151008524024375, + "learning_rate": 7.622528410477842e-06, + "logits/chosen": 0.3907800018787384, + "logits/rejected": 0.3244548738002777, + "logps/chosen": -0.7783466577529907, + "logps/rejected": -1.9935373067855835, + "loss": 0.8665, + "odds_ratio_loss": 0.5557862520217896, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07783466577529907, + "rewards/margins": 0.1215190589427948, + "rewards/rejected": -0.19935372471809387, + "sft_loss": 0.7783466577529907, + "step": 664 + }, + { + "epoch": 0.9616775126536515, + "grad_norm": 4.387593299955748, + "learning_rate": 7.6212102279775115e-06, + "logits/chosen": 0.3593447208404541, + "logits/rejected": 0.3258175849914551, + "logps/chosen": -0.7193015813827515, + "logps/rejected": -1.7327250242233276, + "loss": 0.8301, + "odds_ratio_loss": 0.46600061655044556, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0719301626086235, + "rewards/margins": 0.1013423502445221, + "rewards/rejected": -0.1732725203037262, + "sft_loss": 0.7193015813827515, + "step": 665 + }, + { + "epoch": 0.9631236442516269, + "grad_norm": 2.390706539517393, + "learning_rate": 7.6198898622360464e-06, + "logits/chosen": 0.34965071082115173, + "logits/rejected": 0.31699496507644653, + "logps/chosen": -0.9958378672599792, + "logps/rejected": -1.3755648136138916, + "loss": 0.9733, + "odds_ratio_loss": 0.7043642997741699, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.09958378970623016, + "rewards/margins": 0.03797269985079765, + "rewards/rejected": -0.13755649328231812, + "sft_loss": 0.9958378672599792, + "step": 666 + }, + { + "epoch": 0.9645697758496024, + "grad_norm": 3.905469796567993, + "learning_rate": 7.6185673140495015e-06, + "logits/chosen": 0.3465041518211365, + "logits/rejected": 0.31788861751556396, + "logps/chosen": -0.8372557163238525, + "logps/rejected": -1.2869638204574585, + "loss": 0.8411, + "odds_ratio_loss": 0.5828957557678223, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08372557163238525, + "rewards/margins": 0.04497080296278, + "rewards/rejected": -0.12869638204574585, + "sft_loss": 0.8372557163238525, + "step": 667 + }, + { + "epoch": 0.9660159074475777, + "grad_norm": 2.875529991953273, + "learning_rate": 7.617242584215246e-06, + "logits/chosen": 0.4149461090564728, + "logits/rejected": 0.2595219612121582, + "logps/chosen": -0.6817153692245483, + "logps/rejected": -2.7561120986938477, + "loss": 0.7481, + "odds_ratio_loss": 0.37974974513053894, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06817153841257095, + "rewards/margins": 0.20743967592716217, + "rewards/rejected": -0.2756112217903137, + "sft_loss": 0.6817153692245483, + "step": 668 + }, + { + "epoch": 0.9674620390455532, + "grad_norm": 2.3328224837698883, + "learning_rate": 7.615915673531965e-06, + "logits/chosen": 0.3929983973503113, + "logits/rejected": 0.300082802772522, + "logps/chosen": -0.7355288863182068, + "logps/rejected": -2.5262303352355957, + "loss": 0.9321, + "odds_ratio_loss": 0.49977803230285645, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07355289161205292, + "rewards/margins": 0.1790701150894165, + "rewards/rejected": -0.2526229918003082, + "sft_loss": 0.7355288863182068, + "step": 669 + }, + { + "epoch": 0.9689081706435285, + "grad_norm": 2.540065505875259, + "learning_rate": 7.614586582799658e-06, + "logits/chosen": 0.4453916847705841, + "logits/rejected": 0.40558886528015137, + "logps/chosen": -0.7393182516098022, + "logps/rejected": -1.651641607284546, + "loss": 0.8151, + "odds_ratio_loss": 0.47889044880867004, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07393182814121246, + "rewards/margins": 0.09123234450817108, + "rewards/rejected": -0.16516415774822235, + "sft_loss": 0.7393182516098022, + "step": 670 + }, + { + "epoch": 0.970354302241504, + "grad_norm": 2.219181410621246, + "learning_rate": 7.6132553128196375e-06, + "logits/chosen": 0.4777889847755432, + "logits/rejected": 0.3341582417488098, + "logps/chosen": -0.6518281698226929, + "logps/rejected": -1.7046725749969482, + "loss": 0.7995, + "odds_ratio_loss": 0.5171917676925659, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06518281996250153, + "rewards/margins": 0.1052844300866127, + "rewards/rejected": -0.17046725749969482, + "sft_loss": 0.6518281698226929, + "step": 671 + }, + { + "epoch": 0.9718004338394793, + "grad_norm": 2.6826612009314426, + "learning_rate": 7.6119218643945315e-06, + "logits/chosen": 0.3291289806365967, + "logits/rejected": 0.32001611590385437, + "logps/chosen": -0.9242229461669922, + "logps/rejected": -1.5439229011535645, + "loss": 0.8439, + "odds_ratio_loss": 0.7239330410957336, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09242229908704758, + "rewards/margins": 0.06196998804807663, + "rewards/rejected": -0.1543922871351242, + "sft_loss": 0.9242229461669922, + "step": 672 + }, + { + "epoch": 0.9732465654374548, + "grad_norm": 2.554322117110017, + "learning_rate": 7.610586238328281e-06, + "logits/chosen": 0.4449082314968109, + "logits/rejected": 0.3627157211303711, + "logps/chosen": -0.769290030002594, + "logps/rejected": -1.377912998199463, + "loss": 0.8809, + "odds_ratio_loss": 0.5437467098236084, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0769289955496788, + "rewards/margins": 0.06086229905486107, + "rewards/rejected": -0.13779129087924957, + "sft_loss": 0.769290030002594, + "step": 673 + }, + { + "epoch": 0.9746926970354303, + "grad_norm": 4.82523082470742, + "learning_rate": 7.60924843542614e-06, + "logits/chosen": 0.3904574513435364, + "logits/rejected": 0.2845170497894287, + "logps/chosen": -0.8830059766769409, + "logps/rejected": -3.303161144256592, + "loss": 0.9391, + "odds_ratio_loss": 0.39237675070762634, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08830060064792633, + "rewards/margins": 0.24201549589633942, + "rewards/rejected": -0.33031609654426575, + "sft_loss": 0.8830059766769409, + "step": 674 + }, + { + "epoch": 0.9761388286334056, + "grad_norm": 2.6405629501899375, + "learning_rate": 7.607908456494675e-06, + "logits/chosen": 0.5365076661109924, + "logits/rejected": 0.4125050902366638, + "logps/chosen": -0.6532673835754395, + "logps/rejected": -1.4482712745666504, + "loss": 0.8422, + "odds_ratio_loss": 0.4444807171821594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0653267428278923, + "rewards/margins": 0.07950039207935333, + "rewards/rejected": -0.14482714235782623, + "sft_loss": 0.6532673835754395, + "step": 675 + }, + { + "epoch": 0.9775849602313811, + "grad_norm": 3.3513421127086636, + "learning_rate": 7.606566302341764e-06, + "logits/chosen": 0.46091920137405396, + "logits/rejected": 0.25358325242996216, + "logps/chosen": -0.8063492178916931, + "logps/rejected": -2.6329450607299805, + "loss": 0.8453, + "odds_ratio_loss": 0.4977782964706421, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08063492923974991, + "rewards/margins": 0.1826595962047577, + "rewards/rejected": -0.263294517993927, + "sft_loss": 0.8063492178916931, + "step": 676 + }, + { + "epoch": 0.9790310918293564, + "grad_norm": 2.5294773650031717, + "learning_rate": 7.6052219737765975e-06, + "logits/chosen": 0.478007435798645, + "logits/rejected": 0.38246941566467285, + "logps/chosen": -0.8104468584060669, + "logps/rejected": -1.7242964506149292, + "loss": 0.8154, + "odds_ratio_loss": 0.520146369934082, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08104468137025833, + "rewards/margins": 0.09138496220111847, + "rewards/rejected": -0.1724296510219574, + "sft_loss": 0.8104468584060669, + "step": 677 + }, + { + "epoch": 0.9804772234273319, + "grad_norm": 3.661969667095535, + "learning_rate": 7.6038754716096755e-06, + "logits/chosen": 0.30862492322921753, + "logits/rejected": 0.27289360761642456, + "logps/chosen": -0.7575228810310364, + "logps/rejected": -1.1974655389785767, + "loss": 0.8244, + "odds_ratio_loss": 0.545995831489563, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07575228810310364, + "rewards/margins": 0.04399425908923149, + "rewards/rejected": -0.11974655091762543, + "sft_loss": 0.7575228810310364, + "step": 678 + }, + { + "epoch": 0.9819233550253073, + "grad_norm": 2.836339221066449, + "learning_rate": 7.60252679665281e-06, + "logits/chosen": 0.35597071051597595, + "logits/rejected": 0.3139832615852356, + "logps/chosen": -0.7728620171546936, + "logps/rejected": -1.354665994644165, + "loss": 0.7193, + "odds_ratio_loss": 0.5544801950454712, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07728619873523712, + "rewards/margins": 0.05818040296435356, + "rewards/rejected": -0.13546660542488098, + "sft_loss": 0.7728620171546936, + "step": 679 + }, + { + "epoch": 0.9833694866232827, + "grad_norm": 2.2847928594793943, + "learning_rate": 7.601175949719122e-06, + "logits/chosen": 0.3785718083381653, + "logits/rejected": 0.25374460220336914, + "logps/chosen": -0.6391298174858093, + "logps/rejected": -2.5889670848846436, + "loss": 0.7757, + "odds_ratio_loss": 0.39246100187301636, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06391298770904541, + "rewards/margins": 0.19498372077941895, + "rewards/rejected": -0.25889670848846436, + "sft_loss": 0.6391298174858093, + "step": 680 + }, + { + "epoch": 0.9848156182212582, + "grad_norm": 3.2734066693082116, + "learning_rate": 7.599822931623041e-06, + "logits/chosen": 0.4401516020298004, + "logits/rejected": 0.31636372208595276, + "logps/chosen": -0.8324559330940247, + "logps/rejected": -2.279355764389038, + "loss": 0.8542, + "odds_ratio_loss": 0.5873942375183105, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08324559032917023, + "rewards/margins": 0.14468997716903687, + "rewards/rejected": -0.2279355525970459, + "sft_loss": 0.8324559330940247, + "step": 681 + }, + { + "epoch": 0.9862617498192335, + "grad_norm": 3.4818714247714158, + "learning_rate": 7.598467743180308e-06, + "logits/chosen": 0.3637101352214813, + "logits/rejected": 0.3162555694580078, + "logps/chosen": -0.6616679430007935, + "logps/rejected": -2.204662799835205, + "loss": 0.8601, + "odds_ratio_loss": 0.47493523359298706, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06616680324077606, + "rewards/margins": 0.15429948270320892, + "rewards/rejected": -0.22046628594398499, + "sft_loss": 0.6616679430007935, + "step": 682 + }, + { + "epoch": 0.987707881417209, + "grad_norm": 2.3892984192577122, + "learning_rate": 7.597110385207969e-06, + "logits/chosen": 0.31790465116500854, + "logits/rejected": 0.26166340708732605, + "logps/chosen": -0.9303499460220337, + "logps/rejected": -1.437572956085205, + "loss": 0.9003, + "odds_ratio_loss": 0.6231876611709595, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09303499758243561, + "rewards/margins": 0.05072229355573654, + "rewards/rejected": -0.14375729858875275, + "sft_loss": 0.9303499460220337, + "step": 683 + }, + { + "epoch": 0.9891540130151844, + "grad_norm": 2.2284432203154685, + "learning_rate": 7.5957508585243824e-06, + "logits/chosen": 0.40829747915267944, + "logits/rejected": 0.27027884125709534, + "logps/chosen": -0.6149638891220093, + "logps/rejected": -2.2985355854034424, + "loss": 0.6659, + "odds_ratio_loss": 0.340939998626709, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06149638444185257, + "rewards/margins": 0.16835719347000122, + "rewards/rejected": -0.2298535704612732, + "sft_loss": 0.6149638891220093, + "step": 684 + }, + { + "epoch": 0.9906001446131598, + "grad_norm": 2.95393549994262, + "learning_rate": 7.594389163949211e-06, + "logits/chosen": 0.3751344084739685, + "logits/rejected": 0.24515630304813385, + "logps/chosen": -0.6303630471229553, + "logps/rejected": -2.502126693725586, + "loss": 0.7415, + "odds_ratio_loss": 0.3910840153694153, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06303630769252777, + "rewards/margins": 0.18717637658119202, + "rewards/rejected": -0.250212699174881, + "sft_loss": 0.6303630471229553, + "step": 685 + }, + { + "epoch": 0.9920462762111352, + "grad_norm": 3.0543818604744923, + "learning_rate": 7.593025302303426e-06, + "logits/chosen": 0.22160741686820984, + "logits/rejected": 0.19796130061149597, + "logps/chosen": -0.818253755569458, + "logps/rejected": -1.9009416103363037, + "loss": 0.877, + "odds_ratio_loss": 0.6066489219665527, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0818253830075264, + "rewards/margins": 0.10826879739761353, + "rewards/rejected": -0.19009417295455933, + "sft_loss": 0.818253755569458, + "step": 686 + }, + { + "epoch": 0.9934924078091106, + "grad_norm": 4.998255178745226, + "learning_rate": 7.591659274409305e-06, + "logits/chosen": 0.4371909201145172, + "logits/rejected": 0.3654007613658905, + "logps/chosen": -0.8172946572303772, + "logps/rejected": -1.5242695808410645, + "loss": 0.9026, + "odds_ratio_loss": 0.6884384155273438, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0817294716835022, + "rewards/margins": 0.07069748640060425, + "rewards/rejected": -0.15242695808410645, + "sft_loss": 0.8172946572303772, + "step": 687 + }, + { + "epoch": 0.9949385394070861, + "grad_norm": 4.778633155537618, + "learning_rate": 7.590291081090429e-06, + "logits/chosen": 0.33778876066207886, + "logits/rejected": 0.18438909947872162, + "logps/chosen": -0.8597713708877563, + "logps/rejected": -1.5066003799438477, + "loss": 0.8564, + "odds_ratio_loss": 0.5719389319419861, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08597713708877563, + "rewards/margins": 0.06468289345502853, + "rewards/rejected": -0.15066005289554596, + "sft_loss": 0.8597713708877563, + "step": 688 + }, + { + "epoch": 0.9963846710050615, + "grad_norm": 6.0189507993908276, + "learning_rate": 7.588920723171691e-06, + "logits/chosen": 0.33838024735450745, + "logits/rejected": 0.2940466105937958, + "logps/chosen": -0.8718283176422119, + "logps/rejected": -1.8520734310150146, + "loss": 0.9698, + "odds_ratio_loss": 0.5115089416503906, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.08718283474445343, + "rewards/margins": 0.09802451729774475, + "rewards/rejected": -0.18520736694335938, + "sft_loss": 0.8718283176422119, + "step": 689 + }, + { + "epoch": 0.9978308026030369, + "grad_norm": 3.9502319107623367, + "learning_rate": 7.5875482014792805e-06, + "logits/chosen": 0.3570603132247925, + "logits/rejected": 0.27360644936561584, + "logps/chosen": -1.0185794830322266, + "logps/rejected": -1.4535801410675049, + "loss": 0.9067, + "odds_ratio_loss": 0.65177321434021, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10185794532299042, + "rewards/margins": 0.04350007325410843, + "rewards/rejected": -0.14535802602767944, + "sft_loss": 1.0185794830322266, + "step": 690 + }, + { + "epoch": 0.9992769342010123, + "grad_norm": 7.674669788400658, + "learning_rate": 7.586173516840698e-06, + "logits/chosen": 0.1779128909111023, + "logits/rejected": 0.17757825553417206, + "logps/chosen": -0.9851481914520264, + "logps/rejected": -1.3905221223831177, + "loss": 0.8799, + "odds_ratio_loss": 0.7719675302505493, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.09851481765508652, + "rewards/margins": 0.04053738713264465, + "rewards/rejected": -0.13905221223831177, + "sft_loss": 0.9851481914520264, + "step": 691 + }, + { + "epoch": 1.0007230657989876, + "grad_norm": 3.173077480414479, + "learning_rate": 7.584796670084747e-06, + "logits/chosen": 0.3374330997467041, + "logits/rejected": 0.25602987408638, + "logps/chosen": -0.7603847980499268, + "logps/rejected": -2.37874436378479, + "loss": 0.6604, + "odds_ratio_loss": 0.4636649191379547, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07603848725557327, + "rewards/margins": 0.1618359386920929, + "rewards/rejected": -0.23787443339824677, + "sft_loss": 0.7603847980499268, + "step": 692 + }, + { + "epoch": 1.002169197396963, + "grad_norm": 2.9704017840618158, + "learning_rate": 7.583417662041532e-06, + "logits/chosen": 0.473114013671875, + "logits/rejected": 0.2985457181930542, + "logps/chosen": -0.4166085124015808, + "logps/rejected": -2.621722459793091, + "loss": 0.5093, + "odds_ratio_loss": 0.25622743368148804, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0416608564555645, + "rewards/margins": 0.22051140666007996, + "rewards/rejected": -0.26217225193977356, + "sft_loss": 0.4166085124015808, + "step": 693 + }, + { + "epoch": 1.0036153289949385, + "grad_norm": 3.679055594373421, + "learning_rate": 7.5820364935424625e-06, + "logits/chosen": 0.36234307289123535, + "logits/rejected": 0.2413337230682373, + "logps/chosen": -0.49339836835861206, + "logps/rejected": -2.096301555633545, + "loss": 0.5615, + "odds_ratio_loss": 0.36454418301582336, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.049339838325977325, + "rewards/margins": 0.16029031574726105, + "rewards/rejected": -0.20963016152381897, + "sft_loss": 0.49339836835861206, + "step": 694 + }, + { + "epoch": 1.005061460592914, + "grad_norm": 2.4644440720589715, + "learning_rate": 7.58065316542025e-06, + "logits/chosen": 0.2052929401397705, + "logits/rejected": 0.17279671132564545, + "logps/chosen": -0.5167049169540405, + "logps/rejected": -3.038837432861328, + "loss": 0.5475, + "odds_ratio_loss": 0.30443111062049866, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.051670484244823456, + "rewards/margins": 0.2522132694721222, + "rewards/rejected": -0.30388376116752625, + "sft_loss": 0.5167049169540405, + "step": 695 + }, + { + "epoch": 1.0065075921908895, + "grad_norm": 2.2735633936891726, + "learning_rate": 7.579267678508907e-06, + "logits/chosen": 0.03438059240579605, + "logits/rejected": 0.08168449997901917, + "logps/chosen": -0.7009294033050537, + "logps/rejected": -1.5276418924331665, + "loss": 0.5529, + "odds_ratio_loss": 0.3713332414627075, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07009293884038925, + "rewards/margins": 0.08267124742269516, + "rewards/rejected": -0.15276417136192322, + "sft_loss": 0.7009294033050537, + "step": 696 + }, + { + "epoch": 1.0079537237888647, + "grad_norm": 2.6197848918777447, + "learning_rate": 7.577880033643751e-06, + "logits/chosen": 0.01991932839155197, + "logits/rejected": 0.004507867619395256, + "logps/chosen": -0.6916311979293823, + "logps/rejected": -2.359644651412964, + "loss": 0.6266, + "odds_ratio_loss": 0.3500153720378876, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06916312128305435, + "rewards/margins": 0.1668013483285904, + "rewards/rejected": -0.23596449196338654, + "sft_loss": 0.6916311979293823, + "step": 697 + }, + { + "epoch": 1.0093998553868402, + "grad_norm": 3.2406420094854553, + "learning_rate": 7.576490231661397e-06, + "logits/chosen": -0.03848964348435402, + "logits/rejected": 0.02764774300158024, + "logps/chosen": -0.5944284200668335, + "logps/rejected": -1.3864145278930664, + "loss": 0.6854, + "odds_ratio_loss": 0.5450542569160461, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05944284424185753, + "rewards/margins": 0.07919859886169434, + "rewards/rejected": -0.13864144682884216, + "sft_loss": 0.5944284200668335, + "step": 698 + }, + { + "epoch": 1.0108459869848156, + "grad_norm": 2.6286685009567035, + "learning_rate": 7.575098273399764e-06, + "logits/chosen": -0.05605170875787735, + "logits/rejected": -0.044075943529605865, + "logps/chosen": -0.6372457146644592, + "logps/rejected": -1.772578477859497, + "loss": 0.5905, + "odds_ratio_loss": 0.3773233890533447, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0637245699763298, + "rewards/margins": 0.11353328824043274, + "rewards/rejected": -0.17725783586502075, + "sft_loss": 0.6372457146644592, + "step": 699 + }, + { + "epoch": 1.0122921185827911, + "grad_norm": 2.617536762472655, + "learning_rate": 7.573704159698065e-06, + "logits/chosen": -0.05474071949720383, + "logits/rejected": -0.07595973461866379, + "logps/chosen": -0.6069769859313965, + "logps/rejected": -2.064570903778076, + "loss": 0.6547, + "odds_ratio_loss": 0.35097628831863403, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06069770082831383, + "rewards/margins": 0.14575940370559692, + "rewards/rejected": -0.20645710825920105, + "sft_loss": 0.6069769859313965, + "step": 700 + }, + { + "epoch": 1.0137382501807664, + "grad_norm": 2.689800745017883, + "learning_rate": 7.572307891396817e-06, + "logits/chosen": -0.05100724846124649, + "logits/rejected": 0.07264310121536255, + "logps/chosen": -0.717523455619812, + "logps/rejected": -2.394158363342285, + "loss": 0.5874, + "odds_ratio_loss": 0.36042433977127075, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07175233960151672, + "rewards/margins": 0.16766351461410522, + "rewards/rejected": -0.23941585421562195, + "sft_loss": 0.717523455619812, + "step": 701 + }, + { + "epoch": 1.0151843817787418, + "grad_norm": 2.535012066884644, + "learning_rate": 7.570909469337838e-06, + "logits/chosen": 0.06960493326187134, + "logits/rejected": 0.07583092153072357, + "logps/chosen": -0.49426883459091187, + "logps/rejected": -2.2409791946411133, + "loss": 0.6178, + "odds_ratio_loss": 0.3533773720264435, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.049426883459091187, + "rewards/margins": 0.17467105388641357, + "rewards/rejected": -0.22409793734550476, + "sft_loss": 0.49426883459091187, + "step": 702 + }, + { + "epoch": 1.0166305133767173, + "grad_norm": 2.655301235799629, + "learning_rate": 7.5695088943642415e-06, + "logits/chosen": 0.01799941062927246, + "logits/rejected": 0.00933268666267395, + "logps/chosen": -0.3941487669944763, + "logps/rejected": -2.9899024963378906, + "loss": 0.5979, + "odds_ratio_loss": 0.26885470747947693, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03941487893462181, + "rewards/margins": 0.25957539677619934, + "rewards/rejected": -0.29899024963378906, + "sft_loss": 0.3941487669944763, + "step": 703 + }, + { + "epoch": 1.0180766449746927, + "grad_norm": 2.5535543893804906, + "learning_rate": 7.568106167320437e-06, + "logits/chosen": 0.041138745844364166, + "logits/rejected": 0.11178290843963623, + "logps/chosen": -0.49738141894340515, + "logps/rejected": -2.7035470008850098, + "loss": 0.5773, + "odds_ratio_loss": 0.2610205411911011, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.049738142639398575, + "rewards/margins": 0.22061654925346375, + "rewards/rejected": -0.270354688167572, + "sft_loss": 0.49738141894340515, + "step": 704 + }, + { + "epoch": 1.0195227765726682, + "grad_norm": 2.5907209857345923, + "learning_rate": 7.566701289052136e-06, + "logits/chosen": 0.0588398277759552, + "logits/rejected": 0.020695263519883156, + "logps/chosen": -0.6042892932891846, + "logps/rejected": -3.8982067108154297, + "loss": 0.5564, + "odds_ratio_loss": 0.28044500946998596, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0604289248585701, + "rewards/margins": 0.3293917775154114, + "rewards/rejected": -0.3898206949234009, + "sft_loss": 0.6042892932891846, + "step": 705 + }, + { + "epoch": 1.0209689081706435, + "grad_norm": 6.346699372024468, + "learning_rate": 7.565294260406343e-06, + "logits/chosen": -0.01402386836707592, + "logits/rejected": 0.020731184631586075, + "logps/chosen": -0.517099916934967, + "logps/rejected": -1.578346848487854, + "loss": 0.5113, + "odds_ratio_loss": 0.2911356985569, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05170999467372894, + "rewards/margins": 0.10612468421459198, + "rewards/rejected": -0.15783467888832092, + "sft_loss": 0.517099916934967, + "step": 706 + }, + { + "epoch": 1.022415039768619, + "grad_norm": 2.737750718755634, + "learning_rate": 7.563885082231363e-06, + "logits/chosen": 0.09974583983421326, + "logits/rejected": 0.11834936589002609, + "logps/chosen": -0.4971277415752411, + "logps/rejected": -2.3825626373291016, + "loss": 0.5577, + "odds_ratio_loss": 0.30548495054244995, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04971277341246605, + "rewards/margins": 0.18854348361492157, + "rewards/rejected": -0.23825626075267792, + "sft_loss": 0.4971277415752411, + "step": 707 + }, + { + "epoch": 1.0238611713665944, + "grad_norm": 3.5057829756261785, + "learning_rate": 7.562473755376792e-06, + "logits/chosen": -0.1551908254623413, + "logits/rejected": -0.17449763417243958, + "logps/chosen": -0.6683604121208191, + "logps/rejected": -3.447512149810791, + "loss": 0.6055, + "odds_ratio_loss": 0.3249449133872986, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06683603674173355, + "rewards/margins": 0.27791517972946167, + "rewards/rejected": -0.344751238822937, + "sft_loss": 0.6683604121208191, + "step": 708 + }, + { + "epoch": 1.0253073029645698, + "grad_norm": 2.578140669633971, + "learning_rate": 7.561060280693528e-06, + "logits/chosen": -0.23344504833221436, + "logits/rejected": -0.031244784593582153, + "logps/chosen": -0.4613468050956726, + "logps/rejected": -1.805433988571167, + "loss": 0.57, + "odds_ratio_loss": 0.21366247534751892, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04613468050956726, + "rewards/margins": 0.13440871238708496, + "rewards/rejected": -0.18054340779781342, + "sft_loss": 0.4613468050956726, + "step": 709 + }, + { + "epoch": 1.026753434562545, + "grad_norm": 2.46611135338615, + "learning_rate": 7.559644659033757e-06, + "logits/chosen": -0.10052379965782166, + "logits/rejected": -0.03007357195019722, + "logps/chosen": -0.4927516579627991, + "logps/rejected": -2.790343999862671, + "loss": 0.571, + "odds_ratio_loss": 0.2678479254245758, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04927516356110573, + "rewards/margins": 0.22975923120975494, + "rewards/rejected": -0.27903443574905396, + "sft_loss": 0.4927516579627991, + "step": 710 + }, + { + "epoch": 1.0281995661605206, + "grad_norm": 2.666752714440422, + "learning_rate": 7.558226891250963e-06, + "logits/chosen": -0.03222482651472092, + "logits/rejected": 0.01601647585630417, + "logps/chosen": -0.6681675314903259, + "logps/rejected": -1.4737236499786377, + "loss": 0.5683, + "odds_ratio_loss": 0.353636771440506, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06681674718856812, + "rewards/margins": 0.08055561035871506, + "rewards/rejected": -0.14737236499786377, + "sft_loss": 0.6681675314903259, + "step": 711 + }, + { + "epoch": 1.029645697758496, + "grad_norm": 2.618017036869127, + "learning_rate": 7.556806978199924e-06, + "logits/chosen": 0.0519598163664341, + "logits/rejected": 0.022058458998799324, + "logps/chosen": -0.5769098997116089, + "logps/rejected": -2.5604069232940674, + "loss": 0.6182, + "odds_ratio_loss": 0.3295339345932007, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05769098550081253, + "rewards/margins": 0.1983497142791748, + "rewards/rejected": -0.25604069232940674, + "sft_loss": 0.5769098997116089, + "step": 712 + }, + { + "epoch": 1.0310918293564715, + "grad_norm": 7.761185954898741, + "learning_rate": 7.555384920736711e-06, + "logits/chosen": 0.015996111556887627, + "logits/rejected": 0.08699595928192139, + "logps/chosen": -0.7711721658706665, + "logps/rejected": -1.6923863887786865, + "loss": 0.6644, + "odds_ratio_loss": 0.4671747088432312, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0771172046661377, + "rewards/margins": 0.0921214297413826, + "rewards/rejected": -0.16923865675926208, + "sft_loss": 0.7711721658706665, + "step": 713 + }, + { + "epoch": 1.032537960954447, + "grad_norm": 2.6755880173033013, + "learning_rate": 7.5539607197186875e-06, + "logits/chosen": 0.07660658657550812, + "logits/rejected": 0.01353158988058567, + "logps/chosen": -0.6658707857131958, + "logps/rejected": -3.14312481880188, + "loss": 0.6126, + "odds_ratio_loss": 0.2412908524274826, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06658707559108734, + "rewards/margins": 0.24772541224956512, + "rewards/rejected": -0.31431248784065247, + "sft_loss": 0.6658707857131958, + "step": 714 + }, + { + "epoch": 1.0339840925524222, + "grad_norm": 2.439023237908597, + "learning_rate": 7.552534376004511e-06, + "logits/chosen": -0.02263808436691761, + "logits/rejected": 0.05558781325817108, + "logps/chosen": -0.3617287576198578, + "logps/rejected": -3.1298577785491943, + "loss": 0.5288, + "odds_ratio_loss": 0.16875040531158447, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03617287427186966, + "rewards/margins": 0.27681291103363037, + "rewards/rejected": -0.31298577785491943, + "sft_loss": 0.3617287576198578, + "step": 715 + }, + { + "epoch": 1.0354302241503976, + "grad_norm": 2.6945317555257775, + "learning_rate": 7.551105890454128e-06, + "logits/chosen": -0.13312295079231262, + "logits/rejected": -0.014301072806119919, + "logps/chosen": -0.5835843682289124, + "logps/rejected": -1.665818214416504, + "loss": 0.6249, + "odds_ratio_loss": 0.23966217041015625, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.058358438313007355, + "rewards/margins": 0.10822339355945587, + "rewards/rejected": -0.16658182442188263, + "sft_loss": 0.5835843682289124, + "step": 716 + }, + { + "epoch": 1.0368763557483731, + "grad_norm": 4.069629175458383, + "learning_rate": 7.549675263928776e-06, + "logits/chosen": 0.05003291368484497, + "logits/rejected": 0.09077559411525726, + "logps/chosen": -0.5346865057945251, + "logps/rejected": -1.7945709228515625, + "loss": 0.5653, + "odds_ratio_loss": 0.30477625131607056, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.053468652069568634, + "rewards/margins": 0.1259884536266327, + "rewards/rejected": -0.17945709824562073, + "sft_loss": 0.5346865057945251, + "step": 717 + }, + { + "epoch": 1.0383224873463486, + "grad_norm": 2.5024788446043438, + "learning_rate": 7.548242497290988e-06, + "logits/chosen": -0.1343534290790558, + "logits/rejected": -0.016086647287011147, + "logps/chosen": -0.49459975957870483, + "logps/rejected": -2.4642348289489746, + "loss": 0.5703, + "odds_ratio_loss": 0.23821872472763062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.049459975212812424, + "rewards/margins": 0.19696351885795593, + "rewards/rejected": -0.24642349779605865, + "sft_loss": 0.49459975957870483, + "step": 718 + }, + { + "epoch": 1.0397686189443238, + "grad_norm": 2.604294962061697, + "learning_rate": 7.546807591404584e-06, + "logits/chosen": -0.10261072218418121, + "logits/rejected": -0.09555017948150635, + "logps/chosen": -0.654711127281189, + "logps/rejected": -2.6856062412261963, + "loss": 0.639, + "odds_ratio_loss": 0.36931928992271423, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0654711127281189, + "rewards/margins": 0.20308950543403625, + "rewards/rejected": -0.26856061816215515, + "sft_loss": 0.654711127281189, + "step": 719 + }, + { + "epoch": 1.0412147505422993, + "grad_norm": 2.5336512821889947, + "learning_rate": 7.545370547134672e-06, + "logits/chosen": -0.0048141926527023315, + "logits/rejected": 0.013522947207093239, + "logps/chosen": -0.48836880922317505, + "logps/rejected": -1.8508555889129639, + "loss": 0.5304, + "odds_ratio_loss": 0.2670961618423462, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.048836879432201385, + "rewards/margins": 0.1362486630678177, + "rewards/rejected": -0.18508554995059967, + "sft_loss": 0.48836880922317505, + "step": 720 + }, + { + "epoch": 1.0426608821402747, + "grad_norm": 5.073824546241314, + "learning_rate": 7.5439313653476546e-06, + "logits/chosen": 0.010667698457837105, + "logits/rejected": -0.007655080407857895, + "logps/chosen": -0.6140876412391663, + "logps/rejected": -2.0642848014831543, + "loss": 0.6017, + "odds_ratio_loss": 0.33399447798728943, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06140875816345215, + "rewards/margins": 0.14501972496509552, + "rewards/rejected": -0.20642849802970886, + "sft_loss": 0.6140876412391663, + "step": 721 + }, + { + "epoch": 1.0441070137382502, + "grad_norm": 2.4676977774963214, + "learning_rate": 7.542490046911217e-06, + "logits/chosen": 0.046404771506786346, + "logits/rejected": 0.04437633231282234, + "logps/chosen": -0.5226276516914368, + "logps/rejected": -1.6676445007324219, + "loss": 0.581, + "odds_ratio_loss": 0.33058759570121765, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.052262768149375916, + "rewards/margins": 0.11450167745351791, + "rewards/rejected": -0.16676445305347443, + "sft_loss": 0.5226276516914368, + "step": 722 + }, + { + "epoch": 1.0455531453362257, + "grad_norm": 2.393012743770882, + "learning_rate": 7.541046592694336e-06, + "logits/chosen": -0.11884389072656631, + "logits/rejected": -0.012990422546863556, + "logps/chosen": -0.638570249080658, + "logps/rejected": -1.4692680835723877, + "loss": 0.6703, + "odds_ratio_loss": 0.34661900997161865, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06385702639818192, + "rewards/margins": 0.08306978642940521, + "rewards/rejected": -0.14692680537700653, + "sft_loss": 0.638570249080658, + "step": 723 + }, + { + "epoch": 1.046999276934201, + "grad_norm": 2.2437060230333428, + "learning_rate": 7.539601003567277e-06, + "logits/chosen": -0.043492406606674194, + "logits/rejected": 0.05224483087658882, + "logps/chosen": -0.7726094722747803, + "logps/rejected": -1.4802862405776978, + "loss": 0.6724, + "odds_ratio_loss": 0.4070410132408142, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07726094871759415, + "rewards/margins": 0.07076768577098846, + "rewards/rejected": -0.1480286419391632, + "sft_loss": 0.7726094722747803, + "step": 724 + }, + { + "epoch": 1.0484454085321764, + "grad_norm": 2.202580998080793, + "learning_rate": 7.538153280401589e-06, + "logits/chosen": -0.04949672147631645, + "logits/rejected": -0.10443629324436188, + "logps/chosen": -0.668379008769989, + "logps/rejected": -1.9327638149261475, + "loss": 0.5646, + "odds_ratio_loss": 0.39114710688591003, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06683789938688278, + "rewards/margins": 0.12643848359584808, + "rewards/rejected": -0.19327637553215027, + "sft_loss": 0.668379008769989, + "step": 725 + }, + { + "epoch": 1.0498915401301518, + "grad_norm": 2.601181457233383, + "learning_rate": 7.536703424070111e-06, + "logits/chosen": 0.06261063367128372, + "logits/rejected": -0.016470249742269516, + "logps/chosen": -0.48023730516433716, + "logps/rejected": -2.6002039909362793, + "loss": 0.4852, + "odds_ratio_loss": 0.25949835777282715, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.048023734241724014, + "rewards/margins": 0.2119966745376587, + "rewards/rejected": -0.2600204050540924, + "sft_loss": 0.48023730516433716, + "step": 726 + }, + { + "epoch": 1.0513376717281273, + "grad_norm": 2.923410627265685, + "learning_rate": 7.535251435446967e-06, + "logits/chosen": 0.07905685156583786, + "logits/rejected": 0.07296408712863922, + "logps/chosen": -0.41822901368141174, + "logps/rejected": -2.0072243213653564, + "loss": 0.5235, + "odds_ratio_loss": 0.25963878631591797, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.041822899132966995, + "rewards/margins": 0.15889953076839447, + "rewards/rejected": -0.20072242617607117, + "sft_loss": 0.41822901368141174, + "step": 727 + }, + { + "epoch": 1.0527838033261028, + "grad_norm": 4.27971490526211, + "learning_rate": 7.533797315407566e-06, + "logits/chosen": 0.053780484944581985, + "logits/rejected": 0.010446615517139435, + "logps/chosen": -0.5086668729782104, + "logps/rejected": -2.1758975982666016, + "loss": 0.6473, + "odds_ratio_loss": 0.3123031556606293, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.050866689532995224, + "rewards/margins": 0.16672305762767792, + "rewards/rejected": -0.21758975088596344, + "sft_loss": 0.5086668729782104, + "step": 728 + }, + { + "epoch": 1.054229934924078, + "grad_norm": 3.2608327123149223, + "learning_rate": 7.532341064828602e-06, + "logits/chosen": -0.3278353214263916, + "logits/rejected": -0.129352867603302, + "logps/chosen": -0.6491460800170898, + "logps/rejected": -1.670910120010376, + "loss": 0.5573, + "odds_ratio_loss": 0.30113694071769714, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06491461396217346, + "rewards/margins": 0.10217640548944473, + "rewards/rejected": -0.1670910269021988, + "sft_loss": 0.6491460800170898, + "step": 729 + }, + { + "epoch": 1.0556760665220535, + "grad_norm": 3.004012904325533, + "learning_rate": 7.530882684588055e-06, + "logits/chosen": -0.07247396558523178, + "logits/rejected": 0.033849820494651794, + "logps/chosen": -0.6534922122955322, + "logps/rejected": -2.295997381210327, + "loss": 0.6167, + "odds_ratio_loss": 0.3587084114551544, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06534922122955322, + "rewards/margins": 0.16425049304962158, + "rewards/rejected": -0.2295997142791748, + "sft_loss": 0.6534922122955322, + "step": 730 + }, + { + "epoch": 1.057122198120029, + "grad_norm": 3.5688118743402115, + "learning_rate": 7.529422175565185e-06, + "logits/chosen": 0.019199436530470848, + "logits/rejected": 0.0648551732301712, + "logps/chosen": -0.44681620597839355, + "logps/rejected": -1.9060118198394775, + "loss": 0.5512, + "odds_ratio_loss": 0.30147257447242737, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.044681623578071594, + "rewards/margins": 0.1459195613861084, + "rewards/rejected": -0.19060118496418, + "sft_loss": 0.44681620597839355, + "step": 731 + }, + { + "epoch": 1.0585683297180044, + "grad_norm": 2.4630310600048557, + "learning_rate": 7.5279595386405426e-06, + "logits/chosen": 0.041221290826797485, + "logits/rejected": 0.041878592222929, + "logps/chosen": -0.7346194982528687, + "logps/rejected": -1.4957523345947266, + "loss": 0.6413, + "odds_ratio_loss": 0.49654725193977356, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07346194982528687, + "rewards/margins": 0.07611328363418579, + "rewards/rejected": -0.14957523345947266, + "sft_loss": 0.7346194982528687, + "step": 732 + }, + { + "epoch": 1.0600144613159797, + "grad_norm": 2.1602876897916063, + "learning_rate": 7.526494774695953e-06, + "logits/chosen": 0.05351543426513672, + "logits/rejected": 0.06491318345069885, + "logps/chosen": -0.6178892850875854, + "logps/rejected": -1.1977730989456177, + "loss": 0.6236, + "odds_ratio_loss": 0.4457571804523468, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.061788931488990784, + "rewards/margins": 0.057988375425338745, + "rewards/rejected": -0.11977731436491013, + "sft_loss": 0.6178892850875854, + "step": 733 + }, + { + "epoch": 1.0614605929139551, + "grad_norm": 2.8924484227212877, + "learning_rate": 7.525027884614532e-06, + "logits/chosen": 0.014550477266311646, + "logits/rejected": 0.09415071457624435, + "logps/chosen": -0.5875763893127441, + "logps/rejected": -1.687889575958252, + "loss": 0.5808, + "odds_ratio_loss": 0.3503105342388153, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05875764787197113, + "rewards/margins": 0.11003130674362183, + "rewards/rejected": -0.16878895461559296, + "sft_loss": 0.5875763893127441, + "step": 734 + }, + { + "epoch": 1.0629067245119306, + "grad_norm": 2.834196823758551, + "learning_rate": 7.523558869280668e-06, + "logits/chosen": -0.0009785722941160202, + "logits/rejected": 0.036724962294101715, + "logps/chosen": -0.4648720920085907, + "logps/rejected": -1.547023892402649, + "loss": 0.5905, + "odds_ratio_loss": 0.2784779965877533, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04648720845580101, + "rewards/margins": 0.10821517556905746, + "rewards/rejected": -0.15470239520072937, + "sft_loss": 0.4648720920085907, + "step": 735 + }, + { + "epoch": 1.064352856109906, + "grad_norm": 2.285507014423719, + "learning_rate": 7.52208772958004e-06, + "logits/chosen": 0.021531209349632263, + "logits/rejected": 0.05180025473237038, + "logps/chosen": -0.7658559083938599, + "logps/rejected": -1.4070403575897217, + "loss": 0.6991, + "odds_ratio_loss": 0.47670185565948486, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07658559828996658, + "rewards/margins": 0.06411843001842499, + "rewards/rejected": -0.14070403575897217, + "sft_loss": 0.7658559083938599, + "step": 736 + }, + { + "epoch": 1.0657989877078815, + "grad_norm": 2.1699866577767235, + "learning_rate": 7.520614466399602e-06, + "logits/chosen": 0.06271585822105408, + "logits/rejected": 0.10178729146718979, + "logps/chosen": -0.6396518349647522, + "logps/rejected": -1.8695476055145264, + "loss": 0.4857, + "odds_ratio_loss": 0.29615530371665955, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06396518647670746, + "rewards/margins": 0.12298958003520966, + "rewards/rejected": -0.18695476651191711, + "sft_loss": 0.6396518349647522, + "step": 737 + }, + { + "epoch": 1.0672451193058567, + "grad_norm": 2.5201426723175215, + "learning_rate": 7.5191390806275905e-06, + "logits/chosen": 0.06206812709569931, + "logits/rejected": 0.05045921355485916, + "logps/chosen": -0.5444098114967346, + "logps/rejected": -1.3108839988708496, + "loss": 0.5319, + "odds_ratio_loss": 0.4091678857803345, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05444097891449928, + "rewards/margins": 0.07664742320775986, + "rewards/rejected": -0.13108840584754944, + "sft_loss": 0.5444098114967346, + "step": 738 + }, + { + "epoch": 1.0686912509038322, + "grad_norm": 2.661287000165096, + "learning_rate": 7.51766157315352e-06, + "logits/chosen": 0.017679838463664055, + "logits/rejected": 0.16645830869674683, + "logps/chosen": -0.5121591091156006, + "logps/rejected": -2.511974334716797, + "loss": 0.5386, + "odds_ratio_loss": 0.25777196884155273, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05121590942144394, + "rewards/margins": 0.19998154044151306, + "rewards/rejected": -0.2511974573135376, + "sft_loss": 0.5121591091156006, + "step": 739 + }, + { + "epoch": 1.0701373825018077, + "grad_norm": 2.6654097422186958, + "learning_rate": 7.516181944868187e-06, + "logits/chosen": 0.06915304809808731, + "logits/rejected": 0.07672876119613647, + "logps/chosen": -0.7108054757118225, + "logps/rejected": -2.896374225616455, + "loss": 0.614, + "odds_ratio_loss": 0.3021408021450043, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07108054310083389, + "rewards/margins": 0.21855685114860535, + "rewards/rejected": -0.28963741660118103, + "sft_loss": 0.7108054757118225, + "step": 740 + }, + { + "epoch": 1.0715835140997831, + "grad_norm": 2.5983221718366853, + "learning_rate": 7.514700196663663e-06, + "logits/chosen": 0.04233922064304352, + "logits/rejected": 0.10317760705947876, + "logps/chosen": -0.5160056948661804, + "logps/rejected": -1.6412593126296997, + "loss": 0.5438, + "odds_ratio_loss": 0.31687411665916443, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05160056799650192, + "rewards/margins": 0.11252538114786148, + "rewards/rejected": -0.1641259491443634, + "sft_loss": 0.5160056948661804, + "step": 741 + }, + { + "epoch": 1.0730296456977584, + "grad_norm": 2.5631716723122224, + "learning_rate": 7.5132163294332995e-06, + "logits/chosen": 0.0766778364777565, + "logits/rejected": 0.09308762103319168, + "logps/chosen": -0.5432895421981812, + "logps/rejected": -2.4959678649902344, + "loss": 0.566, + "odds_ratio_loss": 0.273905485868454, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.054328951984643936, + "rewards/margins": 0.19526784121990204, + "rewards/rejected": -0.24959678947925568, + "sft_loss": 0.5432895421981812, + "step": 742 + }, + { + "epoch": 1.0744757772957338, + "grad_norm": 2.7943114290109494, + "learning_rate": 7.511730344071727e-06, + "logits/chosen": 0.29873916506767273, + "logits/rejected": 0.2477722316980362, + "logps/chosen": -0.286681592464447, + "logps/rejected": -3.2388148307800293, + "loss": 0.5915, + "odds_ratio_loss": 0.17670558393001556, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.02866816148161888, + "rewards/margins": 0.29521334171295166, + "rewards/rejected": -0.32388150691986084, + "sft_loss": 0.286681592464447, + "step": 743 + }, + { + "epoch": 1.0759219088937093, + "grad_norm": 2.4260752328507977, + "learning_rate": 7.51024224147485e-06, + "logits/chosen": 0.12139531970024109, + "logits/rejected": 0.1723383665084839, + "logps/chosen": -0.549209475517273, + "logps/rejected": -1.6002026796340942, + "loss": 0.6727, + "odds_ratio_loss": 0.2690895199775696, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05492095276713371, + "rewards/margins": 0.10509932041168213, + "rewards/rejected": -0.16002027690410614, + "sft_loss": 0.549209475517273, + "step": 744 + }, + { + "epoch": 1.0773680404916848, + "grad_norm": 2.8830174008099245, + "learning_rate": 7.508752022539854e-06, + "logits/chosen": 0.0439901128411293, + "logits/rejected": 0.009160804562270641, + "logps/chosen": -0.6056515574455261, + "logps/rejected": -2.096651792526245, + "loss": 0.6268, + "odds_ratio_loss": 0.4086237847805023, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06056516245007515, + "rewards/margins": 0.14910002052783966, + "rewards/rejected": -0.2096651792526245, + "sft_loss": 0.6056515574455261, + "step": 745 + }, + { + "epoch": 1.0788141720896602, + "grad_norm": 7.200080989258661, + "learning_rate": 7.507259688165195e-06, + "logits/chosen": 0.06999029964208603, + "logits/rejected": 0.10017455369234085, + "logps/chosen": -0.5238281488418579, + "logps/rejected": -2.3946316242218018, + "loss": 0.5687, + "odds_ratio_loss": 0.28676459193229675, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05238281562924385, + "rewards/margins": 0.18708035349845886, + "rewards/rejected": -0.2394631803035736, + "sft_loss": 0.5238281488418579, + "step": 746 + }, + { + "epoch": 1.0802603036876355, + "grad_norm": 2.443321073791897, + "learning_rate": 7.5057652392506066e-06, + "logits/chosen": 0.2201310396194458, + "logits/rejected": 0.10155685991048813, + "logps/chosen": -0.5948346853256226, + "logps/rejected": -1.850401759147644, + "loss": 0.6237, + "odds_ratio_loss": 0.38340234756469727, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.059483468532562256, + "rewards/margins": 0.12555670738220215, + "rewards/rejected": -0.1850401759147644, + "sft_loss": 0.5948346853256226, + "step": 747 + }, + { + "epoch": 1.081706435285611, + "grad_norm": 2.7951657976898896, + "learning_rate": 7.504268676697099e-06, + "logits/chosen": 0.15670828521251678, + "logits/rejected": 0.20218618214130402, + "logps/chosen": -0.4286814332008362, + "logps/rejected": -2.867215156555176, + "loss": 0.5752, + "odds_ratio_loss": 0.1948789358139038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04286814481019974, + "rewards/margins": 0.2438533902168274, + "rewards/rejected": -0.28672152757644653, + "sft_loss": 0.4286814332008362, + "step": 748 + }, + { + "epoch": 1.0831525668835864, + "grad_norm": 3.036970670942331, + "learning_rate": 7.502770001406956e-06, + "logits/chosen": -0.09760545194149017, + "logits/rejected": 0.01785401999950409, + "logps/chosen": -0.5453016757965088, + "logps/rejected": -2.2281641960144043, + "loss": 0.6201, + "odds_ratio_loss": 0.2581316828727722, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05453016608953476, + "rewards/margins": 0.16828623414039612, + "rewards/rejected": -0.22281640768051147, + "sft_loss": 0.5453016757965088, + "step": 749 + }, + { + "epoch": 1.0845986984815619, + "grad_norm": 2.6823863807236212, + "learning_rate": 7.501269214283732e-06, + "logits/chosen": 0.018087085336446762, + "logits/rejected": 0.0732310563325882, + "logps/chosen": -0.8942903280258179, + "logps/rejected": -1.6640185117721558, + "loss": 0.6898, + "odds_ratio_loss": 0.4530031085014343, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08942904323339462, + "rewards/margins": 0.07697281241416931, + "rewards/rejected": -0.16640184819698334, + "sft_loss": 0.8942903280258179, + "step": 750 + }, + { + "epoch": 1.0860448300795373, + "grad_norm": 2.3463543940654468, + "learning_rate": 7.499766316232259e-06, + "logits/chosen": 0.04176183417439461, + "logits/rejected": 0.04234351962804794, + "logps/chosen": -0.5866740942001343, + "logps/rejected": -1.2051106691360474, + "loss": 0.6237, + "odds_ratio_loss": 0.3826124668121338, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.058667413890361786, + "rewards/margins": 0.06184365600347519, + "rewards/rejected": -0.12051106244325638, + "sft_loss": 0.5866740942001343, + "step": 751 + }, + { + "epoch": 1.0874909616775126, + "grad_norm": 2.638412242735269, + "learning_rate": 7.49826130815864e-06, + "logits/chosen": 0.11069029569625854, + "logits/rejected": 0.10534046590328217, + "logps/chosen": -0.4467243254184723, + "logps/rejected": -2.7915725708007812, + "loss": 0.6219, + "odds_ratio_loss": 0.1533384621143341, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04467243328690529, + "rewards/margins": 0.23448482155799866, + "rewards/rejected": -0.27915725111961365, + "sft_loss": 0.4467243254184723, + "step": 752 + }, + { + "epoch": 1.088937093275488, + "grad_norm": 2.062633106407356, + "learning_rate": 7.496754190970249e-06, + "logits/chosen": 0.2177714705467224, + "logits/rejected": 0.1637977957725525, + "logps/chosen": -0.5180121064186096, + "logps/rejected": -2.5437209606170654, + "loss": 0.6282, + "odds_ratio_loss": 0.25341200828552246, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05180121213197708, + "rewards/margins": 0.20257088541984558, + "rewards/rejected": -0.25437209010124207, + "sft_loss": 0.5180121064186096, + "step": 753 + }, + { + "epoch": 1.0903832248734635, + "grad_norm": 2.5677209683454394, + "learning_rate": 7.495244965575734e-06, + "logits/chosen": 0.11416684091091156, + "logits/rejected": 0.13222752511501312, + "logps/chosen": -0.4176730215549469, + "logps/rejected": -2.855869770050049, + "loss": 0.499, + "odds_ratio_loss": 0.3281542658805847, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04176730290055275, + "rewards/margins": 0.2438196837902069, + "rewards/rejected": -0.28558698296546936, + "sft_loss": 0.4176730215549469, + "step": 754 + }, + { + "epoch": 1.091829356471439, + "grad_norm": 2.6069186629139196, + "learning_rate": 7.49373363288501e-06, + "logits/chosen": -0.022992167621850967, + "logits/rejected": 0.004823219031095505, + "logps/chosen": -0.7831302881240845, + "logps/rejected": -1.6426312923431396, + "loss": 0.5694, + "odds_ratio_loss": 0.41426563262939453, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07831303775310516, + "rewards/margins": 0.0859500989317894, + "rewards/rejected": -0.16426314413547516, + "sft_loss": 0.7831302881240845, + "step": 755 + }, + { + "epoch": 1.0932754880694142, + "grad_norm": 3.6538860464685903, + "learning_rate": 7.492220193809267e-06, + "logits/chosen": 0.08986733853816986, + "logits/rejected": 0.05370461195707321, + "logps/chosen": -0.5365122556686401, + "logps/rejected": -3.8356852531433105, + "loss": 0.6559, + "odds_ratio_loss": 0.2820737659931183, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05365122854709625, + "rewards/margins": 0.329917311668396, + "rewards/rejected": -0.38356852531433105, + "sft_loss": 0.5365122556686401, + "step": 756 + }, + { + "epoch": 1.0947216196673897, + "grad_norm": 2.3375387449236524, + "learning_rate": 7.490704649260963e-06, + "logits/chosen": 0.13479389250278473, + "logits/rejected": 0.2163029909133911, + "logps/chosen": -0.3963888883590698, + "logps/rejected": -2.7100229263305664, + "loss": 0.5486, + "odds_ratio_loss": 0.16003967821598053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03963889181613922, + "rewards/margins": 0.23136338591575623, + "rewards/rejected": -0.27100229263305664, + "sft_loss": 0.3963888883590698, + "step": 757 + }, + { + "epoch": 1.0961677512653651, + "grad_norm": 2.7826733677141187, + "learning_rate": 7.489187000153825e-06, + "logits/chosen": 0.09706145524978638, + "logits/rejected": 0.16182109713554382, + "logps/chosen": -0.4988633096218109, + "logps/rejected": -2.425071954727173, + "loss": 0.6046, + "odds_ratio_loss": 0.2742428183555603, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04988633841276169, + "rewards/margins": 0.1926208883523941, + "rewards/rejected": -0.2425072193145752, + "sft_loss": 0.4988633096218109, + "step": 758 + }, + { + "epoch": 1.0976138828633406, + "grad_norm": 2.7639210117774953, + "learning_rate": 7.48766724740285e-06, + "logits/chosen": 0.12796220183372498, + "logits/rejected": 0.06461986899375916, + "logps/chosen": -0.5522404909133911, + "logps/rejected": -2.7422850131988525, + "loss": 0.5448, + "odds_ratio_loss": 0.33710646629333496, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05522404983639717, + "rewards/margins": 0.21900448203086853, + "rewards/rejected": -0.2742285132408142, + "sft_loss": 0.5522404909133911, + "step": 759 + }, + { + "epoch": 1.099060014461316, + "grad_norm": 3.8969453348992515, + "learning_rate": 7.486145391924301e-06, + "logits/chosen": 0.05818912759423256, + "logits/rejected": 0.05282029137015343, + "logps/chosen": -0.6544702649116516, + "logps/rejected": -2.393183708190918, + "loss": 0.6318, + "odds_ratio_loss": 0.40100622177124023, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06544703245162964, + "rewards/margins": 0.17387133836746216, + "rewards/rejected": -0.2393183708190918, + "sft_loss": 0.6544702649116516, + "step": 760 + }, + { + "epoch": 1.1005061460592913, + "grad_norm": 2.7628775670375063, + "learning_rate": 7.4846214346357125e-06, + "logits/chosen": 0.05507563427090645, + "logits/rejected": 0.0078296959400177, + "logps/chosen": -0.527977466583252, + "logps/rejected": -2.508857488632202, + "loss": 0.5549, + "odds_ratio_loss": 0.32088613510131836, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.052797745913267136, + "rewards/margins": 0.19808803498744965, + "rewards/rejected": -0.2508857548236847, + "sft_loss": 0.527977466583252, + "step": 761 + }, + { + "epoch": 1.1019522776572668, + "grad_norm": 2.542282535738457, + "learning_rate": 7.483095376455884e-06, + "logits/chosen": 0.29207634925842285, + "logits/rejected": 0.18721196055412292, + "logps/chosen": -0.42134177684783936, + "logps/rejected": -3.063565492630005, + "loss": 0.5765, + "odds_ratio_loss": 0.17249009013175964, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.042134176939725876, + "rewards/margins": 0.2642224133014679, + "rewards/rejected": -0.3063565790653229, + "sft_loss": 0.42134177684783936, + "step": 762 + }, + { + "epoch": 1.1033984092552422, + "grad_norm": 3.9252015141252015, + "learning_rate": 7.481567218304878e-06, + "logits/chosen": 0.16148284077644348, + "logits/rejected": 0.16037528216838837, + "logps/chosen": -0.37327027320861816, + "logps/rejected": -2.7958102226257324, + "loss": 0.5319, + "odds_ratio_loss": 0.21088604629039764, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03732702508568764, + "rewards/margins": 0.24225401878356934, + "rewards/rejected": -0.2795810401439667, + "sft_loss": 0.37327027320861816, + "step": 763 + }, + { + "epoch": 1.1048445408532177, + "grad_norm": 4.315528891824625, + "learning_rate": 7.480036961104031e-06, + "logits/chosen": -0.07114061713218689, + "logits/rejected": 0.013804474845528603, + "logps/chosen": -0.6288735866546631, + "logps/rejected": -2.1085119247436523, + "loss": 0.6512, + "odds_ratio_loss": 0.39338162541389465, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06288735568523407, + "rewards/margins": 0.14796383678913116, + "rewards/rejected": -0.21085119247436523, + "sft_loss": 0.6288735866546631, + "step": 764 + }, + { + "epoch": 1.106290672451193, + "grad_norm": 3.0374677766883047, + "learning_rate": 7.478504605775938e-06, + "logits/chosen": 0.09024432301521301, + "logits/rejected": 0.11920122802257538, + "logps/chosen": -0.5411593914031982, + "logps/rejected": -1.7794485092163086, + "loss": 0.5823, + "odds_ratio_loss": 0.328127384185791, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.054115939885377884, + "rewards/margins": 0.12382891029119492, + "rewards/rejected": -0.1779448539018631, + "sft_loss": 0.5411593914031982, + "step": 765 + }, + { + "epoch": 1.1077368040491684, + "grad_norm": 2.8560879970949675, + "learning_rate": 7.476970153244463e-06, + "logits/chosen": 0.13054589927196503, + "logits/rejected": 0.06883732229471207, + "logps/chosen": -0.6385205388069153, + "logps/rejected": -1.957822561264038, + "loss": 0.5768, + "odds_ratio_loss": 0.41158097982406616, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06385205686092377, + "rewards/margins": 0.13193020224571228, + "rewards/rejected": -0.19578225910663605, + "sft_loss": 0.6385205388069153, + "step": 766 + }, + { + "epoch": 1.1091829356471439, + "grad_norm": 3.8575247023602643, + "learning_rate": 7.475433604434734e-06, + "logits/chosen": 0.09822223335504532, + "logits/rejected": -0.0020382339134812355, + "logps/chosen": -0.553131103515625, + "logps/rejected": -2.6227946281433105, + "loss": 0.6335, + "odds_ratio_loss": 0.3468872308731079, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0553131066262722, + "rewards/margins": 0.2069663405418396, + "rewards/rejected": -0.2622794508934021, + "sft_loss": 0.553131103515625, + "step": 767 + }, + { + "epoch": 1.1106290672451193, + "grad_norm": 2.4229550650167684, + "learning_rate": 7.47389496027314e-06, + "logits/chosen": 0.013593245297670364, + "logits/rejected": 0.09762951731681824, + "logps/chosen": -0.38503605127334595, + "logps/rejected": -3.5363712310791016, + "loss": 0.5369, + "odds_ratio_loss": 0.1493338942527771, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.038503602147102356, + "rewards/margins": 0.3151334822177887, + "rewards/rejected": -0.35363709926605225, + "sft_loss": 0.38503605127334595, + "step": 768 + }, + { + "epoch": 1.1120751988430948, + "grad_norm": 2.434271111123222, + "learning_rate": 7.472354221687337e-06, + "logits/chosen": 0.11509215831756592, + "logits/rejected": -0.01175488531589508, + "logps/chosen": -0.5181520581245422, + "logps/rejected": -2.47695255279541, + "loss": 0.5855, + "odds_ratio_loss": 0.24836497008800507, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.051815204322338104, + "rewards/margins": 0.19588005542755127, + "rewards/rejected": -0.24769528210163116, + "sft_loss": 0.5181520581245422, + "step": 769 + }, + { + "epoch": 1.11352133044107, + "grad_norm": 3.6024883785529798, + "learning_rate": 7.470811389606241e-06, + "logits/chosen": 0.1907249093055725, + "logits/rejected": 0.28744474053382874, + "logps/chosen": -0.3892439901828766, + "logps/rejected": -2.2846317291259766, + "loss": 0.5206, + "odds_ratio_loss": 0.24296937882900238, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03892439603805542, + "rewards/margins": 0.18953877687454224, + "rewards/rejected": -0.22846317291259766, + "sft_loss": 0.3892439901828766, + "step": 770 + }, + { + "epoch": 1.1149674620390455, + "grad_norm": 2.9734534681452938, + "learning_rate": 7.469266464960032e-06, + "logits/chosen": -0.009543132036924362, + "logits/rejected": 0.025686249136924744, + "logps/chosen": -0.5033233761787415, + "logps/rejected": -3.2725722789764404, + "loss": 0.5662, + "odds_ratio_loss": 0.23238584399223328, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05033233389258385, + "rewards/margins": 0.27692487835884094, + "rewards/rejected": -0.3272572159767151, + "sft_loss": 0.5033233761787415, + "step": 771 + }, + { + "epoch": 1.116413593637021, + "grad_norm": 2.6868039101284205, + "learning_rate": 7.4677194486801504e-06, + "logits/chosen": 0.0987258031964302, + "logits/rejected": 0.18745985627174377, + "logps/chosen": -0.5120230913162231, + "logps/rejected": -1.8366214036941528, + "loss": 0.5464, + "odds_ratio_loss": 0.3096643090248108, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.051202308386564255, + "rewards/margins": 0.1324598342180252, + "rewards/rejected": -0.18366214632987976, + "sft_loss": 0.5120230913162231, + "step": 772 + }, + { + "epoch": 1.1178597252349964, + "grad_norm": 3.2648658058180646, + "learning_rate": 7.466170341699298e-06, + "logits/chosen": 0.12391200661659241, + "logits/rejected": 0.07651515305042267, + "logps/chosen": -0.5479176640510559, + "logps/rejected": -2.7681431770324707, + "loss": 0.6005, + "odds_ratio_loss": 0.3141166567802429, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05479177087545395, + "rewards/margins": 0.22202253341674805, + "rewards/rejected": -0.2768143117427826, + "sft_loss": 0.5479176640510559, + "step": 773 + }, + { + "epoch": 1.119305856832972, + "grad_norm": 2.1969045008343535, + "learning_rate": 7.464619144951436e-06, + "logits/chosen": 0.1988317370414734, + "logits/rejected": 0.11384803801774979, + "logps/chosen": -0.6396727561950684, + "logps/rejected": -2.1379432678222656, + "loss": 0.5548, + "odds_ratio_loss": 0.4122881293296814, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0639672800898552, + "rewards/margins": 0.14982706308364868, + "rewards/rejected": -0.21379435062408447, + "sft_loss": 0.6396727561950684, + "step": 774 + }, + { + "epoch": 1.1207519884309471, + "grad_norm": 2.828734292348446, + "learning_rate": 7.463065859371789e-06, + "logits/chosen": 0.024105386808514595, + "logits/rejected": -0.05560684576630592, + "logps/chosen": -0.58314049243927, + "logps/rejected": -2.818758487701416, + "loss": 0.5745, + "odds_ratio_loss": 0.41460278630256653, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05831405147910118, + "rewards/margins": 0.22356180846691132, + "rewards/rejected": -0.2818758487701416, + "sft_loss": 0.58314049243927, + "step": 775 + }, + { + "epoch": 1.1221981200289226, + "grad_norm": 2.5439099335428206, + "learning_rate": 7.461510485896838e-06, + "logits/chosen": 0.2544099688529968, + "logits/rejected": 0.20363222062587738, + "logps/chosen": -0.4311085343360901, + "logps/rejected": -2.491203546524048, + "loss": 0.6513, + "odds_ratio_loss": 0.28176209330558777, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04311085492372513, + "rewards/margins": 0.20600950717926025, + "rewards/rejected": -0.24912036955356598, + "sft_loss": 0.4311085343360901, + "step": 776 + }, + { + "epoch": 1.123644251626898, + "grad_norm": 3.5321805261897374, + "learning_rate": 7.4599530254643205e-06, + "logits/chosen": 0.054844632744789124, + "logits/rejected": 0.0795421302318573, + "logps/chosen": -0.49303340911865234, + "logps/rejected": -2.2153918743133545, + "loss": 0.6372, + "odds_ratio_loss": 0.3127673864364624, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.049303337931632996, + "rewards/margins": 0.1722358614206314, + "rewards/rejected": -0.22153916954994202, + "sft_loss": 0.49303340911865234, + "step": 777 + }, + { + "epoch": 1.1250903832248735, + "grad_norm": 9.818583319726136, + "learning_rate": 7.45839347901324e-06, + "logits/chosen": 0.10133037716150284, + "logits/rejected": 0.08598774671554565, + "logps/chosen": -0.41913947463035583, + "logps/rejected": -3.2657482624053955, + "loss": 0.596, + "odds_ratio_loss": 0.2532481849193573, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0419139489531517, + "rewards/margins": 0.28466087579727173, + "rewards/rejected": -0.3265748620033264, + "sft_loss": 0.41913947463035583, + "step": 778 + }, + { + "epoch": 1.126536514822849, + "grad_norm": 2.5273673577209186, + "learning_rate": 7.45683184748385e-06, + "logits/chosen": 0.10568895936012268, + "logits/rejected": 0.07520011067390442, + "logps/chosen": -0.4639187753200531, + "logps/rejected": -4.35772180557251, + "loss": 0.5471, + "odds_ratio_loss": 0.18750569224357605, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04639187827706337, + "rewards/margins": 0.3893802762031555, + "rewards/rejected": -0.4357721507549286, + "sft_loss": 0.4639187753200531, + "step": 779 + }, + { + "epoch": 1.1279826464208242, + "grad_norm": 3.3847158927408816, + "learning_rate": 7.455268131817664e-06, + "logits/chosen": 0.11961061507463455, + "logits/rejected": 0.11430468410253525, + "logps/chosen": -0.4800012707710266, + "logps/rejected": -1.999869704246521, + "loss": 0.491, + "odds_ratio_loss": 0.2718590497970581, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04800013080239296, + "rewards/margins": 0.15198683738708496, + "rewards/rejected": -0.19998696446418762, + "sft_loss": 0.4800012707710266, + "step": 780 + }, + { + "epoch": 1.1294287780187997, + "grad_norm": 2.26973956764143, + "learning_rate": 7.453702332957454e-06, + "logits/chosen": 0.0754300132393837, + "logits/rejected": 0.048578303307294846, + "logps/chosen": -0.6013798713684082, + "logps/rejected": -1.9893176555633545, + "loss": 0.5393, + "odds_ratio_loss": 0.4172818660736084, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06013799458742142, + "rewards/margins": 0.13879379630088806, + "rewards/rejected": -0.19893178343772888, + "sft_loss": 0.6013798713684082, + "step": 781 + }, + { + "epoch": 1.1308749096167752, + "grad_norm": 3.0760734312283624, + "learning_rate": 7.452134451847243e-06, + "logits/chosen": 0.1234007477760315, + "logits/rejected": 0.10730813443660736, + "logps/chosen": -0.5656208395957947, + "logps/rejected": -2.4852113723754883, + "loss": 0.5875, + "odds_ratio_loss": 0.31027752161026, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.056562088429927826, + "rewards/margins": 0.19195906817913055, + "rewards/rejected": -0.24852114915847778, + "sft_loss": 0.5656208395957947, + "step": 782 + }, + { + "epoch": 1.1323210412147506, + "grad_norm": 3.3526925530268326, + "learning_rate": 7.450564489432315e-06, + "logits/chosen": 0.04157966375350952, + "logits/rejected": 0.0403023287653923, + "logps/chosen": -0.6040517091751099, + "logps/rejected": -1.2740135192871094, + "loss": 0.6748, + "odds_ratio_loss": 0.41726475954055786, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.060405176132917404, + "rewards/margins": 0.06699618697166443, + "rewards/rejected": -0.12740135192871094, + "sft_loss": 0.6040517091751099, + "step": 783 + }, + { + "epoch": 1.1337671728127259, + "grad_norm": 3.0802366326947204, + "learning_rate": 7.448992446659204e-06, + "logits/chosen": 0.14116020500659943, + "logits/rejected": 0.15150752663612366, + "logps/chosen": -0.6432009339332581, + "logps/rejected": -3.107823133468628, + "loss": 0.624, + "odds_ratio_loss": 0.2854066789150238, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06432008743286133, + "rewards/margins": 0.24646221101284027, + "rewards/rejected": -0.3107823133468628, + "sft_loss": 0.6432009339332581, + "step": 784 + }, + { + "epoch": 1.1352133044107013, + "grad_norm": 2.9008034505623685, + "learning_rate": 7.447418324475702e-06, + "logits/chosen": -0.07773000001907349, + "logits/rejected": 0.016000304371118546, + "logps/chosen": -0.4436779022216797, + "logps/rejected": -3.0289247035980225, + "loss": 0.5249, + "odds_ratio_loss": 0.2261512130498886, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04436779022216797, + "rewards/margins": 0.25852468609809875, + "rewards/rejected": -0.3028924763202667, + "sft_loss": 0.4436779022216797, + "step": 785 + }, + { + "epoch": 1.1366594360086768, + "grad_norm": 2.5909707064380596, + "learning_rate": 7.445842123830853e-06, + "logits/chosen": -0.016650903970003128, + "logits/rejected": -0.010309025645256042, + "logps/chosen": -0.5804932117462158, + "logps/rejected": -1.5151318311691284, + "loss": 0.5625, + "odds_ratio_loss": 0.3440878689289093, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05804932489991188, + "rewards/margins": 0.09346386790275574, + "rewards/rejected": -0.15151318907737732, + "sft_loss": 0.5804932117462158, + "step": 786 + }, + { + "epoch": 1.1381055676066523, + "grad_norm": 2.2178481077560717, + "learning_rate": 7.444263845674953e-06, + "logits/chosen": 0.03409305587410927, + "logits/rejected": 0.06542815268039703, + "logps/chosen": -0.7013732194900513, + "logps/rejected": -2.4514522552490234, + "loss": 0.5767, + "odds_ratio_loss": 0.32921674847602844, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07013732194900513, + "rewards/margins": 0.1750079095363617, + "rewards/rejected": -0.24514521658420563, + "sft_loss": 0.7013732194900513, + "step": 787 + }, + { + "epoch": 1.1395516992046275, + "grad_norm": 3.168676722580113, + "learning_rate": 7.442683490959554e-06, + "logits/chosen": 0.009714031592011452, + "logits/rejected": 0.07314697653055191, + "logps/chosen": -0.7520695924758911, + "logps/rejected": -1.4773638248443604, + "loss": 0.6163, + "odds_ratio_loss": 0.3795466721057892, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07520696520805359, + "rewards/margins": 0.07252941280603409, + "rewards/rejected": -0.14773637056350708, + "sft_loss": 0.7520695924758911, + "step": 788 + }, + { + "epoch": 1.140997830802603, + "grad_norm": 2.231023146887912, + "learning_rate": 7.441101060637456e-06, + "logits/chosen": 0.011076090857386589, + "logits/rejected": 0.0065444353967905045, + "logps/chosen": -0.4639368951320648, + "logps/rejected": -2.8974857330322266, + "loss": 0.5276, + "odds_ratio_loss": 0.358600378036499, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04639369249343872, + "rewards/margins": 0.2433549016714096, + "rewards/rejected": -0.2897486090660095, + "sft_loss": 0.4639368951320648, + "step": 789 + }, + { + "epoch": 1.1424439624005784, + "grad_norm": 5.751344294959958, + "learning_rate": 7.4395165556627115e-06, + "logits/chosen": 0.044419676065444946, + "logits/rejected": 0.0467277392745018, + "logps/chosen": -0.5675772428512573, + "logps/rejected": -3.3737549781799316, + "loss": 0.6823, + "odds_ratio_loss": 0.3221891224384308, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05675772577524185, + "rewards/margins": 0.28061774373054504, + "rewards/rejected": -0.3373754918575287, + "sft_loss": 0.5675772428512573, + "step": 790 + }, + { + "epoch": 1.143890093998554, + "grad_norm": 3.157815374086038, + "learning_rate": 7.437929976990625e-06, + "logits/chosen": 0.07242487370967865, + "logits/rejected": 0.06932821869850159, + "logps/chosen": -0.6816798448562622, + "logps/rejected": -2.0356240272521973, + "loss": 0.6112, + "odds_ratio_loss": 0.31875520944595337, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06816798448562622, + "rewards/margins": 0.1353944092988968, + "rewards/rejected": -0.203562393784523, + "sft_loss": 0.6816798448562622, + "step": 791 + }, + { + "epoch": 1.1453362255965294, + "grad_norm": 2.399288961917676, + "learning_rate": 7.436341325577753e-06, + "logits/chosen": 0.006493567489087582, + "logits/rejected": 0.05797387287020683, + "logps/chosen": -0.5998603701591492, + "logps/rejected": -1.7945928573608398, + "loss": 0.5849, + "odds_ratio_loss": 0.2764272689819336, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05998603627085686, + "rewards/margins": 0.11947324872016907, + "rewards/rejected": -0.17945930361747742, + "sft_loss": 0.5998603701591492, + "step": 792 + }, + { + "epoch": 1.1467823571945046, + "grad_norm": 4.92991125737335, + "learning_rate": 7.434750602381896e-06, + "logits/chosen": -0.006598275154829025, + "logits/rejected": 0.03047710284590721, + "logps/chosen": -0.6850670576095581, + "logps/rejected": -2.417117118835449, + "loss": 0.6757, + "odds_ratio_loss": 0.5608515739440918, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06850671023130417, + "rewards/margins": 0.17320501804351807, + "rewards/rejected": -0.24171173572540283, + "sft_loss": 0.6850670576095581, + "step": 793 + }, + { + "epoch": 1.14822848879248, + "grad_norm": 3.028327047199182, + "learning_rate": 7.433157808362109e-06, + "logits/chosen": 0.031274694949388504, + "logits/rejected": 0.07910322397947311, + "logps/chosen": -0.4861868619918823, + "logps/rejected": -2.6531078815460205, + "loss": 0.563, + "odds_ratio_loss": 0.21675142645835876, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04861868917942047, + "rewards/margins": 0.21669211983680725, + "rewards/rejected": -0.26531079411506653, + "sft_loss": 0.4861868619918823, + "step": 794 + }, + { + "epoch": 1.1496746203904555, + "grad_norm": 2.7428039098034787, + "learning_rate": 7.4315629444786934e-06, + "logits/chosen": 0.13038958609104156, + "logits/rejected": 0.16042017936706543, + "logps/chosen": -0.6496018171310425, + "logps/rejected": -2.7725348472595215, + "loss": 0.6591, + "odds_ratio_loss": 0.3653820753097534, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06496018916368484, + "rewards/margins": 0.21229329705238342, + "rewards/rejected": -0.27725350856781006, + "sft_loss": 0.6496018171310425, + "step": 795 + }, + { + "epoch": 1.151120751988431, + "grad_norm": 2.3190293075287585, + "learning_rate": 7.429966011693198e-06, + "logits/chosen": 0.09569834172725677, + "logits/rejected": 0.13140498101711273, + "logps/chosen": -0.5810202360153198, + "logps/rejected": -1.6585171222686768, + "loss": 0.6426, + "odds_ratio_loss": 0.25168225169181824, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05810202658176422, + "rewards/margins": 0.10774968564510345, + "rewards/rejected": -0.16585171222686768, + "sft_loss": 0.5810202360153198, + "step": 796 + }, + { + "epoch": 1.1525668835864065, + "grad_norm": 2.3257384032215023, + "learning_rate": 7.428367010968418e-06, + "logits/chosen": 0.17940768599510193, + "logits/rejected": 0.1755165457725525, + "logps/chosen": -0.4974741041660309, + "logps/rejected": -2.3814783096313477, + "loss": 0.575, + "odds_ratio_loss": 0.2930542230606079, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04974740743637085, + "rewards/margins": 0.18840041756629944, + "rewards/rejected": -0.2381478101015091, + "sft_loss": 0.4974741041660309, + "step": 797 + }, + { + "epoch": 1.1540130151843817, + "grad_norm": 2.33840083888642, + "learning_rate": 7.4267659432684e-06, + "logits/chosen": 0.06194993481040001, + "logits/rejected": 0.02340729907155037, + "logps/chosen": -0.5812244415283203, + "logps/rejected": -3.1519532203674316, + "loss": 0.6231, + "odds_ratio_loss": 0.27667036652565, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05812244117259979, + "rewards/margins": 0.2570728659629822, + "rewards/rejected": -0.31519532203674316, + "sft_loss": 0.5812244415283203, + "step": 798 + }, + { + "epoch": 1.1554591467823572, + "grad_norm": 3.148046088506335, + "learning_rate": 7.4251628095584325e-06, + "logits/chosen": -0.011250527575612068, + "logits/rejected": 0.017998933792114258, + "logps/chosen": -0.693738579750061, + "logps/rejected": -1.7688831090927124, + "loss": 0.6544, + "odds_ratio_loss": 0.3419437110424042, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06937386095523834, + "rewards/margins": 0.10751446336507797, + "rewards/rejected": -0.17688831686973572, + "sft_loss": 0.693738579750061, + "step": 799 + }, + { + "epoch": 1.1569052783803326, + "grad_norm": 2.437296250008247, + "learning_rate": 7.4235576108050495e-06, + "logits/chosen": 0.1309782713651657, + "logits/rejected": 0.13392174243927002, + "logps/chosen": -0.4447363615036011, + "logps/rejected": -3.2417688369750977, + "loss": 0.5679, + "odds_ratio_loss": 0.36193907260894775, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.044473640620708466, + "rewards/margins": 0.2797032594680786, + "rewards/rejected": -0.3241769075393677, + "sft_loss": 0.4447363615036011, + "step": 800 + }, + { + "epoch": 1.158351409978308, + "grad_norm": 2.7438204360634346, + "learning_rate": 7.4219503479760325e-06, + "logits/chosen": 0.15036681294441223, + "logits/rejected": 0.11659523844718933, + "logps/chosen": -0.4682498574256897, + "logps/rejected": -2.816537380218506, + "loss": 0.5587, + "odds_ratio_loss": 0.28212669491767883, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04682498425245285, + "rewards/margins": 0.23482874035835266, + "rewards/rejected": -0.2816537320613861, + "sft_loss": 0.4682498574256897, + "step": 801 + }, + { + "epoch": 1.1597975415762836, + "grad_norm": 2.2929808725485543, + "learning_rate": 7.420341022040405e-06, + "logits/chosen": 0.018691712990403175, + "logits/rejected": 0.004425849765539169, + "logps/chosen": -0.5483542084693909, + "logps/rejected": -3.6711413860321045, + "loss": 0.5833, + "odds_ratio_loss": 0.2860555350780487, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05483543127775192, + "rewards/margins": 0.31227871775627136, + "rewards/rejected": -0.3671141266822815, + "sft_loss": 0.5483542084693909, + "step": 802 + }, + { + "epoch": 1.1612436731742588, + "grad_norm": 3.1911949847406413, + "learning_rate": 7.418729633968439e-06, + "logits/chosen": 0.16952145099639893, + "logits/rejected": 0.15931960940361023, + "logps/chosen": -0.4818491041660309, + "logps/rejected": -2.875197410583496, + "loss": 0.6413, + "odds_ratio_loss": 0.2592041790485382, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04818491265177727, + "rewards/margins": 0.239334836602211, + "rewards/rejected": -0.28751975297927856, + "sft_loss": 0.4818491041660309, + "step": 803 + }, + { + "epoch": 1.1626898047722343, + "grad_norm": 2.459591917048973, + "learning_rate": 7.4171161847316424e-06, + "logits/chosen": 0.12029114365577698, + "logits/rejected": 0.16648927330970764, + "logps/chosen": -0.6787177920341492, + "logps/rejected": -2.847053050994873, + "loss": 0.6632, + "odds_ratio_loss": 0.30066680908203125, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06787177175283432, + "rewards/margins": 0.21683353185653687, + "rewards/rejected": -0.2847052812576294, + "sft_loss": 0.6787177920341492, + "step": 804 + }, + { + "epoch": 1.1641359363702097, + "grad_norm": 2.819839180987813, + "learning_rate": 7.4155006753027715e-06, + "logits/chosen": 0.21553045511245728, + "logits/rejected": 0.16459545493125916, + "logps/chosen": -0.5797298550605774, + "logps/rejected": -2.6545183658599854, + "loss": 0.6086, + "odds_ratio_loss": 0.349483847618103, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0579729899764061, + "rewards/margins": 0.2074788510799408, + "rewards/rejected": -0.2654518485069275, + "sft_loss": 0.5797298550605774, + "step": 805 + }, + { + "epoch": 1.1655820679681852, + "grad_norm": 3.649699933535626, + "learning_rate": 7.413883106655823e-06, + "logits/chosen": 0.1408543586730957, + "logits/rejected": 0.151872456073761, + "logps/chosen": -0.5444905757904053, + "logps/rejected": -1.8509986400604248, + "loss": 0.5559, + "odds_ratio_loss": 0.28263750672340393, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05444905534386635, + "rewards/margins": 0.1306508183479309, + "rewards/rejected": -0.18509986996650696, + "sft_loss": 0.5444905757904053, + "step": 806 + }, + { + "epoch": 1.1670281995661604, + "grad_norm": 2.8521602435109106, + "learning_rate": 7.412263479766034e-06, + "logits/chosen": 0.0004207249730825424, + "logits/rejected": 0.07325251400470734, + "logps/chosen": -0.6310700178146362, + "logps/rejected": -2.819793939590454, + "loss": 0.5759, + "odds_ratio_loss": 0.371545672416687, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06310699880123138, + "rewards/margins": 0.21887239813804626, + "rewards/rejected": -0.28197938203811646, + "sft_loss": 0.6310700178146362, + "step": 807 + }, + { + "epoch": 1.168474331164136, + "grad_norm": 2.9617169421526275, + "learning_rate": 7.410641795609885e-06, + "logits/chosen": 0.32813215255737305, + "logits/rejected": 0.11179277300834656, + "logps/chosen": -0.6126865148544312, + "logps/rejected": -2.45556378364563, + "loss": 0.5838, + "odds_ratio_loss": 0.2806455194950104, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.061268653720617294, + "rewards/margins": 0.18428772687911987, + "rewards/rejected": -0.24555638432502747, + "sft_loss": 0.6126865148544312, + "step": 808 + }, + { + "epoch": 1.1699204627621114, + "grad_norm": 3.51193121019222, + "learning_rate": 7.409018055165095e-06, + "logits/chosen": 0.24320703744888306, + "logits/rejected": 0.14701107144355774, + "logps/chosen": -0.5555135011672974, + "logps/rejected": -3.6711976528167725, + "loss": 0.6075, + "odds_ratio_loss": 0.30618131160736084, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.055551350116729736, + "rewards/margins": 0.3115684390068054, + "rewards/rejected": -0.36711978912353516, + "sft_loss": 0.5555135011672974, + "step": 809 + }, + { + "epoch": 1.1713665943600868, + "grad_norm": 2.854322284446154, + "learning_rate": 7.407392259410623e-06, + "logits/chosen": 0.11030896008014679, + "logits/rejected": 0.07109300792217255, + "logps/chosen": -0.535825252532959, + "logps/rejected": -3.356602907180786, + "loss": 0.6089, + "odds_ratio_loss": 0.37696099281311035, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05358252674341202, + "rewards/margins": 0.28207775950431824, + "rewards/rejected": -0.33566030859947205, + "sft_loss": 0.535825252532959, + "step": 810 + }, + { + "epoch": 1.172812725958062, + "grad_norm": 2.6810454261200536, + "learning_rate": 7.405764409326668e-06, + "logits/chosen": 0.024707181379199028, + "logits/rejected": 0.15931369364261627, + "logps/chosen": -0.5429731607437134, + "logps/rejected": -2.546696186065674, + "loss": 0.6071, + "odds_ratio_loss": 0.2758614718914032, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.054297320544719696, + "rewards/margins": 0.20037232339382172, + "rewards/rejected": -0.2546696364879608, + "sft_loss": 0.5429731607437134, + "step": 811 + }, + { + "epoch": 1.1742588575560375, + "grad_norm": 2.8967656734297815, + "learning_rate": 7.404134505894665e-06, + "logits/chosen": 0.18780213594436646, + "logits/rejected": 0.11726965010166168, + "logps/chosen": -0.44569364190101624, + "logps/rejected": -2.6580047607421875, + "loss": 0.5756, + "odds_ratio_loss": 0.2095184624195099, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04456936568021774, + "rewards/margins": 0.2212311178445816, + "rewards/rejected": -0.26580050587654114, + "sft_loss": 0.44569364190101624, + "step": 812 + }, + { + "epoch": 1.175704989154013, + "grad_norm": 2.7702135625243844, + "learning_rate": 7.40250255009729e-06, + "logits/chosen": 0.09605167806148529, + "logits/rejected": 0.2212832123041153, + "logps/chosen": -0.4400288462638855, + "logps/rejected": -2.868157148361206, + "loss": 0.591, + "odds_ratio_loss": 0.21612593531608582, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04400289058685303, + "rewards/margins": 0.2428128570318222, + "rewards/rejected": -0.28681573271751404, + "sft_loss": 0.4400288462638855, + "step": 813 + }, + { + "epoch": 1.1771511207519885, + "grad_norm": 2.589780825174727, + "learning_rate": 7.400868542918457e-06, + "logits/chosen": 0.23280946910381317, + "logits/rejected": 0.2035413384437561, + "logps/chosen": -0.6458877325057983, + "logps/rejected": -2.4575438499450684, + "loss": 0.5889, + "odds_ratio_loss": 0.3527429699897766, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0645887702703476, + "rewards/margins": 0.18116562068462372, + "rewards/rejected": -0.2457543909549713, + "sft_loss": 0.6458877325057983, + "step": 814 + }, + { + "epoch": 1.178597252349964, + "grad_norm": 2.5623900104319626, + "learning_rate": 7.399232485343311e-06, + "logits/chosen": 0.10259507596492767, + "logits/rejected": 0.1167682632803917, + "logps/chosen": -0.8443832397460938, + "logps/rejected": -1.3338778018951416, + "loss": 0.7028, + "odds_ratio_loss": 0.6011531352996826, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08443832397460938, + "rewards/margins": 0.0489494651556015, + "rewards/rejected": -0.13338778913021088, + "sft_loss": 0.8443832397460938, + "step": 815 + }, + { + "epoch": 1.1800433839479392, + "grad_norm": 2.5950621753937493, + "learning_rate": 7.397594378358241e-06, + "logits/chosen": 0.25614750385284424, + "logits/rejected": 0.22538936138153076, + "logps/chosen": -0.4889935255050659, + "logps/rejected": -3.658846139907837, + "loss": 0.5713, + "odds_ratio_loss": 0.1725495159626007, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04889935255050659, + "rewards/margins": 0.31698527932167053, + "rewards/rejected": -0.36588460206985474, + "sft_loss": 0.4889935255050659, + "step": 816 + }, + { + "epoch": 1.1814895155459146, + "grad_norm": 2.562573118975313, + "learning_rate": 7.395954222950866e-06, + "logits/chosen": 0.12951630353927612, + "logits/rejected": 0.21345645189285278, + "logps/chosen": -0.39483872056007385, + "logps/rejected": -2.5382630825042725, + "loss": 0.5061, + "odds_ratio_loss": 0.25803595781326294, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.03948386758565903, + "rewards/margins": 0.21434244513511658, + "rewards/rejected": -0.2538263201713562, + "sft_loss": 0.39483872056007385, + "step": 817 + }, + { + "epoch": 1.18293564714389, + "grad_norm": 2.3614280189896744, + "learning_rate": 7.394312020110042e-06, + "logits/chosen": 0.17008808255195618, + "logits/rejected": 0.10077522695064545, + "logps/chosen": -0.5804603099822998, + "logps/rejected": -1.993128776550293, + "loss": 0.555, + "odds_ratio_loss": 0.31265169382095337, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05804603174328804, + "rewards/margins": 0.1412668526172638, + "rewards/rejected": -0.19931289553642273, + "sft_loss": 0.5804603099822998, + "step": 818 + }, + { + "epoch": 1.1843817787418656, + "grad_norm": 4.26288185974698, + "learning_rate": 7.392667770825859e-06, + "logits/chosen": 0.12806616723537445, + "logits/rejected": 0.12684689462184906, + "logps/chosen": -0.6775280237197876, + "logps/rejected": -2.6817450523376465, + "loss": 0.6758, + "odds_ratio_loss": 0.36490458250045776, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06775280088186264, + "rewards/margins": 0.20042170584201813, + "rewards/rejected": -0.2681744694709778, + "sft_loss": 0.6775280237197876, + "step": 819 + }, + { + "epoch": 1.185827910339841, + "grad_norm": 6.992906851248464, + "learning_rate": 7.391021476089641e-06, + "logits/chosen": 0.15885204076766968, + "logits/rejected": 0.08956670761108398, + "logps/chosen": -0.48606452345848083, + "logps/rejected": -1.5799554586410522, + "loss": 0.605, + "odds_ratio_loss": 0.2400798499584198, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04860645532608032, + "rewards/margins": 0.10938909649848938, + "rewards/rejected": -0.1579955518245697, + "sft_loss": 0.48606452345848083, + "step": 820 + }, + { + "epoch": 1.1872740419378163, + "grad_norm": 2.8411437399780244, + "learning_rate": 7.389373136893947e-06, + "logits/chosen": 0.056294236332178116, + "logits/rejected": 0.06409749388694763, + "logps/chosen": -0.6919089555740356, + "logps/rejected": -1.5181670188903809, + "loss": 0.6048, + "odds_ratio_loss": 0.4656546115875244, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06919088959693909, + "rewards/margins": 0.08262581378221512, + "rewards/rejected": -0.1518167108297348, + "sft_loss": 0.6919089555740356, + "step": 821 + }, + { + "epoch": 1.1887201735357917, + "grad_norm": 4.461930888679296, + "learning_rate": 7.3877227542325645e-06, + "logits/chosen": 0.15600307285785675, + "logits/rejected": 0.11605434864759445, + "logps/chosen": -0.47259920835494995, + "logps/rejected": -2.2307705879211426, + "loss": 0.569, + "odds_ratio_loss": 0.34623655676841736, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04725992679595947, + "rewards/margins": 0.17581716179847717, + "rewards/rejected": -0.22307708859443665, + "sft_loss": 0.47259920835494995, + "step": 822 + }, + { + "epoch": 1.1901663051337672, + "grad_norm": 2.3378993173232394, + "learning_rate": 7.3860703291005154e-06, + "logits/chosen": -0.04555685818195343, + "logits/rejected": -0.017071541398763657, + "logps/chosen": -0.59052574634552, + "logps/rejected": -3.355835199356079, + "loss": 0.7088, + "odds_ratio_loss": 0.3438900113105774, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05905257910490036, + "rewards/margins": 0.276530921459198, + "rewards/rejected": -0.33558350801467896, + "sft_loss": 0.59052574634552, + "step": 823 + }, + { + "epoch": 1.1916124367317427, + "grad_norm": 5.750195138604537, + "learning_rate": 7.384415862494055e-06, + "logits/chosen": 0.09526326507329941, + "logits/rejected": 0.1360977292060852, + "logps/chosen": -0.7659902572631836, + "logps/rejected": -4.108973026275635, + "loss": 0.6252, + "odds_ratio_loss": 0.395515114068985, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07659903168678284, + "rewards/margins": 0.3342982530593872, + "rewards/rejected": -0.41089728474617004, + "sft_loss": 0.7659902572631836, + "step": 824 + }, + { + "epoch": 1.1930585683297181, + "grad_norm": 2.948269959006474, + "learning_rate": 7.382759355410666e-06, + "logits/chosen": 0.178156316280365, + "logits/rejected": 0.2407047152519226, + "logps/chosen": -0.5645558834075928, + "logps/rejected": -2.222418785095215, + "loss": 0.549, + "odds_ratio_loss": 0.2562811076641083, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0564555898308754, + "rewards/margins": 0.16578629612922668, + "rewards/rejected": -0.22224187850952148, + "sft_loss": 0.5645558834075928, + "step": 825 + }, + { + "epoch": 1.1945046999276934, + "grad_norm": 2.874090383621196, + "learning_rate": 7.381100808849063e-06, + "logits/chosen": 0.10233473777770996, + "logits/rejected": 0.19571229815483093, + "logps/chosen": -0.39320099353790283, + "logps/rejected": -2.9694581031799316, + "loss": 0.5638, + "odds_ratio_loss": 0.1621977984905243, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03932010382413864, + "rewards/margins": 0.2576256990432739, + "rewards/rejected": -0.29694584012031555, + "sft_loss": 0.39320099353790283, + "step": 826 + }, + { + "epoch": 1.1959508315256688, + "grad_norm": 2.9133806547736727, + "learning_rate": 7.379440223809189e-06, + "logits/chosen": 0.23779156804084778, + "logits/rejected": 0.08777079731225967, + "logps/chosen": -0.49866682291030884, + "logps/rejected": -2.9157798290252686, + "loss": 0.6402, + "odds_ratio_loss": 0.27906733751296997, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.049866683781147, + "rewards/margins": 0.24171128869056702, + "rewards/rejected": -0.2915779948234558, + "sft_loss": 0.49866682291030884, + "step": 827 + }, + { + "epoch": 1.1973969631236443, + "grad_norm": 2.2335902201771214, + "learning_rate": 7.377777601292219e-06, + "logits/chosen": 0.2867448031902313, + "logits/rejected": 0.14885875582695007, + "logps/chosen": -0.5436640381813049, + "logps/rejected": -3.3524608612060547, + "loss": 0.6432, + "odds_ratio_loss": 0.27895376086235046, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05436640605330467, + "rewards/margins": 0.2808796763420105, + "rewards/rejected": -0.3352460563182831, + "sft_loss": 0.5436640381813049, + "step": 828 + }, + { + "epoch": 1.1988430947216198, + "grad_norm": 3.199011954022099, + "learning_rate": 7.376112942300552e-06, + "logits/chosen": 0.015715528279542923, + "logits/rejected": 0.06243829429149628, + "logps/chosen": -0.7299444675445557, + "logps/rejected": -1.2420305013656616, + "loss": 0.6999, + "odds_ratio_loss": 0.4474690854549408, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07299444824457169, + "rewards/margins": 0.051208604127168655, + "rewards/rejected": -0.12420305609703064, + "sft_loss": 0.7299444675445557, + "step": 829 + }, + { + "epoch": 1.200289226319595, + "grad_norm": 2.4971244149041527, + "learning_rate": 7.374446247837818e-06, + "logits/chosen": 0.08180706202983856, + "logits/rejected": 0.046968236565589905, + "logps/chosen": -0.6631790399551392, + "logps/rejected": -1.8908724784851074, + "loss": 0.6524, + "odds_ratio_loss": 0.3295513987541199, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06631790101528168, + "rewards/margins": 0.12276935577392578, + "rewards/rejected": -0.18908724188804626, + "sft_loss": 0.6631790399551392, + "step": 830 + }, + { + "epoch": 1.2017353579175705, + "grad_norm": 2.174741373156066, + "learning_rate": 7.372777518908874e-06, + "logits/chosen": 0.08737226575613022, + "logits/rejected": 0.004193238914012909, + "logps/chosen": -0.6524173021316528, + "logps/rejected": -2.7812983989715576, + "loss": 0.5948, + "odds_ratio_loss": 0.36923202872276306, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.065241739153862, + "rewards/margins": 0.21288813650608063, + "rewards/rejected": -0.2781298756599426, + "sft_loss": 0.6524173021316528, + "step": 831 + }, + { + "epoch": 1.203181489515546, + "grad_norm": 2.7310295441448114, + "learning_rate": 7.371106756519802e-06, + "logits/chosen": 0.173319473862648, + "logits/rejected": 0.08408096432685852, + "logps/chosen": -0.6770601272583008, + "logps/rejected": -2.8480212688446045, + "loss": 0.6284, + "odds_ratio_loss": 0.3516097664833069, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06770601868629456, + "rewards/margins": 0.21709612011909485, + "rewards/rejected": -0.2848021388053894, + "sft_loss": 0.6770601272583008, + "step": 832 + }, + { + "epoch": 1.2046276211135214, + "grad_norm": 7.808474205778759, + "learning_rate": 7.369433961677911e-06, + "logits/chosen": 0.0008435901254415512, + "logits/rejected": -0.0506550632417202, + "logps/chosen": -0.8189026117324829, + "logps/rejected": -1.491754412651062, + "loss": 0.6586, + "odds_ratio_loss": 0.4845430850982666, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08189025521278381, + "rewards/margins": 0.06728518754243851, + "rewards/rejected": -0.14917545020580292, + "sft_loss": 0.8189026117324829, + "step": 833 + }, + { + "epoch": 1.2060737527114966, + "grad_norm": 2.728802488582667, + "learning_rate": 7.367759135391736e-06, + "logits/chosen": 0.08259500563144684, + "logits/rejected": 0.11314639449119568, + "logps/chosen": -0.6674097776412964, + "logps/rejected": -2.400236129760742, + "loss": 0.6741, + "odds_ratio_loss": 0.31944167613983154, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0667409747838974, + "rewards/margins": 0.17328262329101562, + "rewards/rejected": -0.24002361297607422, + "sft_loss": 0.6674097776412964, + "step": 834 + }, + { + "epoch": 1.207519884309472, + "grad_norm": 3.9207051998045026, + "learning_rate": 7.366082278671035e-06, + "logits/chosen": 0.026229331269860268, + "logits/rejected": 0.06131180375814438, + "logps/chosen": -0.5891038179397583, + "logps/rejected": -2.125972032546997, + "loss": 0.6595, + "odds_ratio_loss": 0.28627675771713257, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05891038104891777, + "rewards/margins": 0.15368682146072388, + "rewards/rejected": -0.21259720623493195, + "sft_loss": 0.5891038179397583, + "step": 835 + }, + { + "epoch": 1.2089660159074476, + "grad_norm": 2.5431982766029866, + "learning_rate": 7.364403392526792e-06, + "logits/chosen": 0.0870114117860794, + "logits/rejected": 0.11938456445932388, + "logps/chosen": -0.618964433670044, + "logps/rejected": -1.597392201423645, + "loss": 0.6361, + "odds_ratio_loss": 0.33390533924102783, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.061896443367004395, + "rewards/margins": 0.09784276783466339, + "rewards/rejected": -0.15973922610282898, + "sft_loss": 0.618964433670044, + "step": 836 + }, + { + "epoch": 1.210412147505423, + "grad_norm": 4.077563662601138, + "learning_rate": 7.362722477971212e-06, + "logits/chosen": 0.16533887386322021, + "logits/rejected": 0.19099020957946777, + "logps/chosen": -0.6469172239303589, + "logps/rejected": -1.096304178237915, + "loss": 0.6717, + "odds_ratio_loss": 0.458132803440094, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06469172239303589, + "rewards/margins": 0.044938698410987854, + "rewards/rejected": -0.10963042080402374, + "sft_loss": 0.6469172239303589, + "step": 837 + }, + { + "epoch": 1.2118582791033985, + "grad_norm": 4.144574279732616, + "learning_rate": 7.3610395360177265e-06, + "logits/chosen": 0.23874233663082123, + "logits/rejected": 0.1909375786781311, + "logps/chosen": -0.4308343529701233, + "logps/rejected": -4.907786846160889, + "loss": 0.6455, + "odds_ratio_loss": 0.2326105535030365, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04308343678712845, + "rewards/margins": 0.447695255279541, + "rewards/rejected": -0.49077871441841125, + "sft_loss": 0.4308343529701233, + "step": 838 + }, + { + "epoch": 1.2133044107013737, + "grad_norm": 2.572402444131962, + "learning_rate": 7.359354567680988e-06, + "logits/chosen": 0.02161214128136635, + "logits/rejected": 1.9058585166931152e-05, + "logps/chosen": -0.6865906715393066, + "logps/rejected": -3.0387065410614014, + "loss": 0.571, + "odds_ratio_loss": 0.32448068261146545, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06865906715393066, + "rewards/margins": 0.2352115958929062, + "rewards/rejected": -0.30387064814567566, + "sft_loss": 0.6865906715393066, + "step": 839 + }, + { + "epoch": 1.2147505422993492, + "grad_norm": 2.639023550297937, + "learning_rate": 7.357667573976868e-06, + "logits/chosen": 0.1299152374267578, + "logits/rejected": 0.1438194215297699, + "logps/chosen": -0.6337484121322632, + "logps/rejected": -2.1571292877197266, + "loss": 0.6384, + "odds_ratio_loss": 0.32844048738479614, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0633748471736908, + "rewards/margins": 0.15233808755874634, + "rewards/rejected": -0.21571293473243713, + "sft_loss": 0.6337484121322632, + "step": 840 + }, + { + "epoch": 1.2161966738973247, + "grad_norm": 2.355187398625417, + "learning_rate": 7.355978555922462e-06, + "logits/chosen": 0.17293739318847656, + "logits/rejected": 0.14240939915180206, + "logps/chosen": -0.7368587255477905, + "logps/rejected": -1.4114584922790527, + "loss": 0.6926, + "odds_ratio_loss": 0.5368320345878601, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07368587702512741, + "rewards/margins": 0.06745997071266174, + "rewards/rejected": -0.14114584028720856, + "sft_loss": 0.7368587255477905, + "step": 841 + }, + { + "epoch": 1.2176428054953001, + "grad_norm": 2.3515135229234385, + "learning_rate": 7.354287514536086e-06, + "logits/chosen": -0.12064902484416962, + "logits/rejected": -0.07363397628068924, + "logps/chosen": -0.639882504940033, + "logps/rejected": -2.084611177444458, + "loss": 0.6079, + "odds_ratio_loss": 0.3934566378593445, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06398826092481613, + "rewards/margins": 0.1444728672504425, + "rewards/rejected": -0.20846112072467804, + "sft_loss": 0.639882504940033, + "step": 842 + }, + { + "epoch": 1.2190889370932756, + "grad_norm": 5.7804051421541995, + "learning_rate": 7.352594450837275e-06, + "logits/chosen": 0.14320078492164612, + "logits/rejected": 0.13621798157691956, + "logps/chosen": -0.6584324240684509, + "logps/rejected": -3.753617763519287, + "loss": 0.6103, + "odds_ratio_loss": 0.31743061542510986, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06584323942661285, + "rewards/margins": 0.3095185458660126, + "rewards/rejected": -0.37536177039146423, + "sft_loss": 0.6584324240684509, + "step": 843 + }, + { + "epoch": 1.2205350686912508, + "grad_norm": 2.2438557449285605, + "learning_rate": 7.350899365846783e-06, + "logits/chosen": 0.0793205052614212, + "logits/rejected": 0.06837757676839828, + "logps/chosen": -0.6042904853820801, + "logps/rejected": -3.4073262214660645, + "loss": 0.5716, + "odds_ratio_loss": 0.26326024532318115, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.060429058969020844, + "rewards/margins": 0.2803035378456116, + "rewards/rejected": -0.3407326340675354, + "sft_loss": 0.6042904853820801, + "step": 844 + }, + { + "epoch": 1.2219812002892263, + "grad_norm": 5.631401098009144, + "learning_rate": 7.349202260586583e-06, + "logits/chosen": 0.16910803318023682, + "logits/rejected": 0.1033957228064537, + "logps/chosen": -0.5252476930618286, + "logps/rejected": -3.475262403488159, + "loss": 0.5537, + "odds_ratio_loss": 0.36323004961013794, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.052524764090776443, + "rewards/margins": 0.29500147700309753, + "rewards/rejected": -0.3475262522697449, + "sft_loss": 0.5252476930618286, + "step": 845 + }, + { + "epoch": 1.2234273318872018, + "grad_norm": 2.3919449248178375, + "learning_rate": 7.3475031360798675e-06, + "logits/chosen": 0.021080223843455315, + "logits/rejected": 0.05137103796005249, + "logps/chosen": -0.7238802909851074, + "logps/rejected": -1.834083080291748, + "loss": 0.6581, + "odds_ratio_loss": 0.3619506359100342, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07238802313804626, + "rewards/margins": 0.1110202968120575, + "rewards/rejected": -0.18340831995010376, + "sft_loss": 0.7238802909851074, + "step": 846 + }, + { + "epoch": 1.2248734634851772, + "grad_norm": 2.401756254973007, + "learning_rate": 7.345801993351043e-06, + "logits/chosen": 0.1136443018913269, + "logits/rejected": 0.09337516129016876, + "logps/chosen": -0.508891761302948, + "logps/rejected": -1.8448481559753418, + "loss": 0.5723, + "odds_ratio_loss": 0.3853185772895813, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05088917538523674, + "rewards/margins": 0.13359564542770386, + "rewards/rejected": -0.1844848245382309, + "sft_loss": 0.508891761302948, + "step": 847 + }, + { + "epoch": 1.2263195950831527, + "grad_norm": 2.210151721154268, + "learning_rate": 7.344098833425736e-06, + "logits/chosen": 0.04942498356103897, + "logits/rejected": 0.1343136578798294, + "logps/chosen": -0.5119268298149109, + "logps/rejected": -3.112112522125244, + "loss": 0.6502, + "odds_ratio_loss": 0.22073283791542053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05119268596172333, + "rewards/margins": 0.26001858711242676, + "rewards/rejected": -0.3112112283706665, + "sft_loss": 0.5119268298149109, + "step": 848 + }, + { + "epoch": 1.227765726681128, + "grad_norm": 2.186487322506283, + "learning_rate": 7.342393657330786e-06, + "logits/chosen": 0.08392804116010666, + "logits/rejected": 0.09755400568246841, + "logps/chosen": -0.6143007874488831, + "logps/rejected": -2.7268826961517334, + "loss": 0.6268, + "odds_ratio_loss": 0.3477764427661896, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.061430081725120544, + "rewards/margins": 0.21125821769237518, + "rewards/rejected": -0.27268826961517334, + "sft_loss": 0.6143007874488831, + "step": 849 + }, + { + "epoch": 1.2292118582791034, + "grad_norm": 2.5756144440337003, + "learning_rate": 7.340686466094253e-06, + "logits/chosen": 0.01365756243467331, + "logits/rejected": -0.015208684839308262, + "logps/chosen": -0.6614384055137634, + "logps/rejected": -1.732398271560669, + "loss": 0.59, + "odds_ratio_loss": 0.4114297926425934, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06614383310079575, + "rewards/margins": 0.10709600150585175, + "rewards/rejected": -0.1732398420572281, + "sft_loss": 0.6614384055137634, + "step": 850 + }, + { + "epoch": 1.2306579898770789, + "grad_norm": 3.2850407881963646, + "learning_rate": 7.338977260745408e-06, + "logits/chosen": 0.06653488427400589, + "logits/rejected": 0.11970080435276031, + "logps/chosen": -0.6355282068252563, + "logps/rejected": -2.975282669067383, + "loss": 0.6661, + "odds_ratio_loss": 0.18445342779159546, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06355281919240952, + "rewards/margins": 0.23397547006607056, + "rewards/rejected": -0.29752829670906067, + "sft_loss": 0.6355282068252563, + "step": 851 + }, + { + "epoch": 1.2321041214750543, + "grad_norm": 3.3087787505793074, + "learning_rate": 7.337266042314736e-06, + "logits/chosen": -0.11909240484237671, + "logits/rejected": -0.003269646316766739, + "logps/chosen": -0.5766974687576294, + "logps/rejected": -1.5956358909606934, + "loss": 0.5398, + "odds_ratio_loss": 0.3452081084251404, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0576697513461113, + "rewards/margins": 0.1018938422203064, + "rewards/rejected": -0.1595635861158371, + "sft_loss": 0.5766974687576294, + "step": 852 + }, + { + "epoch": 1.2335502530730296, + "grad_norm": 2.2720728280558182, + "learning_rate": 7.335552811833938e-06, + "logits/chosen": 0.09772266447544098, + "logits/rejected": 0.05156873166561127, + "logps/chosen": -0.7244196534156799, + "logps/rejected": -1.5867480039596558, + "loss": 0.5883, + "odds_ratio_loss": 0.4903804659843445, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.072441965341568, + "rewards/margins": 0.08623284101486206, + "rewards/rejected": -0.15867480635643005, + "sft_loss": 0.7244196534156799, + "step": 853 + }, + { + "epoch": 1.234996384671005, + "grad_norm": 4.799589917880964, + "learning_rate": 7.333837570335926e-06, + "logits/chosen": 0.17452943325042725, + "logits/rejected": 0.13533267378807068, + "logps/chosen": -0.41512399911880493, + "logps/rejected": -4.827322959899902, + "loss": 0.603, + "odds_ratio_loss": 0.2235361635684967, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04151239991188049, + "rewards/margins": 0.44121992588043213, + "rewards/rejected": -0.4827323257923126, + "sft_loss": 0.41512399911880493, + "step": 854 + }, + { + "epoch": 1.2364425162689805, + "grad_norm": 2.1998808847443017, + "learning_rate": 7.332120318854828e-06, + "logits/chosen": 0.16299928724765778, + "logits/rejected": 0.12603971362113953, + "logps/chosen": -0.6076937317848206, + "logps/rejected": -2.386244297027588, + "loss": 0.6123, + "odds_ratio_loss": 0.3471486568450928, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.060769371688365936, + "rewards/margins": 0.17785507440567017, + "rewards/rejected": -0.2386244535446167, + "sft_loss": 0.6076937317848206, + "step": 855 + }, + { + "epoch": 1.237888647866956, + "grad_norm": 4.333732122428708, + "learning_rate": 7.330401058425978e-06, + "logits/chosen": 0.1115557998418808, + "logits/rejected": 0.03767241910099983, + "logps/chosen": -0.6454582214355469, + "logps/rejected": -4.050790309906006, + "loss": 0.5584, + "odds_ratio_loss": 0.39332228899002075, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06454582512378693, + "rewards/margins": 0.34053319692611694, + "rewards/rejected": -0.4050790071487427, + "sft_loss": 0.6454582214355469, + "step": 856 + }, + { + "epoch": 1.2393347794649312, + "grad_norm": 2.325898041501977, + "learning_rate": 7.328679790085928e-06, + "logits/chosen": 0.233146071434021, + "logits/rejected": 0.11350773274898529, + "logps/chosen": -0.45263344049453735, + "logps/rejected": -2.9898529052734375, + "loss": 0.5872, + "odds_ratio_loss": 0.2763618230819702, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.045263346284627914, + "rewards/margins": 0.2537219226360321, + "rewards/rejected": -0.2989852726459503, + "sft_loss": 0.45263344049453735, + "step": 857 + }, + { + "epoch": 1.2407809110629067, + "grad_norm": 3.1302938440980874, + "learning_rate": 7.326956514872434e-06, + "logits/chosen": -0.018604222685098648, + "logits/rejected": 0.06933964788913727, + "logps/chosen": -0.48552218079566956, + "logps/rejected": -1.1437679529190063, + "loss": 0.6535, + "odds_ratio_loss": 0.3590165376663208, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.048552218824625015, + "rewards/margins": 0.06582456827163696, + "rewards/rejected": -0.11437679827213287, + "sft_loss": 0.48552218079566956, + "step": 858 + }, + { + "epoch": 1.2422270426608821, + "grad_norm": 2.353039250439809, + "learning_rate": 7.325231233824465e-06, + "logits/chosen": 0.09792876243591309, + "logits/rejected": 0.12561389803886414, + "logps/chosen": -0.5376712679862976, + "logps/rejected": -2.5611867904663086, + "loss": 0.5828, + "odds_ratio_loss": 0.2520514130592346, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.053767129778862, + "rewards/margins": 0.20235158503055573, + "rewards/rejected": -0.2561187148094177, + "sft_loss": 0.5376712679862976, + "step": 859 + }, + { + "epoch": 1.2436731742588576, + "grad_norm": 2.7402418071243697, + "learning_rate": 7.323503947982203e-06, + "logits/chosen": 0.22483845055103302, + "logits/rejected": 0.20443016290664673, + "logps/chosen": -0.34170234203338623, + "logps/rejected": -2.6574883460998535, + "loss": 0.5197, + "odds_ratio_loss": 0.19146178662776947, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.034170232713222504, + "rewards/margins": 0.23157860338687897, + "rewards/rejected": -0.2657488286495209, + "sft_loss": 0.34170234203338623, + "step": 860 + }, + { + "epoch": 1.245119305856833, + "grad_norm": 2.1857929122043616, + "learning_rate": 7.3217746583870315e-06, + "logits/chosen": 0.0867963433265686, + "logits/rejected": 0.08815450221300125, + "logps/chosen": -0.49371716380119324, + "logps/rejected": -3.03228497505188, + "loss": 0.6737, + "odds_ratio_loss": 0.2313099354505539, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.049371711909770966, + "rewards/margins": 0.2538568079471588, + "rewards/rejected": -0.303228497505188, + "sft_loss": 0.49371716380119324, + "step": 861 + }, + { + "epoch": 1.2465654374548083, + "grad_norm": 4.024529157035028, + "learning_rate": 7.3200433660815474e-06, + "logits/chosen": 0.11046599596738815, + "logits/rejected": 0.059477321803569794, + "logps/chosen": -0.6523793935775757, + "logps/rejected": -1.849184513092041, + "loss": 0.6839, + "odds_ratio_loss": 0.36109447479248047, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06523793935775757, + "rewards/margins": 0.11968051642179489, + "rewards/rejected": -0.18491846323013306, + "sft_loss": 0.6523793935775757, + "step": 862 + }, + { + "epoch": 1.2480115690527838, + "grad_norm": 3.176670393350689, + "learning_rate": 7.318310072109552e-06, + "logits/chosen": 0.07282102853059769, + "logits/rejected": 0.19558274745941162, + "logps/chosen": -0.5407617688179016, + "logps/rejected": -2.6236157417297363, + "loss": 0.6409, + "odds_ratio_loss": 0.20967018604278564, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0540761798620224, + "rewards/margins": 0.2082854062318802, + "rewards/rejected": -0.2623615860939026, + "sft_loss": 0.5407617688179016, + "step": 863 + }, + { + "epoch": 1.2494577006507592, + "grad_norm": 2.8304016651309754, + "learning_rate": 7.3165747775160555e-06, + "logits/chosen": 0.1903497278690338, + "logits/rejected": 0.07230844348669052, + "logps/chosen": -0.712563693523407, + "logps/rejected": -3.380535364151001, + "loss": 0.5968, + "odds_ratio_loss": 0.35233545303344727, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0712563768029213, + "rewards/margins": 0.26679715514183044, + "rewards/rejected": -0.33805355429649353, + "sft_loss": 0.712563693523407, + "step": 864 + }, + { + "epoch": 1.2509038322487347, + "grad_norm": 2.4209488023330996, + "learning_rate": 7.3148374833472746e-06, + "logits/chosen": 0.1617758572101593, + "logits/rejected": 0.08497782796621323, + "logps/chosen": -0.6493286490440369, + "logps/rejected": -3.3906636238098145, + "loss": 0.6008, + "odds_ratio_loss": 0.4235643148422241, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06493286788463593, + "rewards/margins": 0.27413350343704224, + "rewards/rejected": -0.33906635642051697, + "sft_loss": 0.6493286490440369, + "step": 865 + }, + { + "epoch": 1.2523499638467102, + "grad_norm": 2.499704939420737, + "learning_rate": 7.313098190650627e-06, + "logits/chosen": -0.037707388401031494, + "logits/rejected": 0.029249371960759163, + "logps/chosen": -0.7335867881774902, + "logps/rejected": -1.2428430318832397, + "loss": 0.6596, + "odds_ratio_loss": 0.46961063146591187, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0733586773276329, + "rewards/margins": 0.050925616174936295, + "rewards/rejected": -0.1242842972278595, + "sft_loss": 0.7335867881774902, + "step": 866 + }, + { + "epoch": 1.2537960954446854, + "grad_norm": 3.162650037572366, + "learning_rate": 7.311356900474743e-06, + "logits/chosen": 0.18778613209724426, + "logits/rejected": 0.14763246476650238, + "logps/chosen": -0.5227529406547546, + "logps/rejected": -2.191002607345581, + "loss": 0.5999, + "odds_ratio_loss": 0.2827909588813782, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.052275292575359344, + "rewards/margins": 0.16682496666908264, + "rewards/rejected": -0.2191002517938614, + "sft_loss": 0.5227529406547546, + "step": 867 + }, + { + "epoch": 1.2552422270426609, + "grad_norm": 3.1734084703080048, + "learning_rate": 7.30961361386945e-06, + "logits/chosen": 0.22620001435279846, + "logits/rejected": 0.28974610567092896, + "logps/chosen": -0.631101131439209, + "logps/rejected": -1.6538848876953125, + "loss": 0.5449, + "odds_ratio_loss": 0.3684981167316437, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0631101131439209, + "rewards/margins": 0.10227837413549423, + "rewards/rejected": -0.16538847982883453, + "sft_loss": 0.631101131439209, + "step": 868 + }, + { + "epoch": 1.2566883586406363, + "grad_norm": 3.5174474662803465, + "learning_rate": 7.307868331885783e-06, + "logits/chosen": 0.15043975412845612, + "logits/rejected": 0.09274716675281525, + "logps/chosen": -0.5229210257530212, + "logps/rejected": -3.506356954574585, + "loss": 0.5961, + "odds_ratio_loss": 0.30983737111091614, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.052292101085186005, + "rewards/margins": 0.29834359884262085, + "rewards/rejected": -0.35063570737838745, + "sft_loss": 0.5229210257530212, + "step": 869 + }, + { + "epoch": 1.2581344902386118, + "grad_norm": 3.329767820424892, + "learning_rate": 7.306121055575979e-06, + "logits/chosen": 0.17091235518455505, + "logits/rejected": 0.05310794711112976, + "logps/chosen": -0.48885834217071533, + "logps/rejected": -2.341912269592285, + "loss": 0.5332, + "odds_ratio_loss": 0.3056949973106384, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.04888583719730377, + "rewards/margins": 0.1853054016828537, + "rewards/rejected": -0.23419123888015747, + "sft_loss": 0.48885834217071533, + "step": 870 + }, + { + "epoch": 1.2595806218365873, + "grad_norm": 2.421934812884417, + "learning_rate": 7.304371785993478e-06, + "logits/chosen": 0.07035940140485764, + "logits/rejected": 0.057187557220458984, + "logps/chosen": -0.5910875797271729, + "logps/rejected": -3.1442012786865234, + "loss": 0.655, + "odds_ratio_loss": 0.26056718826293945, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.059108760207891464, + "rewards/margins": 0.25531139969825745, + "rewards/rejected": -0.3144201636314392, + "sft_loss": 0.5910875797271729, + "step": 871 + }, + { + "epoch": 1.2610267534345625, + "grad_norm": 5.4366957185017055, + "learning_rate": 7.302620524192919e-06, + "logits/chosen": 0.1306525021791458, + "logits/rejected": -0.046006545424461365, + "logps/chosen": -0.4835171401500702, + "logps/rejected": -3.9955079555511475, + "loss": 0.5438, + "odds_ratio_loss": 0.2657640874385834, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0483517125248909, + "rewards/margins": 0.35119912028312683, + "rewards/rejected": -0.39955076575279236, + "sft_loss": 0.4835171401500702, + "step": 872 + }, + { + "epoch": 1.262472885032538, + "grad_norm": 4.732128151149686, + "learning_rate": 7.300867271230147e-06, + "logits/chosen": -0.02715367265045643, + "logits/rejected": 0.008712584152817726, + "logps/chosen": -0.7222077250480652, + "logps/rejected": -2.355595827102661, + "loss": 0.5656, + "odds_ratio_loss": 0.2727832794189453, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07222076505422592, + "rewards/margins": 0.1633388102054596, + "rewards/rejected": -0.2355595827102661, + "sft_loss": 0.7222077250480652, + "step": 873 + }, + { + "epoch": 1.2639190166305134, + "grad_norm": 2.946079569555965, + "learning_rate": 7.299112028162202e-06, + "logits/chosen": 0.12276051938533783, + "logits/rejected": 0.09081517904996872, + "logps/chosen": -0.6572151184082031, + "logps/rejected": -3.947697877883911, + "loss": 0.6366, + "odds_ratio_loss": 0.37073010206222534, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06572151184082031, + "rewards/margins": 0.3290482759475708, + "rewards/rejected": -0.3947698175907135, + "sft_loss": 0.6572151184082031, + "step": 874 + }, + { + "epoch": 1.2653651482284887, + "grad_norm": 3.824117606600312, + "learning_rate": 7.297354796047329e-06, + "logits/chosen": 0.1744898557662964, + "logits/rejected": 0.1692255735397339, + "logps/chosen": -0.4023602604866028, + "logps/rejected": -3.4393415451049805, + "loss": 0.6831, + "odds_ratio_loss": 0.22353234887123108, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04023602232336998, + "rewards/margins": 0.3036981225013733, + "rewards/rejected": -0.34393414855003357, + "sft_loss": 0.4023602604866028, + "step": 875 + }, + { + "epoch": 1.2668112798264641, + "grad_norm": 2.9230987462682227, + "learning_rate": 7.295595575944968e-06, + "logits/chosen": 0.11851765215396881, + "logits/rejected": 0.15573816001415253, + "logps/chosen": -0.5971537232398987, + "logps/rejected": -2.4105782508850098, + "loss": 0.6128, + "odds_ratio_loss": 0.3604203164577484, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05971537530422211, + "rewards/margins": 0.1813424527645111, + "rewards/rejected": -0.24105781316757202, + "sft_loss": 0.5971537232398987, + "step": 876 + }, + { + "epoch": 1.2682574114244396, + "grad_norm": 3.7641940234842592, + "learning_rate": 7.293834368915762e-06, + "logits/chosen": 0.17587612569332123, + "logits/rejected": 0.11586427688598633, + "logps/chosen": -0.6683433651924133, + "logps/rejected": -1.2942430973052979, + "loss": 0.6081, + "odds_ratio_loss": 0.40443292260169983, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06683434545993805, + "rewards/margins": 0.06258997321128845, + "rewards/rejected": -0.1294243186712265, + "sft_loss": 0.6683433651924133, + "step": 877 + }, + { + "epoch": 1.269703543022415, + "grad_norm": 2.7147629034863483, + "learning_rate": 7.292071176021546e-06, + "logits/chosen": 0.1754606068134308, + "logits/rejected": 0.17133712768554688, + "logps/chosen": -0.6982166767120361, + "logps/rejected": -2.0608537197113037, + "loss": 0.6528, + "odds_ratio_loss": 0.39273586869239807, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06982167065143585, + "rewards/margins": 0.13626369833946228, + "rewards/rejected": -0.20608535408973694, + "sft_loss": 0.6982166767120361, + "step": 878 + }, + { + "epoch": 1.2711496746203905, + "grad_norm": 2.362146428523586, + "learning_rate": 7.2903059983253575e-06, + "logits/chosen": 0.3036377429962158, + "logits/rejected": 0.21588896214962006, + "logps/chosen": -0.5209940671920776, + "logps/rejected": -2.4526965618133545, + "loss": 0.5407, + "odds_ratio_loss": 0.33587196469306946, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.052099406719207764, + "rewards/margins": 0.19317024946212769, + "rewards/rejected": -0.24526965618133545, + "sft_loss": 0.5209940671920776, + "step": 879 + }, + { + "epoch": 1.2725958062183658, + "grad_norm": 2.5454177758478496, + "learning_rate": 7.288538836891428e-06, + "logits/chosen": 0.06093733385205269, + "logits/rejected": 0.07876378297805786, + "logps/chosen": -0.6048973798751831, + "logps/rejected": -3.196272611618042, + "loss": 0.5978, + "odds_ratio_loss": 0.28094661235809326, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06048973649740219, + "rewards/margins": 0.25913751125335693, + "rewards/rejected": -0.31962722539901733, + "sft_loss": 0.6048973798751831, + "step": 880 + }, + { + "epoch": 1.2740419378163412, + "grad_norm": 4.714093464026552, + "learning_rate": 7.286769692785185e-06, + "logits/chosen": 0.12928709387779236, + "logits/rejected": 0.0929189920425415, + "logps/chosen": -0.6182774305343628, + "logps/rejected": -1.788148045539856, + "loss": 0.6916, + "odds_ratio_loss": 0.5657252073287964, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.061827752739191055, + "rewards/margins": 0.11698705703020096, + "rewards/rejected": -0.17881479859352112, + "sft_loss": 0.6182774305343628, + "step": 881 + }, + { + "epoch": 1.2754880694143167, + "grad_norm": 2.425544165899364, + "learning_rate": 7.284998567073254e-06, + "logits/chosen": 0.1605134755373001, + "logits/rejected": 0.12401594966650009, + "logps/chosen": -0.4902043640613556, + "logps/rejected": -2.4860270023345947, + "loss": 0.6171, + "odds_ratio_loss": 0.24903835356235504, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0490204393863678, + "rewards/margins": 0.1995822638273239, + "rewards/rejected": -0.2486027032136917, + "sft_loss": 0.4902043640613556, + "step": 882 + }, + { + "epoch": 1.2769342010122922, + "grad_norm": 2.4177056064353937, + "learning_rate": 7.283225460823452e-06, + "logits/chosen": 0.15278860926628113, + "logits/rejected": 0.09883365780115128, + "logps/chosen": -0.4777308702468872, + "logps/rejected": -3.172642707824707, + "loss": 0.6086, + "odds_ratio_loss": 0.2730526626110077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0477730855345726, + "rewards/margins": 0.26949119567871094, + "rewards/rejected": -0.31726425886154175, + "sft_loss": 0.4777308702468872, + "step": 883 + }, + { + "epoch": 1.2783803326102676, + "grad_norm": 2.870771462027815, + "learning_rate": 7.281450375104792e-06, + "logits/chosen": 0.11632952094078064, + "logits/rejected": 0.08604128658771515, + "logps/chosen": -0.7169528007507324, + "logps/rejected": -2.1212775707244873, + "loss": 0.6727, + "odds_ratio_loss": 0.43706566095352173, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07169528305530548, + "rewards/margins": 0.1404324769973755, + "rewards/rejected": -0.21212777495384216, + "sft_loss": 0.7169528007507324, + "step": 884 + }, + { + "epoch": 1.2798264642082429, + "grad_norm": 2.654421440453263, + "learning_rate": 7.2796733109874785e-06, + "logits/chosen": 0.16258105635643005, + "logits/rejected": 0.170535147190094, + "logps/chosen": -0.5291702747344971, + "logps/rejected": -2.99528431892395, + "loss": 0.6692, + "odds_ratio_loss": 0.2050587683916092, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05291702598333359, + "rewards/margins": 0.24661138653755188, + "rewards/rejected": -0.29952841997146606, + "sft_loss": 0.5291702747344971, + "step": 885 + }, + { + "epoch": 1.2812725958062183, + "grad_norm": 6.212461882229058, + "learning_rate": 7.277894269542912e-06, + "logits/chosen": 0.1574457436800003, + "logits/rejected": 0.021832166239619255, + "logps/chosen": -0.5372164249420166, + "logps/rejected": -3.0358946323394775, + "loss": 0.682, + "odds_ratio_loss": 0.29545292258262634, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05372164770960808, + "rewards/margins": 0.24986781179904938, + "rewards/rejected": -0.30358946323394775, + "sft_loss": 0.5372164249420166, + "step": 886 + }, + { + "epoch": 1.2827187274041938, + "grad_norm": 2.733381355486248, + "learning_rate": 7.2761132518436825e-06, + "logits/chosen": 0.1425001621246338, + "logits/rejected": 0.16708292067050934, + "logps/chosen": -0.498007595539093, + "logps/rejected": -2.4287235736846924, + "loss": 0.551, + "odds_ratio_loss": 0.21452642977237701, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04980075731873512, + "rewards/margins": 0.19307160377502441, + "rewards/rejected": -0.24287235736846924, + "sft_loss": 0.498007595539093, + "step": 887 + }, + { + "epoch": 1.2841648590021693, + "grad_norm": 2.302702730393573, + "learning_rate": 7.274330258963571e-06, + "logits/chosen": 0.2424948513507843, + "logits/rejected": 0.10700159519910812, + "logps/chosen": -0.43079856038093567, + "logps/rejected": -3.0819058418273926, + "loss": 0.6015, + "odds_ratio_loss": 0.19200022518634796, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04307985678315163, + "rewards/margins": 0.26511070132255554, + "rewards/rejected": -0.30819055438041687, + "sft_loss": 0.43079856038093567, + "step": 888 + }, + { + "epoch": 1.2856109906001447, + "grad_norm": 2.612798051257584, + "learning_rate": 7.272545291977551e-06, + "logits/chosen": 0.08881522715091705, + "logits/rejected": 0.05702493339776993, + "logps/chosen": -0.5296792387962341, + "logps/rejected": -3.994205951690674, + "loss": 0.6511, + "odds_ratio_loss": 0.289590448141098, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05296792834997177, + "rewards/margins": 0.34645265340805054, + "rewards/rejected": -0.3994206190109253, + "sft_loss": 0.5296792387962341, + "step": 889 + }, + { + "epoch": 1.28705712219812, + "grad_norm": 2.9782311108855244, + "learning_rate": 7.270758351961787e-06, + "logits/chosen": 0.026416288688778877, + "logits/rejected": 0.03395572304725647, + "logps/chosen": -0.6140929460525513, + "logps/rejected": -2.65456223487854, + "loss": 0.5723, + "odds_ratio_loss": 0.3684936761856079, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06140929460525513, + "rewards/margins": 0.20404693484306335, + "rewards/rejected": -0.2654562294483185, + "sft_loss": 0.6140929460525513, + "step": 890 + }, + { + "epoch": 1.2885032537960954, + "grad_norm": 2.589171345600719, + "learning_rate": 7.268969439993631e-06, + "logits/chosen": 0.1916169971227646, + "logits/rejected": 0.04407678171992302, + "logps/chosen": -0.5033664703369141, + "logps/rejected": -2.073901891708374, + "loss": 0.7008, + "odds_ratio_loss": 0.2652108669281006, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05033664405345917, + "rewards/margins": 0.15705353021621704, + "rewards/rejected": -0.2073901891708374, + "sft_loss": 0.5033664703369141, + "step": 891 + }, + { + "epoch": 1.289949385394071, + "grad_norm": 5.374295907094805, + "learning_rate": 7.267178557151625e-06, + "logits/chosen": 0.142109215259552, + "logits/rejected": 0.191066175699234, + "logps/chosen": -0.4820294976234436, + "logps/rejected": -2.943162441253662, + "loss": 0.6246, + "odds_ratio_loss": 0.3385387361049652, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.04820295050740242, + "rewards/margins": 0.24611330032348633, + "rewards/rejected": -0.29431626200675964, + "sft_loss": 0.4820294976234436, + "step": 892 + }, + { + "epoch": 1.2913955169920464, + "grad_norm": 3.3410095444556873, + "learning_rate": 7.265385704515498e-06, + "logits/chosen": 0.05013919621706009, + "logits/rejected": 0.16960841417312622, + "logps/chosen": -0.7405865788459778, + "logps/rejected": -2.0694758892059326, + "loss": 0.6052, + "odds_ratio_loss": 0.41710177063941956, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0740586593747139, + "rewards/margins": 0.13288892805576324, + "rewards/rejected": -0.20694757997989655, + "sft_loss": 0.7405865788459778, + "step": 893 + }, + { + "epoch": 1.2928416485900218, + "grad_norm": 2.5037443157900823, + "learning_rate": 7.263590883166168e-06, + "logits/chosen": 0.10009223222732544, + "logits/rejected": 0.14507119357585907, + "logps/chosen": -0.433633029460907, + "logps/rejected": -1.8324575424194336, + "loss": 0.594, + "odds_ratio_loss": 0.27205413579940796, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0433633029460907, + "rewards/margins": 0.13988244533538818, + "rewards/rejected": -0.18324576318264008, + "sft_loss": 0.433633029460907, + "step": 894 + }, + { + "epoch": 1.294287780187997, + "grad_norm": 2.6618437917755156, + "learning_rate": 7.2617940941857395e-06, + "logits/chosen": -0.0036581484600901604, + "logits/rejected": 0.12767058610916138, + "logps/chosen": -0.5401878356933594, + "logps/rejected": -2.6623241901397705, + "loss": 0.5868, + "odds_ratio_loss": 0.21899092197418213, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0540187805891037, + "rewards/margins": 0.2122136503458023, + "rewards/rejected": -0.266232430934906, + "sft_loss": 0.5401878356933594, + "step": 895 + }, + { + "epoch": 1.2957339117859725, + "grad_norm": 3.7716031429476997, + "learning_rate": 7.259995338657504e-06, + "logits/chosen": 0.06534141302108765, + "logits/rejected": 0.06475174427032471, + "logps/chosen": -0.6003227829933167, + "logps/rejected": -2.5519890785217285, + "loss": 0.7213, + "odds_ratio_loss": 0.39002418518066406, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.060032278299331665, + "rewards/margins": 0.19516664743423462, + "rewards/rejected": -0.25519895553588867, + "sft_loss": 0.6003227829933167, + "step": 896 + }, + { + "epoch": 1.297180043383948, + "grad_norm": 2.794184064220952, + "learning_rate": 7.258194617665937e-06, + "logits/chosen": 0.1161397323012352, + "logits/rejected": 0.14191779494285583, + "logps/chosen": -0.6059271097183228, + "logps/rejected": -2.318582534790039, + "loss": 0.6352, + "odds_ratio_loss": 0.42859724164009094, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.060592710971832275, + "rewards/margins": 0.17126552760601044, + "rewards/rejected": -0.2318582534790039, + "sft_loss": 0.6059271097183228, + "step": 897 + }, + { + "epoch": 1.2986261749819232, + "grad_norm": 2.3562388178644915, + "learning_rate": 7.256391932296701e-06, + "logits/chosen": -0.09829920530319214, + "logits/rejected": 0.0967685803771019, + "logps/chosen": -0.5340638160705566, + "logps/rejected": -2.656466245651245, + "loss": 0.6395, + "odds_ratio_loss": 0.22315004467964172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.053406380116939545, + "rewards/margins": 0.21224026381969452, + "rewards/rejected": -0.26564663648605347, + "sft_loss": 0.5340638160705566, + "step": 898 + }, + { + "epoch": 1.3000723065798987, + "grad_norm": 3.4290264601170573, + "learning_rate": 7.25458728363664e-06, + "logits/chosen": 0.10071888566017151, + "logits/rejected": 0.07806149125099182, + "logps/chosen": -0.6966016888618469, + "logps/rejected": -3.5601463317871094, + "loss": 0.6549, + "odds_ratio_loss": 0.31283316016197205, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06966017186641693, + "rewards/margins": 0.2863544821739197, + "rewards/rejected": -0.3560146689414978, + "sft_loss": 0.6966016888618469, + "step": 899 + }, + { + "epoch": 1.3015184381778742, + "grad_norm": 2.5914762182916586, + "learning_rate": 7.252780672773785e-06, + "logits/chosen": 0.0018359817331656814, + "logits/rejected": 0.041274283081293106, + "logps/chosen": -0.592832624912262, + "logps/rejected": -2.7108519077301025, + "loss": 0.6497, + "odds_ratio_loss": 0.2760257422924042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.059283267706632614, + "rewards/margins": 0.2118019312620163, + "rewards/rejected": -0.2710852026939392, + "sft_loss": 0.592832624912262, + "step": 900 + }, + { + "epoch": 1.3029645697758496, + "grad_norm": 2.512930115177367, + "learning_rate": 7.250972100797347e-06, + "logits/chosen": 0.14943164587020874, + "logits/rejected": 0.04612865671515465, + "logps/chosen": -0.7253419160842896, + "logps/rejected": -2.070033073425293, + "loss": 0.5876, + "odds_ratio_loss": 0.45559167861938477, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07253418862819672, + "rewards/margins": 0.13446910679340363, + "rewards/rejected": -0.20700329542160034, + "sft_loss": 0.7253419160842896, + "step": 901 + }, + { + "epoch": 1.304410701373825, + "grad_norm": 3.628486682129814, + "learning_rate": 7.249161568797722e-06, + "logits/chosen": 0.04626443237066269, + "logits/rejected": -0.00022936612367630005, + "logps/chosen": -0.6973298788070679, + "logps/rejected": -2.7048377990722656, + "loss": 0.5538, + "odds_ratio_loss": 0.39978471398353577, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06973298639059067, + "rewards/margins": 0.20075079798698425, + "rewards/rejected": -0.2704837918281555, + "sft_loss": 0.6973298788070679, + "step": 902 + }, + { + "epoch": 1.3058568329718003, + "grad_norm": 2.7165152924208398, + "learning_rate": 7.247349077866486e-06, + "logits/chosen": 0.13136953115463257, + "logits/rejected": 0.08058802038431168, + "logps/chosen": -0.6339420676231384, + "logps/rejected": -2.620114326477051, + "loss": 0.5215, + "odds_ratio_loss": 0.3651321530342102, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0633942037820816, + "rewards/margins": 0.19861723482608795, + "rewards/rejected": -0.26201143860816956, + "sft_loss": 0.6339420676231384, + "step": 903 + }, + { + "epoch": 1.3073029645697758, + "grad_norm": 3.1412086704512645, + "learning_rate": 7.245534629096397e-06, + "logits/chosen": 0.06389408558607101, + "logits/rejected": 0.13316801190376282, + "logps/chosen": -0.3657557964324951, + "logps/rejected": -2.257955551147461, + "loss": 0.5489, + "odds_ratio_loss": 0.22655794024467468, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03657558187842369, + "rewards/margins": 0.18921998143196106, + "rewards/rejected": -0.22579556703567505, + "sft_loss": 0.3657557964324951, + "step": 904 + }, + { + "epoch": 1.3087490961677513, + "grad_norm": 2.643111326392046, + "learning_rate": 7.243718223581391e-06, + "logits/chosen": 0.060490552335977554, + "logits/rejected": 0.008018707856535912, + "logps/chosen": -0.6461340188980103, + "logps/rejected": -2.293785572052002, + "loss": 0.659, + "odds_ratio_loss": 0.3631659746170044, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06461340188980103, + "rewards/margins": 0.1647651493549347, + "rewards/rejected": -0.22937855124473572, + "sft_loss": 0.6461340188980103, + "step": 905 + }, + { + "epoch": 1.3101952277657267, + "grad_norm": 2.9794545360738995, + "learning_rate": 7.241899862416588e-06, + "logits/chosen": 0.041527822613716125, + "logits/rejected": 0.18100914359092712, + "logps/chosen": -0.37341946363449097, + "logps/rejected": -1.93304443359375, + "loss": 0.5409, + "odds_ratio_loss": 0.19359731674194336, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03734194487333298, + "rewards/margins": 0.1559625118970871, + "rewards/rejected": -0.19330444931983948, + "sft_loss": 0.37341946363449097, + "step": 906 + }, + { + "epoch": 1.3116413593637022, + "grad_norm": 3.9441615451488916, + "learning_rate": 7.240079546698284e-06, + "logits/chosen": 0.03994975611567497, + "logits/rejected": 0.034951452165842056, + "logps/chosen": -0.6786578297615051, + "logps/rejected": -3.219226360321045, + "loss": 0.5901, + "odds_ratio_loss": 0.3564690351486206, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06786578893661499, + "rewards/margins": 0.254056841135025, + "rewards/rejected": -0.32192263007164, + "sft_loss": 0.6786578297615051, + "step": 907 + }, + { + "epoch": 1.3130874909616774, + "grad_norm": 2.812788200690957, + "learning_rate": 7.238257277523955e-06, + "logits/chosen": -0.03691191226243973, + "logits/rejected": 0.054480381309986115, + "logps/chosen": -0.570400595664978, + "logps/rejected": -2.1054439544677734, + "loss": 0.593, + "odds_ratio_loss": 0.3295222520828247, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05704005807638168, + "rewards/margins": 0.15350434184074402, + "rewards/rejected": -0.2105444073677063, + "sft_loss": 0.570400595664978, + "step": 908 + }, + { + "epoch": 1.314533622559653, + "grad_norm": 2.3743896241886797, + "learning_rate": 7.23643305599225e-06, + "logits/chosen": 0.10392654687166214, + "logits/rejected": 0.07878941297531128, + "logps/chosen": -0.6637530326843262, + "logps/rejected": -2.511354446411133, + "loss": 0.6289, + "odds_ratio_loss": 0.3708875775337219, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06637530028820038, + "rewards/margins": 0.1847601681947708, + "rewards/rejected": -0.2511354684829712, + "sft_loss": 0.6637530326843262, + "step": 909 + }, + { + "epoch": 1.3159797541576284, + "grad_norm": 2.565095852577169, + "learning_rate": 7.234606883203004e-06, + "logits/chosen": -0.02966320887207985, + "logits/rejected": -0.02640990912914276, + "logps/chosen": -0.674669086933136, + "logps/rejected": -2.56156325340271, + "loss": 0.6611, + "odds_ratio_loss": 0.26282304525375366, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06746691465377808, + "rewards/margins": 0.18868939578533173, + "rewards/rejected": -0.256156325340271, + "sft_loss": 0.674669086933136, + "step": 910 + }, + { + "epoch": 1.3174258857556038, + "grad_norm": 2.7836971947588776, + "learning_rate": 7.23277876025722e-06, + "logits/chosen": 0.20673127472400665, + "logits/rejected": 0.08467914164066315, + "logps/chosen": -0.353419691324234, + "logps/rejected": -3.728929042816162, + "loss": 0.429, + "odds_ratio_loss": 0.10951358824968338, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03534197062253952, + "rewards/margins": 0.33755093812942505, + "rewards/rejected": -0.37289291620254517, + "sft_loss": 0.353419691324234, + "step": 911 + }, + { + "epoch": 1.3188720173535793, + "grad_norm": 2.454408699083897, + "learning_rate": 7.230948688257083e-06, + "logits/chosen": 0.24181370437145233, + "logits/rejected": 0.17572006583213806, + "logps/chosen": -0.5705441236495972, + "logps/rejected": -1.9255404472351074, + "loss": 0.5852, + "odds_ratio_loss": 0.3566300868988037, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.057054419070482254, + "rewards/margins": 0.13549962639808655, + "rewards/rejected": -0.1925540566444397, + "sft_loss": 0.5705441236495972, + "step": 912 + }, + { + "epoch": 1.3203181489515545, + "grad_norm": 2.658117655292024, + "learning_rate": 7.2291166683059465e-06, + "logits/chosen": 0.27265405654907227, + "logits/rejected": 0.27807876467704773, + "logps/chosen": -0.3936464786529541, + "logps/rejected": -2.4130916595458984, + "loss": 0.5982, + "odds_ratio_loss": 0.26774898171424866, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03936465084552765, + "rewards/margins": 0.2019445151090622, + "rewards/rejected": -0.24130916595458984, + "sft_loss": 0.3936464786529541, + "step": 913 + }, + { + "epoch": 1.32176428054953, + "grad_norm": 2.597200659458208, + "learning_rate": 7.227282701508345e-06, + "logits/chosen": 0.1615338921546936, + "logits/rejected": 0.2152920812368393, + "logps/chosen": -0.6008027791976929, + "logps/rejected": -2.2669618129730225, + "loss": 0.5978, + "odds_ratio_loss": 0.33722782135009766, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.060080282390117645, + "rewards/margins": 0.16661590337753296, + "rewards/rejected": -0.22669617831707, + "sft_loss": 0.6008027791976929, + "step": 914 + }, + { + "epoch": 1.3232104121475055, + "grad_norm": 3.5437125466151405, + "learning_rate": 7.225446788969983e-06, + "logits/chosen": 0.03311977535486221, + "logits/rejected": 0.09436703473329544, + "logps/chosen": -0.7546591758728027, + "logps/rejected": -2.4398608207702637, + "loss": 0.6545, + "odds_ratio_loss": 0.3189430832862854, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07546591758728027, + "rewards/margins": 0.16852018237113953, + "rewards/rejected": -0.2439860850572586, + "sft_loss": 0.7546591758728027, + "step": 915 + }, + { + "epoch": 1.324656543745481, + "grad_norm": 2.3380832431302094, + "learning_rate": 7.22360893179774e-06, + "logits/chosen": 0.15004731714725494, + "logits/rejected": 0.20780208706855774, + "logps/chosen": -0.5689486265182495, + "logps/rejected": -2.762641429901123, + "loss": 0.6252, + "odds_ratio_loss": 0.3046174645423889, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05689486116170883, + "rewards/margins": 0.21936927735805511, + "rewards/rejected": -0.27626413106918335, + "sft_loss": 0.5689486265182495, + "step": 916 + }, + { + "epoch": 1.3261026753434564, + "grad_norm": 2.4482868229892945, + "learning_rate": 7.221769131099664e-06, + "logits/chosen": 0.15187448263168335, + "logits/rejected": 0.10933353006839752, + "logps/chosen": -0.4541112184524536, + "logps/rejected": -1.784976601600647, + "loss": 0.5475, + "odds_ratio_loss": 0.3130142092704773, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0454111248254776, + "rewards/margins": 0.13308654725551605, + "rewards/rejected": -0.17849765717983246, + "sft_loss": 0.4541112184524536, + "step": 917 + }, + { + "epoch": 1.3275488069414316, + "grad_norm": 2.1760698534951124, + "learning_rate": 7.219927387984981e-06, + "logits/chosen": 0.30233603715896606, + "logits/rejected": 0.2338351458311081, + "logps/chosen": -0.4559868276119232, + "logps/rejected": -3.7631051540374756, + "loss": 0.5335, + "odds_ratio_loss": 0.2917534112930298, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04559868201613426, + "rewards/margins": 0.33071184158325195, + "rewards/rejected": -0.3763105273246765, + "sft_loss": 0.4559868276119232, + "step": 918 + }, + { + "epoch": 1.328994938539407, + "grad_norm": 2.4506090033415626, + "learning_rate": 7.2180837035640835e-06, + "logits/chosen": 0.04991018399596214, + "logits/rejected": 0.04235024377703667, + "logps/chosen": -0.6620197892189026, + "logps/rejected": -1.66459059715271, + "loss": 0.5907, + "odds_ratio_loss": 0.5594837665557861, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06620198488235474, + "rewards/margins": 0.10025707632303238, + "rewards/rejected": -0.16645905375480652, + "sft_loss": 0.6620197892189026, + "step": 919 + }, + { + "epoch": 1.3304410701373826, + "grad_norm": 4.1490860488057555, + "learning_rate": 7.216238078948535e-06, + "logits/chosen": 0.08021122962236404, + "logits/rejected": 0.06495549529790878, + "logps/chosen": -0.7230328321456909, + "logps/rejected": -1.6988046169281006, + "loss": 0.6312, + "odds_ratio_loss": 0.4206200838088989, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07230328768491745, + "rewards/margins": 0.09757716953754425, + "rewards/rejected": -0.1698804497718811, + "sft_loss": 0.7230328321456909, + "step": 920 + }, + { + "epoch": 1.3318872017353578, + "grad_norm": 2.1950249688867856, + "learning_rate": 7.214390515251072e-06, + "logits/chosen": 0.02401835471391678, + "logits/rejected": 0.13640937209129333, + "logps/chosen": -0.628595232963562, + "logps/rejected": -3.0257411003112793, + "loss": 0.5807, + "odds_ratio_loss": 0.2951260209083557, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06285952031612396, + "rewards/margins": 0.2397146075963974, + "rewards/rejected": -0.30257412791252136, + "sft_loss": 0.628595232963562, + "step": 921 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 2.8819789495190573, + "learning_rate": 7.212541013585593e-06, + "logits/chosen": 0.06210823729634285, + "logits/rejected": 0.04636611416935921, + "logps/chosen": -0.6891711950302124, + "logps/rejected": -1.5676841735839844, + "loss": 0.5844, + "odds_ratio_loss": 0.4012836515903473, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06891711801290512, + "rewards/margins": 0.08785130083560944, + "rewards/rejected": -0.15676842629909515, + "sft_loss": 0.6891711950302124, + "step": 922 + }, + { + "epoch": 1.3347794649313087, + "grad_norm": 3.2852191324633906, + "learning_rate": 7.210689575067174e-06, + "logits/chosen": 0.03468220680952072, + "logits/rejected": 0.02965848706662655, + "logps/chosen": -0.7979564070701599, + "logps/rejected": -1.7514419555664062, + "loss": 0.7483, + "odds_ratio_loss": 0.4156520962715149, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07979562878608704, + "rewards/margins": 0.09534855931997299, + "rewards/rejected": -0.17514419555664062, + "sft_loss": 0.7979564070701599, + "step": 923 + }, + { + "epoch": 1.3362255965292842, + "grad_norm": 4.75037918024199, + "learning_rate": 7.2088362008120525e-06, + "logits/chosen": 0.05110059678554535, + "logits/rejected": -0.019120914861559868, + "logps/chosen": -0.7032926082611084, + "logps/rejected": -2.6311984062194824, + "loss": 0.6983, + "odds_ratio_loss": 0.3601962625980377, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07032926380634308, + "rewards/margins": 0.19279058277606964, + "rewards/rejected": -0.2631198465824127, + "sft_loss": 0.7032926082611084, + "step": 924 + }, + { + "epoch": 1.3376717281272597, + "grad_norm": 2.6554971310087008, + "learning_rate": 7.206980891937634e-06, + "logits/chosen": 0.11156775057315826, + "logits/rejected": 0.059639737010002136, + "logps/chosen": -0.6611694097518921, + "logps/rejected": -1.7360694408416748, + "loss": 0.6662, + "odds_ratio_loss": 0.388332724571228, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06611695140600204, + "rewards/margins": 0.10748998820781708, + "rewards/rejected": -0.17360693216323853, + "sft_loss": 0.6611694097518921, + "step": 925 + }, + { + "epoch": 1.339117859725235, + "grad_norm": 2.1000856373583217, + "learning_rate": 7.205123649562491e-06, + "logits/chosen": 0.048644740134477615, + "logits/rejected": 0.09357769787311554, + "logps/chosen": -0.6007069945335388, + "logps/rejected": -1.6124356985092163, + "loss": 0.5757, + "odds_ratio_loss": 0.2972748875617981, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06007070094347, + "rewards/margins": 0.10117286443710327, + "rewards/rejected": -0.16124355792999268, + "sft_loss": 0.6007069945335388, + "step": 926 + }, + { + "epoch": 1.3405639913232104, + "grad_norm": 2.4909605286439116, + "learning_rate": 7.203264474806363e-06, + "logits/chosen": 0.036845333874225616, + "logits/rejected": 0.03189665079116821, + "logps/chosen": -0.8164952993392944, + "logps/rejected": -1.740440845489502, + "loss": 0.6713, + "odds_ratio_loss": 0.4842316210269928, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0816495344042778, + "rewards/margins": 0.09239454567432404, + "rewards/rejected": -0.17404407262802124, + "sft_loss": 0.8164952993392944, + "step": 927 + }, + { + "epoch": 1.3420101229211858, + "grad_norm": 2.867368406936184, + "learning_rate": 7.201403368790153e-06, + "logits/chosen": 0.009860752150416374, + "logits/rejected": -0.03470249101519585, + "logps/chosen": -0.7068488597869873, + "logps/rejected": -1.9816157817840576, + "loss": 0.612, + "odds_ratio_loss": 0.38851380348205566, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07068488001823425, + "rewards/margins": 0.12747670710086823, + "rewards/rejected": -0.19816158711910248, + "sft_loss": 0.7068488597869873, + "step": 928 + }, + { + "epoch": 1.3434562545191613, + "grad_norm": 2.561085257631517, + "learning_rate": 7.199540332635929e-06, + "logits/chosen": 0.16725574433803558, + "logits/rejected": 0.08495479077100754, + "logps/chosen": -0.5896700024604797, + "logps/rejected": -2.8246240615844727, + "loss": 0.6371, + "odds_ratio_loss": 0.2792496085166931, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05896700173616409, + "rewards/margins": 0.22349542379379272, + "rewards/rejected": -0.2824624180793762, + "sft_loss": 0.5896700024604797, + "step": 929 + }, + { + "epoch": 1.3449023861171367, + "grad_norm": 3.454641080740334, + "learning_rate": 7.197675367466921e-06, + "logits/chosen": 0.13279861211776733, + "logits/rejected": 0.07949304580688477, + "logps/chosen": -0.5868549942970276, + "logps/rejected": -2.453871965408325, + "loss": 0.6217, + "odds_ratio_loss": 0.280853807926178, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05868549272418022, + "rewards/margins": 0.1867016851902008, + "rewards/rejected": -0.24538719654083252, + "sft_loss": 0.5868549942970276, + "step": 930 + }, + { + "epoch": 1.346348517715112, + "grad_norm": 2.357731459721896, + "learning_rate": 7.195808474407526e-06, + "logits/chosen": 0.23046283423900604, + "logits/rejected": 0.19778576493263245, + "logps/chosen": -0.5050675868988037, + "logps/rejected": -3.1081252098083496, + "loss": 0.6304, + "odds_ratio_loss": 0.292201966047287, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05050676316022873, + "rewards/margins": 0.2603057622909546, + "rewards/rejected": -0.3108125329017639, + "sft_loss": 0.5050675868988037, + "step": 931 + }, + { + "epoch": 1.3477946493130875, + "grad_norm": 2.7649321853544895, + "learning_rate": 7.193939654583298e-06, + "logits/chosen": 0.08632639050483704, + "logits/rejected": 0.1114271879196167, + "logps/chosen": -0.5845703482627869, + "logps/rejected": -3.52717924118042, + "loss": 0.6325, + "odds_ratio_loss": 0.2684318423271179, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.058457035571336746, + "rewards/margins": 0.2942608594894409, + "rewards/rejected": -0.35271787643432617, + "sft_loss": 0.5845703482627869, + "step": 932 + }, + { + "epoch": 1.349240780911063, + "grad_norm": 2.848582481930005, + "learning_rate": 7.192068909120959e-06, + "logits/chosen": 0.23675987124443054, + "logits/rejected": 0.15245428681373596, + "logps/chosen": -0.5717525482177734, + "logps/rejected": -2.8358075618743896, + "loss": 0.5965, + "odds_ratio_loss": 0.27822452783584595, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05717525631189346, + "rewards/margins": 0.22640548646450043, + "rewards/rejected": -0.2835807800292969, + "sft_loss": 0.5717525482177734, + "step": 933 + }, + { + "epoch": 1.3506869125090384, + "grad_norm": 2.2955814268257035, + "learning_rate": 7.190196239148383e-06, + "logits/chosen": 0.04583890736103058, + "logits/rejected": 0.12824928760528564, + "logps/chosen": -0.5103280544281006, + "logps/rejected": -2.381923198699951, + "loss": 0.5477, + "odds_ratio_loss": 0.2340468466281891, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05103280395269394, + "rewards/margins": 0.18715953826904297, + "rewards/rejected": -0.2381923347711563, + "sft_loss": 0.5103280544281006, + "step": 934 + }, + { + "epoch": 1.3521330441070138, + "grad_norm": 2.265898663876626, + "learning_rate": 7.188321645794614e-06, + "logits/chosen": 0.10190585255622864, + "logits/rejected": 0.072527676820755, + "logps/chosen": -0.7817630171775818, + "logps/rejected": -1.4952585697174072, + "loss": 0.7559, + "odds_ratio_loss": 0.4879745841026306, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07817629724740982, + "rewards/margins": 0.07134956121444702, + "rewards/rejected": -0.14952586591243744, + "sft_loss": 0.7817630171775818, + "step": 935 + }, + { + "epoch": 1.353579175704989, + "grad_norm": 2.720364384274246, + "learning_rate": 7.186445130189851e-06, + "logits/chosen": 0.142786905169487, + "logits/rejected": 0.1314203292131424, + "logps/chosen": -0.472442626953125, + "logps/rejected": -3.0655393600463867, + "loss": 0.6144, + "odds_ratio_loss": 0.26142001152038574, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04724426195025444, + "rewards/margins": 0.25930964946746826, + "rewards/rejected": -0.3065539002418518, + "sft_loss": 0.472442626953125, + "step": 936 + }, + { + "epoch": 1.3550253073029646, + "grad_norm": 2.175857876943466, + "learning_rate": 7.184566693465451e-06, + "logits/chosen": 0.1783963143825531, + "logits/rejected": 0.04779674485325813, + "logps/chosen": -0.7196685671806335, + "logps/rejected": -2.550107955932617, + "loss": 0.5999, + "odds_ratio_loss": 0.36530670523643494, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07196685671806335, + "rewards/margins": 0.1830439418554306, + "rewards/rejected": -0.25501078367233276, + "sft_loss": 0.7196685671806335, + "step": 937 + }, + { + "epoch": 1.35647143890094, + "grad_norm": 2.3984769925628706, + "learning_rate": 7.182686336753932e-06, + "logits/chosen": 0.025604430586099625, + "logits/rejected": 0.07506309449672699, + "logps/chosen": -0.6966655254364014, + "logps/rejected": -2.069488286972046, + "loss": 0.6765, + "odds_ratio_loss": 0.30220383405685425, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0696665421128273, + "rewards/margins": 0.13728229701519012, + "rewards/rejected": -0.20694883167743683, + "sft_loss": 0.6966655254364014, + "step": 938 + }, + { + "epoch": 1.3579175704989155, + "grad_norm": 2.0636491573130056, + "learning_rate": 7.180804061188965e-06, + "logits/chosen": 0.2072528898715973, + "logits/rejected": 0.1830851286649704, + "logps/chosen": -0.5181276798248291, + "logps/rejected": -4.006014823913574, + "loss": 0.576, + "odds_ratio_loss": 0.28068408370018005, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05181277170777321, + "rewards/margins": 0.34878870844841003, + "rewards/rejected": -0.40060147643089294, + "sft_loss": 0.5181276798248291, + "step": 939 + }, + { + "epoch": 1.359363702096891, + "grad_norm": 2.2952421934808593, + "learning_rate": 7.1789198679053835e-06, + "logits/chosen": 0.17554627358913422, + "logits/rejected": 0.06563098728656769, + "logps/chosen": -0.6369041204452515, + "logps/rejected": -3.788663387298584, + "loss": 0.6079, + "odds_ratio_loss": 0.19372212886810303, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0636904165148735, + "rewards/margins": 0.3151758909225464, + "rewards/rejected": -0.3788663446903229, + "sft_loss": 0.6369041204452515, + "step": 940 + }, + { + "epoch": 1.3608098336948662, + "grad_norm": 2.335130582600968, + "learning_rate": 7.177033758039174e-06, + "logits/chosen": 0.079743891954422, + "logits/rejected": 0.04777732491493225, + "logps/chosen": -0.5177881717681885, + "logps/rejected": -4.678028106689453, + "loss": 0.5689, + "odds_ratio_loss": 0.3156812787055969, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05177881568670273, + "rewards/margins": 0.41602402925491333, + "rewards/rejected": -0.46780288219451904, + "sft_loss": 0.5177881717681885, + "step": 941 + }, + { + "epoch": 1.3622559652928417, + "grad_norm": 3.0332594020601626, + "learning_rate": 7.175145732727481e-06, + "logits/chosen": 0.23156288266181946, + "logits/rejected": 0.20619621872901917, + "logps/chosen": -0.46926769614219666, + "logps/rejected": -2.526423215866089, + "loss": 0.5919, + "odds_ratio_loss": 0.3614828288555145, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.046926770359277725, + "rewards/margins": 0.20571555197238922, + "rewards/rejected": -0.25264233350753784, + "sft_loss": 0.46926769614219666, + "step": 942 + }, + { + "epoch": 1.3637020968908171, + "grad_norm": 3.2546479492806704, + "learning_rate": 7.1732557931085986e-06, + "logits/chosen": 0.06294719874858856, + "logits/rejected": 0.19816607236862183, + "logps/chosen": -0.5671209096908569, + "logps/rejected": -1.2771296501159668, + "loss": 0.5765, + "odds_ratio_loss": 0.4410284757614136, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.056712083518505096, + "rewards/margins": 0.07100087404251099, + "rewards/rejected": -0.12771296501159668, + "sft_loss": 0.5671209096908569, + "step": 943 + }, + { + "epoch": 1.3651482284887924, + "grad_norm": 2.39680766363605, + "learning_rate": 7.17136394032198e-06, + "logits/chosen": 0.17434678971767426, + "logits/rejected": 0.10099100321531296, + "logps/chosen": -0.4977617561817169, + "logps/rejected": -2.2785232067108154, + "loss": 0.5481, + "odds_ratio_loss": 0.41038426756858826, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.04977617412805557, + "rewards/margins": 0.1780761480331421, + "rewards/rejected": -0.22785231471061707, + "sft_loss": 0.4977617561817169, + "step": 944 + }, + { + "epoch": 1.3665943600867678, + "grad_norm": 2.440564784775663, + "learning_rate": 7.16947017550823e-06, + "logits/chosen": 0.0626385509967804, + "logits/rejected": 0.029109565541148186, + "logps/chosen": -0.6074717044830322, + "logps/rejected": -3.7257726192474365, + "loss": 0.661, + "odds_ratio_loss": 0.27803951501846313, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0607471764087677, + "rewards/margins": 0.3118301033973694, + "rewards/rejected": -0.3725772798061371, + "sft_loss": 0.6074717044830322, + "step": 945 + }, + { + "epoch": 1.3680404916847433, + "grad_norm": 7.122163738225102, + "learning_rate": 7.167574499809108e-06, + "logits/chosen": 0.1047414243221283, + "logits/rejected": 0.05279163271188736, + "logps/chosen": -0.6560348868370056, + "logps/rejected": -3.7505457401275635, + "loss": 0.6665, + "odds_ratio_loss": 0.3180413544178009, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06560348719358444, + "rewards/margins": 0.30945104360580444, + "rewards/rejected": -0.37505456805229187, + "sft_loss": 0.6560348868370056, + "step": 946 + }, + { + "epoch": 1.3694866232827188, + "grad_norm": 17.543573177125726, + "learning_rate": 7.165676914367522e-06, + "logits/chosen": 0.08533883094787598, + "logits/rejected": 0.14580032229423523, + "logps/chosen": -0.6218971610069275, + "logps/rejected": -3.34824275970459, + "loss": 0.5894, + "odds_ratio_loss": 0.30703213810920715, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06218971684575081, + "rewards/margins": 0.2726345658302307, + "rewards/rejected": -0.33482426404953003, + "sft_loss": 0.6218971610069275, + "step": 947 + }, + { + "epoch": 1.3709327548806942, + "grad_norm": 2.5915141272193307, + "learning_rate": 7.163777420327534e-06, + "logits/chosen": 0.1597047746181488, + "logits/rejected": 0.11482205241918564, + "logps/chosen": -0.6220844388008118, + "logps/rejected": -2.246950149536133, + "loss": 0.5908, + "odds_ratio_loss": 0.3186954855918884, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06220844388008118, + "rewards/margins": 0.16248659789562225, + "rewards/rejected": -0.22469504177570343, + "sft_loss": 0.6220844388008118, + "step": 948 + }, + { + "epoch": 1.3723788864786695, + "grad_norm": 2.493790722665044, + "learning_rate": 7.161876018834357e-06, + "logits/chosen": 0.23967748880386353, + "logits/rejected": 0.19121602177619934, + "logps/chosen": -0.6689061522483826, + "logps/rejected": -2.2310941219329834, + "loss": 0.6469, + "odds_ratio_loss": 0.3700863718986511, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06689061969518661, + "rewards/margins": 0.15621879696846008, + "rewards/rejected": -0.2231094092130661, + "sft_loss": 0.6689061522483826, + "step": 949 + }, + { + "epoch": 1.373825018076645, + "grad_norm": 7.095619235390716, + "learning_rate": 7.159972711034352e-06, + "logits/chosen": 0.3317505717277527, + "logits/rejected": 0.25593841075897217, + "logps/chosen": -0.5780055522918701, + "logps/rejected": -1.6080982685089111, + "loss": 0.5444, + "odds_ratio_loss": 0.36196255683898926, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05780055373907089, + "rewards/margins": 0.10300928354263306, + "rewards/rejected": -0.16080984473228455, + "sft_loss": 0.5780055522918701, + "step": 950 + }, + { + "epoch": 1.3752711496746204, + "grad_norm": 2.4424076920707174, + "learning_rate": 7.15806749807503e-06, + "logits/chosen": 0.2837293744087219, + "logits/rejected": 0.27560287714004517, + "logps/chosen": -0.3303763270378113, + "logps/rejected": -1.49053156375885, + "loss": 0.5996, + "odds_ratio_loss": 0.24135808646678925, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03303763270378113, + "rewards/margins": 0.11601553857326508, + "rewards/rejected": -0.1490531712770462, + "sft_loss": 0.3303763270378113, + "step": 951 + }, + { + "epoch": 1.3767172812725958, + "grad_norm": 2.8452765697499354, + "learning_rate": 7.156160381105051e-06, + "logits/chosen": 0.12051470577716827, + "logits/rejected": 0.08477576822042465, + "logps/chosen": -0.5957016944885254, + "logps/rejected": -2.136902093887329, + "loss": 0.5906, + "odds_ratio_loss": 0.3252412676811218, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05957017466425896, + "rewards/margins": 0.1541200429201126, + "rewards/rejected": -0.21369022130966187, + "sft_loss": 0.5957016944885254, + "step": 952 + }, + { + "epoch": 1.3781634128705713, + "grad_norm": 2.5765936064861776, + "learning_rate": 7.154251361274225e-06, + "logits/chosen": -0.002856435254216194, + "logits/rejected": 0.07068169862031937, + "logps/chosen": -0.6157888770103455, + "logps/rejected": -1.3842309713363647, + "loss": 0.5797, + "odds_ratio_loss": 0.3665923476219177, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.061578892171382904, + "rewards/margins": 0.0768442153930664, + "rewards/rejected": -0.1384231150150299, + "sft_loss": 0.6157888770103455, + "step": 953 + }, + { + "epoch": 1.3796095444685466, + "grad_norm": 2.2398769523605786, + "learning_rate": 7.152340439733504e-06, + "logits/chosen": 0.13600611686706543, + "logits/rejected": 0.10630634427070618, + "logps/chosen": -0.5609400868415833, + "logps/rejected": -1.998949408531189, + "loss": 0.5757, + "odds_ratio_loss": 0.23014606535434723, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.056094009429216385, + "rewards/margins": 0.14380092918872833, + "rewards/rejected": -0.19989493489265442, + "sft_loss": 0.5609400868415833, + "step": 954 + }, + { + "epoch": 1.381055676066522, + "grad_norm": 2.613749506790682, + "learning_rate": 7.1504276176349925e-06, + "logits/chosen": 0.22844244539737701, + "logits/rejected": 0.22849160432815552, + "logps/chosen": -0.4829053282737732, + "logps/rejected": -1.5053625106811523, + "loss": 0.5361, + "odds_ratio_loss": 0.26122379302978516, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04829053580760956, + "rewards/margins": 0.10224571079015732, + "rewards/rejected": -0.15053623914718628, + "sft_loss": 0.4829053282737732, + "step": 955 + }, + { + "epoch": 1.3825018076644975, + "grad_norm": 2.4554120773971757, + "learning_rate": 7.148512896131937e-06, + "logits/chosen": 0.09752713143825531, + "logits/rejected": 0.1276295930147171, + "logps/chosen": -0.6276790499687195, + "logps/rejected": -1.9808611869812012, + "loss": 0.6139, + "odds_ratio_loss": 0.24553599953651428, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06276790797710419, + "rewards/margins": 0.13531821966171265, + "rewards/rejected": -0.19808611273765564, + "sft_loss": 0.6276790499687195, + "step": 956 + }, + { + "epoch": 1.383947939262473, + "grad_norm": 4.827788032322935, + "learning_rate": 7.146596276378728e-06, + "logits/chosen": 0.2864004969596863, + "logits/rejected": 0.24811071157455444, + "logps/chosen": -0.5591001510620117, + "logps/rejected": -1.2078362703323364, + "loss": 0.4646, + "odds_ratio_loss": 0.32582181692123413, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05591001361608505, + "rewards/margins": 0.06487361341714859, + "rewards/rejected": -0.12078362703323364, + "sft_loss": 0.5591001510620117, + "step": 957 + }, + { + "epoch": 1.3853940708604484, + "grad_norm": 2.9075717065401308, + "learning_rate": 7.1446777595309066e-06, + "logits/chosen": 0.21857096254825592, + "logits/rejected": 0.12259276211261749, + "logps/chosen": -0.6916131973266602, + "logps/rejected": -1.2286429405212402, + "loss": 0.6633, + "odds_ratio_loss": 0.5306054353713989, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0691613182425499, + "rewards/margins": 0.053702980279922485, + "rewards/rejected": -0.12286430597305298, + "sft_loss": 0.6916131973266602, + "step": 958 + }, + { + "epoch": 1.3868402024584237, + "grad_norm": 2.4754404143488333, + "learning_rate": 7.1427573467451515e-06, + "logits/chosen": 0.1782752275466919, + "logits/rejected": 0.18330281972885132, + "logps/chosen": -0.7293274402618408, + "logps/rejected": -1.8634823560714722, + "loss": 0.7318, + "odds_ratio_loss": 0.28276628255844116, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07293274253606796, + "rewards/margins": 0.11341549456119537, + "rewards/rejected": -0.18634825944900513, + "sft_loss": 0.7293274402618408, + "step": 959 + }, + { + "epoch": 1.3882863340563991, + "grad_norm": 2.2520057424020927, + "learning_rate": 7.140835039179288e-06, + "logits/chosen": 0.14758704602718353, + "logits/rejected": 0.1023918017745018, + "logps/chosen": -0.5791318416595459, + "logps/rejected": -1.516669511795044, + "loss": 0.5407, + "odds_ratio_loss": 0.3355981111526489, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05791318789124489, + "rewards/margins": 0.09375376999378204, + "rewards/rejected": -0.15166696906089783, + "sft_loss": 0.5791318416595459, + "step": 960 + }, + { + "epoch": 1.3897324656543746, + "grad_norm": 2.3823196591157902, + "learning_rate": 7.138910837992281e-06, + "logits/chosen": 0.15106113255023956, + "logits/rejected": 0.05747794359922409, + "logps/chosen": -0.6319140791893005, + "logps/rejected": -1.544478178024292, + "loss": 0.7169, + "odds_ratio_loss": 0.39006781578063965, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06319140642881393, + "rewards/margins": 0.09125641733407974, + "rewards/rejected": -0.15444782376289368, + "sft_loss": 0.6319140791893005, + "step": 961 + }, + { + "epoch": 1.39117859725235, + "grad_norm": 3.1485603341157806, + "learning_rate": 7.1369847443442394e-06, + "logits/chosen": 0.11428692191839218, + "logits/rejected": 0.1583063304424286, + "logps/chosen": -0.43781501054763794, + "logps/rejected": -1.672081708908081, + "loss": 0.5578, + "odds_ratio_loss": 0.20121005177497864, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.043781500309705734, + "rewards/margins": 0.12342666834592819, + "rewards/rejected": -0.16720816493034363, + "sft_loss": 0.43781501054763794, + "step": 962 + }, + { + "epoch": 1.3926247288503255, + "grad_norm": 2.7598488840824196, + "learning_rate": 7.135056759396411e-06, + "logits/chosen": 0.1512761414051056, + "logits/rejected": 0.16234086453914642, + "logps/chosen": -0.47584232687950134, + "logps/rejected": -2.192073106765747, + "loss": 0.6127, + "odds_ratio_loss": 0.24130618572235107, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.047584228217601776, + "rewards/margins": 0.171623095870018, + "rewards/rejected": -0.21920731663703918, + "sft_loss": 0.47584232687950134, + "step": 963 + }, + { + "epoch": 1.3940708604483008, + "grad_norm": 2.549294497737792, + "learning_rate": 7.133126884311187e-06, + "logits/chosen": 0.15712814033031464, + "logits/rejected": 0.09098143875598907, + "logps/chosen": -0.38123244047164917, + "logps/rejected": -2.0281896591186523, + "loss": 0.4884, + "odds_ratio_loss": 0.26970845460891724, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.038123246282339096, + "rewards/margins": 0.16469572484493256, + "rewards/rejected": -0.20281897485256195, + "sft_loss": 0.38123244047164917, + "step": 964 + }, + { + "epoch": 1.3955169920462762, + "grad_norm": 4.069834024322856, + "learning_rate": 7.131195120252096e-06, + "logits/chosen": -0.048756882548332214, + "logits/rejected": -0.008307691663503647, + "logps/chosen": -0.5556820631027222, + "logps/rejected": -1.4905530214309692, + "loss": 0.6153, + "odds_ratio_loss": 0.33696621656417847, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05556820333003998, + "rewards/margins": 0.09348709136247635, + "rewards/rejected": -0.14905530214309692, + "sft_loss": 0.5556820631027222, + "step": 965 + }, + { + "epoch": 1.3969631236442517, + "grad_norm": 3.272715217782266, + "learning_rate": 7.129261468383804e-06, + "logits/chosen": 0.13486486673355103, + "logits/rejected": 0.18454203009605408, + "logps/chosen": -0.454858660697937, + "logps/rejected": -1.9876313209533691, + "loss": 0.5522, + "odds_ratio_loss": 0.2198922336101532, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04548586905002594, + "rewards/margins": 0.15327726304531097, + "rewards/rejected": -0.19876313209533691, + "sft_loss": 0.454858660697937, + "step": 966 + }, + { + "epoch": 1.398409255242227, + "grad_norm": 2.9672392480045553, + "learning_rate": 7.127325929872119e-06, + "logits/chosen": 0.10959649085998535, + "logits/rejected": 0.0439496710896492, + "logps/chosen": -0.6432375907897949, + "logps/rejected": -1.7530879974365234, + "loss": 0.5961, + "odds_ratio_loss": 0.3384247124195099, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06432375311851501, + "rewards/margins": 0.11098504066467285, + "rewards/rejected": -0.17530880868434906, + "sft_loss": 0.6432375907897949, + "step": 967 + }, + { + "epoch": 1.3998553868402024, + "grad_norm": 3.7393635299161994, + "learning_rate": 7.125388505883983e-06, + "logits/chosen": 0.14931072294712067, + "logits/rejected": 0.10823297500610352, + "logps/chosen": -0.6483991742134094, + "logps/rejected": -2.3597779273986816, + "loss": 0.6251, + "odds_ratio_loss": 0.35210758447647095, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0648399218916893, + "rewards/margins": 0.17113786935806274, + "rewards/rejected": -0.23597779870033264, + "sft_loss": 0.6483991742134094, + "step": 968 + }, + { + "epoch": 1.4013015184381779, + "grad_norm": 14.620071753017426, + "learning_rate": 7.123449197587477e-06, + "logits/chosen": 0.2409316450357437, + "logits/rejected": 0.16842088103294373, + "logps/chosen": -0.4753984808921814, + "logps/rejected": -1.9124274253845215, + "loss": 0.5743, + "odds_ratio_loss": 0.3033334016799927, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0475398488342762, + "rewards/margins": 0.14370287954807281, + "rewards/rejected": -0.19124272465705872, + "sft_loss": 0.4753984808921814, + "step": 969 + }, + { + "epoch": 1.4027476500361533, + "grad_norm": 2.45277025388321, + "learning_rate": 7.121508006151817e-06, + "logits/chosen": 0.20841699838638306, + "logits/rejected": 0.15673436224460602, + "logps/chosen": -0.4456249475479126, + "logps/rejected": -1.9759278297424316, + "loss": 0.5027, + "odds_ratio_loss": 0.22154033184051514, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04456249624490738, + "rewards/margins": 0.15303027629852295, + "rewards/rejected": -0.19759276509284973, + "sft_loss": 0.4456249475479126, + "step": 970 + }, + { + "epoch": 1.4041937816341288, + "grad_norm": 3.378013540864925, + "learning_rate": 7.119564932747353e-06, + "logits/chosen": 0.1502024084329605, + "logits/rejected": 0.1177692785859108, + "logps/chosen": -0.7510949969291687, + "logps/rejected": -1.452868938446045, + "loss": 0.6338, + "odds_ratio_loss": 0.3943096399307251, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07510949671268463, + "rewards/margins": 0.07017740607261658, + "rewards/rejected": -0.1452869176864624, + "sft_loss": 0.7510949969291687, + "step": 971 + }, + { + "epoch": 1.405639913232104, + "grad_norm": 2.4471392780527443, + "learning_rate": 7.1176199785455744e-06, + "logits/chosen": 0.18596996366977692, + "logits/rejected": 0.20554053783416748, + "logps/chosen": -0.48409661650657654, + "logps/rejected": -2.281468152999878, + "loss": 0.5936, + "odds_ratio_loss": 0.18068525195121765, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04840966686606407, + "rewards/margins": 0.1797371655702591, + "rewards/rejected": -0.22814682126045227, + "sft_loss": 0.48409661650657654, + "step": 972 + }, + { + "epoch": 1.4070860448300795, + "grad_norm": 2.145919907848024, + "learning_rate": 7.115673144719098e-06, + "logits/chosen": 0.22358013689517975, + "logits/rejected": 0.1526821106672287, + "logps/chosen": -0.764935314655304, + "logps/rejected": -1.8448175191879272, + "loss": 0.6393, + "odds_ratio_loss": 0.3151146173477173, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0764935314655304, + "rewards/margins": 0.10798821598291397, + "rewards/rejected": -0.18448176980018616, + "sft_loss": 0.764935314655304, + "step": 973 + }, + { + "epoch": 1.408532176428055, + "grad_norm": 2.4421624822159766, + "learning_rate": 7.11372443244168e-06, + "logits/chosen": 0.1208169162273407, + "logits/rejected": 0.003757679834961891, + "logps/chosen": -0.6918614506721497, + "logps/rejected": -1.6558613777160645, + "loss": 0.721, + "odds_ratio_loss": 0.3674412965774536, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06918615102767944, + "rewards/margins": 0.09639999270439148, + "rewards/rejected": -0.16558612883090973, + "sft_loss": 0.6918614506721497, + "step": 974 + }, + { + "epoch": 1.4099783080260304, + "grad_norm": 2.323616865318886, + "learning_rate": 7.111773842888204e-06, + "logits/chosen": 0.08812505751848221, + "logits/rejected": 0.11653533577919006, + "logps/chosen": -0.4842926859855652, + "logps/rejected": -1.6144311428070068, + "loss": 0.6273, + "odds_ratio_loss": 0.17716938257217407, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04842927306890488, + "rewards/margins": 0.1130138412117958, + "rewards/rejected": -0.16144311428070068, + "sft_loss": 0.4842926859855652, + "step": 975 + }, + { + "epoch": 1.4114244396240059, + "grad_norm": 2.9872065011223112, + "learning_rate": 7.109821377234688e-06, + "logits/chosen": 0.057052284479141235, + "logits/rejected": 0.0494823083281517, + "logps/chosen": -0.7413144111633301, + "logps/rejected": -1.2311071157455444, + "loss": 0.6643, + "odds_ratio_loss": 0.48893070220947266, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07413144409656525, + "rewards/margins": 0.0489792674779892, + "rewards/rejected": -0.12311071157455444, + "sft_loss": 0.7413144111633301, + "step": 976 + }, + { + "epoch": 1.4128705712219811, + "grad_norm": 2.8564528181411064, + "learning_rate": 7.107867036658283e-06, + "logits/chosen": 0.12099157273769379, + "logits/rejected": 0.14747996628284454, + "logps/chosen": -0.5671837329864502, + "logps/rejected": -2.8476688861846924, + "loss": 0.6619, + "odds_ratio_loss": 0.2223397195339203, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0567183718085289, + "rewards/margins": 0.22804851830005646, + "rewards/rejected": -0.28476688265800476, + "sft_loss": 0.5671837329864502, + "step": 977 + }, + { + "epoch": 1.4143167028199566, + "grad_norm": 3.8616731526720796, + "learning_rate": 7.105910822337266e-06, + "logits/chosen": 0.24294635653495789, + "logits/rejected": 0.23327285051345825, + "logps/chosen": -0.4823381304740906, + "logps/rejected": -1.6777340173721313, + "loss": 0.6131, + "odds_ratio_loss": 0.33123984932899475, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.048233818262815475, + "rewards/margins": 0.11953960359096527, + "rewards/rejected": -0.16777342557907104, + "sft_loss": 0.4823381304740906, + "step": 978 + }, + { + "epoch": 1.415762834417932, + "grad_norm": 2.2192337031178804, + "learning_rate": 7.103952735451047e-06, + "logits/chosen": 0.26770561933517456, + "logits/rejected": 0.33913475275039673, + "logps/chosen": -0.4104244112968445, + "logps/rejected": -1.8166509866714478, + "loss": 0.5658, + "odds_ratio_loss": 0.3066392242908478, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04104244336485863, + "rewards/margins": 0.14062266051769257, + "rewards/rejected": -0.1816651076078415, + "sft_loss": 0.4104244112968445, + "step": 979 + }, + { + "epoch": 1.4172089660159075, + "grad_norm": 2.734924337643227, + "learning_rate": 7.1019927771801625e-06, + "logits/chosen": 0.14210444688796997, + "logits/rejected": 0.16518303751945496, + "logps/chosen": -0.7257510423660278, + "logps/rejected": -1.8980352878570557, + "loss": 0.6179, + "odds_ratio_loss": 0.2865678369998932, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07257509976625443, + "rewards/margins": 0.1172284185886383, + "rewards/rejected": -0.18980352580547333, + "sft_loss": 0.7257510423660278, + "step": 980 + }, + { + "epoch": 1.418655097613883, + "grad_norm": 2.229503219289486, + "learning_rate": 7.10003094870628e-06, + "logits/chosen": 0.13261494040489197, + "logits/rejected": 0.061438750475645065, + "logps/chosen": -0.610904335975647, + "logps/rejected": -1.613142490386963, + "loss": 0.6427, + "odds_ratio_loss": 0.38070380687713623, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.061090435832738876, + "rewards/margins": 0.10022380948066711, + "rewards/rejected": -0.1613142490386963, + "sft_loss": 0.610904335975647, + "step": 981 + }, + { + "epoch": 1.4201012292118582, + "grad_norm": 4.653721481898526, + "learning_rate": 7.0980672512121925e-06, + "logits/chosen": 0.08789413422346115, + "logits/rejected": 0.12038551270961761, + "logps/chosen": -0.615592360496521, + "logps/rejected": -1.3449037075042725, + "loss": 0.645, + "odds_ratio_loss": 0.3649972677230835, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06155924126505852, + "rewards/margins": 0.07293112576007843, + "rewards/rejected": -0.13449037075042725, + "sft_loss": 0.615592360496521, + "step": 982 + }, + { + "epoch": 1.4215473608098337, + "grad_norm": 7.576850107740403, + "learning_rate": 7.096101685881821e-06, + "logits/chosen": 0.16142991185188293, + "logits/rejected": 0.14472821354866028, + "logps/chosen": -0.5334713459014893, + "logps/rejected": -1.8715717792510986, + "loss": 0.6974, + "odds_ratio_loss": 0.2439751923084259, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.053347136825323105, + "rewards/margins": 0.13381005823612213, + "rewards/rejected": -0.18715719878673553, + "sft_loss": 0.5334713459014893, + "step": 983 + }, + { + "epoch": 1.4229934924078091, + "grad_norm": 3.518443568999813, + "learning_rate": 7.094134253900212e-06, + "logits/chosen": 0.12732619047164917, + "logits/rejected": 0.24472256004810333, + "logps/chosen": -0.572356104850769, + "logps/rejected": -1.8468163013458252, + "loss": 0.5554, + "odds_ratio_loss": 0.30827876925468445, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.057235606014728546, + "rewards/margins": 0.1274460256099701, + "rewards/rejected": -0.18468163907527924, + "sft_loss": 0.572356104850769, + "step": 984 + }, + { + "epoch": 1.4244396240057844, + "grad_norm": 2.719443297095395, + "learning_rate": 7.092164956453539e-06, + "logits/chosen": 0.23533201217651367, + "logits/rejected": 0.1809999793767929, + "logps/chosen": -0.5789510011672974, + "logps/rejected": -1.5870184898376465, + "loss": 0.6231, + "odds_ratio_loss": 0.33866560459136963, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05789510905742645, + "rewards/margins": 0.10080674290657043, + "rewards/rejected": -0.1587018370628357, + "sft_loss": 0.5789510011672974, + "step": 985 + }, + { + "epoch": 1.42588575560376, + "grad_norm": 5.778385803733022, + "learning_rate": 7.090193794729095e-06, + "logits/chosen": 0.008287119679152966, + "logits/rejected": 0.03246738016605377, + "logps/chosen": -0.5857095718383789, + "logps/rejected": -2.002685070037842, + "loss": 0.7155, + "odds_ratio_loss": 0.2693645656108856, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05857095867395401, + "rewards/margins": 0.14169755578041077, + "rewards/rejected": -0.20026850700378418, + "sft_loss": 0.5857095718383789, + "step": 986 + }, + { + "epoch": 1.4273318872017353, + "grad_norm": 2.484397723351678, + "learning_rate": 7.088220769915304e-06, + "logits/chosen": 0.04477142542600632, + "logits/rejected": 0.02228371426463127, + "logps/chosen": -0.6996700167655945, + "logps/rejected": -1.581622838973999, + "loss": 0.6102, + "odds_ratio_loss": 0.41134923696517944, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06996700912714005, + "rewards/margins": 0.08819527924060822, + "rewards/rejected": -0.15816228091716766, + "sft_loss": 0.6996700167655945, + "step": 987 + }, + { + "epoch": 1.4287780187997108, + "grad_norm": 3.1362962214467363, + "learning_rate": 7.086245883201709e-06, + "logits/chosen": -0.01240419689565897, + "logits/rejected": -0.013412795960903168, + "logps/chosen": -0.6005573272705078, + "logps/rejected": -1.2886971235275269, + "loss": 0.7403, + "odds_ratio_loss": 0.4563630223274231, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06005573272705078, + "rewards/margins": 0.0688139945268631, + "rewards/rejected": -0.12886972725391388, + "sft_loss": 0.6005573272705078, + "step": 988 + }, + { + "epoch": 1.4302241503976862, + "grad_norm": 2.7302199943502963, + "learning_rate": 7.084269135778976e-06, + "logits/chosen": 0.27440059185028076, + "logits/rejected": 0.17620548605918884, + "logps/chosen": -0.5781697034835815, + "logps/rejected": -1.6051629781723022, + "loss": 0.599, + "odds_ratio_loss": 0.3173554539680481, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.057816971093416214, + "rewards/margins": 0.10269933193922043, + "rewards/rejected": -0.16051630675792694, + "sft_loss": 0.5781697034835815, + "step": 989 + }, + { + "epoch": 1.4316702819956615, + "grad_norm": 2.211744246354969, + "learning_rate": 7.082290528838895e-06, + "logits/chosen": 0.19083952903747559, + "logits/rejected": 0.14187321066856384, + "logps/chosen": -0.6545434594154358, + "logps/rejected": -1.9486523866653442, + "loss": 0.5823, + "odds_ratio_loss": 0.32819491624832153, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06545434892177582, + "rewards/margins": 0.12941089272499084, + "rewards/rejected": -0.19486522674560547, + "sft_loss": 0.6545434594154358, + "step": 990 + }, + { + "epoch": 1.433116413593637, + "grad_norm": 2.8739397638479582, + "learning_rate": 7.080310063574374e-06, + "logits/chosen": 0.2413957715034485, + "logits/rejected": 0.2167605757713318, + "logps/chosen": -0.7022543549537659, + "logps/rejected": -1.8416874408721924, + "loss": 0.6775, + "odds_ratio_loss": 0.4233511984348297, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07022543251514435, + "rewards/margins": 0.11394332349300385, + "rewards/rejected": -0.1841687560081482, + "sft_loss": 0.7022543549537659, + "step": 991 + }, + { + "epoch": 1.4345625451916124, + "grad_norm": 4.233643812008143, + "learning_rate": 7.078327741179443e-06, + "logits/chosen": 0.31747961044311523, + "logits/rejected": 0.17522543668746948, + "logps/chosen": -0.526762843132019, + "logps/rejected": -1.6177246570587158, + "loss": 0.6189, + "odds_ratio_loss": 0.3080407977104187, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05267628654837608, + "rewards/margins": 0.10909618437290192, + "rewards/rejected": -0.1617724597454071, + "sft_loss": 0.526762843132019, + "step": 992 + }, + { + "epoch": 1.4360086767895879, + "grad_norm": 3.555697493007301, + "learning_rate": 7.076343562849253e-06, + "logits/chosen": 0.18947246670722961, + "logits/rejected": 0.11990557610988617, + "logps/chosen": -0.6089348793029785, + "logps/rejected": -3.1226184368133545, + "loss": 0.6423, + "odds_ratio_loss": 0.21913611888885498, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06089348718523979, + "rewards/margins": 0.25136834383010864, + "rewards/rejected": -0.31226181983947754, + "sft_loss": 0.6089348793029785, + "step": 993 + }, + { + "epoch": 1.4374548083875633, + "grad_norm": 2.4046589860985788, + "learning_rate": 7.074357529780071e-06, + "logits/chosen": 0.12996569275856018, + "logits/rejected": 0.2138558328151703, + "logps/chosen": -0.6799343228340149, + "logps/rejected": -1.2040917873382568, + "loss": 0.6446, + "odds_ratio_loss": 0.37377679347991943, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06799343228340149, + "rewards/margins": 0.05241573974490166, + "rewards/rejected": -0.12040917575359344, + "sft_loss": 0.6799343228340149, + "step": 994 + }, + { + "epoch": 1.4389009399855386, + "grad_norm": 2.5068905804152304, + "learning_rate": 7.072369643169284e-06, + "logits/chosen": 0.28199514746665955, + "logits/rejected": 0.1579236090183258, + "logps/chosen": -0.6818425059318542, + "logps/rejected": -1.374591588973999, + "loss": 0.6167, + "odds_ratio_loss": 0.4469667971134186, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0681842491030693, + "rewards/margins": 0.0692749172449112, + "rewards/rejected": -0.1374591588973999, + "sft_loss": 0.6818425059318542, + "step": 995 + }, + { + "epoch": 1.440347071583514, + "grad_norm": 2.8897518619748825, + "learning_rate": 7.070379904215396e-06, + "logits/chosen": 0.19853602349758148, + "logits/rejected": 0.11660370975732803, + "logps/chosen": -0.47910040616989136, + "logps/rejected": -1.618467092514038, + "loss": 0.591, + "odds_ratio_loss": 0.30910736322402954, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.047910042107105255, + "rewards/margins": 0.11393667757511139, + "rewards/rejected": -0.16184672713279724, + "sft_loss": 0.47910040616989136, + "step": 996 + }, + { + "epoch": 1.4417932031814895, + "grad_norm": 4.590190762321148, + "learning_rate": 7.0683883141180295e-06, + "logits/chosen": 0.16932472586631775, + "logits/rejected": 0.23921875655651093, + "logps/chosen": -0.5061404705047607, + "logps/rejected": -1.450538992881775, + "loss": 0.6689, + "odds_ratio_loss": 0.2410426139831543, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05061405152082443, + "rewards/margins": 0.09443984925746918, + "rewards/rejected": -0.145053893327713, + "sft_loss": 0.5061404705047607, + "step": 997 + }, + { + "epoch": 1.443239334779465, + "grad_norm": 2.3532961841410716, + "learning_rate": 7.06639487407792e-06, + "logits/chosen": 0.16211381554603577, + "logits/rejected": 0.14582449197769165, + "logps/chosen": -0.38793298602104187, + "logps/rejected": -1.8462269306182861, + "loss": 0.4933, + "odds_ratio_loss": 0.15614688396453857, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.038793303072452545, + "rewards/margins": 0.14582940936088562, + "rewards/rejected": -0.18462270498275757, + "sft_loss": 0.38793298602104187, + "step": 998 + }, + { + "epoch": 1.4446854663774404, + "grad_norm": 2.4403398082029564, + "learning_rate": 7.06439958529692e-06, + "logits/chosen": 0.24929878115653992, + "logits/rejected": 0.20911167562007904, + "logps/chosen": -0.5529198050498962, + "logps/rejected": -2.3418209552764893, + "loss": 0.5847, + "odds_ratio_loss": 0.20496883988380432, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.055291980504989624, + "rewards/margins": 0.17889009416103363, + "rewards/rejected": -0.23418208956718445, + "sft_loss": 0.5529198050498962, + "step": 999 + }, + { + "epoch": 1.4461315979754157, + "grad_norm": 2.645683265131475, + "learning_rate": 7.062402448977997e-06, + "logits/chosen": 0.3289748430252075, + "logits/rejected": 0.20327892899513245, + "logps/chosen": -0.5288233757019043, + "logps/rejected": -2.5970919132232666, + "loss": 0.6075, + "odds_ratio_loss": 0.23200330138206482, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05288233980536461, + "rewards/margins": 0.20682686567306519, + "rewards/rejected": -0.2597091794013977, + "sft_loss": 0.5288233757019043, + "step": 1000 + }, + { + "epoch": 1.4475777295733911, + "grad_norm": 2.7552725794710082, + "learning_rate": 7.0604034663252326e-06, + "logits/chosen": 0.13092300295829773, + "logits/rejected": 0.15963725745677948, + "logps/chosen": -0.37244975566864014, + "logps/rejected": -1.9033265113830566, + "loss": 0.5478, + "odds_ratio_loss": 0.23867227137088776, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.037244975566864014, + "rewards/margins": 0.15308767557144165, + "rewards/rejected": -0.19033265113830566, + "sft_loss": 0.37244975566864014, + "step": 1001 + }, + { + "epoch": 1.4490238611713666, + "grad_norm": 4.113907222826934, + "learning_rate": 7.058402638543819e-06, + "logits/chosen": 0.2033143788576126, + "logits/rejected": 0.23652614653110504, + "logps/chosen": -0.5085000991821289, + "logps/rejected": -1.6749879121780396, + "loss": 0.6418, + "odds_ratio_loss": 0.25362229347229004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05085001513361931, + "rewards/margins": 0.11664877831935883, + "rewards/rejected": -0.16749879717826843, + "sft_loss": 0.5085000991821289, + "step": 1002 + }, + { + "epoch": 1.450469992769342, + "grad_norm": 2.6623407856341283, + "learning_rate": 7.056399966840065e-06, + "logits/chosen": 0.18851172924041748, + "logits/rejected": 0.18449491262435913, + "logps/chosen": -0.5473241806030273, + "logps/rejected": -2.25048828125, + "loss": 0.5852, + "odds_ratio_loss": 0.2814570367336273, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.054732419550418854, + "rewards/margins": 0.1703163981437683, + "rewards/rejected": -0.22504881024360657, + "sft_loss": 0.5473241806030273, + "step": 1003 + }, + { + "epoch": 1.4519161243673175, + "grad_norm": 2.2832553471825956, + "learning_rate": 7.0543954524213885e-06, + "logits/chosen": 0.19176509976387024, + "logits/rejected": 0.17191959917545319, + "logps/chosen": -0.4636141359806061, + "logps/rejected": -2.2305266857147217, + "loss": 0.6144, + "odds_ratio_loss": 0.19034519791603088, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04636141285300255, + "rewards/margins": 0.17669126391410828, + "rewards/rejected": -0.22305268049240112, + "sft_loss": 0.4636141359806061, + "step": 1004 + }, + { + "epoch": 1.4533622559652928, + "grad_norm": 2.8776457997498053, + "learning_rate": 7.052389096496316e-06, + "logits/chosen": 0.1522367298603058, + "logits/rejected": 0.15638092160224915, + "logps/chosen": -0.7201976180076599, + "logps/rejected": -1.3056902885437012, + "loss": 0.6707, + "odds_ratio_loss": 0.4569261372089386, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07201976329088211, + "rewards/margins": 0.058549270033836365, + "rewards/rejected": -0.13056901097297668, + "sft_loss": 0.7201976180076599, + "step": 1005 + }, + { + "epoch": 1.4548083875632682, + "grad_norm": 3.901836611275651, + "learning_rate": 7.0503809002744895e-06, + "logits/chosen": 0.1361616551876068, + "logits/rejected": 0.1412799060344696, + "logps/chosen": -0.4152643084526062, + "logps/rejected": -1.949552059173584, + "loss": 0.6122, + "odds_ratio_loss": 0.2301965057849884, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0415264368057251, + "rewards/margins": 0.15342876315116882, + "rewards/rejected": -0.19495519995689392, + "sft_loss": 0.4152643084526062, + "step": 1006 + }, + { + "epoch": 1.4562545191612437, + "grad_norm": 2.2885218862113406, + "learning_rate": 7.048370864966658e-06, + "logits/chosen": 0.09770112484693527, + "logits/rejected": 0.16095119714736938, + "logps/chosen": -0.5029357671737671, + "logps/rejected": -1.595658540725708, + "loss": 0.5774, + "odds_ratio_loss": 0.27672871947288513, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05029357224702835, + "rewards/margins": 0.10927227884531021, + "rewards/rejected": -0.15956585109233856, + "sft_loss": 0.5029357671737671, + "step": 1007 + }, + { + "epoch": 1.457700650759219, + "grad_norm": 2.7412792848860024, + "learning_rate": 7.046358991784679e-06, + "logits/chosen": 0.3045928478240967, + "logits/rejected": 0.1931784301996231, + "logps/chosen": -0.5084326267242432, + "logps/rejected": -1.954370379447937, + "loss": 0.6619, + "odds_ratio_loss": 0.23789116740226746, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.050843268632888794, + "rewards/margins": 0.14459377527236938, + "rewards/rejected": -0.19543704390525818, + "sft_loss": 0.5084326267242432, + "step": 1008 + }, + { + "epoch": 1.4591467823571946, + "grad_norm": 2.6518257487154897, + "learning_rate": 7.044345281941517e-06, + "logits/chosen": 0.19667373597621918, + "logits/rejected": 0.026554403826594353, + "logps/chosen": -0.6434774398803711, + "logps/rejected": -1.9622483253479004, + "loss": 0.6395, + "odds_ratio_loss": 0.33053648471832275, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06434774398803711, + "rewards/margins": 0.1318770945072174, + "rewards/rejected": -0.19622483849525452, + "sft_loss": 0.6434774398803711, + "step": 1009 + }, + { + "epoch": 1.4605929139551699, + "grad_norm": 3.306538949734956, + "learning_rate": 7.042329736651247e-06, + "logits/chosen": 0.26415181159973145, + "logits/rejected": 0.1740158200263977, + "logps/chosen": -0.6276966333389282, + "logps/rejected": -2.046126127243042, + "loss": 0.6323, + "odds_ratio_loss": 0.3550504148006439, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06276966631412506, + "rewards/margins": 0.14184294641017914, + "rewards/rejected": -0.2046126127243042, + "sft_loss": 0.6276966333389282, + "step": 1010 + }, + { + "epoch": 1.4620390455531453, + "grad_norm": 2.57776723354512, + "learning_rate": 7.040312357129047e-06, + "logits/chosen": 0.1909094750881195, + "logits/rejected": 0.24289001524448395, + "logps/chosen": -0.5022028684616089, + "logps/rejected": -1.813218116760254, + "loss": 0.581, + "odds_ratio_loss": 0.2843645215034485, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05022028833627701, + "rewards/margins": 0.1311015486717224, + "rewards/rejected": -0.18132182955741882, + "sft_loss": 0.5022028684616089, + "step": 1011 + }, + { + "epoch": 1.4634851771511208, + "grad_norm": 2.7761880864873154, + "learning_rate": 7.038293144591204e-06, + "logits/chosen": 0.20759516954421997, + "logits/rejected": 0.14761096239089966, + "logps/chosen": -0.577498733997345, + "logps/rejected": -1.8540127277374268, + "loss": 0.6301, + "odds_ratio_loss": 0.3021618127822876, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.057749874889850616, + "rewards/margins": 0.1276514083147049, + "rewards/rejected": -0.18540127575397491, + "sft_loss": 0.577498733997345, + "step": 1012 + }, + { + "epoch": 1.464931308749096, + "grad_norm": 2.277520562849688, + "learning_rate": 7.036272100255109e-06, + "logits/chosen": 0.249737948179245, + "logits/rejected": 0.18658365309238434, + "logps/chosen": -0.5908975005149841, + "logps/rejected": -1.8330168724060059, + "loss": 0.6079, + "odds_ratio_loss": 0.3356654644012451, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05908975377678871, + "rewards/margins": 0.12421193718910217, + "rewards/rejected": -0.18330168724060059, + "sft_loss": 0.5908975005149841, + "step": 1013 + }, + { + "epoch": 1.4663774403470715, + "grad_norm": 2.5201576999568163, + "learning_rate": 7.034249225339255e-06, + "logits/chosen": 0.11815465986728668, + "logits/rejected": -0.03885069862008095, + "logps/chosen": -0.5782637000083923, + "logps/rejected": -2.2459661960601807, + "loss": 0.6193, + "odds_ratio_loss": 0.3164394497871399, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05782637372612953, + "rewards/margins": 0.16677024960517883, + "rewards/rejected": -0.22459663450717926, + "sft_loss": 0.5782637000083923, + "step": 1014 + }, + { + "epoch": 1.467823571945047, + "grad_norm": 2.9866144671540003, + "learning_rate": 7.032224521063243e-06, + "logits/chosen": 0.23842084407806396, + "logits/rejected": 0.1553962379693985, + "logps/chosen": -0.4561951160430908, + "logps/rejected": -2.000938892364502, + "loss": 0.598, + "odds_ratio_loss": 0.20124277472496033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04561951383948326, + "rewards/margins": 0.15447436273097992, + "rewards/rejected": -0.20009386539459229, + "sft_loss": 0.4561951160430908, + "step": 1015 + }, + { + "epoch": 1.4692697035430224, + "grad_norm": 2.69333811024608, + "learning_rate": 7.030197988647774e-06, + "logits/chosen": 0.1152530089020729, + "logits/rejected": 0.0506892129778862, + "logps/chosen": -0.5420433282852173, + "logps/rejected": -2.2813198566436768, + "loss": 0.5693, + "odds_ratio_loss": 0.2778359651565552, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05420433729887009, + "rewards/margins": 0.1739276647567749, + "rewards/rejected": -0.2281319797039032, + "sft_loss": 0.5420433282852173, + "step": 1016 + }, + { + "epoch": 1.470715835140998, + "grad_norm": 2.325614102351017, + "learning_rate": 7.028169629314653e-06, + "logits/chosen": 0.10290016233921051, + "logits/rejected": 0.09782204031944275, + "logps/chosen": -0.5482643246650696, + "logps/rejected": -1.9553083181381226, + "loss": 0.6292, + "odds_ratio_loss": 0.24048736691474915, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.054826442152261734, + "rewards/margins": 0.14070439338684082, + "rewards/rejected": -0.19553083181381226, + "sft_loss": 0.5482643246650696, + "step": 1017 + }, + { + "epoch": 1.4721619667389731, + "grad_norm": 2.272861105925204, + "learning_rate": 7.026139444286783e-06, + "logits/chosen": 0.12405059486627579, + "logits/rejected": 0.12237481027841568, + "logps/chosen": -0.6233857870101929, + "logps/rejected": -1.7711641788482666, + "loss": 0.5714, + "odds_ratio_loss": 0.30120688676834106, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06233857944607735, + "rewards/margins": 0.1147778332233429, + "rewards/rejected": -0.17711640894412994, + "sft_loss": 0.6233857870101929, + "step": 1018 + }, + { + "epoch": 1.4736080983369486, + "grad_norm": 3.7017997314571276, + "learning_rate": 7.0241074347881725e-06, + "logits/chosen": 0.20509743690490723, + "logits/rejected": 0.19157272577285767, + "logps/chosen": -0.5758723020553589, + "logps/rejected": -1.6775627136230469, + "loss": 0.571, + "odds_ratio_loss": 0.35033929347991943, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05758722871541977, + "rewards/margins": 0.11016905307769775, + "rewards/rejected": -0.16775627434253693, + "sft_loss": 0.5758723020553589, + "step": 1019 + }, + { + "epoch": 1.475054229934924, + "grad_norm": 3.740076627765479, + "learning_rate": 7.022073602043926e-06, + "logits/chosen": 0.31628650426864624, + "logits/rejected": 0.24296697974205017, + "logps/chosen": -0.5375529527664185, + "logps/rejected": -2.6180481910705566, + "loss": 0.6056, + "odds_ratio_loss": 0.3072022795677185, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.053755298256874084, + "rewards/margins": 0.20804953575134277, + "rewards/rejected": -0.26180481910705566, + "sft_loss": 0.5375529527664185, + "step": 1020 + }, + { + "epoch": 1.4765003615328995, + "grad_norm": 2.916082710897773, + "learning_rate": 7.020037947280249e-06, + "logits/chosen": 0.2506396174430847, + "logits/rejected": 0.11263955384492874, + "logps/chosen": -0.6686195135116577, + "logps/rejected": -1.8187741041183472, + "loss": 0.596, + "odds_ratio_loss": 0.3449232876300812, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06686195731163025, + "rewards/margins": 0.11501546204090118, + "rewards/rejected": -0.18187743425369263, + "sft_loss": 0.6686195135116577, + "step": 1021 + }, + { + "epoch": 1.477946493130875, + "grad_norm": 3.9390911957773134, + "learning_rate": 7.018000471724446e-06, + "logits/chosen": 0.10807211697101593, + "logits/rejected": 0.22087359428405762, + "logps/chosen": -0.5579795837402344, + "logps/rejected": -2.2236127853393555, + "loss": 0.6239, + "odds_ratio_loss": 0.2716135084629059, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05579796060919762, + "rewards/margins": 0.16656331717967987, + "rewards/rejected": -0.2223612666130066, + "sft_loss": 0.5579795837402344, + "step": 1022 + }, + { + "epoch": 1.4793926247288502, + "grad_norm": 2.5788918246446126, + "learning_rate": 7.01596117660492e-06, + "logits/chosen": 0.04662645608186722, + "logits/rejected": 0.1416071653366089, + "logps/chosen": -0.6649520397186279, + "logps/rejected": -1.8460800647735596, + "loss": 0.5739, + "odds_ratio_loss": 0.3914756178855896, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06649520993232727, + "rewards/margins": 0.11811280250549316, + "rewards/rejected": -0.18460801243782043, + "sft_loss": 0.6649520397186279, + "step": 1023 + }, + { + "epoch": 1.4808387563268257, + "grad_norm": 2.4501783550989713, + "learning_rate": 7.013920063151166e-06, + "logits/chosen": 0.1949535608291626, + "logits/rejected": 0.0820920318365097, + "logps/chosen": -0.6624947786331177, + "logps/rejected": -2.2682080268859863, + "loss": 0.6292, + "odds_ratio_loss": 0.3945419192314148, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06624948233366013, + "rewards/margins": 0.16057133674621582, + "rewards/rejected": -0.22682081162929535, + "sft_loss": 0.6624947786331177, + "step": 1024 + }, + { + "epoch": 1.4822848879248012, + "grad_norm": 2.2241593032463887, + "learning_rate": 7.011877132593781e-06, + "logits/chosen": 0.19115297496318817, + "logits/rejected": 0.19649362564086914, + "logps/chosen": -0.5488402843475342, + "logps/rejected": -1.9641447067260742, + "loss": 0.5042, + "odds_ratio_loss": 0.251476526260376, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05488403141498566, + "rewards/margins": 0.14153045415878296, + "rewards/rejected": -0.19641447067260742, + "sft_loss": 0.5488402843475342, + "step": 1025 + }, + { + "epoch": 1.4837310195227766, + "grad_norm": 2.9155833133769176, + "learning_rate": 7.009832386164456e-06, + "logits/chosen": 0.15561585128307343, + "logits/rejected": 0.19798195362091064, + "logps/chosen": -0.6533185243606567, + "logps/rejected": -1.3767894506454468, + "loss": 0.6188, + "odds_ratio_loss": 0.3969898521900177, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0653318539261818, + "rewards/margins": 0.07234710454940796, + "rewards/rejected": -0.13767895102500916, + "sft_loss": 0.6533185243606567, + "step": 1026 + }, + { + "epoch": 1.485177151120752, + "grad_norm": 2.6169665641699913, + "learning_rate": 7.007785825095975e-06, + "logits/chosen": 0.23924939334392548, + "logits/rejected": 0.019013479351997375, + "logps/chosen": -0.521590530872345, + "logps/rejected": -2.0993692874908447, + "loss": 0.7109, + "odds_ratio_loss": 0.24000337719917297, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05215904861688614, + "rewards/margins": 0.15777787566184998, + "rewards/rejected": -0.2099369466304779, + "sft_loss": 0.521590530872345, + "step": 1027 + }, + { + "epoch": 1.4866232827187273, + "grad_norm": 2.572626461324335, + "learning_rate": 7.005737450622219e-06, + "logits/chosen": 0.299623966217041, + "logits/rejected": 0.24995410442352295, + "logps/chosen": -0.533481776714325, + "logps/rejected": -1.580140471458435, + "loss": 0.5954, + "odds_ratio_loss": 0.3327270746231079, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05334818363189697, + "rewards/margins": 0.1046658605337143, + "rewards/rejected": -0.15801402926445007, + "sft_loss": 0.533481776714325, + "step": 1028 + }, + { + "epoch": 1.4880694143167028, + "grad_norm": 3.5453397201056243, + "learning_rate": 7.003687263978158e-06, + "logits/chosen": 0.18590807914733887, + "logits/rejected": 0.2126408964395523, + "logps/chosen": -0.6051197052001953, + "logps/rejected": -2.3243978023529053, + "loss": 0.5805, + "odds_ratio_loss": 0.38674530386924744, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06051196902990341, + "rewards/margins": 0.171927809715271, + "rewards/rejected": -0.232439786195755, + "sft_loss": 0.6051197052001953, + "step": 1029 + }, + { + "epoch": 1.4895155459146783, + "grad_norm": 2.8003747097713227, + "learning_rate": 7.00163526639986e-06, + "logits/chosen": 0.14941300451755524, + "logits/rejected": 0.11249898374080658, + "logps/chosen": -0.7737295031547546, + "logps/rejected": -1.510383129119873, + "loss": 0.6962, + "odds_ratio_loss": 0.39545488357543945, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0773729458451271, + "rewards/margins": 0.07366538047790527, + "rewards/rejected": -0.15103831887245178, + "sft_loss": 0.7737295031547546, + "step": 1030 + }, + { + "epoch": 1.4909616775126535, + "grad_norm": 2.6587374392731475, + "learning_rate": 6.99958145912448e-06, + "logits/chosen": 0.18349449336528778, + "logits/rejected": 0.10890393704175949, + "logps/chosen": -0.5819097757339478, + "logps/rejected": -1.5076624155044556, + "loss": 0.5803, + "odds_ratio_loss": 0.40786978602409363, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.058190979063510895, + "rewards/margins": 0.09257525205612183, + "rewards/rejected": -0.15076623857021332, + "sft_loss": 0.5819097757339478, + "step": 1031 + }, + { + "epoch": 1.4924078091106292, + "grad_norm": 2.1458834769890203, + "learning_rate": 6.997525843390267e-06, + "logits/chosen": 0.0792083889245987, + "logits/rejected": 0.06461001932621002, + "logps/chosen": -0.6367547512054443, + "logps/rejected": -1.3028634786605835, + "loss": 0.6463, + "odds_ratio_loss": 0.3900574743747711, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06367547810077667, + "rewards/margins": 0.06661087274551392, + "rewards/rejected": -0.1302863508462906, + "sft_loss": 0.6367547512054443, + "step": 1032 + }, + { + "epoch": 1.4938539407086044, + "grad_norm": 2.4642215991009517, + "learning_rate": 6.995468420436559e-06, + "logits/chosen": 0.1517665535211563, + "logits/rejected": 0.15160293877124786, + "logps/chosen": -0.6312042474746704, + "logps/rejected": -2.0376627445220947, + "loss": 0.684, + "odds_ratio_loss": 0.3216949701309204, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06312042474746704, + "rewards/margins": 0.1406458467245102, + "rewards/rejected": -0.20376628637313843, + "sft_loss": 0.6312042474746704, + "step": 1033 + }, + { + "epoch": 1.49530007230658, + "grad_norm": 2.5076819118432874, + "learning_rate": 6.993409191503783e-06, + "logits/chosen": 0.239852637052536, + "logits/rejected": 0.12884551286697388, + "logps/chosen": -0.4071224331855774, + "logps/rejected": -2.5471112728118896, + "loss": 0.6061, + "odds_ratio_loss": 0.25786054134368896, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04071224480867386, + "rewards/margins": 0.21399888396263123, + "rewards/rejected": -0.2547111213207245, + "sft_loss": 0.4071224331855774, + "step": 1034 + }, + { + "epoch": 1.4967462039045554, + "grad_norm": 4.028839475322059, + "learning_rate": 6.991348157833457e-06, + "logits/chosen": 0.10707180202007294, + "logits/rejected": 0.08338991552591324, + "logps/chosen": -0.5520052909851074, + "logps/rejected": -2.078350067138672, + "loss": 0.604, + "odds_ratio_loss": 0.3524037003517151, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05520053207874298, + "rewards/margins": 0.15263448655605316, + "rewards/rejected": -0.20783501863479614, + "sft_loss": 0.5520052909851074, + "step": 1035 + }, + { + "epoch": 1.4981923355025306, + "grad_norm": 2.3833532208842256, + "learning_rate": 6.9892853206681864e-06, + "logits/chosen": 0.15900209546089172, + "logits/rejected": 0.098246268928051, + "logps/chosen": -0.6327139139175415, + "logps/rejected": -1.6600315570831299, + "loss": 0.6718, + "odds_ratio_loss": 0.3264049291610718, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06327139586210251, + "rewards/margins": 0.10273175686597824, + "rewards/rejected": -0.16600315272808075, + "sft_loss": 0.6327139139175415, + "step": 1036 + }, + { + "epoch": 1.499638467100506, + "grad_norm": 4.421414597775642, + "learning_rate": 6.987220681251663e-06, + "logits/chosen": 0.0970354974269867, + "logits/rejected": 0.07867510616779327, + "logps/chosen": -0.5139919519424438, + "logps/rejected": -2.057021379470825, + "loss": 0.658, + "odds_ratio_loss": 0.21215665340423584, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05139920115470886, + "rewards/margins": 0.1543029397726059, + "rewards/rejected": -0.20570214092731476, + "sft_loss": 0.5139919519424438, + "step": 1037 + }, + { + "epoch": 1.5010845986984815, + "grad_norm": 2.874565400720609, + "learning_rate": 6.985154240828665e-06, + "logits/chosen": 0.2262597382068634, + "logits/rejected": 0.1943497508764267, + "logps/chosen": -0.6259772181510925, + "logps/rejected": -2.125296115875244, + "loss": 0.7194, + "odds_ratio_loss": 0.4203740954399109, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06259772181510925, + "rewards/margins": 0.1499318927526474, + "rewards/rejected": -0.21252959966659546, + "sft_loss": 0.6259772181510925, + "step": 1038 + }, + { + "epoch": 1.502530730296457, + "grad_norm": 2.6558933441242214, + "learning_rate": 6.983086000645057e-06, + "logits/chosen": 0.07608388364315033, + "logits/rejected": 0.15391358733177185, + "logps/chosen": -0.7084933519363403, + "logps/rejected": -1.6010119915008545, + "loss": 0.6625, + "odds_ratio_loss": 0.30498605966567993, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07084932923316956, + "rewards/margins": 0.08925186842679977, + "rewards/rejected": -0.16010120511054993, + "sft_loss": 0.7084933519363403, + "step": 1039 + }, + { + "epoch": 1.5039768618944325, + "grad_norm": 2.6321061758591617, + "learning_rate": 6.981015961947788e-06, + "logits/chosen": 0.1412767767906189, + "logits/rejected": 0.13829348981380463, + "logps/chosen": -0.5603752136230469, + "logps/rejected": -2.2956314086914062, + "loss": 0.5655, + "odds_ratio_loss": 0.3607153296470642, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.056037526577711105, + "rewards/margins": 0.1735256314277649, + "rewards/rejected": -0.2295631468296051, + "sft_loss": 0.5603752136230469, + "step": 1040 + }, + { + "epoch": 1.5054229934924077, + "grad_norm": 3.0828053228799006, + "learning_rate": 6.978944125984895e-06, + "logits/chosen": 0.19295093417167664, + "logits/rejected": 0.14091432094573975, + "logps/chosen": -0.5975869297981262, + "logps/rejected": -2.6924171447753906, + "loss": 0.6764, + "odds_ratio_loss": 0.2563125789165497, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05975869297981262, + "rewards/margins": 0.20948302745819092, + "rewards/rejected": -0.26924172043800354, + "sft_loss": 0.5975869297981262, + "step": 1041 + }, + { + "epoch": 1.5068691250903832, + "grad_norm": 3.960558914590278, + "learning_rate": 6.976870494005492e-06, + "logits/chosen": 0.19148896634578705, + "logits/rejected": 0.0627632588148117, + "logps/chosen": -0.5534959435462952, + "logps/rejected": -2.0796003341674805, + "loss": 0.6236, + "odds_ratio_loss": 0.3466914892196655, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.055349595844745636, + "rewards/margins": 0.1526104211807251, + "rewards/rejected": -0.20796000957489014, + "sft_loss": 0.5534959435462952, + "step": 1042 + }, + { + "epoch": 1.5083152566883586, + "grad_norm": 2.361686297673546, + "learning_rate": 6.974795067259781e-06, + "logits/chosen": 0.0762028768658638, + "logits/rejected": 0.11023451387882233, + "logps/chosen": -0.5732434988021851, + "logps/rejected": -1.5401384830474854, + "loss": 0.6878, + "odds_ratio_loss": 0.29356640577316284, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0573243573307991, + "rewards/margins": 0.09668950736522675, + "rewards/rejected": -0.15401385724544525, + "sft_loss": 0.5732434988021851, + "step": 1043 + }, + { + "epoch": 1.509761388286334, + "grad_norm": 2.458530408887522, + "learning_rate": 6.972717846999046e-06, + "logits/chosen": 0.14236396551132202, + "logits/rejected": 0.10859329998493195, + "logps/chosen": -0.4144444465637207, + "logps/rejected": -1.902043104171753, + "loss": 0.622, + "odds_ratio_loss": 0.23717351257801056, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04144444689154625, + "rewards/margins": 0.1487598568201065, + "rewards/rejected": -0.19020430743694305, + "sft_loss": 0.4144444465637207, + "step": 1044 + }, + { + "epoch": 1.5112075198843096, + "grad_norm": 2.3087641052869894, + "learning_rate": 6.97063883447565e-06, + "logits/chosen": 0.10838791728019714, + "logits/rejected": 0.11068686097860336, + "logps/chosen": -0.49994957447052, + "logps/rejected": -2.1742091178894043, + "loss": 0.5046, + "odds_ratio_loss": 0.2922423481941223, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04999496415257454, + "rewards/margins": 0.1674259603023529, + "rewards/rejected": -0.21742090582847595, + "sft_loss": 0.49994957447052, + "step": 1045 + }, + { + "epoch": 1.5126536514822848, + "grad_norm": 2.3997435361179327, + "learning_rate": 6.968558030943035e-06, + "logits/chosen": 0.045848749577999115, + "logits/rejected": 0.08902572095394135, + "logps/chosen": -0.6003144383430481, + "logps/rejected": -1.817967176437378, + "loss": 0.5955, + "odds_ratio_loss": 0.33355093002319336, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06003144755959511, + "rewards/margins": 0.12176527082920074, + "rewards/rejected": -0.18179671466350555, + "sft_loss": 0.6003144383430481, + "step": 1046 + }, + { + "epoch": 1.5140997830802603, + "grad_norm": 2.465062809755268, + "learning_rate": 6.966475437655728e-06, + "logits/chosen": 0.18887464702129364, + "logits/rejected": 0.1661260426044464, + "logps/chosen": -0.45803892612457275, + "logps/rejected": -1.855370044708252, + "loss": 0.5773, + "odds_ratio_loss": 0.2459450662136078, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04580388963222504, + "rewards/margins": 0.13973310589790344, + "rewards/rejected": -0.18553701043128967, + "sft_loss": 0.45803892612457275, + "step": 1047 + }, + { + "epoch": 1.5155459146782357, + "grad_norm": 2.7122560174653088, + "learning_rate": 6.964391055869331e-06, + "logits/chosen": 0.20639120042324066, + "logits/rejected": 0.24513506889343262, + "logps/chosen": -0.5997314453125, + "logps/rejected": -1.2009388208389282, + "loss": 0.5093, + "odds_ratio_loss": 0.5465415120124817, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05997314304113388, + "rewards/margins": 0.06012075021862984, + "rewards/rejected": -0.12009389698505402, + "sft_loss": 0.5997314453125, + "step": 1048 + }, + { + "epoch": 1.516992046276211, + "grad_norm": 3.499958743730523, + "learning_rate": 6.962304886840526e-06, + "logits/chosen": 0.11560941487550735, + "logits/rejected": 0.1723850816488266, + "logps/chosen": -0.5111711025238037, + "logps/rejected": -1.831732153892517, + "loss": 0.6253, + "odds_ratio_loss": 0.19189368188381195, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05111711472272873, + "rewards/margins": 0.1320561021566391, + "rewards/rejected": -0.18317320942878723, + "sft_loss": 0.5111711025238037, + "step": 1049 + }, + { + "epoch": 1.5184381778741867, + "grad_norm": 2.490232127901872, + "learning_rate": 6.960216931827072e-06, + "logits/chosen": 0.13617388904094696, + "logits/rejected": 0.10104858875274658, + "logps/chosen": -0.8242969512939453, + "logps/rejected": -1.2352579832077026, + "loss": 0.6404, + "odds_ratio_loss": 0.5068786144256592, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.08242969959974289, + "rewards/margins": 0.04109610244631767, + "rewards/rejected": -0.12352579832077026, + "sft_loss": 0.8242969512939453, + "step": 1050 + }, + { + "epoch": 1.519884309472162, + "grad_norm": 2.796815752969695, + "learning_rate": 6.958127192087805e-06, + "logits/chosen": 0.2978774905204773, + "logits/rejected": 0.21057304739952087, + "logps/chosen": -0.498077392578125, + "logps/rejected": -1.7246334552764893, + "loss": 0.5587, + "odds_ratio_loss": 0.334107905626297, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04980773851275444, + "rewards/margins": 0.12265560030937195, + "rewards/rejected": -0.1724633276462555, + "sft_loss": 0.498077392578125, + "step": 1051 + }, + { + "epoch": 1.5213304410701374, + "grad_norm": 2.2782092009922708, + "learning_rate": 6.956035668882636e-06, + "logits/chosen": 0.010676529258489609, + "logits/rejected": 0.0010472461581230164, + "logps/chosen": -0.6882982850074768, + "logps/rejected": -2.3437445163726807, + "loss": 0.6361, + "odds_ratio_loss": 0.33234259486198425, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06882983446121216, + "rewards/margins": 0.16554459929466248, + "rewards/rejected": -0.23437443375587463, + "sft_loss": 0.6882982850074768, + "step": 1052 + }, + { + "epoch": 1.5227765726681128, + "grad_norm": 3.2140920809453655, + "learning_rate": 6.953942363472554e-06, + "logits/chosen": 0.13342228531837463, + "logits/rejected": 0.05354118347167969, + "logps/chosen": -0.6020887494087219, + "logps/rejected": -2.461318254470825, + "loss": 0.6204, + "odds_ratio_loss": 0.32788577675819397, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06020887941122055, + "rewards/margins": 0.18592296540737152, + "rewards/rejected": -0.24613183736801147, + "sft_loss": 0.6020887494087219, + "step": 1053 + }, + { + "epoch": 1.524222704266088, + "grad_norm": 2.5019712775378666, + "learning_rate": 6.951847277119618e-06, + "logits/chosen": 0.2589821219444275, + "logits/rejected": 0.16210149228572845, + "logps/chosen": -0.510164737701416, + "logps/rejected": -2.564856767654419, + "loss": 0.5104, + "odds_ratio_loss": 0.23116162419319153, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05101647228002548, + "rewards/margins": 0.20546920597553253, + "rewards/rejected": -0.2564856708049774, + "sft_loss": 0.510164737701416, + "step": 1054 + }, + { + "epoch": 1.5256688358640638, + "grad_norm": 2.152655482579084, + "learning_rate": 6.949750411086965e-06, + "logits/chosen": 0.34317436814308167, + "logits/rejected": 0.10291274636983871, + "logps/chosen": -0.7262193560600281, + "logps/rejected": -1.5096874237060547, + "loss": 0.7065, + "odds_ratio_loss": 0.4528844952583313, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07262193411588669, + "rewards/margins": 0.07834681868553162, + "rewards/rejected": -0.1509687602519989, + "sft_loss": 0.7262193560600281, + "step": 1055 + }, + { + "epoch": 1.527114967462039, + "grad_norm": 3.50447530153194, + "learning_rate": 6.947651766638804e-06, + "logits/chosen": 0.12302163243293762, + "logits/rejected": 0.13556215167045593, + "logps/chosen": -0.5117670297622681, + "logps/rejected": -2.444344997406006, + "loss": 0.5551, + "odds_ratio_loss": 0.15113696455955505, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.051176704466342926, + "rewards/margins": 0.19325780868530273, + "rewards/rejected": -0.24443453550338745, + "sft_loss": 0.5117670297622681, + "step": 1056 + }, + { + "epoch": 1.5285610990600145, + "grad_norm": 2.435293117738104, + "learning_rate": 6.945551345040414e-06, + "logits/chosen": 0.15510180592536926, + "logits/rejected": 0.11878678947687149, + "logps/chosen": -0.5824716687202454, + "logps/rejected": -1.9384113550186157, + "loss": 0.5609, + "odds_ratio_loss": 0.35332125425338745, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.058247171342372894, + "rewards/margins": 0.1355939656496048, + "rewards/rejected": -0.1938411295413971, + "sft_loss": 0.5824716687202454, + "step": 1057 + }, + { + "epoch": 1.53000723065799, + "grad_norm": 2.588323654734541, + "learning_rate": 6.943449147558148e-06, + "logits/chosen": 0.2299852967262268, + "logits/rejected": 0.2677266597747803, + "logps/chosen": -0.4605291187763214, + "logps/rejected": -1.5798819065093994, + "loss": 0.6116, + "odds_ratio_loss": 0.2766846716403961, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04605291038751602, + "rewards/margins": 0.11193528026342392, + "rewards/rejected": -0.15798819065093994, + "sft_loss": 0.4605291187763214, + "step": 1058 + }, + { + "epoch": 1.5314533622559652, + "grad_norm": 3.837009839441936, + "learning_rate": 6.941345175459428e-06, + "logits/chosen": 0.1960982233285904, + "logits/rejected": 0.1517314910888672, + "logps/chosen": -0.6805843114852905, + "logps/rejected": -1.9845657348632812, + "loss": 0.6312, + "odds_ratio_loss": 0.44597482681274414, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06805843114852905, + "rewards/margins": 0.13039812445640564, + "rewards/rejected": -0.1984565705060959, + "sft_loss": 0.6805843114852905, + "step": 1059 + }, + { + "epoch": 1.5328994938539409, + "grad_norm": 3.5474173037398278, + "learning_rate": 6.939239430012747e-06, + "logits/chosen": 0.1270655393600464, + "logits/rejected": 0.16382458806037903, + "logps/chosen": -0.6372203230857849, + "logps/rejected": -1.725688099861145, + "loss": 0.5645, + "odds_ratio_loss": 0.3358212411403656, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06372203677892685, + "rewards/margins": 0.10884677618741989, + "rewards/rejected": -0.17256881296634674, + "sft_loss": 0.6372203230857849, + "step": 1060 + }, + { + "epoch": 1.534345625451916, + "grad_norm": 2.6777737468006597, + "learning_rate": 6.937131912487666e-06, + "logits/chosen": 0.12652812898159027, + "logits/rejected": 0.17299723625183105, + "logps/chosen": -0.5954207181930542, + "logps/rejected": -1.8514387607574463, + "loss": 0.6188, + "odds_ratio_loss": 0.2636999487876892, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05954207852482796, + "rewards/margins": 0.12560181319713593, + "rewards/rejected": -0.18514388799667358, + "sft_loss": 0.5954207181930542, + "step": 1061 + }, + { + "epoch": 1.5357917570498916, + "grad_norm": 3.721568398574073, + "learning_rate": 6.935022624154818e-06, + "logits/chosen": 0.014476214535534382, + "logits/rejected": 0.025586508214473724, + "logps/chosen": -0.6806229948997498, + "logps/rejected": -2.057115077972412, + "loss": 0.6268, + "odds_ratio_loss": 0.3748878836631775, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06806230545043945, + "rewards/margins": 0.13764920830726624, + "rewards/rejected": -0.2057115137577057, + "sft_loss": 0.6806229948997498, + "step": 1062 + }, + { + "epoch": 1.537237888647867, + "grad_norm": 3.415662609051976, + "learning_rate": 6.9329115662858965e-06, + "logits/chosen": 0.2381383776664734, + "logits/rejected": 0.13723555207252502, + "logps/chosen": -0.5097121000289917, + "logps/rejected": -2.628650188446045, + "loss": 0.617, + "odds_ratio_loss": 0.269598126411438, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05097120627760887, + "rewards/margins": 0.21189382672309875, + "rewards/rejected": -0.2628650367259979, + "sft_loss": 0.5097121000289917, + "step": 1063 + }, + { + "epoch": 1.5386840202458423, + "grad_norm": 2.3924126847399028, + "learning_rate": 6.9307987401536694e-06, + "logits/chosen": 0.22002063691616058, + "logits/rejected": 0.19519548118114471, + "logps/chosen": -0.516993522644043, + "logps/rejected": -2.4856157302856445, + "loss": 0.5586, + "odds_ratio_loss": 0.23298753798007965, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05169935151934624, + "rewards/margins": 0.19686222076416016, + "rewards/rejected": -0.2485615760087967, + "sft_loss": 0.516993522644043, + "step": 1064 + }, + { + "epoch": 1.5401301518438177, + "grad_norm": 2.492354637645805, + "learning_rate": 6.928684147031967e-06, + "logits/chosen": 0.20798169076442719, + "logits/rejected": 0.20337769389152527, + "logps/chosen": -0.5130513310432434, + "logps/rejected": -1.760741949081421, + "loss": 0.6231, + "odds_ratio_loss": 0.29917263984680176, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0513051338493824, + "rewards/margins": 0.12476906925439835, + "rewards/rejected": -0.17607420682907104, + "sft_loss": 0.5130513310432434, + "step": 1065 + }, + { + "epoch": 1.5415762834417932, + "grad_norm": 2.8580063984243185, + "learning_rate": 6.926567788195683e-06, + "logits/chosen": 0.12441393733024597, + "logits/rejected": 0.12534169852733612, + "logps/chosen": -0.8271850943565369, + "logps/rejected": -1.329588770866394, + "loss": 0.6965, + "odds_ratio_loss": 0.5016704797744751, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08271851390600204, + "rewards/margins": 0.05024036392569542, + "rewards/rejected": -0.13295888900756836, + "sft_loss": 0.8271850943565369, + "step": 1066 + }, + { + "epoch": 1.5430224150397687, + "grad_norm": 2.088204535572692, + "learning_rate": 6.9244496649207814e-06, + "logits/chosen": 0.29973694682121277, + "logits/rejected": 0.19213856756687164, + "logps/chosen": -0.476146399974823, + "logps/rejected": -2.555558204650879, + "loss": 0.5292, + "odds_ratio_loss": 0.2622685432434082, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04761463776230812, + "rewards/margins": 0.2079411745071411, + "rewards/rejected": -0.25555580854415894, + "sft_loss": 0.476146399974823, + "step": 1067 + }, + { + "epoch": 1.5444685466377441, + "grad_norm": 2.6496817862444964, + "learning_rate": 6.922329778484284e-06, + "logits/chosen": 0.12427856773138046, + "logits/rejected": -0.0361967608332634, + "logps/chosen": -0.7265552282333374, + "logps/rejected": -1.805168867111206, + "loss": 0.6306, + "odds_ratio_loss": 0.512405514717102, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07265552878379822, + "rewards/margins": 0.10786135494709015, + "rewards/rejected": -0.18051689863204956, + "sft_loss": 0.7265552282333374, + "step": 1068 + }, + { + "epoch": 1.5459146782357194, + "grad_norm": 2.3428447174051024, + "learning_rate": 6.920208130164279e-06, + "logits/chosen": 0.1944482922554016, + "logits/rejected": 0.12993068993091583, + "logps/chosen": -0.5531301498413086, + "logps/rejected": -1.8621083498001099, + "loss": 0.6635, + "odds_ratio_loss": 0.31757286190986633, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05531301349401474, + "rewards/margins": 0.13089781999588013, + "rewards/rejected": -0.18621084094047546, + "sft_loss": 0.5531301498413086, + "step": 1069 + }, + { + "epoch": 1.5473608098336948, + "grad_norm": 2.4592542393577137, + "learning_rate": 6.9180847212399185e-06, + "logits/chosen": 0.11052542924880981, + "logits/rejected": 0.12636902928352356, + "logps/chosen": -0.5426141023635864, + "logps/rejected": -2.2695884704589844, + "loss": 0.6373, + "odds_ratio_loss": 0.26784971356391907, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05426141619682312, + "rewards/margins": 0.17269743978977203, + "rewards/rejected": -0.22695885598659515, + "sft_loss": 0.5426141023635864, + "step": 1070 + }, + { + "epoch": 1.5488069414316703, + "grad_norm": 2.171573456977925, + "learning_rate": 6.91595955299141e-06, + "logits/chosen": 0.19434921443462372, + "logits/rejected": 0.1056637316942215, + "logps/chosen": -0.5363348722457886, + "logps/rejected": -1.7876577377319336, + "loss": 0.5617, + "odds_ratio_loss": 0.32317835092544556, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05363348871469498, + "rewards/margins": 0.12513227760791779, + "rewards/rejected": -0.17876575887203217, + "sft_loss": 0.5363348722457886, + "step": 1071 + }, + { + "epoch": 1.5502530730296455, + "grad_norm": 3.1058433998181942, + "learning_rate": 6.913832626700027e-06, + "logits/chosen": 0.02380383014678955, + "logits/rejected": 0.06460568308830261, + "logps/chosen": -0.5088622570037842, + "logps/rejected": -2.6462490558624268, + "loss": 0.5913, + "odds_ratio_loss": 0.242878720164299, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05088622868061066, + "rewards/margins": 0.21373867988586426, + "rewards/rejected": -0.2646248936653137, + "sft_loss": 0.5088622570037842, + "step": 1072 + }, + { + "epoch": 1.5516992046276212, + "grad_norm": 2.543394825286057, + "learning_rate": 6.911703943648101e-06, + "logits/chosen": 0.12439356744289398, + "logits/rejected": 0.07908182591199875, + "logps/chosen": -0.7427908182144165, + "logps/rejected": -1.9743529558181763, + "loss": 0.6456, + "odds_ratio_loss": 0.44264400005340576, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07427908480167389, + "rewards/margins": 0.12315621227025986, + "rewards/rejected": -0.19743528962135315, + "sft_loss": 0.7427908182144165, + "step": 1073 + }, + { + "epoch": 1.5531453362255965, + "grad_norm": 2.859886564508446, + "learning_rate": 6.909573505119022e-06, + "logits/chosen": 0.18945147097110748, + "logits/rejected": 0.028836995363235474, + "logps/chosen": -0.509463906288147, + "logps/rejected": -2.2200417518615723, + "loss": 0.5975, + "odds_ratio_loss": 0.354769229888916, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05094639211893082, + "rewards/margins": 0.171057790517807, + "rewards/rejected": -0.22200417518615723, + "sft_loss": 0.509463906288147, + "step": 1074 + }, + { + "epoch": 1.554591467823572, + "grad_norm": 2.4603604127071237, + "learning_rate": 6.907441312397242e-06, + "logits/chosen": 0.07920490205287933, + "logits/rejected": 0.10876142978668213, + "logps/chosen": -0.6252812743186951, + "logps/rejected": -1.5969560146331787, + "loss": 0.6832, + "odds_ratio_loss": 0.32766029238700867, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06252812594175339, + "rewards/margins": 0.0971674695611, + "rewards/rejected": -0.1596955955028534, + "sft_loss": 0.6252812743186951, + "step": 1075 + }, + { + "epoch": 1.5560375994215474, + "grad_norm": 2.718584331639608, + "learning_rate": 6.905307366768266e-06, + "logits/chosen": 0.09336543083190918, + "logits/rejected": 0.10105445981025696, + "logps/chosen": -0.7363986968994141, + "logps/rejected": -1.756248116493225, + "loss": 0.6651, + "odds_ratio_loss": 0.4169667363166809, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0736398696899414, + "rewards/margins": 0.10198494046926498, + "rewards/rejected": -0.175624817609787, + "sft_loss": 0.7363986968994141, + "step": 1076 + }, + { + "epoch": 1.5574837310195226, + "grad_norm": 2.5631268510701535, + "learning_rate": 6.903171669518657e-06, + "logits/chosen": 0.21750575304031372, + "logits/rejected": 0.13447445631027222, + "logps/chosen": -0.5367184281349182, + "logps/rejected": -3.1012187004089355, + "loss": 0.6973, + "odds_ratio_loss": 0.31506919860839844, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05367184057831764, + "rewards/margins": 0.2564500570297241, + "rewards/rejected": -0.31012189388275146, + "sft_loss": 0.5367184281349182, + "step": 1077 + }, + { + "epoch": 1.5589298626174983, + "grad_norm": 2.749705246651489, + "learning_rate": 6.901034221936037e-06, + "logits/chosen": 0.21169041097164154, + "logits/rejected": 0.12119154632091522, + "logps/chosen": -0.3760773539543152, + "logps/rejected": -2.7587168216705322, + "loss": 0.4959, + "odds_ratio_loss": 0.24344316124916077, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03760773688554764, + "rewards/margins": 0.23826396465301514, + "rewards/rejected": -0.2758716940879822, + "sft_loss": 0.3760773539543152, + "step": 1078 + }, + { + "epoch": 1.5603759942154736, + "grad_norm": 2.376515470109904, + "learning_rate": 6.898895025309078e-06, + "logits/chosen": 0.25765448808670044, + "logits/rejected": 0.2227647304534912, + "logps/chosen": -0.5679758191108704, + "logps/rejected": -2.064207077026367, + "loss": 0.6137, + "odds_ratio_loss": 0.2876105308532715, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.056797582656145096, + "rewards/margins": 0.14962315559387207, + "rewards/rejected": -0.20642071962356567, + "sft_loss": 0.5679758191108704, + "step": 1079 + }, + { + "epoch": 1.561822125813449, + "grad_norm": 2.409181380059003, + "learning_rate": 6.896754080927515e-06, + "logits/chosen": 0.25729578733444214, + "logits/rejected": 0.1790364384651184, + "logps/chosen": -0.6595759391784668, + "logps/rejected": -3.047138214111328, + "loss": 0.6201, + "odds_ratio_loss": 0.4079166352748871, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06595759838819504, + "rewards/margins": 0.2387562245130539, + "rewards/rejected": -0.30471381545066833, + "sft_loss": 0.6595759391784668, + "step": 1080 + }, + { + "epoch": 1.5632682574114245, + "grad_norm": 2.841839465101232, + "learning_rate": 6.894611390082125e-06, + "logits/chosen": 0.3018430471420288, + "logits/rejected": 0.23976141214370728, + "logps/chosen": -0.4250287413597107, + "logps/rejected": -2.774780511856079, + "loss": 0.6541, + "odds_ratio_loss": 0.2535403370857239, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04250287264585495, + "rewards/margins": 0.23497521877288818, + "rewards/rejected": -0.27747809886932373, + "sft_loss": 0.4250287413597107, + "step": 1081 + }, + { + "epoch": 1.5647143890093997, + "grad_norm": 2.3848242853645742, + "learning_rate": 6.892466954064748e-06, + "logits/chosen": 0.17624327540397644, + "logits/rejected": 0.035079099237918854, + "logps/chosen": -0.7374420166015625, + "logps/rejected": -2.1065616607666016, + "loss": 0.6227, + "odds_ratio_loss": 0.5102405548095703, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07374420762062073, + "rewards/margins": 0.13691195845603943, + "rewards/rejected": -0.21065616607666016, + "sft_loss": 0.7374420166015625, + "step": 1082 + }, + { + "epoch": 1.5661605206073754, + "grad_norm": 2.2507360178739555, + "learning_rate": 6.890320774168272e-06, + "logits/chosen": 0.09431849420070648, + "logits/rejected": 0.09603389352560043, + "logps/chosen": -0.671610414981842, + "logps/rejected": -2.1536717414855957, + "loss": 0.6743, + "odds_ratio_loss": 0.33693233132362366, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06716103851795197, + "rewards/margins": 0.14820614457130432, + "rewards/rejected": -0.2153671830892563, + "sft_loss": 0.671610414981842, + "step": 1083 + }, + { + "epoch": 1.5676066522053507, + "grad_norm": 4.758544005698199, + "learning_rate": 6.8881728516866365e-06, + "logits/chosen": 0.06953014433383942, + "logits/rejected": 0.10708625614643097, + "logps/chosen": -0.5058231353759766, + "logps/rejected": -2.6862740516662598, + "loss": 0.5229, + "odds_ratio_loss": 0.2551036477088928, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.050582315772771835, + "rewards/margins": 0.21804511547088623, + "rewards/rejected": -0.268627405166626, + "sft_loss": 0.5058231353759766, + "step": 1084 + }, + { + "epoch": 1.5690527838033261, + "grad_norm": 2.8356454390086867, + "learning_rate": 6.886023187914831e-06, + "logits/chosen": 0.1127920001745224, + "logits/rejected": 0.04135732352733612, + "logps/chosen": -0.6294536590576172, + "logps/rejected": -2.6099953651428223, + "loss": 0.5941, + "odds_ratio_loss": 0.38581383228302, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06294536590576172, + "rewards/margins": 0.19805417954921722, + "rewards/rejected": -0.26099956035614014, + "sft_loss": 0.6294536590576172, + "step": 1085 + }, + { + "epoch": 1.5704989154013016, + "grad_norm": 2.409003843426103, + "learning_rate": 6.8838717841488995e-06, + "logits/chosen": 0.18301844596862793, + "logits/rejected": 0.1595679372549057, + "logps/chosen": -0.6605107188224792, + "logps/rejected": -2.446531295776367, + "loss": 0.6903, + "odds_ratio_loss": 0.2724457085132599, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06605107337236404, + "rewards/margins": 0.17860206961631775, + "rewards/rejected": -0.2446531355381012, + "sft_loss": 0.6605107188224792, + "step": 1086 + }, + { + "epoch": 1.5719450469992768, + "grad_norm": 2.439223193709018, + "learning_rate": 6.881718641685926e-06, + "logits/chosen": 0.14739452302455902, + "logits/rejected": 0.11320450901985168, + "logps/chosen": -0.5645332932472229, + "logps/rejected": -3.358351945877075, + "loss": 0.5989, + "odds_ratio_loss": 0.2953460216522217, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05645333230495453, + "rewards/margins": 0.2793818712234497, + "rewards/rejected": -0.33583518862724304, + "sft_loss": 0.5645332932472229, + "step": 1087 + }, + { + "epoch": 1.5733911785972523, + "grad_norm": 2.3436482899115942, + "learning_rate": 6.879563761824052e-06, + "logits/chosen": 0.05163494125008583, + "logits/rejected": 0.04702293127775192, + "logps/chosen": -0.7239556312561035, + "logps/rejected": -2.1285696029663086, + "loss": 0.6696, + "odds_ratio_loss": 0.37814947962760925, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07239556312561035, + "rewards/margins": 0.14046140015125275, + "rewards/rejected": -0.2128569781780243, + "sft_loss": 0.7239556312561035, + "step": 1088 + }, + { + "epoch": 1.5748373101952278, + "grad_norm": 2.36411335058339, + "learning_rate": 6.877407145862461e-06, + "logits/chosen": 0.18641921877861023, + "logits/rejected": 0.05207761377096176, + "logps/chosen": -0.5632745027542114, + "logps/rejected": -2.345430612564087, + "loss": 0.6053, + "odds_ratio_loss": 0.3601348400115967, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0563274510204792, + "rewards/margins": 0.1782156229019165, + "rewards/rejected": -0.2345430552959442, + "sft_loss": 0.5632745027542114, + "step": 1089 + }, + { + "epoch": 1.5762834417932032, + "grad_norm": 3.1847878741876943, + "learning_rate": 6.875248795101386e-06, + "logits/chosen": 0.20281922817230225, + "logits/rejected": 0.14336425065994263, + "logps/chosen": -0.582189679145813, + "logps/rejected": -1.8264280557632446, + "loss": 0.5495, + "odds_ratio_loss": 0.3409522473812103, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05821897089481354, + "rewards/margins": 0.12442383170127869, + "rewards/rejected": -0.18264278769493103, + "sft_loss": 0.582189679145813, + "step": 1090 + }, + { + "epoch": 1.5777295733911787, + "grad_norm": 3.329674278967477, + "learning_rate": 6.873088710842103e-06, + "logits/chosen": 0.07660141587257385, + "logits/rejected": 0.06340264528989792, + "logps/chosen": -0.49803632497787476, + "logps/rejected": -2.5776054859161377, + "loss": 0.575, + "odds_ratio_loss": 0.2812206745147705, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.049803633242845535, + "rewards/margins": 0.20795691013336182, + "rewards/rejected": -0.25776055455207825, + "sft_loss": 0.49803632497787476, + "step": 1091 + }, + { + "epoch": 1.579175704989154, + "grad_norm": 3.3946195116434192, + "learning_rate": 6.870926894386936e-06, + "logits/chosen": 0.14749659597873688, + "logits/rejected": 0.03280261904001236, + "logps/chosen": -0.3851412236690521, + "logps/rejected": -3.3120474815368652, + "loss": 0.5268, + "odds_ratio_loss": 0.21843570470809937, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03851412236690521, + "rewards/margins": 0.292690634727478, + "rewards/rejected": -0.33120474219322205, + "sft_loss": 0.3851412236690521, + "step": 1092 + }, + { + "epoch": 1.5806218365871294, + "grad_norm": 2.263707967091026, + "learning_rate": 6.868763347039252e-06, + "logits/chosen": 0.12011094391345978, + "logits/rejected": 0.0620017871260643, + "logps/chosen": -0.5656850934028625, + "logps/rejected": -2.0197243690490723, + "loss": 0.5298, + "odds_ratio_loss": 0.33666715025901794, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.056568510830402374, + "rewards/margins": 0.1454039216041565, + "rewards/rejected": -0.20197243988513947, + "sft_loss": 0.5656850934028625, + "step": 1093 + }, + { + "epoch": 1.5820679681851049, + "grad_norm": 3.0913874233343446, + "learning_rate": 6.8665980701034604e-06, + "logits/chosen": 0.17975221574306488, + "logits/rejected": 0.07172837853431702, + "logps/chosen": -0.4119968116283417, + "logps/rejected": -2.78731632232666, + "loss": 0.6669, + "odds_ratio_loss": 0.3397271931171417, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.04119968041777611, + "rewards/margins": 0.23753196001052856, + "rewards/rejected": -0.27873164415359497, + "sft_loss": 0.4119968116283417, + "step": 1094 + }, + { + "epoch": 1.58351409978308, + "grad_norm": 3.449949589857646, + "learning_rate": 6.864431064885018e-06, + "logits/chosen": 0.10938216745853424, + "logits/rejected": 0.12328256666660309, + "logps/chosen": -0.724564254283905, + "logps/rejected": -2.3423516750335693, + "loss": 0.6826, + "odds_ratio_loss": 0.3296443819999695, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07245641946792603, + "rewards/margins": 0.1617787629365921, + "rewards/rejected": -0.23423519730567932, + "sft_loss": 0.724564254283905, + "step": 1095 + }, + { + "epoch": 1.5849602313810558, + "grad_norm": 2.585591290579989, + "learning_rate": 6.862262332690416e-06, + "logits/chosen": 0.1535097360610962, + "logits/rejected": 0.0930045023560524, + "logps/chosen": -0.5395622253417969, + "logps/rejected": -2.67804217338562, + "loss": 0.6028, + "odds_ratio_loss": 0.2423020452260971, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05395621806383133, + "rewards/margins": 0.2138480246067047, + "rewards/rejected": -0.26780423521995544, + "sft_loss": 0.5395622253417969, + "step": 1096 + }, + { + "epoch": 1.586406362979031, + "grad_norm": 2.731714970233483, + "learning_rate": 6.860091874827196e-06, + "logits/chosen": 0.15666761994361877, + "logits/rejected": 0.1512334793806076, + "logps/chosen": -0.49038049578666687, + "logps/rejected": -2.745338201522827, + "loss": 0.6536, + "odds_ratio_loss": 0.3623482286930084, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04903804883360863, + "rewards/margins": 0.22549577057361603, + "rewards/rejected": -0.27453380823135376, + "sft_loss": 0.49038049578666687, + "step": 1097 + }, + { + "epoch": 1.5878524945770065, + "grad_norm": 2.586427523762545, + "learning_rate": 6.85791969260393e-06, + "logits/chosen": 0.18899771571159363, + "logits/rejected": 0.17124459147453308, + "logps/chosen": -0.5725775957107544, + "logps/rejected": -2.258521556854248, + "loss": 0.5769, + "odds_ratio_loss": 0.34468019008636475, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0572577565908432, + "rewards/margins": 0.1685943752527237, + "rewards/rejected": -0.2258521318435669, + "sft_loss": 0.5725775957107544, + "step": 1098 + }, + { + "epoch": 1.589298626174982, + "grad_norm": 2.359047347217126, + "learning_rate": 6.855745787330238e-06, + "logits/chosen": 0.19358870387077332, + "logits/rejected": 0.17605549097061157, + "logps/chosen": -0.5376795530319214, + "logps/rejected": -2.1253416538238525, + "loss": 0.5648, + "odds_ratio_loss": 0.33196595311164856, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05376795679330826, + "rewards/margins": 0.15876621007919312, + "rewards/rejected": -0.21253415942192078, + "sft_loss": 0.5376795530319214, + "step": 1099 + }, + { + "epoch": 1.5907447577729572, + "grad_norm": 2.9412329763848697, + "learning_rate": 6.853570160316777e-06, + "logits/chosen": 0.11650137603282928, + "logits/rejected": 0.1291441023349762, + "logps/chosen": -0.6347838640213013, + "logps/rejected": -1.5328726768493652, + "loss": 0.6059, + "odds_ratio_loss": 0.38221707940101624, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06347838789224625, + "rewards/margins": 0.0898088812828064, + "rewards/rejected": -0.15328726172447205, + "sft_loss": 0.6347838640213013, + "step": 1100 + }, + { + "epoch": 1.592190889370933, + "grad_norm": 2.3477841646172286, + "learning_rate": 6.851392812875236e-06, + "logits/chosen": 0.16941529512405396, + "logits/rejected": 0.1017862856388092, + "logps/chosen": -0.5886316299438477, + "logps/rejected": -1.5656369924545288, + "loss": 0.6633, + "odds_ratio_loss": 0.3969772756099701, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.058863162994384766, + "rewards/margins": 0.09770055115222931, + "rewards/rejected": -0.15656371414661407, + "sft_loss": 0.5886316299438477, + "step": 1101 + }, + { + "epoch": 1.5936370209689081, + "grad_norm": 2.4546950838412056, + "learning_rate": 6.84921374631835e-06, + "logits/chosen": 0.24961897730827332, + "logits/rejected": 0.03723708540201187, + "logps/chosen": -0.602834165096283, + "logps/rejected": -4.140806674957275, + "loss": 0.6096, + "odds_ratio_loss": 0.28800731897354126, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.060283418744802475, + "rewards/margins": 0.3537972569465637, + "rewards/rejected": -0.4140806794166565, + "sft_loss": 0.602834165096283, + "step": 1102 + }, + { + "epoch": 1.5950831525668836, + "grad_norm": 2.5069717615115414, + "learning_rate": 6.847032961959884e-06, + "logits/chosen": 0.15000823140144348, + "logits/rejected": 0.24704432487487793, + "logps/chosen": -0.5027948617935181, + "logps/rejected": -1.290198802947998, + "loss": 0.6305, + "odds_ratio_loss": 0.3084346652030945, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.050279490649700165, + "rewards/margins": 0.07874040305614471, + "rewards/rejected": -0.12901988625526428, + "sft_loss": 0.5027948617935181, + "step": 1103 + }, + { + "epoch": 1.596529284164859, + "grad_norm": 3.4065840087438204, + "learning_rate": 6.844850461114643e-06, + "logits/chosen": 0.22534215450286865, + "logits/rejected": 0.1937633752822876, + "logps/chosen": -0.44374150037765503, + "logps/rejected": -2.513273000717163, + "loss": 0.5895, + "odds_ratio_loss": 0.23521637916564941, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04437415301799774, + "rewards/margins": 0.20695313811302185, + "rewards/rejected": -0.2513273060321808, + "sft_loss": 0.44374150037765503, + "step": 1104 + }, + { + "epoch": 1.5979754157628343, + "grad_norm": 2.506183913454602, + "learning_rate": 6.842666245098462e-06, + "logits/chosen": 0.1503123641014099, + "logits/rejected": 0.10985986888408661, + "logps/chosen": -0.6794801950454712, + "logps/rejected": -2.6882190704345703, + "loss": 0.7458, + "odds_ratio_loss": 0.2798730134963989, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06794802099466324, + "rewards/margins": 0.20087388157844543, + "rewards/rejected": -0.26882192492485046, + "sft_loss": 0.6794801950454712, + "step": 1105 + }, + { + "epoch": 1.59942154736081, + "grad_norm": 3.2357897333744905, + "learning_rate": 6.840480315228214e-06, + "logits/chosen": 0.18373164534568787, + "logits/rejected": 0.07605935633182526, + "logps/chosen": -0.4748189449310303, + "logps/rejected": -3.4461898803710938, + "loss": 0.6415, + "odds_ratio_loss": 0.3127560615539551, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04748189449310303, + "rewards/margins": 0.2971371114253998, + "rewards/rejected": -0.3446190357208252, + "sft_loss": 0.4748189449310303, + "step": 1106 + }, + { + "epoch": 1.6008676789587852, + "grad_norm": 2.616072647630207, + "learning_rate": 6.838292672821806e-06, + "logits/chosen": 0.29636842012405396, + "logits/rejected": 0.11761577427387238, + "logps/chosen": -0.5896111726760864, + "logps/rejected": -3.0412216186523438, + "loss": 0.6376, + "odds_ratio_loss": 0.3512306213378906, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05896111577749252, + "rewards/margins": 0.2451610416173935, + "rewards/rejected": -0.3041221499443054, + "sft_loss": 0.5896111726760864, + "step": 1107 + }, + { + "epoch": 1.6023138105567607, + "grad_norm": 2.846990331140939, + "learning_rate": 6.836103319198175e-06, + "logits/chosen": 0.20525681972503662, + "logits/rejected": 0.1730688512325287, + "logps/chosen": -0.68004310131073, + "logps/rejected": -1.7271138429641724, + "loss": 0.6933, + "odds_ratio_loss": 0.38755208253860474, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.068004310131073, + "rewards/margins": 0.10470708459615707, + "rewards/rejected": -0.17271138727664948, + "sft_loss": 0.68004310131073, + "step": 1108 + }, + { + "epoch": 1.6037599421547362, + "grad_norm": 2.5244611709953664, + "learning_rate": 6.833912255677289e-06, + "logits/chosen": 0.05130451172590256, + "logits/rejected": 0.19735708832740784, + "logps/chosen": -0.6317247152328491, + "logps/rejected": -2.590360164642334, + "loss": 0.6713, + "odds_ratio_loss": 0.25427278876304626, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06317247450351715, + "rewards/margins": 0.1958635449409485, + "rewards/rejected": -0.25903603434562683, + "sft_loss": 0.6317247152328491, + "step": 1109 + }, + { + "epoch": 1.6052060737527114, + "grad_norm": 2.5093017639783732, + "learning_rate": 6.8317194835801505e-06, + "logits/chosen": 0.16803748905658722, + "logits/rejected": 0.15267445147037506, + "logps/chosen": -0.4448273181915283, + "logps/rejected": -2.7201156616210938, + "loss": 0.5178, + "odds_ratio_loss": 0.2554229199886322, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04448273032903671, + "rewards/margins": 0.22752884030342102, + "rewards/rejected": -0.27201157808303833, + "sft_loss": 0.4448273181915283, + "step": 1110 + }, + { + "epoch": 1.6066522053506869, + "grad_norm": 2.686052793049035, + "learning_rate": 6.829525004228788e-06, + "logits/chosen": 0.0710282251238823, + "logits/rejected": 0.016714416444301605, + "logps/chosen": -0.4681667983531952, + "logps/rejected": -2.477705240249634, + "loss": 0.5899, + "odds_ratio_loss": 0.32242822647094727, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04681668430566788, + "rewards/margins": 0.2009538859128952, + "rewards/rejected": -0.2477705478668213, + "sft_loss": 0.4681667983531952, + "step": 1111 + }, + { + "epoch": 1.6080983369486623, + "grad_norm": 2.9752745108948813, + "learning_rate": 6.827328818946263e-06, + "logits/chosen": 0.006719652563333511, + "logits/rejected": 0.06880474090576172, + "logps/chosen": -0.7231383919715881, + "logps/rejected": -2.544945478439331, + "loss": 0.6282, + "odds_ratio_loss": 0.22746002674102783, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0723138377070427, + "rewards/margins": 0.182180717587471, + "rewards/rejected": -0.2544945478439331, + "sft_loss": 0.7231383919715881, + "step": 1112 + }, + { + "epoch": 1.6095444685466378, + "grad_norm": 2.1692853490318122, + "learning_rate": 6.825130929056662e-06, + "logits/chosen": 0.14772772789001465, + "logits/rejected": 0.10956289619207382, + "logps/chosen": -0.7819063663482666, + "logps/rejected": -1.8836016654968262, + "loss": 0.7105, + "odds_ratio_loss": 0.3727669417858124, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0781906396150589, + "rewards/margins": 0.11016951501369476, + "rewards/rejected": -0.18836016952991486, + "sft_loss": 0.7819063663482666, + "step": 1113 + }, + { + "epoch": 1.6109906001446133, + "grad_norm": 2.489057678072517, + "learning_rate": 6.822931335885103e-06, + "logits/chosen": 0.1797943413257599, + "logits/rejected": 0.13633441925048828, + "logps/chosen": -0.5514078140258789, + "logps/rejected": -2.322840929031372, + "loss": 0.6057, + "odds_ratio_loss": 0.2773057520389557, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05514077842235565, + "rewards/margins": 0.17714330554008484, + "rewards/rejected": -0.2322840839624405, + "sft_loss": 0.5514078140258789, + "step": 1114 + }, + { + "epoch": 1.6124367317425885, + "grad_norm": 2.105562163406456, + "learning_rate": 6.820730040757728e-06, + "logits/chosen": 0.15017375349998474, + "logits/rejected": 0.1785978227853775, + "logps/chosen": -0.5378727316856384, + "logps/rejected": -2.191357135772705, + "loss": 0.6226, + "odds_ratio_loss": 0.2285824716091156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05378727242350578, + "rewards/margins": 0.16534847021102905, + "rewards/rejected": -0.21913574635982513, + "sft_loss": 0.5378727316856384, + "step": 1115 + }, + { + "epoch": 1.613882863340564, + "grad_norm": 2.3044143455434924, + "learning_rate": 6.818527045001705e-06, + "logits/chosen": 0.16752856969833374, + "logits/rejected": 0.16827546060085297, + "logps/chosen": -0.4314478933811188, + "logps/rejected": -2.3377957344055176, + "loss": 0.6104, + "odds_ratio_loss": 0.2084464430809021, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04314478486776352, + "rewards/margins": 0.19063478708267212, + "rewards/rejected": -0.23377957940101624, + "sft_loss": 0.4314478933811188, + "step": 1116 + }, + { + "epoch": 1.6153289949385394, + "grad_norm": 2.3550945594014574, + "learning_rate": 6.816322349945229e-06, + "logits/chosen": 0.14617860317230225, + "logits/rejected": 0.09834670275449753, + "logps/chosen": -0.5126814246177673, + "logps/rejected": -2.7283544540405273, + "loss": 0.5873, + "odds_ratio_loss": 0.17719094455242157, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05126814544200897, + "rewards/margins": 0.221567302942276, + "rewards/rejected": -0.2728354334831238, + "sft_loss": 0.5126814246177673, + "step": 1117 + }, + { + "epoch": 1.6167751265365147, + "grad_norm": 2.853329800579316, + "learning_rate": 6.81411595691752e-06, + "logits/chosen": 0.1595417559146881, + "logits/rejected": 0.09003555774688721, + "logps/chosen": -0.5111287832260132, + "logps/rejected": -4.566333293914795, + "loss": 0.5822, + "odds_ratio_loss": 0.19126391410827637, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05111287906765938, + "rewards/margins": 0.4055204391479492, + "rewards/rejected": -0.4566333293914795, + "sft_loss": 0.5111287832260132, + "step": 1118 + }, + { + "epoch": 1.6182212581344904, + "grad_norm": 4.368835734538797, + "learning_rate": 6.81190786724882e-06, + "logits/chosen": 0.13393574953079224, + "logits/rejected": 0.23001378774642944, + "logps/chosen": -0.532265305519104, + "logps/rejected": -2.221224308013916, + "loss": 0.5696, + "odds_ratio_loss": 0.27427342534065247, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0532265342772007, + "rewards/margins": 0.1688959002494812, + "rewards/rejected": -0.2221224159002304, + "sft_loss": 0.532265305519104, + "step": 1119 + }, + { + "epoch": 1.6196673897324656, + "grad_norm": 2.3014882949802646, + "learning_rate": 6.809698082270394e-06, + "logits/chosen": 0.08405912667512894, + "logits/rejected": 0.14118604362010956, + "logps/chosen": -0.742213249206543, + "logps/rejected": -2.146195888519287, + "loss": 0.6572, + "odds_ratio_loss": 0.441084086894989, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07422132790088654, + "rewards/margins": 0.14039826393127441, + "rewards/rejected": -0.21461959183216095, + "sft_loss": 0.742213249206543, + "step": 1120 + }, + { + "epoch": 1.621113521330441, + "grad_norm": 4.67097973693336, + "learning_rate": 6.80748660331453e-06, + "logits/chosen": 0.1618950515985489, + "logits/rejected": 0.035197075456380844, + "logps/chosen": -0.7067070603370667, + "logps/rejected": -1.7793760299682617, + "loss": 0.6408, + "odds_ratio_loss": 0.4611864686012268, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0706707015633583, + "rewards/margins": 0.10726691782474518, + "rewards/rejected": -0.17793762683868408, + "sft_loss": 0.7067070603370667, + "step": 1121 + }, + { + "epoch": 1.6225596529284165, + "grad_norm": 2.6910652991942787, + "learning_rate": 6.8052734317145356e-06, + "logits/chosen": 0.16007086634635925, + "logits/rejected": 0.24231232702732086, + "logps/chosen": -0.4963790774345398, + "logps/rejected": -3.519035816192627, + "loss": 0.6319, + "odds_ratio_loss": 0.2605344355106354, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04963790625333786, + "rewards/margins": 0.3022657036781311, + "rewards/rejected": -0.35190361738204956, + "sft_loss": 0.4963790774345398, + "step": 1122 + }, + { + "epoch": 1.6240057845263918, + "grad_norm": 2.36256991290105, + "learning_rate": 6.803058568804742e-06, + "logits/chosen": 0.16033251583576202, + "logits/rejected": 0.12260282039642334, + "logps/chosen": -0.5150243639945984, + "logps/rejected": -2.330399513244629, + "loss": 0.4986, + "odds_ratio_loss": 0.34891653060913086, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05150243639945984, + "rewards/margins": 0.18153752386569977, + "rewards/rejected": -0.2330399751663208, + "sft_loss": 0.5150243639945984, + "step": 1123 + }, + { + "epoch": 1.6254519161243675, + "grad_norm": 2.621102035932219, + "learning_rate": 6.800842015920496e-06, + "logits/chosen": 0.13604751229286194, + "logits/rejected": 0.04528624191880226, + "logps/chosen": -0.5994237661361694, + "logps/rejected": -3.107229709625244, + "loss": 0.6421, + "odds_ratio_loss": 0.3047102689743042, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.059942372143268585, + "rewards/margins": 0.2507806122303009, + "rewards/rejected": -0.3107229471206665, + "sft_loss": 0.5994237661361694, + "step": 1124 + }, + { + "epoch": 1.6268980477223427, + "grad_norm": 3.3189950012529845, + "learning_rate": 6.798623774398169e-06, + "logits/chosen": 0.2003846913576126, + "logits/rejected": 0.11310932040214539, + "logps/chosen": -0.5863832831382751, + "logps/rejected": -1.5205748081207275, + "loss": 0.6413, + "odds_ratio_loss": 0.2823806405067444, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.058638330549001694, + "rewards/margins": 0.0934191569685936, + "rewards/rejected": -0.1520574986934662, + "sft_loss": 0.5863832831382751, + "step": 1125 + }, + { + "epoch": 1.6283441793203182, + "grad_norm": 2.6471591529472693, + "learning_rate": 6.796403845575145e-06, + "logits/chosen": 0.2325344830751419, + "logits/rejected": 0.1505952775478363, + "logps/chosen": -0.40177321434020996, + "logps/rejected": -2.4784207344055176, + "loss": 0.6045, + "odds_ratio_loss": 0.22741401195526123, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.040177322924137115, + "rewards/margins": 0.20766472816467285, + "rewards/rejected": -0.24784202873706818, + "sft_loss": 0.40177321434020996, + "step": 1126 + }, + { + "epoch": 1.6297903109182936, + "grad_norm": 2.475300022674049, + "learning_rate": 6.794182230789827e-06, + "logits/chosen": 0.16944287717342377, + "logits/rejected": 0.20057857036590576, + "logps/chosen": -0.5337626338005066, + "logps/rejected": -1.8407460451126099, + "loss": 0.5948, + "odds_ratio_loss": 0.33305466175079346, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05337626859545708, + "rewards/margins": 0.1306983232498169, + "rewards/rejected": -0.18407458066940308, + "sft_loss": 0.5337626338005066, + "step": 1127 + }, + { + "epoch": 1.6312364425162689, + "grad_norm": 2.4279942191938293, + "learning_rate": 6.7919589313816355e-06, + "logits/chosen": 0.27863115072250366, + "logits/rejected": 0.20137888193130493, + "logps/chosen": -0.5784075260162354, + "logps/rejected": -2.008815288543701, + "loss": 0.5877, + "odds_ratio_loss": 0.3455614745616913, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05784075707197189, + "rewards/margins": 0.14304079115390778, + "rewards/rejected": -0.20088155567646027, + "sft_loss": 0.5784075260162354, + "step": 1128 + }, + { + "epoch": 1.6326825741142446, + "grad_norm": 2.486755988814461, + "learning_rate": 6.789733948691006e-06, + "logits/chosen": 0.12318692356348038, + "logits/rejected": 0.09866747260093689, + "logps/chosen": -0.6427417397499084, + "logps/rejected": -2.290992259979248, + "loss": 0.6232, + "odds_ratio_loss": 0.3308793306350708, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06427416950464249, + "rewards/margins": 0.16482506692409515, + "rewards/rejected": -0.22909922897815704, + "sft_loss": 0.6427417397499084, + "step": 1129 + }, + { + "epoch": 1.6341287057122198, + "grad_norm": 8.705640420104292, + "learning_rate": 6.787507284059388e-06, + "logits/chosen": 0.3377228379249573, + "logits/rejected": 0.33408480882644653, + "logps/chosen": -0.5667027235031128, + "logps/rejected": -1.6456166505813599, + "loss": 0.6168, + "odds_ratio_loss": 0.39584583044052124, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05667027086019516, + "rewards/margins": 0.10789139568805695, + "rewards/rejected": -0.1645616590976715, + "sft_loss": 0.5667027235031128, + "step": 1130 + }, + { + "epoch": 1.6355748373101953, + "grad_norm": 2.23403133866239, + "learning_rate": 6.785278938829248e-06, + "logits/chosen": 0.18635720014572144, + "logits/rejected": 0.12744563817977905, + "logps/chosen": -0.5257724523544312, + "logps/rejected": -3.5002336502075195, + "loss": 0.5913, + "odds_ratio_loss": 0.22851410508155823, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.052577245980501175, + "rewards/margins": 0.2974461317062378, + "rewards/rejected": -0.3500233590602875, + "sft_loss": 0.5257724523544312, + "step": 1131 + }, + { + "epoch": 1.6370209689081707, + "grad_norm": 2.2838844158071114, + "learning_rate": 6.7830489143440625e-06, + "logits/chosen": 0.24182364344596863, + "logits/rejected": 0.12117785215377808, + "logps/chosen": -0.5080547332763672, + "logps/rejected": -1.8937900066375732, + "loss": 0.6495, + "odds_ratio_loss": 0.305306077003479, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0508054718375206, + "rewards/margins": 0.1385735273361206, + "rewards/rejected": -0.1893789917230606, + "sft_loss": 0.5080547332763672, + "step": 1132 + }, + { + "epoch": 1.638467100506146, + "grad_norm": 2.492923349789528, + "learning_rate": 6.78081721194832e-06, + "logits/chosen": 0.12716682255268097, + "logits/rejected": 0.19468384981155396, + "logps/chosen": -0.5700826644897461, + "logps/rejected": -2.5006954669952393, + "loss": 0.7108, + "odds_ratio_loss": 0.24998816847801208, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05700826644897461, + "rewards/margins": 0.19306129217147827, + "rewards/rejected": -0.2500695586204529, + "sft_loss": 0.5700826644897461, + "step": 1133 + }, + { + "epoch": 1.6399132321041214, + "grad_norm": 2.473546132314206, + "learning_rate": 6.778583832987524e-06, + "logits/chosen": 0.21075762808322906, + "logits/rejected": 0.07315226644277573, + "logps/chosen": -0.5984592437744141, + "logps/rejected": -2.3020665645599365, + "loss": 0.6367, + "odds_ratio_loss": 0.3468858599662781, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.059845924377441406, + "rewards/margins": 0.1703607439994812, + "rewards/rejected": -0.2302066683769226, + "sft_loss": 0.5984592437744141, + "step": 1134 + }, + { + "epoch": 1.641359363702097, + "grad_norm": 3.815784113822362, + "learning_rate": 6.776348778808187e-06, + "logits/chosen": 0.2276933491230011, + "logits/rejected": 0.20840172469615936, + "logps/chosen": -0.4322483241558075, + "logps/rejected": -2.1296682357788086, + "loss": 0.5527, + "odds_ratio_loss": 0.18725760281085968, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04322483763098717, + "rewards/margins": 0.16974200308322906, + "rewards/rejected": -0.21296685934066772, + "sft_loss": 0.4322483241558075, + "step": 1135 + }, + { + "epoch": 1.6428054953000724, + "grad_norm": 3.230042856607609, + "learning_rate": 6.774112050757831e-06, + "logits/chosen": 0.22059807181358337, + "logits/rejected": 0.19416289031505585, + "logps/chosen": -0.4906231760978699, + "logps/rejected": -2.541626214981079, + "loss": 0.6078, + "odds_ratio_loss": 0.32527580857276917, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.04906231909990311, + "rewards/margins": 0.20510032773017883, + "rewards/rejected": -0.25416263937950134, + "sft_loss": 0.4906231760978699, + "step": 1136 + }, + { + "epoch": 1.6442516268980478, + "grad_norm": 2.507231602313924, + "learning_rate": 6.771873650184987e-06, + "logits/chosen": 0.2306595742702484, + "logits/rejected": 0.17716917395591736, + "logps/chosen": -0.6218141317367554, + "logps/rejected": -3.662498950958252, + "loss": 0.6342, + "odds_ratio_loss": 0.29755860567092896, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06218141317367554, + "rewards/margins": 0.30406850576400757, + "rewards/rejected": -0.3662499189376831, + "sft_loss": 0.6218141317367554, + "step": 1137 + }, + { + "epoch": 1.645697758496023, + "grad_norm": 2.577574144850942, + "learning_rate": 6.769633578439196e-06, + "logits/chosen": 0.22073982656002045, + "logits/rejected": 0.14277009665966034, + "logps/chosen": -0.49371635913848877, + "logps/rejected": -1.553049087524414, + "loss": 0.5629, + "odds_ratio_loss": 0.36073967814445496, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.049371637403964996, + "rewards/margins": 0.105933278799057, + "rewards/rejected": -0.1553049087524414, + "sft_loss": 0.49371635913848877, + "step": 1138 + }, + { + "epoch": 1.6471438900939985, + "grad_norm": 2.5217495034916904, + "learning_rate": 6.767391836871006e-06, + "logits/chosen": 0.25824111700057983, + "logits/rejected": 0.2234843224287033, + "logps/chosen": -0.48958465456962585, + "logps/rejected": -2.2131435871124268, + "loss": 0.5043, + "odds_ratio_loss": 0.2931782007217407, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.048958465456962585, + "rewards/margins": 0.17235590517520905, + "rewards/rejected": -0.22131435573101044, + "sft_loss": 0.48958465456962585, + "step": 1139 + }, + { + "epoch": 1.648590021691974, + "grad_norm": 3.0287506272655373, + "learning_rate": 6.76514842683197e-06, + "logits/chosen": 0.08717759698629379, + "logits/rejected": 0.17295514047145844, + "logps/chosen": -0.5925130844116211, + "logps/rejected": -2.349668264389038, + "loss": 0.6824, + "odds_ratio_loss": 0.28350913524627686, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05925130844116211, + "rewards/margins": 0.17571553587913513, + "rewards/rejected": -0.23496684432029724, + "sft_loss": 0.5925130844116211, + "step": 1140 + }, + { + "epoch": 1.6500361532899492, + "grad_norm": 2.4412616506760427, + "learning_rate": 6.7629033496746485e-06, + "logits/chosen": 0.20098035037517548, + "logits/rejected": 0.20969319343566895, + "logps/chosen": -0.5476840138435364, + "logps/rejected": -2.703965663909912, + "loss": 0.6569, + "odds_ratio_loss": 0.23726245760917664, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0547684021294117, + "rewards/margins": 0.21562819182872772, + "rewards/rejected": -0.2703965902328491, + "sft_loss": 0.5476840138435364, + "step": 1141 + }, + { + "epoch": 1.651482284887925, + "grad_norm": 3.32524625474085, + "learning_rate": 6.760656606752608e-06, + "logits/chosen": 0.19986768066883087, + "logits/rejected": 0.2439718395471573, + "logps/chosen": -0.5939256548881531, + "logps/rejected": -3.133114814758301, + "loss": 0.6345, + "odds_ratio_loss": 0.3041260540485382, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05939256399869919, + "rewards/margins": 0.25391891598701477, + "rewards/rejected": -0.31331151723861694, + "sft_loss": 0.5939256548881531, + "step": 1142 + }, + { + "epoch": 1.6529284164859002, + "grad_norm": 2.9130226961401613, + "learning_rate": 6.758408199420418e-06, + "logits/chosen": 0.1656503677368164, + "logits/rejected": 0.16937926411628723, + "logps/chosen": -0.4139644503593445, + "logps/rejected": -2.241116762161255, + "loss": 0.5212, + "odds_ratio_loss": 0.20857828855514526, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04139644652605057, + "rewards/margins": 0.18271522223949432, + "rewards/rejected": -0.2241116613149643, + "sft_loss": 0.4139644503593445, + "step": 1143 + }, + { + "epoch": 1.6543745480838756, + "grad_norm": 3.058127435624459, + "learning_rate": 6.75615812903365e-06, + "logits/chosen": 0.23680804669857025, + "logits/rejected": 0.1664726287126541, + "logps/chosen": -0.6967654228210449, + "logps/rejected": -1.6403802633285522, + "loss": 0.571, + "odds_ratio_loss": 0.4093691408634186, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06967654079198837, + "rewards/margins": 0.09436149895191193, + "rewards/rejected": -0.1640380322933197, + "sft_loss": 0.6967654228210449, + "step": 1144 + }, + { + "epoch": 1.655820679681851, + "grad_norm": 2.1730215190277935, + "learning_rate": 6.7539063969488825e-06, + "logits/chosen": 0.18492895364761353, + "logits/rejected": 0.26141905784606934, + "logps/chosen": -0.5463985800743103, + "logps/rejected": -1.4454739093780518, + "loss": 0.6343, + "odds_ratio_loss": 0.34149205684661865, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05463985726237297, + "rewards/margins": 0.08990754932165146, + "rewards/rejected": -0.14454740285873413, + "sft_loss": 0.5463985800743103, + "step": 1145 + }, + { + "epoch": 1.6572668112798263, + "grad_norm": 3.374659455042283, + "learning_rate": 6.75165300452369e-06, + "logits/chosen": 0.11994520574808121, + "logits/rejected": 0.09264302998781204, + "logps/chosen": -0.6896888613700867, + "logps/rejected": -2.469841480255127, + "loss": 0.5971, + "odds_ratio_loss": 0.4484861493110657, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06896888464689255, + "rewards/margins": 0.17801526188850403, + "rewards/rejected": -0.24698415398597717, + "sft_loss": 0.6896888613700867, + "step": 1146 + }, + { + "epoch": 1.658712942877802, + "grad_norm": 2.7494237048983106, + "learning_rate": 6.749397953116654e-06, + "logits/chosen": 0.21989181637763977, + "logits/rejected": 0.13300833106040955, + "logps/chosen": -0.5863887071609497, + "logps/rejected": -2.8034584522247314, + "loss": 0.5857, + "odds_ratio_loss": 0.419039785861969, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05863887071609497, + "rewards/margins": 0.22170698642730713, + "rewards/rejected": -0.2803458273410797, + "sft_loss": 0.5863887071609497, + "step": 1147 + }, + { + "epoch": 1.6601590744757773, + "grad_norm": 2.464174573411112, + "learning_rate": 6.747141244087352e-06, + "logits/chosen": 0.2038121223449707, + "logits/rejected": 0.08812229335308075, + "logps/chosen": -0.5297778844833374, + "logps/rejected": -2.8625032901763916, + "loss": 0.6452, + "odds_ratio_loss": 0.19093012809753418, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0529777929186821, + "rewards/margins": 0.23327253758907318, + "rewards/rejected": -0.2862503230571747, + "sft_loss": 0.5297778844833374, + "step": 1148 + }, + { + "epoch": 1.6616052060737527, + "grad_norm": 2.275862629901525, + "learning_rate": 6.744882878796362e-06, + "logits/chosen": 0.2904704511165619, + "logits/rejected": 0.29210197925567627, + "logps/chosen": -0.47496497631073, + "logps/rejected": -2.142214298248291, + "loss": 0.5979, + "odds_ratio_loss": 0.3119732737541199, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.047496501356363297, + "rewards/margins": 0.16672493517398834, + "rewards/rejected": -0.21422143280506134, + "sft_loss": 0.47496497631073, + "step": 1149 + }, + { + "epoch": 1.6630513376717282, + "grad_norm": 2.4348073492732447, + "learning_rate": 6.742622858605262e-06, + "logits/chosen": 0.06364156305789948, + "logits/rejected": 0.03990757837891579, + "logps/chosen": -0.48284998536109924, + "logps/rejected": -4.278642654418945, + "loss": 0.6012, + "odds_ratio_loss": 0.14767947793006897, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.048285000026226044, + "rewards/margins": 0.37957924604415894, + "rewards/rejected": -0.42786428332328796, + "sft_loss": 0.48284998536109924, + "step": 1150 + }, + { + "epoch": 1.6644974692697034, + "grad_norm": 2.5895585584395073, + "learning_rate": 6.740361184876625e-06, + "logits/chosen": 0.1997375339269638, + "logits/rejected": 0.17401760816574097, + "logps/chosen": -0.4806000888347626, + "logps/rejected": -1.8395509719848633, + "loss": 0.5594, + "odds_ratio_loss": 0.27543121576309204, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.048060011118650436, + "rewards/margins": 0.13589510321617126, + "rewards/rejected": -0.1839551031589508, + "sft_loss": 0.4806000888347626, + "step": 1151 + }, + { + "epoch": 1.6659436008676791, + "grad_norm": 4.701884489199648, + "learning_rate": 6.738097858974024e-06, + "logits/chosen": 0.19600707292556763, + "logits/rejected": 0.16965457797050476, + "logps/chosen": -0.6757255792617798, + "logps/rejected": -1.1500645875930786, + "loss": 0.6207, + "odds_ratio_loss": 0.6113768815994263, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06757256388664246, + "rewards/margins": 0.04743390157818794, + "rewards/rejected": -0.1150064542889595, + "sft_loss": 0.6757255792617798, + "step": 1152 + }, + { + "epoch": 1.6673897324656544, + "grad_norm": 3.175165956336856, + "learning_rate": 6.735832882262026e-06, + "logits/chosen": 0.08555848896503448, + "logits/rejected": 0.10591746121644974, + "logps/chosen": -0.4486284852027893, + "logps/rejected": -2.222221612930298, + "loss": 0.5803, + "odds_ratio_loss": 0.1979960799217224, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04486284777522087, + "rewards/margins": 0.17735931277275085, + "rewards/rejected": -0.22222216427326202, + "sft_loss": 0.4486284852027893, + "step": 1153 + }, + { + "epoch": 1.6688358640636298, + "grad_norm": 2.720757064167482, + "learning_rate": 6.733566256106193e-06, + "logits/chosen": 0.4870707392692566, + "logits/rejected": 0.2844332158565521, + "logps/chosen": -0.6252628564834595, + "logps/rejected": -2.486476421356201, + "loss": 0.6303, + "odds_ratio_loss": 0.3258809447288513, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06252628564834595, + "rewards/margins": 0.1861213743686676, + "rewards/rejected": -0.24864766001701355, + "sft_loss": 0.6252628564834595, + "step": 1154 + }, + { + "epoch": 1.6702819956616053, + "grad_norm": 5.187324442141512, + "learning_rate": 6.731297981873086e-06, + "logits/chosen": 0.1666257381439209, + "logits/rejected": 0.1535015106201172, + "logps/chosen": -0.6065875887870789, + "logps/rejected": -2.051072359085083, + "loss": 0.6106, + "odds_ratio_loss": 0.43608808517456055, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.060658760368824005, + "rewards/margins": 0.14444848895072937, + "rewards/rejected": -0.20510724186897278, + "sft_loss": 0.6065875887870789, + "step": 1155 + }, + { + "epoch": 1.6717281272595805, + "grad_norm": 2.1684485335228354, + "learning_rate": 6.729028060930251e-06, + "logits/chosen": 0.358548104763031, + "logits/rejected": 0.1484028548002243, + "logps/chosen": -0.485321581363678, + "logps/rejected": -3.6741294860839844, + "loss": 0.589, + "odds_ratio_loss": 0.25323089957237244, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0485321581363678, + "rewards/margins": 0.3188807964324951, + "rewards/rejected": -0.3674129545688629, + "sft_loss": 0.485321581363678, + "step": 1156 + }, + { + "epoch": 1.673174258857556, + "grad_norm": 2.46870459613307, + "learning_rate": 6.726756494646235e-06, + "logits/chosen": 0.16199025511741638, + "logits/rejected": 0.1509033590555191, + "logps/chosen": -0.6030094027519226, + "logps/rejected": -1.4246817827224731, + "loss": 0.5609, + "odds_ratio_loss": 0.36534902453422546, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06030093878507614, + "rewards/margins": 0.08216723799705505, + "rewards/rejected": -0.1424681842327118, + "sft_loss": 0.6030094027519226, + "step": 1157 + }, + { + "epoch": 1.6746203904555315, + "grad_norm": 2.4726592245004633, + "learning_rate": 6.7244832843905725e-06, + "logits/chosen": 0.17424005270004272, + "logits/rejected": 0.15898647904396057, + "logps/chosen": -0.5712554454803467, + "logps/rejected": -2.1054673194885254, + "loss": 0.5444, + "odds_ratio_loss": 0.21559767425060272, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05712554603815079, + "rewards/margins": 0.15342120826244354, + "rewards/rejected": -0.21054676175117493, + "sft_loss": 0.5712554454803467, + "step": 1158 + }, + { + "epoch": 1.676066522053507, + "grad_norm": 3.5416468577213345, + "learning_rate": 6.72220843153379e-06, + "logits/chosen": 0.2255643904209137, + "logits/rejected": 0.13521726429462433, + "logps/chosen": -0.5240362882614136, + "logps/rejected": -2.168708324432373, + "loss": 0.6036, + "odds_ratio_loss": 0.3438939154148102, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.052403632551431656, + "rewards/margins": 0.16446718573570251, + "rewards/rejected": -0.21687081456184387, + "sft_loss": 0.5240362882614136, + "step": 1159 + }, + { + "epoch": 1.6775126536514824, + "grad_norm": 3.254244381492882, + "learning_rate": 6.719931937447407e-06, + "logits/chosen": 0.3357830047607422, + "logits/rejected": 0.259264200925827, + "logps/chosen": -0.542752206325531, + "logps/rejected": -3.2315144538879395, + "loss": 0.707, + "odds_ratio_loss": 0.31859856843948364, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05427522212266922, + "rewards/margins": 0.26887619495391846, + "rewards/rejected": -0.3231514096260071, + "sft_loss": 0.542752206325531, + "step": 1160 + }, + { + "epoch": 1.6789587852494576, + "grad_norm": 5.511096848650093, + "learning_rate": 6.717653803503928e-06, + "logits/chosen": 0.22886249423027039, + "logits/rejected": 0.11704693734645844, + "logps/chosen": -0.5869529843330383, + "logps/rejected": -2.572026252746582, + "loss": 0.5864, + "odds_ratio_loss": 0.27222052216529846, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05869529768824577, + "rewards/margins": 0.19850732386112213, + "rewards/rejected": -0.2572026252746582, + "sft_loss": 0.5869529843330383, + "step": 1161 + }, + { + "epoch": 1.680404916847433, + "grad_norm": 2.5036032853106014, + "learning_rate": 6.71537403107685e-06, + "logits/chosen": 0.15184980630874634, + "logits/rejected": 0.1305277943611145, + "logps/chosen": -0.5590934157371521, + "logps/rejected": -2.7704367637634277, + "loss": 0.5799, + "odds_ratio_loss": 0.2579389810562134, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05590933561325073, + "rewards/margins": 0.22113433480262756, + "rewards/rejected": -0.2770436704158783, + "sft_loss": 0.5590934157371521, + "step": 1162 + }, + { + "epoch": 1.6818510484454086, + "grad_norm": 2.6005042356521564, + "learning_rate": 6.713092621540655e-06, + "logits/chosen": 0.28715190291404724, + "logits/rejected": 0.11853550374507904, + "logps/chosen": -0.7113485932350159, + "logps/rejected": -2.0638599395751953, + "loss": 0.6124, + "odds_ratio_loss": 0.4370325803756714, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07113486528396606, + "rewards/margins": 0.13525114953517914, + "rewards/rejected": -0.206385999917984, + "sft_loss": 0.7113485932350159, + "step": 1163 + }, + { + "epoch": 1.6832971800433838, + "grad_norm": 2.3848381421994724, + "learning_rate": 6.7108095762708136e-06, + "logits/chosen": 0.1905509978532791, + "logits/rejected": 0.1692703366279602, + "logps/chosen": -0.550454318523407, + "logps/rejected": -4.018769264221191, + "loss": 0.5935, + "odds_ratio_loss": 0.2899460792541504, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.055045437067747116, + "rewards/margins": 0.3468315005302429, + "rewards/rejected": -0.40187692642211914, + "sft_loss": 0.550454318523407, + "step": 1164 + }, + { + "epoch": 1.6847433116413595, + "grad_norm": 2.7571571011622105, + "learning_rate": 6.708524896643782e-06, + "logits/chosen": 0.188361257314682, + "logits/rejected": 0.0712515264749527, + "logps/chosen": -0.5353450775146484, + "logps/rejected": -2.714508533477783, + "loss": 0.634, + "odds_ratio_loss": 0.28433114290237427, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.053534507751464844, + "rewards/margins": 0.2179163247346878, + "rewards/rejected": -0.27145081758499146, + "sft_loss": 0.5353450775146484, + "step": 1165 + }, + { + "epoch": 1.6861894432393347, + "grad_norm": 2.3320851121282438, + "learning_rate": 6.706238584037003e-06, + "logits/chosen": 0.15112724900245667, + "logits/rejected": 0.12381379306316376, + "logps/chosen": -0.4415856897830963, + "logps/rejected": -3.0355641841888428, + "loss": 0.5931, + "odds_ratio_loss": 0.18794691562652588, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04415857046842575, + "rewards/margins": 0.25939783453941345, + "rewards/rejected": -0.3035564124584198, + "sft_loss": 0.4415856897830963, + "step": 1166 + }, + { + "epoch": 1.6876355748373102, + "grad_norm": 2.506114425073133, + "learning_rate": 6.703950639828903e-06, + "logits/chosen": 0.2930244207382202, + "logits/rejected": 0.28212878108024597, + "logps/chosen": -0.5601966977119446, + "logps/rejected": -2.481189489364624, + "loss": 0.627, + "odds_ratio_loss": 0.3866164982318878, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05601967126131058, + "rewards/margins": 0.19209925830364227, + "rewards/rejected": -0.24811893701553345, + "sft_loss": 0.5601966977119446, + "step": 1167 + }, + { + "epoch": 1.6890817064352857, + "grad_norm": 2.3644437186432645, + "learning_rate": 6.701661065398892e-06, + "logits/chosen": 0.17681562900543213, + "logits/rejected": 0.14664041996002197, + "logps/chosen": -0.5699477195739746, + "logps/rejected": -1.8158708810806274, + "loss": 0.6535, + "odds_ratio_loss": 0.3121950030326843, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05699477344751358, + "rewards/margins": 0.12459231168031693, + "rewards/rejected": -0.1815870851278305, + "sft_loss": 0.5699477195739746, + "step": 1168 + }, + { + "epoch": 1.690527838033261, + "grad_norm": 4.165250675002018, + "learning_rate": 6.699369862127362e-06, + "logits/chosen": 0.26053327322006226, + "logits/rejected": 0.1782192438840866, + "logps/chosen": -0.5276187658309937, + "logps/rejected": -2.0793633460998535, + "loss": 0.4889, + "odds_ratio_loss": 0.39804819226264954, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.05276188254356384, + "rewards/margins": 0.15517446398735046, + "rewards/rejected": -0.2079363316297531, + "sft_loss": 0.5276187658309937, + "step": 1169 + }, + { + "epoch": 1.6919739696312366, + "grad_norm": 2.7717506547854094, + "learning_rate": 6.69707703139569e-06, + "logits/chosen": 0.3379908800125122, + "logits/rejected": 0.1853041648864746, + "logps/chosen": -0.6339027881622314, + "logps/rejected": -2.206711530685425, + "loss": 0.6035, + "odds_ratio_loss": 0.3447216749191284, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06339027732610703, + "rewards/margins": 0.15728086233139038, + "rewards/rejected": -0.220671147108078, + "sft_loss": 0.6339027881622314, + "step": 1170 + }, + { + "epoch": 1.6934201012292118, + "grad_norm": 2.876125870078734, + "learning_rate": 6.694782574586229e-06, + "logits/chosen": 0.14711743593215942, + "logits/rejected": 0.1626882553100586, + "logps/chosen": -0.43197083473205566, + "logps/rejected": -3.0947062969207764, + "loss": 0.6135, + "odds_ratio_loss": 0.23374246060848236, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04319708049297333, + "rewards/margins": 0.266273558139801, + "rewards/rejected": -0.30947065353393555, + "sft_loss": 0.43197083473205566, + "step": 1171 + }, + { + "epoch": 1.6948662328271873, + "grad_norm": 3.1337461461664975, + "learning_rate": 6.692486493082317e-06, + "logits/chosen": 0.19641858339309692, + "logits/rejected": 0.11144060641527176, + "logps/chosen": -0.6843913793563843, + "logps/rejected": -2.083221435546875, + "loss": 0.6497, + "odds_ratio_loss": 0.38456079363822937, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06843913346529007, + "rewards/margins": 0.13988301157951355, + "rewards/rejected": -0.20832215249538422, + "sft_loss": 0.6843913793563843, + "step": 1172 + }, + { + "epoch": 1.6963123644251628, + "grad_norm": 3.552236706747717, + "learning_rate": 6.690188788268273e-06, + "logits/chosen": 0.21632656455039978, + "logits/rejected": 0.115239217877388, + "logps/chosen": -0.6581906676292419, + "logps/rejected": -1.6244016885757446, + "loss": 0.66, + "odds_ratio_loss": 0.46846601366996765, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06581906229257584, + "rewards/margins": 0.09662109613418579, + "rewards/rejected": -0.16244015097618103, + "sft_loss": 0.6581906676292419, + "step": 1173 + }, + { + "epoch": 1.697758496023138, + "grad_norm": 2.557436742334174, + "learning_rate": 6.687889461529386e-06, + "logits/chosen": 0.2814568877220154, + "logits/rejected": 0.18137472867965698, + "logps/chosen": -0.5548322200775146, + "logps/rejected": -2.5669074058532715, + "loss": 0.6203, + "odds_ratio_loss": 0.26014259457588196, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.055483222007751465, + "rewards/margins": 0.20120754837989807, + "rewards/rejected": -0.25669077038764954, + "sft_loss": 0.5548322200775146, + "step": 1174 + }, + { + "epoch": 1.6992046276211137, + "grad_norm": 3.737744307891954, + "learning_rate": 6.685588514251934e-06, + "logits/chosen": 0.1883585900068283, + "logits/rejected": 0.11404610425233841, + "logps/chosen": -0.5273705124855042, + "logps/rejected": -2.0571155548095703, + "loss": 0.6086, + "odds_ratio_loss": 0.21898625791072845, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.052737053483724594, + "rewards/margins": 0.15297451615333557, + "rewards/rejected": -0.20571157336235046, + "sft_loss": 0.5273705124855042, + "step": 1175 + }, + { + "epoch": 1.700650759219089, + "grad_norm": 2.514643381538017, + "learning_rate": 6.6832859478231635e-06, + "logits/chosen": 0.1959303468465805, + "logits/rejected": 0.08744394034147263, + "logps/chosen": -0.5547116994857788, + "logps/rejected": -2.5926737785339355, + "loss": 0.6747, + "odds_ratio_loss": 0.31549614667892456, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05547117441892624, + "rewards/margins": 0.20379620790481567, + "rewards/rejected": -0.2592673897743225, + "sft_loss": 0.5547116994857788, + "step": 1176 + }, + { + "epoch": 1.7020968908170644, + "grad_norm": 2.6698203453703973, + "learning_rate": 6.680981763631303e-06, + "logits/chosen": 0.12705998122692108, + "logits/rejected": 0.08648968487977982, + "logps/chosen": -0.5197083353996277, + "logps/rejected": -3.4079229831695557, + "loss": 0.5662, + "odds_ratio_loss": 0.25399935245513916, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.051970839500427246, + "rewards/margins": 0.2888214588165283, + "rewards/rejected": -0.34079229831695557, + "sft_loss": 0.5197083353996277, + "step": 1177 + }, + { + "epoch": 1.7035430224150399, + "grad_norm": 4.278657923028944, + "learning_rate": 6.6786759630655505e-06, + "logits/chosen": -0.007065432146191597, + "logits/rejected": 0.04697701707482338, + "logps/chosen": -0.5839925408363342, + "logps/rejected": -2.6068286895751953, + "loss": 0.6194, + "odds_ratio_loss": 0.24275022745132446, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0583992525935173, + "rewards/margins": 0.20228362083435059, + "rewards/rejected": -0.2606828808784485, + "sft_loss": 0.5839925408363342, + "step": 1178 + }, + { + "epoch": 1.704989154013015, + "grad_norm": 2.3362879167020836, + "learning_rate": 6.676368547516084e-06, + "logits/chosen": 0.22390501201152802, + "logits/rejected": 0.16717907786369324, + "logps/chosen": -0.5782084465026855, + "logps/rejected": -2.7273147106170654, + "loss": 0.6032, + "odds_ratio_loss": 0.24216079711914062, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05782084912061691, + "rewards/margins": 0.214910626411438, + "rewards/rejected": -0.2727314829826355, + "sft_loss": 0.5782084465026855, + "step": 1179 + }, + { + "epoch": 1.7064352856109906, + "grad_norm": 3.2234699582467075, + "learning_rate": 6.674059518374052e-06, + "logits/chosen": 0.13133449852466583, + "logits/rejected": 0.12285110354423523, + "logps/chosen": -0.5841748118400574, + "logps/rejected": -1.5109751224517822, + "loss": 0.6516, + "odds_ratio_loss": 0.34580686688423157, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.058417484164237976, + "rewards/margins": 0.09268002957105637, + "rewards/rejected": -0.15109750628471375, + "sft_loss": 0.5841748118400574, + "step": 1180 + }, + { + "epoch": 1.707881417208966, + "grad_norm": 3.8578564678205725, + "learning_rate": 6.671748877031577e-06, + "logits/chosen": 0.17093250155448914, + "logits/rejected": 0.1525716334581375, + "logps/chosen": -0.7296780347824097, + "logps/rejected": -3.0437965393066406, + "loss": 0.6778, + "odds_ratio_loss": 0.31801509857177734, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07296780496835709, + "rewards/margins": 0.23141184449195862, + "rewards/rejected": -0.3043796420097351, + "sft_loss": 0.7296780347824097, + "step": 1181 + }, + { + "epoch": 1.7093275488069413, + "grad_norm": 2.8550366156495226, + "learning_rate": 6.6694366248817544e-06, + "logits/chosen": 0.07962992042303085, + "logits/rejected": 0.10816764831542969, + "logps/chosen": -0.5177446007728577, + "logps/rejected": -3.558342456817627, + "loss": 0.6032, + "odds_ratio_loss": 0.2068100869655609, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.051774464547634125, + "rewards/margins": 0.304059773683548, + "rewards/rejected": -0.3558342158794403, + "sft_loss": 0.5177446007728577, + "step": 1182 + }, + { + "epoch": 1.710773680404917, + "grad_norm": 2.7311591788385896, + "learning_rate": 6.667122763318648e-06, + "logits/chosen": 0.23495501279830933, + "logits/rejected": 0.21480917930603027, + "logps/chosen": -0.5341576337814331, + "logps/rejected": -2.850303888320923, + "loss": 0.6349, + "odds_ratio_loss": 0.2973088324069977, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05341576412320137, + "rewards/margins": 0.2316146194934845, + "rewards/rejected": -0.2850303649902344, + "sft_loss": 0.5341576337814331, + "step": 1183 + }, + { + "epoch": 1.7122198120028922, + "grad_norm": 2.3366324777160488, + "learning_rate": 6.664807293737293e-06, + "logits/chosen": 0.22794067859649658, + "logits/rejected": 0.20764786005020142, + "logps/chosen": -0.5206387639045715, + "logps/rejected": -2.451005458831787, + "loss": 0.6059, + "odds_ratio_loss": 0.226045623421669, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05206387862563133, + "rewards/margins": 0.19303664565086365, + "rewards/rejected": -0.24510052800178528, + "sft_loss": 0.5206387639045715, + "step": 1184 + }, + { + "epoch": 1.7136659436008677, + "grad_norm": 2.9344919439096446, + "learning_rate": 6.662490217533697e-06, + "logits/chosen": 0.18199431896209717, + "logits/rejected": 0.08673392236232758, + "logps/chosen": -0.6532351970672607, + "logps/rejected": -3.143551826477051, + "loss": 0.6625, + "odds_ratio_loss": 0.4036434292793274, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06532351672649384, + "rewards/margins": 0.2490316778421402, + "rewards/rejected": -0.31435519456863403, + "sft_loss": 0.6532351970672607, + "step": 1185 + }, + { + "epoch": 1.7151120751988431, + "grad_norm": 3.033539871570757, + "learning_rate": 6.660171536104833e-06, + "logits/chosen": 0.13163414597511292, + "logits/rejected": 0.13655593991279602, + "logps/chosen": -0.6226808428764343, + "logps/rejected": -3.9001150131225586, + "loss": 0.5912, + "odds_ratio_loss": 0.14715641736984253, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06226808577775955, + "rewards/margins": 0.32774341106414795, + "rewards/rejected": -0.3900114893913269, + "sft_loss": 0.6226808428764343, + "step": 1186 + }, + { + "epoch": 1.7165582067968184, + "grad_norm": 2.935200053719907, + "learning_rate": 6.6578512508486425e-06, + "logits/chosen": 0.17753368616104126, + "logits/rejected": 0.0922786295413971, + "logps/chosen": -0.5860269069671631, + "logps/rejected": -1.86979341506958, + "loss": 0.5886, + "odds_ratio_loss": 0.35702764987945557, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05860269442200661, + "rewards/margins": 0.12837664783000946, + "rewards/rejected": -0.18697935342788696, + "sft_loss": 0.5860269069671631, + "step": 1187 + }, + { + "epoch": 1.718004338394794, + "grad_norm": 2.983899984553299, + "learning_rate": 6.655529363164033e-06, + "logits/chosen": 0.236445814371109, + "logits/rejected": 0.2024727165699005, + "logps/chosen": -0.5069154500961304, + "logps/rejected": -2.2717783451080322, + "loss": 0.7185, + "odds_ratio_loss": 0.2174503207206726, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.050691548734903336, + "rewards/margins": 0.1764862984418869, + "rewards/rejected": -0.22717782855033875, + "sft_loss": 0.5069154500961304, + "step": 1188 + }, + { + "epoch": 1.7194504699927693, + "grad_norm": 3.5801911999906015, + "learning_rate": 6.653205874450881e-06, + "logits/chosen": 0.2863970398902893, + "logits/rejected": 0.28775012493133545, + "logps/chosen": -0.3487379848957062, + "logps/rejected": -2.2851011753082275, + "loss": 0.5455, + "odds_ratio_loss": 0.23209895193576813, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03487379476428032, + "rewards/margins": 0.19363632798194885, + "rewards/rejected": -0.22851011157035828, + "sft_loss": 0.3487379848957062, + "step": 1189 + }, + { + "epoch": 1.7208966015907448, + "grad_norm": 2.235271967911421, + "learning_rate": 6.650880786110026e-06, + "logits/chosen": 0.13730399310588837, + "logits/rejected": 0.10951994359493256, + "logps/chosen": -0.6758534908294678, + "logps/rejected": -2.470485210418701, + "loss": 0.6652, + "odds_ratio_loss": 0.3634181320667267, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06758534908294678, + "rewards/margins": 0.17946317791938782, + "rewards/rejected": -0.2470485270023346, + "sft_loss": 0.6758534908294678, + "step": 1190 + }, + { + "epoch": 1.7223427331887202, + "grad_norm": 2.135760086462582, + "learning_rate": 6.6485540995432715e-06, + "logits/chosen": 0.20422452688217163, + "logits/rejected": 0.14295119047164917, + "logps/chosen": -0.6017346978187561, + "logps/rejected": -1.865221619606018, + "loss": 0.5539, + "odds_ratio_loss": 0.22437524795532227, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06017346307635307, + "rewards/margins": 0.12634870409965515, + "rewards/rejected": -0.18652215600013733, + "sft_loss": 0.6017346978187561, + "step": 1191 + }, + { + "epoch": 1.7237888647866955, + "grad_norm": 2.445534213047608, + "learning_rate": 6.6462258161533854e-06, + "logits/chosen": 0.18580156564712524, + "logits/rejected": 0.10908015072345734, + "logps/chosen": -0.5201189517974854, + "logps/rejected": -2.300976276397705, + "loss": 0.5537, + "odds_ratio_loss": 0.29101625084877014, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05201189965009689, + "rewards/margins": 0.17808572947978973, + "rewards/rejected": -0.23009763658046722, + "sft_loss": 0.5201189517974854, + "step": 1192 + }, + { + "epoch": 1.7252349963846711, + "grad_norm": 2.7089892319390536, + "learning_rate": 6.6438959373440995e-06, + "logits/chosen": 0.10451790690422058, + "logits/rejected": 0.152940571308136, + "logps/chosen": -0.445254385471344, + "logps/rejected": -2.340458393096924, + "loss": 0.5893, + "odds_ratio_loss": 0.17854472994804382, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04452543705701828, + "rewards/margins": 0.18952038884162903, + "rewards/rejected": -0.2340458333492279, + "sft_loss": 0.445254385471344, + "step": 1193 + }, + { + "epoch": 1.7266811279826464, + "grad_norm": 3.341107839803183, + "learning_rate": 6.641564464520107e-06, + "logits/chosen": 0.08841860294342041, + "logits/rejected": 0.07997505366802216, + "logps/chosen": -0.9160062074661255, + "logps/rejected": -2.7940661907196045, + "loss": 0.6897, + "odds_ratio_loss": 0.35329318046569824, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.09160062670707703, + "rewards/margins": 0.18780598044395447, + "rewards/rejected": -0.2794066369533539, + "sft_loss": 0.9160062074661255, + "step": 1194 + }, + { + "epoch": 1.7281272595806219, + "grad_norm": 2.9141882593293986, + "learning_rate": 6.6392313990870606e-06, + "logits/chosen": 0.058398887515068054, + "logits/rejected": 0.1468515694141388, + "logps/chosen": -0.6274632215499878, + "logps/rejected": -1.6033984422683716, + "loss": 0.6933, + "odds_ratio_loss": 0.3369959592819214, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0627463236451149, + "rewards/margins": 0.0975935310125351, + "rewards/rejected": -0.1603398323059082, + "sft_loss": 0.6274632215499878, + "step": 1195 + }, + { + "epoch": 1.7295733911785973, + "grad_norm": 3.9876324584086067, + "learning_rate": 6.636896742451573e-06, + "logits/chosen": 0.139909565448761, + "logits/rejected": 0.08156219869852066, + "logps/chosen": -0.6069450378417969, + "logps/rejected": -2.3972322940826416, + "loss": 0.6371, + "odds_ratio_loss": 0.25904297828674316, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06069450080394745, + "rewards/margins": 0.1790287345647812, + "rewards/rejected": -0.23972323536872864, + "sft_loss": 0.6069450378417969, + "step": 1196 + }, + { + "epoch": 1.7310195227765726, + "grad_norm": 3.4932750748545254, + "learning_rate": 6.634560496021219e-06, + "logits/chosen": 0.17661643028259277, + "logits/rejected": 0.10911494493484497, + "logps/chosen": -0.524027943611145, + "logps/rejected": -1.6556886434555054, + "loss": 0.6024, + "odds_ratio_loss": 0.2998625636100769, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0524027943611145, + "rewards/margins": 0.11316606402397156, + "rewards/rejected": -0.16556887328624725, + "sft_loss": 0.524027943611145, + "step": 1197 + }, + { + "epoch": 1.7324656543745482, + "grad_norm": 2.3743801625118564, + "learning_rate": 6.632222661204529e-06, + "logits/chosen": 0.11895874887704849, + "logits/rejected": 0.021969705820083618, + "logps/chosen": -0.45106062293052673, + "logps/rejected": -2.416294574737549, + "loss": 0.5822, + "odds_ratio_loss": 0.26725825667381287, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04510606452822685, + "rewards/margins": 0.19652342796325684, + "rewards/rejected": -0.24162951111793518, + "sft_loss": 0.45106062293052673, + "step": 1198 + }, + { + "epoch": 1.7339117859725235, + "grad_norm": 4.233318357941509, + "learning_rate": 6.629883239410995e-06, + "logits/chosen": 0.3369359076023102, + "logits/rejected": 0.1930331587791443, + "logps/chosen": -0.5083121061325073, + "logps/rejected": -2.8376667499542236, + "loss": 0.6137, + "odds_ratio_loss": 0.2584281861782074, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05083121359348297, + "rewards/margins": 0.23293545842170715, + "rewards/rejected": -0.28376662731170654, + "sft_loss": 0.5083121061325073, + "step": 1199 + }, + { + "epoch": 1.735357917570499, + "grad_norm": 3.619796068905745, + "learning_rate": 6.62754223205106e-06, + "logits/chosen": 0.15451756119728088, + "logits/rejected": 0.08340193331241608, + "logps/chosen": -0.6563261151313782, + "logps/rejected": -2.759770631790161, + "loss": 0.6097, + "odds_ratio_loss": 0.2600904107093811, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06563261151313782, + "rewards/margins": 0.21034446358680725, + "rewards/rejected": -0.27597707509994507, + "sft_loss": 0.6563261151313782, + "step": 1200 + }, + { + "epoch": 1.7368040491684744, + "grad_norm": 3.0571929060012524, + "learning_rate": 6.625199640536127e-06, + "logits/chosen": 0.20956231653690338, + "logits/rejected": 0.13567481935024261, + "logps/chosen": -0.5605679154396057, + "logps/rejected": -2.7481789588928223, + "loss": 0.5964, + "odds_ratio_loss": 0.3873436748981476, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05605679005384445, + "rewards/margins": 0.2187611162662506, + "rewards/rejected": -0.27481788396835327, + "sft_loss": 0.5605679154396057, + "step": 1201 + }, + { + "epoch": 1.7382501807664497, + "grad_norm": 2.9617368021465893, + "learning_rate": 6.622855466278554e-06, + "logits/chosen": 0.20532143115997314, + "logits/rejected": 0.1692730188369751, + "logps/chosen": -0.4614868462085724, + "logps/rejected": -3.6775882244110107, + "loss": 0.5834, + "odds_ratio_loss": 0.18730753660202026, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04614868387579918, + "rewards/margins": 0.32161015272140503, + "rewards/rejected": -0.3677588403224945, + "sft_loss": 0.4614868462085724, + "step": 1202 + }, + { + "epoch": 1.7396963123644251, + "grad_norm": 2.5667355858388703, + "learning_rate": 6.620509710691653e-06, + "logits/chosen": 0.23696063458919525, + "logits/rejected": 0.16394846141338348, + "logps/chosen": -0.657241702079773, + "logps/rejected": -2.8840408325195312, + "loss": 0.7009, + "odds_ratio_loss": 0.24847227334976196, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06572417169809341, + "rewards/margins": 0.22267991304397583, + "rewards/rejected": -0.28840407729148865, + "sft_loss": 0.657241702079773, + "step": 1203 + }, + { + "epoch": 1.7411424439624006, + "grad_norm": 2.907884753323847, + "learning_rate": 6.618162375189687e-06, + "logits/chosen": 0.054735876619815826, + "logits/rejected": 0.011121401563286781, + "logps/chosen": -0.5988122820854187, + "logps/rejected": -1.7168229818344116, + "loss": 0.5172, + "odds_ratio_loss": 0.361611008644104, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05988123267889023, + "rewards/margins": 0.11180106550455093, + "rewards/rejected": -0.17168228328227997, + "sft_loss": 0.5988122820854187, + "step": 1204 + }, + { + "epoch": 1.7425885755603758, + "grad_norm": 2.744172673068131, + "learning_rate": 6.615813461187873e-06, + "logits/chosen": 0.1834808886051178, + "logits/rejected": 0.13022924959659576, + "logps/chosen": -0.5365532636642456, + "logps/rejected": -1.4286985397338867, + "loss": 0.5744, + "odds_ratio_loss": 0.36767858266830444, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05365532636642456, + "rewards/margins": 0.08921454101800919, + "rewards/rejected": -0.14286985993385315, + "sft_loss": 0.5365532636642456, + "step": 1205 + }, + { + "epoch": 1.7440347071583515, + "grad_norm": 2.4576341795144443, + "learning_rate": 6.6134629701023805e-06, + "logits/chosen": 0.15828804671764374, + "logits/rejected": 0.03511790186166763, + "logps/chosen": -0.8053830862045288, + "logps/rejected": -1.2731399536132812, + "loss": 0.6786, + "odds_ratio_loss": 0.47556188702583313, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0805383026599884, + "rewards/margins": 0.0467756986618042, + "rewards/rejected": -0.1273140013217926, + "sft_loss": 0.8053830862045288, + "step": 1206 + }, + { + "epoch": 1.7454808387563268, + "grad_norm": 2.7041504951107727, + "learning_rate": 6.611110903350331e-06, + "logits/chosen": 0.2982510030269623, + "logits/rejected": 0.17835605144500732, + "logps/chosen": -0.5107125043869019, + "logps/rejected": -2.9214985370635986, + "loss": 0.5927, + "odds_ratio_loss": 0.44440388679504395, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.051071248948574066, + "rewards/margins": 0.24107861518859863, + "rewards/rejected": -0.2921498715877533, + "sft_loss": 0.5107125043869019, + "step": 1207 + }, + { + "epoch": 1.7469269703543022, + "grad_norm": 2.8209460199314704, + "learning_rate": 6.608757262349792e-06, + "logits/chosen": 0.06634090840816498, + "logits/rejected": 0.13024091720581055, + "logps/chosen": -0.6781923770904541, + "logps/rejected": -1.6680748462677002, + "loss": 0.6659, + "odds_ratio_loss": 0.3676949739456177, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06781924515962601, + "rewards/margins": 0.09898824989795685, + "rewards/rejected": -0.16680750250816345, + "sft_loss": 0.6781923770904541, + "step": 1208 + }, + { + "epoch": 1.7483731019522777, + "grad_norm": 5.057921935760771, + "learning_rate": 6.606402048519783e-06, + "logits/chosen": 0.03448230028152466, + "logits/rejected": 0.09395559132099152, + "logps/chosen": -0.6829879283905029, + "logps/rejected": -2.3134713172912598, + "loss": 0.5778, + "odds_ratio_loss": 0.3463206887245178, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06829879432916641, + "rewards/margins": 0.16304834187030792, + "rewards/rejected": -0.23134714365005493, + "sft_loss": 0.6829879283905029, + "step": 1209 + }, + { + "epoch": 1.749819233550253, + "grad_norm": 2.5596838109500104, + "learning_rate": 6.604045263280273e-06, + "logits/chosen": 0.12063422054052353, + "logits/rejected": 0.09866572916507721, + "logps/chosen": -0.7138389348983765, + "logps/rejected": -2.4420652389526367, + "loss": 0.6281, + "odds_ratio_loss": 0.36622458696365356, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07138389348983765, + "rewards/margins": 0.17282262444496155, + "rewards/rejected": -0.2442065179347992, + "sft_loss": 0.7138389348983765, + "step": 1210 + }, + { + "epoch": 1.7512653651482286, + "grad_norm": 3.407537712357489, + "learning_rate": 6.601686908052176e-06, + "logits/chosen": 0.10398457199335098, + "logits/rejected": 0.04173152521252632, + "logps/chosen": -0.580781102180481, + "logps/rejected": -2.662278175354004, + "loss": 0.5826, + "odds_ratio_loss": 0.3597915470600128, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0580781027674675, + "rewards/margins": 0.208149716258049, + "rewards/rejected": -0.2662278413772583, + "sft_loss": 0.580781102180481, + "step": 1211 + }, + { + "epoch": 1.7527114967462039, + "grad_norm": 2.6514455590176254, + "learning_rate": 6.599326984257351e-06, + "logits/chosen": 0.19883418083190918, + "logits/rejected": 0.188827246427536, + "logps/chosen": -0.5180667638778687, + "logps/rejected": -2.4371070861816406, + "loss": 0.575, + "odds_ratio_loss": 0.32393041253089905, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05180668085813522, + "rewards/margins": 0.19190403819084167, + "rewards/rejected": -0.2437107264995575, + "sft_loss": 0.5180667638778687, + "step": 1212 + }, + { + "epoch": 1.7541576283441793, + "grad_norm": 2.418453470557911, + "learning_rate": 6.596965493318606e-06, + "logits/chosen": 0.1408243477344513, + "logits/rejected": 0.1452597677707672, + "logps/chosen": -0.5804046988487244, + "logps/rejected": -1.683534860610962, + "loss": 0.6441, + "odds_ratio_loss": 0.35653501749038696, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.058040473610162735, + "rewards/margins": 0.11031302809715271, + "rewards/rejected": -0.16835349798202515, + "sft_loss": 0.5804046988487244, + "step": 1213 + }, + { + "epoch": 1.7556037599421548, + "grad_norm": 2.8936534377819982, + "learning_rate": 6.594602436659695e-06, + "logits/chosen": 0.29035723209381104, + "logits/rejected": 0.13355976343154907, + "logps/chosen": -0.3346521854400635, + "logps/rejected": -4.254483699798584, + "loss": 0.5783, + "odds_ratio_loss": 0.22945931553840637, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.033465221524238586, + "rewards/margins": 0.3919832110404968, + "rewards/rejected": -0.4254484176635742, + "sft_loss": 0.3346521854400635, + "step": 1214 + }, + { + "epoch": 1.75704989154013, + "grad_norm": 2.1734385221887362, + "learning_rate": 6.592237815705309e-06, + "logits/chosen": 0.2960423529148102, + "logits/rejected": 0.13452109694480896, + "logps/chosen": -0.42409226298332214, + "logps/rejected": -1.8257529735565186, + "loss": 0.5485, + "odds_ratio_loss": 0.3245893716812134, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.042409226298332214, + "rewards/margins": 0.14016607403755188, + "rewards/rejected": -0.1825753152370453, + "sft_loss": 0.42409226298332214, + "step": 1215 + }, + { + "epoch": 1.7584960231381057, + "grad_norm": 2.4855257032936935, + "learning_rate": 6.589871631881092e-06, + "logits/chosen": 0.20803236961364746, + "logits/rejected": 0.21837976574897766, + "logps/chosen": -0.6946249008178711, + "logps/rejected": -1.5261632204055786, + "loss": 0.5728, + "odds_ratio_loss": 0.3592318594455719, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06946249306201935, + "rewards/margins": 0.08315382152795792, + "rewards/rejected": -0.15261633694171906, + "sft_loss": 0.6946249008178711, + "step": 1216 + }, + { + "epoch": 1.759942154736081, + "grad_norm": 2.447862716688948, + "learning_rate": 6.587503886613619e-06, + "logits/chosen": 0.23648259043693542, + "logits/rejected": 0.26576292514801025, + "logps/chosen": -0.5033541321754456, + "logps/rejected": -2.461588144302368, + "loss": 0.6131, + "odds_ratio_loss": 0.20790676772594452, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.050335414707660675, + "rewards/margins": 0.19582340121269226, + "rewards/rejected": -0.24615880846977234, + "sft_loss": 0.5033541321754456, + "step": 1217 + }, + { + "epoch": 1.7613882863340564, + "grad_norm": 3.1404167129848672, + "learning_rate": 6.585134581330419e-06, + "logits/chosen": 0.1707933247089386, + "logits/rejected": 0.17154398560523987, + "logps/chosen": -0.5736587047576904, + "logps/rejected": -2.2020938396453857, + "loss": 0.6267, + "odds_ratio_loss": 0.22983211278915405, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05736587569117546, + "rewards/margins": 0.1628435105085373, + "rewards/rejected": -0.22020938992500305, + "sft_loss": 0.5736587047576904, + "step": 1218 + }, + { + "epoch": 1.7628344179320319, + "grad_norm": 2.398668193202477, + "learning_rate": 6.58276371745995e-06, + "logits/chosen": 0.20608599483966827, + "logits/rejected": 0.1394387036561966, + "logps/chosen": -0.4304734766483307, + "logps/rejected": -2.14104962348938, + "loss": 0.5412, + "odds_ratio_loss": 0.319057434797287, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04304734617471695, + "rewards/margins": 0.17105762660503387, + "rewards/rejected": -0.21410496532917023, + "sft_loss": 0.4304734766483307, + "step": 1219 + }, + { + "epoch": 1.7642805495300071, + "grad_norm": 2.6458674834216316, + "learning_rate": 6.580391296431617e-06, + "logits/chosen": 0.20387428998947144, + "logits/rejected": 0.04644143208861351, + "logps/chosen": -0.5995526313781738, + "logps/rejected": -1.390120506286621, + "loss": 0.662, + "odds_ratio_loss": 0.32246649265289307, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05995526909828186, + "rewards/margins": 0.07905678451061249, + "rewards/rejected": -0.13901205360889435, + "sft_loss": 0.5995526313781738, + "step": 1220 + }, + { + "epoch": 1.7657266811279828, + "grad_norm": 2.30503961685648, + "learning_rate": 6.578017319675762e-06, + "logits/chosen": 0.10869728773832321, + "logits/rejected": 0.039436303079128265, + "logps/chosen": -0.6155337691307068, + "logps/rejected": -2.0125937461853027, + "loss": 0.6198, + "odds_ratio_loss": 0.3492213785648346, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06155337765812874, + "rewards/margins": 0.13970601558685303, + "rewards/rejected": -0.20125938951969147, + "sft_loss": 0.6155337691307068, + "step": 1221 + }, + { + "epoch": 1.767172812725958, + "grad_norm": 2.893877860045242, + "learning_rate": 6.5756417886236625e-06, + "logits/chosen": 0.16021108627319336, + "logits/rejected": 0.0937863439321518, + "logps/chosen": -0.6446142792701721, + "logps/rejected": -1.6120359897613525, + "loss": 0.6553, + "odds_ratio_loss": 0.35699713230133057, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06446143239736557, + "rewards/margins": 0.09674215316772461, + "rewards/rejected": -0.16120359301567078, + "sft_loss": 0.6446142792701721, + "step": 1222 + }, + { + "epoch": 1.7686189443239335, + "grad_norm": 3.040730778349726, + "learning_rate": 6.573264704707537e-06, + "logits/chosen": 0.15876901149749756, + "logits/rejected": 0.04192943871021271, + "logps/chosen": -0.5770949125289917, + "logps/rejected": -2.769012451171875, + "loss": 0.5405, + "odds_ratio_loss": 0.22492100298404694, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05770949274301529, + "rewards/margins": 0.2191917598247528, + "rewards/rejected": -0.2769012451171875, + "sft_loss": 0.5770949125289917, + "step": 1223 + }, + { + "epoch": 1.770065075921909, + "grad_norm": 2.3456343253279677, + "learning_rate": 6.570886069360535e-06, + "logits/chosen": 0.1821683943271637, + "logits/rejected": 0.19179019331932068, + "logps/chosen": -0.567207396030426, + "logps/rejected": -1.8817782402038574, + "loss": 0.583, + "odds_ratio_loss": 0.2690046429634094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05672074481844902, + "rewards/margins": 0.13145706057548523, + "rewards/rejected": -0.18817779421806335, + "sft_loss": 0.567207396030426, + "step": 1224 + }, + { + "epoch": 1.7715112075198842, + "grad_norm": 2.683339111567326, + "learning_rate": 6.568505884016749e-06, + "logits/chosen": 0.13196396827697754, + "logits/rejected": 0.0811544805765152, + "logps/chosen": -0.527839183807373, + "logps/rejected": -2.4704227447509766, + "loss": 0.5611, + "odds_ratio_loss": 0.3083456754684448, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05278391391038895, + "rewards/margins": 0.19425836205482483, + "rewards/rejected": -0.24704228341579437, + "sft_loss": 0.527839183807373, + "step": 1225 + }, + { + "epoch": 1.7729573391178597, + "grad_norm": 2.136034880015943, + "learning_rate": 6.566124150111197e-06, + "logits/chosen": 0.08882991969585419, + "logits/rejected": 0.0734814703464508, + "logps/chosen": -0.6307113170623779, + "logps/rejected": -1.749953031539917, + "loss": 0.5799, + "odds_ratio_loss": 0.2958693206310272, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06307113170623779, + "rewards/margins": 0.1119241863489151, + "rewards/rejected": -0.1749953031539917, + "sft_loss": 0.6307113170623779, + "step": 1226 + }, + { + "epoch": 1.7744034707158352, + "grad_norm": 2.617436250062296, + "learning_rate": 6.56374086907984e-06, + "logits/chosen": 0.2238091230392456, + "logits/rejected": 0.1482618749141693, + "logps/chosen": -0.38627102971076965, + "logps/rejected": -4.749565601348877, + "loss": 0.5507, + "odds_ratio_loss": 0.20005583763122559, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.038627102971076965, + "rewards/margins": 0.4363294839859009, + "rewards/rejected": -0.47495660185813904, + "sft_loss": 0.38627102971076965, + "step": 1227 + }, + { + "epoch": 1.7758496023138104, + "grad_norm": 2.529425804367851, + "learning_rate": 6.561356042359563e-06, + "logits/chosen": 0.1380154937505722, + "logits/rejected": 0.11311140656471252, + "logps/chosen": -0.7418189644813538, + "logps/rejected": -1.8061516284942627, + "loss": 0.6726, + "odds_ratio_loss": 0.4076150953769684, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07418189942836761, + "rewards/margins": 0.10643327236175537, + "rewards/rejected": -0.18061517179012299, + "sft_loss": 0.7418189644813538, + "step": 1228 + }, + { + "epoch": 1.777295733911786, + "grad_norm": 2.3388201206907326, + "learning_rate": 6.558969671388189e-06, + "logits/chosen": 0.35083937644958496, + "logits/rejected": 0.23928841948509216, + "logps/chosen": -0.47196346521377563, + "logps/rejected": -2.6147122383117676, + "loss": 0.6066, + "odds_ratio_loss": 0.3234768211841583, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04719635099172592, + "rewards/margins": 0.21427488327026367, + "rewards/rejected": -0.2614712119102478, + "sft_loss": 0.47196346521377563, + "step": 1229 + }, + { + "epoch": 1.7787418655097613, + "grad_norm": 2.396729475929612, + "learning_rate": 6.55658175760447e-06, + "logits/chosen": 0.20722784101963043, + "logits/rejected": 0.18240517377853394, + "logps/chosen": -0.35860294103622437, + "logps/rejected": -1.4275670051574707, + "loss": 0.5159, + "odds_ratio_loss": 0.22660349309444427, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03586029261350632, + "rewards/margins": 0.10689640045166016, + "rewards/rejected": -0.14275670051574707, + "sft_loss": 0.35860294103622437, + "step": 1230 + }, + { + "epoch": 1.7801879971077368, + "grad_norm": 2.088719841010738, + "learning_rate": 6.554192302448087e-06, + "logits/chosen": 0.18596374988555908, + "logits/rejected": 0.09559060633182526, + "logps/chosen": -0.5696253776550293, + "logps/rejected": -2.7666914463043213, + "loss": 0.5767, + "odds_ratio_loss": 0.27393674850463867, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05696253478527069, + "rewards/margins": 0.21970660984516144, + "rewards/rejected": -0.27666914463043213, + "sft_loss": 0.5696253776550293, + "step": 1231 + }, + { + "epoch": 1.7816341287057122, + "grad_norm": 3.5145170842289533, + "learning_rate": 6.551801307359653e-06, + "logits/chosen": 0.2097698450088501, + "logits/rejected": 0.1429743766784668, + "logps/chosen": -0.5215898156166077, + "logps/rejected": -2.5869057178497314, + "loss": 0.6702, + "odds_ratio_loss": 0.2010580152273178, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05215898156166077, + "rewards/margins": 0.2065315991640091, + "rewards/rejected": -0.25869059562683105, + "sft_loss": 0.5215898156166077, + "step": 1232 + }, + { + "epoch": 1.7830802603036875, + "grad_norm": 2.300180857740447, + "learning_rate": 6.549408773780706e-06, + "logits/chosen": 0.16524893045425415, + "logits/rejected": 0.10692907124757767, + "logps/chosen": -0.552807629108429, + "logps/rejected": -3.7471842765808105, + "loss": 0.5814, + "odds_ratio_loss": 0.2621445655822754, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.055280767381191254, + "rewards/margins": 0.3194376826286316, + "rewards/rejected": -0.37471842765808105, + "sft_loss": 0.552807629108429, + "step": 1233 + }, + { + "epoch": 1.7845263919016632, + "grad_norm": 3.1641439654511587, + "learning_rate": 6.5470147031537134e-06, + "logits/chosen": 0.11416902393102646, + "logits/rejected": 0.1004326269030571, + "logps/chosen": -0.6147681474685669, + "logps/rejected": -3.4372403621673584, + "loss": 0.6441, + "odds_ratio_loss": 0.20263680815696716, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06147681921720505, + "rewards/margins": 0.28224724531173706, + "rewards/rejected": -0.3437240421772003, + "sft_loss": 0.6147681474685669, + "step": 1234 + }, + { + "epoch": 1.7859725234996384, + "grad_norm": 2.4720750541905154, + "learning_rate": 6.544619096922071e-06, + "logits/chosen": 0.07047523558139801, + "logits/rejected": 0.19488292932510376, + "logps/chosen": -0.6933853030204773, + "logps/rejected": -1.8614749908447266, + "loss": 0.7145, + "odds_ratio_loss": 0.314864844083786, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06933853030204773, + "rewards/margins": 0.1168089509010315, + "rewards/rejected": -0.18614749610424042, + "sft_loss": 0.6933853030204773, + "step": 1235 + }, + { + "epoch": 1.7874186550976139, + "grad_norm": 2.6188623851621187, + "learning_rate": 6.542221956530099e-06, + "logits/chosen": 0.3162124752998352, + "logits/rejected": 0.10680659115314484, + "logps/chosen": -0.819428563117981, + "logps/rejected": -2.5189719200134277, + "loss": 0.7024, + "odds_ratio_loss": 0.48061349987983704, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0819428488612175, + "rewards/margins": 0.1699543297290802, + "rewards/rejected": -0.2518971860408783, + "sft_loss": 0.819428563117981, + "step": 1236 + }, + { + "epoch": 1.7888647866955893, + "grad_norm": 2.2671401247198686, + "learning_rate": 6.539823283423041e-06, + "logits/chosen": 0.1904761642217636, + "logits/rejected": 0.22632457315921783, + "logps/chosen": -0.7415924072265625, + "logps/rejected": -2.621068000793457, + "loss": 0.5859, + "odds_ratio_loss": 0.3327312469482422, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07415923476219177, + "rewards/margins": 0.1879475712776184, + "rewards/rejected": -0.2621068060398102, + "sft_loss": 0.7415924072265625, + "step": 1237 + }, + { + "epoch": 1.7903109182935646, + "grad_norm": 2.466767010915428, + "learning_rate": 6.537423079047064e-06, + "logits/chosen": 0.25809118151664734, + "logits/rejected": 0.24010270833969116, + "logps/chosen": -0.5368831157684326, + "logps/rejected": -2.134767770767212, + "loss": 0.5884, + "odds_ratio_loss": 0.36311042308807373, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05368831753730774, + "rewards/margins": 0.15978845953941345, + "rewards/rejected": -0.2134767770767212, + "sft_loss": 0.5368831157684326, + "step": 1238 + }, + { + "epoch": 1.7917570498915403, + "grad_norm": 2.6957973304314486, + "learning_rate": 6.5350213448492645e-06, + "logits/chosen": 0.10721335560083389, + "logits/rejected": 0.15869364142417908, + "logps/chosen": -0.532486081123352, + "logps/rejected": -1.3220763206481934, + "loss": 0.5876, + "odds_ratio_loss": 0.4261801838874817, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.053248606622219086, + "rewards/margins": 0.07895903289318085, + "rewards/rejected": -0.13220764696598053, + "sft_loss": 0.532486081123352, + "step": 1239 + }, + { + "epoch": 1.7932031814895155, + "grad_norm": 2.4573630337591537, + "learning_rate": 6.532618082277654e-06, + "logits/chosen": 0.20036396384239197, + "logits/rejected": 0.13425952196121216, + "logps/chosen": -0.4425850212574005, + "logps/rejected": -2.856372833251953, + "loss": 0.5854, + "odds_ratio_loss": 0.1853070706129074, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04425850138068199, + "rewards/margins": 0.2413787692785263, + "rewards/rejected": -0.2856372594833374, + "sft_loss": 0.4425850212574005, + "step": 1240 + }, + { + "epoch": 1.794649313087491, + "grad_norm": 2.7306661642453274, + "learning_rate": 6.5302132927811695e-06, + "logits/chosen": 0.1338634490966797, + "logits/rejected": 0.06804355978965759, + "logps/chosen": -0.7215875387191772, + "logps/rejected": -1.886422872543335, + "loss": 0.646, + "odds_ratio_loss": 0.38540273904800415, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07215875387191772, + "rewards/margins": 0.11648353934288025, + "rewards/rejected": -0.18864230811595917, + "sft_loss": 0.7215875387191772, + "step": 1241 + }, + { + "epoch": 1.7960954446854664, + "grad_norm": 2.728754891284883, + "learning_rate": 6.527806977809667e-06, + "logits/chosen": 0.030140642076730728, + "logits/rejected": 0.05486735701560974, + "logps/chosen": -0.7998731136322021, + "logps/rejected": -1.3197684288024902, + "loss": 0.659, + "odds_ratio_loss": 0.4492250084877014, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07998731732368469, + "rewards/margins": 0.051989540457725525, + "rewards/rejected": -0.13197685778141022, + "sft_loss": 0.7998731136322021, + "step": 1242 + }, + { + "epoch": 1.7975415762834417, + "grad_norm": 2.9607327160654076, + "learning_rate": 6.525399138813923e-06, + "logits/chosen": 0.13286061584949493, + "logits/rejected": 0.1681133359670639, + "logps/chosen": -0.7185732126235962, + "logps/rejected": -1.5053967237472534, + "loss": 0.7056, + "odds_ratio_loss": 0.6407934427261353, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07185731083154678, + "rewards/margins": 0.07868236303329468, + "rewards/rejected": -0.15053966641426086, + "sft_loss": 0.7185732126235962, + "step": 1243 + }, + { + "epoch": 1.7989877078814174, + "grad_norm": 2.3600545000552464, + "learning_rate": 6.522989777245632e-06, + "logits/chosen": 0.29583603143692017, + "logits/rejected": 0.11987383663654327, + "logps/chosen": -0.46510180830955505, + "logps/rejected": -1.4625095129013062, + "loss": 0.6672, + "odds_ratio_loss": 0.403626024723053, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.046510182321071625, + "rewards/margins": 0.09974077343940735, + "rewards/rejected": -0.14625096321105957, + "sft_loss": 0.46510180830955505, + "step": 1244 + }, + { + "epoch": 1.8004338394793926, + "grad_norm": 2.551051522105615, + "learning_rate": 6.5205788945574084e-06, + "logits/chosen": 0.0593152791261673, + "logits/rejected": 0.020416993647813797, + "logps/chosen": -0.5379363894462585, + "logps/rejected": -2.7240047454833984, + "loss": 0.5762, + "odds_ratio_loss": 0.3037707209587097, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.053793638944625854, + "rewards/margins": 0.2186068594455719, + "rewards/rejected": -0.27240049839019775, + "sft_loss": 0.5379363894462585, + "step": 1245 + }, + { + "epoch": 1.801879971077368, + "grad_norm": 2.8134198982380667, + "learning_rate": 6.518166492202781e-06, + "logits/chosen": 0.09442038834095001, + "logits/rejected": 0.13767805695533752, + "logps/chosen": -0.6870551109313965, + "logps/rejected": -2.2993569374084473, + "loss": 0.7367, + "odds_ratio_loss": 0.3380257189273834, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06870551407337189, + "rewards/margins": 0.161230206489563, + "rewards/rejected": -0.22993570566177368, + "sft_loss": 0.6870551109313965, + "step": 1246 + }, + { + "epoch": 1.8033261026753435, + "grad_norm": 2.7343970380232827, + "learning_rate": 6.5157525716361975e-06, + "logits/chosen": 0.06337403506040573, + "logits/rejected": 0.032632552087306976, + "logps/chosen": -0.7269030809402466, + "logps/rejected": -1.3690733909606934, + "loss": 0.6109, + "odds_ratio_loss": 0.4972846508026123, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07269031554460526, + "rewards/margins": 0.06421701610088348, + "rewards/rejected": -0.13690733909606934, + "sft_loss": 0.7269030809402466, + "step": 1247 + }, + { + "epoch": 1.8047722342733188, + "grad_norm": 3.0777568617671682, + "learning_rate": 6.513337134313019e-06, + "logits/chosen": 0.14764419198036194, + "logits/rejected": 0.12649638950824738, + "logps/chosen": -0.5623830556869507, + "logps/rejected": -2.9645442962646484, + "loss": 0.5848, + "odds_ratio_loss": 0.21953721344470978, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05623830854892731, + "rewards/margins": 0.24021612107753754, + "rewards/rejected": -0.29645445942878723, + "sft_loss": 0.5623830556869507, + "step": 1248 + }, + { + "epoch": 1.8062183658712943, + "grad_norm": 2.732656690768169, + "learning_rate": 6.5109201816895204e-06, + "logits/chosen": 0.07473330199718475, + "logits/rejected": 0.08126173168420792, + "logps/chosen": -0.6446163058280945, + "logps/rejected": -2.1125130653381348, + "loss": 0.6244, + "odds_ratio_loss": 0.3653605580329895, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06446163356304169, + "rewards/margins": 0.14678969979286194, + "rewards/rejected": -0.21125133335590363, + "sft_loss": 0.6446163058280945, + "step": 1249 + }, + { + "epoch": 1.8076644974692697, + "grad_norm": 2.6894663583372616, + "learning_rate": 6.508501715222895e-06, + "logits/chosen": 0.1402525007724762, + "logits/rejected": 0.057519227266311646, + "logps/chosen": -0.5445053577423096, + "logps/rejected": -2.487497568130493, + "loss": 0.6504, + "odds_ratio_loss": 0.2824597656726837, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.054450541734695435, + "rewards/margins": 0.19429922103881836, + "rewards/rejected": -0.2487497478723526, + "sft_loss": 0.5445053577423096, + "step": 1250 + }, + { + "epoch": 1.809110629067245, + "grad_norm": 2.860737934069698, + "learning_rate": 6.506081736371241e-06, + "logits/chosen": 0.12336266785860062, + "logits/rejected": 0.1148252934217453, + "logps/chosen": -0.5299996137619019, + "logps/rejected": -2.73490047454834, + "loss": 0.5286, + "odds_ratio_loss": 0.2534593641757965, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.052999965846538544, + "rewards/margins": 0.22049006819725037, + "rewards/rejected": -0.2734900414943695, + "sft_loss": 0.5299996137619019, + "step": 1251 + }, + { + "epoch": 1.8105567606652206, + "grad_norm": 2.0169373424978363, + "learning_rate": 6.503660246593574e-06, + "logits/chosen": 0.10713335871696472, + "logits/rejected": 0.052206408232450485, + "logps/chosen": -0.6295415759086609, + "logps/rejected": -2.7591824531555176, + "loss": 0.5845, + "odds_ratio_loss": 0.279104620218277, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06295415759086609, + "rewards/margins": 0.21296407282352448, + "rewards/rejected": -0.27591824531555176, + "sft_loss": 0.6295415759086609, + "step": 1252 + }, + { + "epoch": 1.8120028922631959, + "grad_norm": 3.6466169903218963, + "learning_rate": 6.50123724734982e-06, + "logits/chosen": 0.2897685766220093, + "logits/rejected": 0.18411767482757568, + "logps/chosen": -0.4855603575706482, + "logps/rejected": -3.1457629203796387, + "loss": 0.5398, + "odds_ratio_loss": 0.24788373708724976, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04855603724718094, + "rewards/margins": 0.2660202383995056, + "rewards/rejected": -0.31457629799842834, + "sft_loss": 0.4855603575706482, + "step": 1253 + }, + { + "epoch": 1.8134490238611713, + "grad_norm": 2.810165738303995, + "learning_rate": 6.498812740100815e-06, + "logits/chosen": 0.08351315557956696, + "logits/rejected": 0.05155708268284798, + "logps/chosen": -0.7639514207839966, + "logps/rejected": -1.6764659881591797, + "loss": 0.714, + "odds_ratio_loss": 0.48868927359580994, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07639515399932861, + "rewards/margins": 0.09125145524740219, + "rewards/rejected": -0.1676466017961502, + "sft_loss": 0.7639514207839966, + "step": 1254 + }, + { + "epoch": 1.8148951554591468, + "grad_norm": 4.447263370216222, + "learning_rate": 6.496386726308301e-06, + "logits/chosen": 0.21318885684013367, + "logits/rejected": 0.1973818689584732, + "logps/chosen": -0.6246731877326965, + "logps/rejected": -2.5932018756866455, + "loss": 0.618, + "odds_ratio_loss": 0.3143559396266937, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06246732175350189, + "rewards/margins": 0.1968528926372528, + "rewards/rejected": -0.2593201994895935, + "sft_loss": 0.6246731877326965, + "step": 1255 + }, + { + "epoch": 1.816341287057122, + "grad_norm": 2.1529265886576345, + "learning_rate": 6.493959207434934e-06, + "logits/chosen": 0.19203035533428192, + "logits/rejected": 0.2155546247959137, + "logps/chosen": -0.6295595765113831, + "logps/rejected": -2.2532200813293457, + "loss": 0.691, + "odds_ratio_loss": 0.2799103856086731, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06295596063137054, + "rewards/margins": 0.16236604750156403, + "rewards/rejected": -0.22532200813293457, + "sft_loss": 0.6295595765113831, + "step": 1256 + }, + { + "epoch": 1.8177874186550977, + "grad_norm": 2.483176547201568, + "learning_rate": 6.491530184944272e-06, + "logits/chosen": 0.18721802532672882, + "logits/rejected": 0.003281711135059595, + "logps/chosen": -0.4738430380821228, + "logps/rejected": -4.528861045837402, + "loss": 0.5752, + "odds_ratio_loss": 0.24636805057525635, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04738430306315422, + "rewards/margins": 0.4055017828941345, + "rewards/rejected": -0.45288610458374023, + "sft_loss": 0.4738430380821228, + "step": 1257 + }, + { + "epoch": 1.819233550253073, + "grad_norm": 2.4614873082947146, + "learning_rate": 6.48909966030078e-06, + "logits/chosen": 0.20001399517059326, + "logits/rejected": 0.04840739816427231, + "logps/chosen": -0.7521265745162964, + "logps/rejected": -2.608405828475952, + "loss": 0.5783, + "odds_ratio_loss": 0.3967435956001282, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07521265745162964, + "rewards/margins": 0.18562793731689453, + "rewards/rejected": -0.26084059476852417, + "sft_loss": 0.7521265745162964, + "step": 1258 + }, + { + "epoch": 1.8206796818510484, + "grad_norm": 2.989994442994001, + "learning_rate": 6.4866676349698334e-06, + "logits/chosen": 0.23923185467720032, + "logits/rejected": 0.3120484948158264, + "logps/chosen": -0.4351702630519867, + "logps/rejected": -1.782483458518982, + "loss": 0.5075, + "odds_ratio_loss": 0.2519921362400055, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04351702705025673, + "rewards/margins": 0.13473132252693176, + "rewards/rejected": -0.1782483607530594, + "sft_loss": 0.4351702630519867, + "step": 1259 + }, + { + "epoch": 1.822125813449024, + "grad_norm": 2.9573254382883682, + "learning_rate": 6.484234110417709e-06, + "logits/chosen": 0.16368578374385834, + "logits/rejected": 0.11058539897203445, + "logps/chosen": -0.7165206074714661, + "logps/rejected": -1.5497503280639648, + "loss": 0.7133, + "odds_ratio_loss": 0.4126344323158264, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07165206223726273, + "rewards/margins": 0.08332297950983047, + "rewards/rejected": -0.1549750417470932, + "sft_loss": 0.7165206074714661, + "step": 1260 + }, + { + "epoch": 1.8235719450469992, + "grad_norm": 2.9020896841357358, + "learning_rate": 6.481799088111588e-06, + "logits/chosen": 0.22353379428386688, + "logits/rejected": 0.06521797180175781, + "logps/chosen": -0.6017415523529053, + "logps/rejected": -2.608680248260498, + "loss": 0.5805, + "odds_ratio_loss": 0.3434939980506897, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.060174159705638885, + "rewards/margins": 0.20069387555122375, + "rewards/rejected": -0.26086804270744324, + "sft_loss": 0.6017415523529053, + "step": 1261 + }, + { + "epoch": 1.8250180766449748, + "grad_norm": 2.4314449265663725, + "learning_rate": 6.4793625695195525e-06, + "logits/chosen": 0.1931626796722412, + "logits/rejected": 0.15090645849704742, + "logps/chosen": -0.5738415122032166, + "logps/rejected": -2.4022316932678223, + "loss": 0.6279, + "odds_ratio_loss": 0.21190491318702698, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.057384148240089417, + "rewards/margins": 0.18283900618553162, + "rewards/rejected": -0.24022316932678223, + "sft_loss": 0.5738415122032166, + "step": 1262 + }, + { + "epoch": 1.82646420824295, + "grad_norm": 2.6562218026914826, + "learning_rate": 6.476924556110589e-06, + "logits/chosen": 0.13250023126602173, + "logits/rejected": 0.06583386659622192, + "logps/chosen": -0.6148098707199097, + "logps/rejected": -2.1790497303009033, + "loss": 0.5669, + "odds_ratio_loss": 0.25489309430122375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.061480991542339325, + "rewards/margins": 0.15642398595809937, + "rewards/rejected": -0.2179049700498581, + "sft_loss": 0.6148098707199097, + "step": 1263 + }, + { + "epoch": 1.8279103398409255, + "grad_norm": 2.863752051638278, + "learning_rate": 6.474485049354587e-06, + "logits/chosen": 0.08306652307510376, + "logits/rejected": 0.08208741247653961, + "logps/chosen": -0.5236082673072815, + "logps/rejected": -3.5160677433013916, + "loss": 0.636, + "odds_ratio_loss": 0.21715307235717773, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05236082896590233, + "rewards/margins": 0.2992459535598755, + "rewards/rejected": -0.3516067862510681, + "sft_loss": 0.5236082673072815, + "step": 1264 + }, + { + "epoch": 1.829356471438901, + "grad_norm": 2.9820042330346967, + "learning_rate": 6.4720440507223314e-06, + "logits/chosen": 0.14499638974666595, + "logits/rejected": 0.05099831521511078, + "logps/chosen": -0.5267215967178345, + "logps/rejected": -3.3054587841033936, + "loss": 0.5866, + "odds_ratio_loss": 0.247196227312088, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05267215520143509, + "rewards/margins": 0.2778737545013428, + "rewards/rejected": -0.33054590225219727, + "sft_loss": 0.5267215967178345, + "step": 1265 + }, + { + "epoch": 1.8308026030368763, + "grad_norm": 2.3343928851027855, + "learning_rate": 6.469601561685512e-06, + "logits/chosen": 0.1534407138824463, + "logits/rejected": 0.02046523243188858, + "logps/chosen": -0.5311034917831421, + "logps/rejected": -2.802785873413086, + "loss": 0.614, + "odds_ratio_loss": 0.2740580141544342, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05311034992337227, + "rewards/margins": 0.2271682620048523, + "rewards/rejected": -0.28027862310409546, + "sft_loss": 0.5311034917831421, + "step": 1266 + }, + { + "epoch": 1.8322487346348517, + "grad_norm": 2.2828013182128624, + "learning_rate": 6.467157583716712e-06, + "logits/chosen": 0.0914180725812912, + "logits/rejected": 0.07987754046916962, + "logps/chosen": -0.693658709526062, + "logps/rejected": -1.6100306510925293, + "loss": 0.6032, + "odds_ratio_loss": 0.3653374910354614, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06936587393283844, + "rewards/margins": 0.09163719415664673, + "rewards/rejected": -0.16100306808948517, + "sft_loss": 0.693658709526062, + "step": 1267 + }, + { + "epoch": 1.8336948662328272, + "grad_norm": 2.7682382563234533, + "learning_rate": 6.464712118289418e-06, + "logits/chosen": 0.09248815476894379, + "logits/rejected": 0.09033681452274323, + "logps/chosen": -0.6010981798171997, + "logps/rejected": -1.8052887916564941, + "loss": 0.5682, + "odds_ratio_loss": 0.3576500415802002, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06010981649160385, + "rewards/margins": 0.12041905522346497, + "rewards/rejected": -0.18052886426448822, + "sft_loss": 0.6010981798171997, + "step": 1268 + }, + { + "epoch": 1.8351409978308026, + "grad_norm": 2.191095438710336, + "learning_rate": 6.462265166878006e-06, + "logits/chosen": 0.1091507226228714, + "logits/rejected": 0.11014742404222488, + "logps/chosen": -0.547124981880188, + "logps/rejected": -2.7218005657196045, + "loss": 0.5802, + "odds_ratio_loss": 0.2503480017185211, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.054712504148483276, + "rewards/margins": 0.2174675613641739, + "rewards/rejected": -0.27218008041381836, + "sft_loss": 0.547124981880188, + "step": 1269 + }, + { + "epoch": 1.836587129428778, + "grad_norm": 2.3132104589718154, + "learning_rate": 6.459816730957756e-06, + "logits/chosen": 0.11147591471672058, + "logits/rejected": 0.04091275855898857, + "logps/chosen": -0.6386622190475464, + "logps/rejected": -1.4583815336227417, + "loss": 0.5806, + "odds_ratio_loss": 0.48449695110321045, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06386622041463852, + "rewards/margins": 0.08197192847728729, + "rewards/rejected": -0.1458381563425064, + "sft_loss": 0.6386622190475464, + "step": 1270 + }, + { + "epoch": 1.8380332610267534, + "grad_norm": 2.6157790210799816, + "learning_rate": 6.457366812004837e-06, + "logits/chosen": 0.18917174637317657, + "logits/rejected": 0.10391932725906372, + "logps/chosen": -0.4687146246433258, + "logps/rejected": -3.772552728652954, + "loss": 0.6262, + "odds_ratio_loss": 0.1897575557231903, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04687146842479706, + "rewards/margins": 0.3303838074207306, + "rewards/rejected": -0.37725526094436646, + "sft_loss": 0.4687146246433258, + "step": 1271 + }, + { + "epoch": 1.8394793926247288, + "grad_norm": 3.0406032129446814, + "learning_rate": 6.4549154114963155e-06, + "logits/chosen": -0.025457292795181274, + "logits/rejected": 0.042470939457416534, + "logps/chosen": -0.5330725312232971, + "logps/rejected": -2.3417413234710693, + "loss": 0.5894, + "odds_ratio_loss": 0.34376680850982666, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05330725386738777, + "rewards/margins": 0.18086686730384827, + "rewards/rejected": -0.23417413234710693, + "sft_loss": 0.5330725312232971, + "step": 1272 + }, + { + "epoch": 1.8409255242227043, + "grad_norm": 2.4045595864109015, + "learning_rate": 6.452462530910148e-06, + "logits/chosen": 0.23581625521183014, + "logits/rejected": 0.13572809100151062, + "logps/chosen": -0.47560757398605347, + "logps/rejected": -3.1639788150787354, + "loss": 0.6038, + "odds_ratio_loss": 0.24863043427467346, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.047560758888721466, + "rewards/margins": 0.2688371539115906, + "rewards/rejected": -0.31639787554740906, + "sft_loss": 0.47560757398605347, + "step": 1273 + }, + { + "epoch": 1.8423716558206795, + "grad_norm": 2.593914431539175, + "learning_rate": 6.4500081717251874e-06, + "logits/chosen": 0.19233283400535583, + "logits/rejected": 0.12806811928749084, + "logps/chosen": -0.5220276117324829, + "logps/rejected": -2.091871976852417, + "loss": 0.6913, + "odds_ratio_loss": 0.29791516065597534, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05220276862382889, + "rewards/margins": 0.15698441863059998, + "rewards/rejected": -0.20918720960617065, + "sft_loss": 0.5220276117324829, + "step": 1274 + }, + { + "epoch": 1.8438177874186552, + "grad_norm": 2.2558787495791526, + "learning_rate": 6.447552335421175e-06, + "logits/chosen": 0.15237554907798767, + "logits/rejected": 0.09454990923404694, + "logps/chosen": -0.46872109174728394, + "logps/rejected": -3.8205952644348145, + "loss": 0.5324, + "odds_ratio_loss": 0.2774191200733185, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04687211290001869, + "rewards/margins": 0.3351873755455017, + "rewards/rejected": -0.3820595145225525, + "sft_loss": 0.46872109174728394, + "step": 1275 + }, + { + "epoch": 1.8452639190166304, + "grad_norm": 2.803061466485772, + "learning_rate": 6.4450950234787445e-06, + "logits/chosen": 0.18218779563903809, + "logits/rejected": 0.1619456708431244, + "logps/chosen": -0.4943692684173584, + "logps/rejected": -2.0796353816986084, + "loss": 0.583, + "odds_ratio_loss": 0.33534127473831177, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04943692684173584, + "rewards/margins": 0.15852662920951843, + "rewards/rejected": -0.20796355605125427, + "sft_loss": 0.4943692684173584, + "step": 1276 + }, + { + "epoch": 1.846710050614606, + "grad_norm": 2.877597865288587, + "learning_rate": 6.442636237379417e-06, + "logits/chosen": 0.16758492588996887, + "logits/rejected": 0.027434751391410828, + "logps/chosen": -0.5394682884216309, + "logps/rejected": -3.1126046180725098, + "loss": 0.5703, + "odds_ratio_loss": 0.30693167448043823, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.053946830332279205, + "rewards/margins": 0.25731363892555237, + "rewards/rejected": -0.311260461807251, + "sft_loss": 0.5394682884216309, + "step": 1277 + }, + { + "epoch": 1.8481561822125814, + "grad_norm": 2.952969926759398, + "learning_rate": 6.440175978605605e-06, + "logits/chosen": 0.1769622266292572, + "logits/rejected": 0.18280473351478577, + "logps/chosen": -0.46669626235961914, + "logps/rejected": -2.5780460834503174, + "loss": 0.5815, + "odds_ratio_loss": 0.2586829960346222, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.046669624745845795, + "rewards/margins": 0.21113497018814087, + "rewards/rejected": -0.25780460238456726, + "sft_loss": 0.46669626235961914, + "step": 1278 + }, + { + "epoch": 1.8496023138105566, + "grad_norm": 2.457451794775069, + "learning_rate": 6.437714248640608e-06, + "logits/chosen": 0.19974656403064728, + "logits/rejected": 0.11863253265619278, + "logps/chosen": -0.6293502449989319, + "logps/rejected": -2.235076427459717, + "loss": 0.542, + "odds_ratio_loss": 0.3751421570777893, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06293503195047379, + "rewards/margins": 0.16057263314723969, + "rewards/rejected": -0.22350764274597168, + "sft_loss": 0.6293502449989319, + "step": 1279 + }, + { + "epoch": 1.8510484454085323, + "grad_norm": 2.59547059999874, + "learning_rate": 6.435251048968611e-06, + "logits/chosen": 0.07283070683479309, + "logits/rejected": 0.07713057100772858, + "logps/chosen": -0.5544098019599915, + "logps/rejected": -2.1915676593780518, + "loss": 0.6361, + "odds_ratio_loss": 0.24329525232315063, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.055440980941057205, + "rewards/margins": 0.16371577978134155, + "rewards/rejected": -0.21915677189826965, + "sft_loss": 0.5544098019599915, + "step": 1280 + }, + { + "epoch": 1.8524945770065075, + "grad_norm": 3.4915957074434916, + "learning_rate": 6.432786381074686e-06, + "logits/chosen": 0.17533960938453674, + "logits/rejected": 0.16550162434577942, + "logps/chosen": -0.541409432888031, + "logps/rejected": -1.7533543109893799, + "loss": 0.5982, + "odds_ratio_loss": 0.4174317419528961, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05414094403386116, + "rewards/margins": 0.12119448930025101, + "rewards/rejected": -0.17533543705940247, + "sft_loss": 0.541409432888031, + "step": 1281 + }, + { + "epoch": 1.853940708604483, + "grad_norm": 2.211714065977566, + "learning_rate": 6.430320246444793e-06, + "logits/chosen": 0.25783729553222656, + "logits/rejected": 0.13165737688541412, + "logps/chosen": -0.3393861651420593, + "logps/rejected": -3.9043896198272705, + "loss": 0.5108, + "odds_ratio_loss": 0.2384585589170456, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03393861651420593, + "rewards/margins": 0.35650038719177246, + "rewards/rejected": -0.390438973903656, + "sft_loss": 0.3393861651420593, + "step": 1282 + }, + { + "epoch": 1.8553868402024585, + "grad_norm": 2.241084791507465, + "learning_rate": 6.427852646565771e-06, + "logits/chosen": 0.27508994936943054, + "logits/rejected": 0.23502448201179504, + "logps/chosen": -0.541483461856842, + "logps/rejected": -2.7447710037231445, + "loss": 0.5657, + "odds_ratio_loss": 0.3992454707622528, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.054148346185684204, + "rewards/margins": 0.22032874822616577, + "rewards/rejected": -0.27447709441185, + "sft_loss": 0.541483461856842, + "step": 1283 + }, + { + "epoch": 1.8568329718004337, + "grad_norm": 2.6425814567398365, + "learning_rate": 6.425383582925345e-06, + "logits/chosen": 0.1416487693786621, + "logits/rejected": 0.12380547821521759, + "logps/chosen": -0.5558061599731445, + "logps/rejected": -1.249847412109375, + "loss": 0.6625, + "odds_ratio_loss": 0.3178814649581909, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05558061599731445, + "rewards/margins": 0.06940412521362305, + "rewards/rejected": -0.1249847412109375, + "sft_loss": 0.5558061599731445, + "step": 1284 + }, + { + "epoch": 1.8582791033984094, + "grad_norm": 3.655460612608677, + "learning_rate": 6.4229130570121255e-06, + "logits/chosen": 0.13592982292175293, + "logits/rejected": 0.1365361362695694, + "logps/chosen": -0.6427457928657532, + "logps/rejected": -2.124833822250366, + "loss": 0.6917, + "odds_ratio_loss": 0.3293951749801636, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06427457928657532, + "rewards/margins": 0.1482088267803192, + "rewards/rejected": -0.21248340606689453, + "sft_loss": 0.6427457928657532, + "step": 1285 + }, + { + "epoch": 1.8597252349963846, + "grad_norm": 2.4326228387465503, + "learning_rate": 6.420441070315599e-06, + "logits/chosen": 0.2087874710559845, + "logits/rejected": 0.11434569954872131, + "logps/chosen": -0.5145087838172913, + "logps/rejected": -2.841360092163086, + "loss": 0.6276, + "odds_ratio_loss": 0.31676119565963745, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.051450878381729126, + "rewards/margins": 0.23268508911132812, + "rewards/rejected": -0.28413599729537964, + "sft_loss": 0.5145087838172913, + "step": 1286 + }, + { + "epoch": 1.86117136659436, + "grad_norm": 2.443729267777704, + "learning_rate": 6.417967624326136e-06, + "logits/chosen": -0.01459294743835926, + "logits/rejected": -0.00034431740641593933, + "logps/chosen": -0.7525098323822021, + "logps/rejected": -2.1756627559661865, + "loss": 0.6109, + "odds_ratio_loss": 0.3534230887889862, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07525098323822021, + "rewards/margins": 0.14231529831886292, + "rewards/rejected": -0.21756626665592194, + "sft_loss": 0.7525098323822021, + "step": 1287 + }, + { + "epoch": 1.8626174981923356, + "grad_norm": 2.7232168710965805, + "learning_rate": 6.415492720534988e-06, + "logits/chosen": 0.11816499382257462, + "logits/rejected": 0.13505424559116364, + "logps/chosen": -0.7191867232322693, + "logps/rejected": -2.004683494567871, + "loss": 0.6509, + "odds_ratio_loss": 0.3239571750164032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07191868126392365, + "rewards/margins": 0.12854966521263123, + "rewards/rejected": -0.20046836137771606, + "sft_loss": 0.7191867232322693, + "step": 1288 + }, + { + "epoch": 1.8640636297903108, + "grad_norm": 2.3846639533778924, + "learning_rate": 6.413016360434282e-06, + "logits/chosen": 0.17629383504390717, + "logits/rejected": 0.07719264179468155, + "logps/chosen": -0.6611538529396057, + "logps/rejected": -1.976793646812439, + "loss": 0.5742, + "odds_ratio_loss": 0.4173961281776428, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0661153793334961, + "rewards/margins": 0.1315639764070511, + "rewards/rejected": -0.19767937064170837, + "sft_loss": 0.6611538529396057, + "step": 1289 + }, + { + "epoch": 1.8655097613882863, + "grad_norm": 2.540831158377396, + "learning_rate": 6.410538545517026e-06, + "logits/chosen": 0.22472399473190308, + "logits/rejected": 0.1605992168188095, + "logps/chosen": -0.4334484934806824, + "logps/rejected": -3.1653952598571777, + "loss": 0.6147, + "odds_ratio_loss": 0.2583789527416229, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04334484785795212, + "rewards/margins": 0.27319473028182983, + "rewards/rejected": -0.31653958559036255, + "sft_loss": 0.4334484934806824, + "step": 1290 + }, + { + "epoch": 1.8669558929862617, + "grad_norm": 3.0085220127490437, + "learning_rate": 6.408059277277102e-06, + "logits/chosen": 0.05498197674751282, + "logits/rejected": 0.14584404230117798, + "logps/chosen": -0.7110510468482971, + "logps/rejected": -1.6284350156784058, + "loss": 0.676, + "odds_ratio_loss": 0.3031470477581024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07110510021448135, + "rewards/margins": 0.09173840284347534, + "rewards/rejected": -0.1628435105085373, + "sft_loss": 0.7110510468482971, + "step": 1291 + }, + { + "epoch": 1.8684020245842372, + "grad_norm": 2.4136664476356846, + "learning_rate": 6.4055785572092715e-06, + "logits/chosen": 0.14040209352970123, + "logits/rejected": 0.006925854831933975, + "logps/chosen": -0.5283491015434265, + "logps/rejected": -3.145298480987549, + "loss": 0.6115, + "odds_ratio_loss": 0.244784876704216, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05283490940928459, + "rewards/margins": 0.26169493794441223, + "rewards/rejected": -0.3145298361778259, + "sft_loss": 0.5283491015434265, + "step": 1292 + }, + { + "epoch": 1.8698481561822127, + "grad_norm": 2.153459001550623, + "learning_rate": 6.40309638680917e-06, + "logits/chosen": 0.07764284312725067, + "logits/rejected": 0.1401481181383133, + "logps/chosen": -0.47514212131500244, + "logps/rejected": -1.8801544904708862, + "loss": 0.5984, + "odds_ratio_loss": 0.3149198889732361, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.047514207661151886, + "rewards/margins": 0.1405012607574463, + "rewards/rejected": -0.18801546096801758, + "sft_loss": 0.47514212131500244, + "step": 1293 + }, + { + "epoch": 1.871294287780188, + "grad_norm": 3.0673694959311733, + "learning_rate": 6.400612767573306e-06, + "logits/chosen": 0.10028165578842163, + "logits/rejected": 0.045115672051906586, + "logps/chosen": -0.6269665360450745, + "logps/rejected": -3.0080552101135254, + "loss": 0.6535, + "odds_ratio_loss": 0.3017050623893738, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06269665062427521, + "rewards/margins": 0.23810890316963196, + "rewards/rejected": -0.30080553889274597, + "sft_loss": 0.6269665360450745, + "step": 1294 + }, + { + "epoch": 1.8727404193781634, + "grad_norm": 2.6138455099005022, + "learning_rate": 6.398127700999064e-06, + "logits/chosen": 0.04801095277070999, + "logits/rejected": 0.027741190046072006, + "logps/chosen": -0.5394948124885559, + "logps/rejected": -2.7543153762817383, + "loss": 0.5348, + "odds_ratio_loss": 0.3202323913574219, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05394947901368141, + "rewards/margins": 0.2214820683002472, + "rewards/rejected": -0.2754315733909607, + "sft_loss": 0.5394948124885559, + "step": 1295 + }, + { + "epoch": 1.8741865509761388, + "grad_norm": 2.6384210048459638, + "learning_rate": 6.395641188584699e-06, + "logits/chosen": 0.05813899263739586, + "logits/rejected": 0.067634716629982, + "logps/chosen": -0.47403401136398315, + "logps/rejected": -2.7208306789398193, + "loss": 0.6532, + "odds_ratio_loss": 0.35417869687080383, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.047403402626514435, + "rewards/margins": 0.22467969357967377, + "rewards/rejected": -0.2720831036567688, + "sft_loss": 0.47403401136398315, + "step": 1296 + }, + { + "epoch": 1.875632682574114, + "grad_norm": 2.443336263784716, + "learning_rate": 6.393153231829341e-06, + "logits/chosen": 0.1275801658630371, + "logits/rejected": 0.07651050388813019, + "logps/chosen": -0.5848337411880493, + "logps/rejected": -3.3461596965789795, + "loss": 0.5843, + "odds_ratio_loss": 0.3497033417224884, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05848337337374687, + "rewards/margins": 0.27613261342048645, + "rewards/rejected": -0.3346160054206848, + "sft_loss": 0.5848337411880493, + "step": 1297 + }, + { + "epoch": 1.8770788141720898, + "grad_norm": 3.6724827423590862, + "learning_rate": 6.390663832232985e-06, + "logits/chosen": 0.16066929697990417, + "logits/rejected": 0.13951550424098969, + "logps/chosen": -0.6801087856292725, + "logps/rejected": -2.111433744430542, + "loss": 0.6903, + "odds_ratio_loss": 0.39818501472473145, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06801088154315948, + "rewards/margins": 0.14313249289989471, + "rewards/rejected": -0.2111433744430542, + "sft_loss": 0.6801087856292725, + "step": 1298 + }, + { + "epoch": 1.878524945770065, + "grad_norm": 2.4997279980859366, + "learning_rate": 6.3881729912965006e-06, + "logits/chosen": 0.18264645338058472, + "logits/rejected": 0.10208077728748322, + "logps/chosen": -0.5250051021575928, + "logps/rejected": -3.1066970825195312, + "loss": 0.5647, + "odds_ratio_loss": 0.3017498254776001, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.052500516176223755, + "rewards/margins": 0.25816917419433594, + "rewards/rejected": -0.3106696903705597, + "sft_loss": 0.5250051021575928, + "step": 1299 + }, + { + "epoch": 1.8799710773680405, + "grad_norm": 3.1095920261150978, + "learning_rate": 6.385680710521624e-06, + "logits/chosen": 0.1434398889541626, + "logits/rejected": 0.07962372153997421, + "logps/chosen": -0.5778891444206238, + "logps/rejected": -3.653951406478882, + "loss": 0.5938, + "odds_ratio_loss": 0.24841637909412384, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0577889159321785, + "rewards/margins": 0.30760622024536133, + "rewards/rejected": -0.36539512872695923, + "sft_loss": 0.5778891444206238, + "step": 1300 + }, + { + "epoch": 1.881417208966016, + "grad_norm": 2.381125913980573, + "learning_rate": 6.383186991410964e-06, + "logits/chosen": 0.1245647668838501, + "logits/rejected": 0.010384336113929749, + "logps/chosen": -0.7906035780906677, + "logps/rejected": -1.5489108562469482, + "loss": 0.6896, + "odds_ratio_loss": 0.4644030034542084, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07906036078929901, + "rewards/margins": 0.07583072781562805, + "rewards/rejected": -0.15489107370376587, + "sft_loss": 0.7906035780906677, + "step": 1301 + }, + { + "epoch": 1.8828633405639912, + "grad_norm": 2.2563598663691558, + "learning_rate": 6.38069183546799e-06, + "logits/chosen": 0.20938679575920105, + "logits/rejected": 0.15651705861091614, + "logps/chosen": -0.7142879366874695, + "logps/rejected": -3.187528371810913, + "loss": 0.5619, + "odds_ratio_loss": 0.44818058609962463, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07142879068851471, + "rewards/margins": 0.24732404947280884, + "rewards/rejected": -0.31875282526016235, + "sft_loss": 0.7142879366874695, + "step": 1302 + }, + { + "epoch": 1.8843094721619669, + "grad_norm": 2.9268345750498637, + "learning_rate": 6.378195244197042e-06, + "logits/chosen": 0.08001869916915894, + "logits/rejected": 0.06469704955816269, + "logps/chosen": -0.6338894367218018, + "logps/rejected": -3.332848072052002, + "loss": 0.6408, + "odds_ratio_loss": 0.320722758769989, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06338894367218018, + "rewards/margins": 0.26989588141441345, + "rewards/rejected": -0.33328482508659363, + "sft_loss": 0.6338894367218018, + "step": 1303 + }, + { + "epoch": 1.8857556037599421, + "grad_norm": 2.980340525854677, + "learning_rate": 6.3756972191033244e-06, + "logits/chosen": 0.002490525133907795, + "logits/rejected": 0.008151497691869736, + "logps/chosen": -0.6807718873023987, + "logps/rejected": -2.1325416564941406, + "loss": 0.608, + "odds_ratio_loss": 0.3046268820762634, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0680771917104721, + "rewards/margins": 0.1451769769191742, + "rewards/rejected": -0.2132541537284851, + "sft_loss": 0.6807718873023987, + "step": 1304 + }, + { + "epoch": 1.8872017353579176, + "grad_norm": 2.3608345245842104, + "learning_rate": 6.373197761692905e-06, + "logits/chosen": 0.13456328213214874, + "logits/rejected": 0.11488880962133408, + "logps/chosen": -0.5167301893234253, + "logps/rejected": -1.9352409839630127, + "loss": 0.5643, + "odds_ratio_loss": 0.4348183870315552, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05167302116751671, + "rewards/margins": 0.14185106754302979, + "rewards/rejected": -0.1935240924358368, + "sft_loss": 0.5167301893234253, + "step": 1305 + }, + { + "epoch": 1.888647866955893, + "grad_norm": 2.7186771660256435, + "learning_rate": 6.370696873472715e-06, + "logits/chosen": 0.25382813811302185, + "logits/rejected": 0.13196587562561035, + "logps/chosen": -0.5394778251647949, + "logps/rejected": -2.625441789627075, + "loss": 0.5759, + "odds_ratio_loss": 0.37160181999206543, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05394778400659561, + "rewards/margins": 0.2085963934659958, + "rewards/rejected": -0.262544184923172, + "sft_loss": 0.5394778251647949, + "step": 1306 + }, + { + "epoch": 1.8900939985538683, + "grad_norm": 2.425429249657979, + "learning_rate": 6.368194555950552e-06, + "logits/chosen": 0.12498855590820312, + "logits/rejected": 0.08333387970924377, + "logps/chosen": -0.6716573238372803, + "logps/rejected": -2.1420390605926514, + "loss": 0.6824, + "odds_ratio_loss": 0.3968257009983063, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06716573238372803, + "rewards/margins": 0.14703817665576935, + "rewards/rejected": -0.21420392394065857, + "sft_loss": 0.6716573238372803, + "step": 1307 + }, + { + "epoch": 1.891540130151844, + "grad_norm": 2.8376022318759104, + "learning_rate": 6.365690810635072e-06, + "logits/chosen": 0.18870142102241516, + "logits/rejected": -0.01581069827079773, + "logps/chosen": -0.5011342763900757, + "logps/rejected": -3.2665085792541504, + "loss": 0.6059, + "odds_ratio_loss": 0.31004223227500916, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05011342838406563, + "rewards/margins": 0.2765374481678009, + "rewards/rejected": -0.3266508877277374, + "sft_loss": 0.5011342763900757, + "step": 1308 + }, + { + "epoch": 1.8929862617498192, + "grad_norm": 2.551693877142118, + "learning_rate": 6.363185639035791e-06, + "logits/chosen": 0.2654971778392792, + "logits/rejected": 0.09708055108785629, + "logps/chosen": -0.5613073110580444, + "logps/rejected": -2.9234206676483154, + "loss": 0.5443, + "odds_ratio_loss": 0.3154491186141968, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05613073334097862, + "rewards/margins": 0.23621134459972382, + "rewards/rejected": -0.29234206676483154, + "sft_loss": 0.5613073110580444, + "step": 1309 + }, + { + "epoch": 1.8944323933477947, + "grad_norm": 2.7390713645360782, + "learning_rate": 6.360679042663085e-06, + "logits/chosen": 0.16496847569942474, + "logits/rejected": 0.13807308673858643, + "logps/chosen": -0.594467043876648, + "logps/rejected": -2.4058032035827637, + "loss": 0.6005, + "odds_ratio_loss": 0.3193429708480835, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.059446703642606735, + "rewards/margins": 0.18113362789154053, + "rewards/rejected": -0.24058032035827637, + "sft_loss": 0.594467043876648, + "step": 1310 + }, + { + "epoch": 1.8958785249457701, + "grad_norm": 2.1716752579277094, + "learning_rate": 6.3581710230281935e-06, + "logits/chosen": 0.05845704302191734, + "logits/rejected": 0.05051730200648308, + "logps/chosen": -0.7212303876876831, + "logps/rejected": -2.8155322074890137, + "loss": 0.6325, + "odds_ratio_loss": 0.3787981867790222, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07212303578853607, + "rewards/margins": 0.20943017303943634, + "rewards/rejected": -0.2815532088279724, + "sft_loss": 0.7212303876876831, + "step": 1311 + }, + { + "epoch": 1.8973246565437454, + "grad_norm": 3.4651701286123586, + "learning_rate": 6.355661581643209e-06, + "logits/chosen": 0.16038264334201813, + "logits/rejected": 0.09201866388320923, + "logps/chosen": -0.6783140897750854, + "logps/rejected": -2.29701566696167, + "loss": 0.7313, + "odds_ratio_loss": 0.44825479388237, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06783141195774078, + "rewards/margins": 0.16187018156051636, + "rewards/rejected": -0.22970159351825714, + "sft_loss": 0.6783140897750854, + "step": 1312 + }, + { + "epoch": 1.8987707881417208, + "grad_norm": 2.8433098050638557, + "learning_rate": 6.353150720021084e-06, + "logits/chosen": 0.1522006392478943, + "logits/rejected": 0.09702038764953613, + "logps/chosen": -0.4841322898864746, + "logps/rejected": -3.95847225189209, + "loss": 0.5584, + "odds_ratio_loss": 0.20565840601921082, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0484132282435894, + "rewards/margins": 0.34743404388427734, + "rewards/rejected": -0.39584726095199585, + "sft_loss": 0.4841322898864746, + "step": 1313 + }, + { + "epoch": 1.9002169197396963, + "grad_norm": 2.74621567318202, + "learning_rate": 6.350638439675626e-06, + "logits/chosen": -0.0204104445874691, + "logits/rejected": 0.0010041743516921997, + "logps/chosen": -0.7974636554718018, + "logps/rejected": -1.3563494682312012, + "loss": 0.6811, + "odds_ratio_loss": 0.5164413452148438, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07974636554718018, + "rewards/margins": 0.055888570845127106, + "rewards/rejected": -0.13563494384288788, + "sft_loss": 0.7974636554718018, + "step": 1314 + }, + { + "epoch": 1.9016630513376718, + "grad_norm": 3.2230719639122585, + "learning_rate": 6.348124742121497e-06, + "logits/chosen": 0.19585879147052765, + "logits/rejected": 0.12719781696796417, + "logps/chosen": -0.47810548543930054, + "logps/rejected": -3.516087055206299, + "loss": 0.5758, + "odds_ratio_loss": 0.18576735258102417, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04781055450439453, + "rewards/margins": 0.3037981390953064, + "rewards/rejected": -0.3516086935997009, + "sft_loss": 0.47810548543930054, + "step": 1315 + }, + { + "epoch": 1.9031091829356472, + "grad_norm": 2.3405960760811966, + "learning_rate": 6.345609628874216e-06, + "logits/chosen": 0.07255364954471588, + "logits/rejected": 0.1415763944387436, + "logps/chosen": -0.5264100432395935, + "logps/rejected": -2.2038300037384033, + "loss": 0.6285, + "odds_ratio_loss": 0.3210826516151428, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05264100432395935, + "rewards/margins": 0.16774199903011322, + "rewards/rejected": -0.22038300335407257, + "sft_loss": 0.5264100432395935, + "step": 1316 + }, + { + "epoch": 1.9045553145336225, + "grad_norm": 4.048582332028306, + "learning_rate": 6.3430931014501546e-06, + "logits/chosen": 0.1995735615491867, + "logits/rejected": 0.09358995407819748, + "logps/chosen": -0.6856232285499573, + "logps/rejected": -1.2745683193206787, + "loss": 0.5704, + "odds_ratio_loss": 0.48545917868614197, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0685623288154602, + "rewards/margins": 0.058894507586956024, + "rewards/rejected": -0.12745684385299683, + "sft_loss": 0.6856232285499573, + "step": 1317 + }, + { + "epoch": 1.906001446131598, + "grad_norm": 2.1869595703546247, + "learning_rate": 6.340575161366536e-06, + "logits/chosen": 0.19228671491146088, + "logits/rejected": 0.07713081687688828, + "logps/chosen": -0.6164018511772156, + "logps/rejected": -2.873373508453369, + "loss": 0.6288, + "odds_ratio_loss": 0.27122145891189575, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.061640188097953796, + "rewards/margins": 0.22569714486598969, + "rewards/rejected": -0.28733736276626587, + "sft_loss": 0.6164018511772156, + "step": 1318 + }, + { + "epoch": 1.9074475777295734, + "grad_norm": 2.5534553089237977, + "learning_rate": 6.338055810141433e-06, + "logits/chosen": 0.10392449051141739, + "logits/rejected": 0.09642945230007172, + "logps/chosen": -0.5694040656089783, + "logps/rejected": -3.596972942352295, + "loss": 0.6739, + "odds_ratio_loss": 0.22980889678001404, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05694040656089783, + "rewards/margins": 0.3027569055557251, + "rewards/rejected": -0.35969728231430054, + "sft_loss": 0.5694040656089783, + "step": 1319 + }, + { + "epoch": 1.9088937093275486, + "grad_norm": 2.8970511977043274, + "learning_rate": 6.335535049293776e-06, + "logits/chosen": 0.18930402398109436, + "logits/rejected": 0.11592543870210648, + "logps/chosen": -0.5666981935501099, + "logps/rejected": -2.564934253692627, + "loss": 0.5861, + "odds_ratio_loss": 0.340043842792511, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05666981637477875, + "rewards/margins": 0.19982360303401947, + "rewards/rejected": -0.2564934194087982, + "sft_loss": 0.5666981935501099, + "step": 1320 + }, + { + "epoch": 1.9103398409255243, + "grad_norm": 3.1968079824765163, + "learning_rate": 6.333012880343339e-06, + "logits/chosen": 0.13316094875335693, + "logits/rejected": 0.09439437091350555, + "logps/chosen": -0.5278730392456055, + "logps/rejected": -1.6159777641296387, + "loss": 0.5628, + "odds_ratio_loss": 0.32977184653282166, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05278730392456055, + "rewards/margins": 0.10881047695875168, + "rewards/rejected": -0.16159778833389282, + "sft_loss": 0.5278730392456055, + "step": 1321 + }, + { + "epoch": 1.9117859725234996, + "grad_norm": 2.250481769511735, + "learning_rate": 6.330489304810747e-06, + "logits/chosen": 0.18630638718605042, + "logits/rejected": 0.07742702960968018, + "logps/chosen": -0.6264196634292603, + "logps/rejected": -2.363523244857788, + "loss": 0.6726, + "odds_ratio_loss": 0.37035539746284485, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06264197081327438, + "rewards/margins": 0.17371037602424622, + "rewards/rejected": -0.23635233938694, + "sft_loss": 0.6264196634292603, + "step": 1322 + }, + { + "epoch": 1.913232104121475, + "grad_norm": 2.3131159606518175, + "learning_rate": 6.327964324217474e-06, + "logits/chosen": 0.13363704085350037, + "logits/rejected": 0.026563134044408798, + "logps/chosen": -0.5144316554069519, + "logps/rejected": -2.5232536792755127, + "loss": 0.5649, + "odds_ratio_loss": 0.3306152820587158, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05144317075610161, + "rewards/margins": 0.200882226228714, + "rewards/rejected": -0.2523253858089447, + "sft_loss": 0.5144316554069519, + "step": 1323 + }, + { + "epoch": 1.9146782357194505, + "grad_norm": 2.839594699946253, + "learning_rate": 6.325437940085839e-06, + "logits/chosen": 0.10742896050214767, + "logits/rejected": 0.08482453227043152, + "logps/chosen": -0.4993060231208801, + "logps/rejected": -3.533346176147461, + "loss": 0.5296, + "odds_ratio_loss": 0.31563615798950195, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04993060231208801, + "rewards/margins": 0.3034040331840515, + "rewards/rejected": -0.35333460569381714, + "sft_loss": 0.4993060231208801, + "step": 1324 + }, + { + "epoch": 1.9161243673174257, + "grad_norm": 2.500177382400634, + "learning_rate": 6.32291015393901e-06, + "logits/chosen": 0.1152525320649147, + "logits/rejected": 0.0816318467259407, + "logps/chosen": -0.48759815096855164, + "logps/rejected": -2.2603230476379395, + "loss": 0.5793, + "odds_ratio_loss": 0.20722666382789612, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0487598180770874, + "rewards/margins": 0.1772724837064743, + "rewards/rejected": -0.2260323166847229, + "sft_loss": 0.48759815096855164, + "step": 1325 + }, + { + "epoch": 1.9175704989154014, + "grad_norm": 2.533564293757888, + "learning_rate": 6.320380967300996e-06, + "logits/chosen": 0.19266647100448608, + "logits/rejected": 0.022887878119945526, + "logps/chosen": -0.5034148097038269, + "logps/rejected": -3.819204092025757, + "loss": 0.5919, + "odds_ratio_loss": 0.2226625382900238, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05034147948026657, + "rewards/margins": 0.33157891035079956, + "rewards/rejected": -0.38192039728164673, + "sft_loss": 0.5034148097038269, + "step": 1326 + }, + { + "epoch": 1.9190166305133767, + "grad_norm": 2.921269282131583, + "learning_rate": 6.317850381696657e-06, + "logits/chosen": 0.2217884510755539, + "logits/rejected": 0.01924968883395195, + "logps/chosen": -0.4081079363822937, + "logps/rejected": -4.638675689697266, + "loss": 0.5773, + "odds_ratio_loss": 0.2604500651359558, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.04081079363822937, + "rewards/margins": 0.4230567514896393, + "rewards/rejected": -0.46386754512786865, + "sft_loss": 0.4081079363822937, + "step": 1327 + }, + { + "epoch": 1.9204627621113521, + "grad_norm": 2.3841994801404214, + "learning_rate": 6.31531839865169e-06, + "logits/chosen": 0.08403509855270386, + "logits/rejected": 0.02859571948647499, + "logps/chosen": -0.5892415642738342, + "logps/rejected": -3.3557963371276855, + "loss": 0.6229, + "odds_ratio_loss": 0.41986650228500366, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05892416089773178, + "rewards/margins": 0.2766554653644562, + "rewards/rejected": -0.33557960391044617, + "sft_loss": 0.5892415642738342, + "step": 1328 + }, + { + "epoch": 1.9219088937093276, + "grad_norm": 4.499602679189735, + "learning_rate": 6.3127850196926365e-06, + "logits/chosen": 0.02900572493672371, + "logits/rejected": 0.0013004057109355927, + "logps/chosen": -0.49205368757247925, + "logps/rejected": -3.4965710639953613, + "loss": 0.5968, + "odds_ratio_loss": 0.22342023253440857, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.049205370247364044, + "rewards/margins": 0.30045175552368164, + "rewards/rejected": -0.3496571183204651, + "sft_loss": 0.49205368757247925, + "step": 1329 + }, + { + "epoch": 1.9233550253073028, + "grad_norm": 2.51797982968202, + "learning_rate": 6.31025024634688e-06, + "logits/chosen": 0.2063034176826477, + "logits/rejected": 0.13551932573318481, + "logps/chosen": -0.6059345006942749, + "logps/rejected": -3.6007964611053467, + "loss": 0.5624, + "odds_ratio_loss": 0.19518573582172394, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06059345230460167, + "rewards/margins": 0.2994861900806427, + "rewards/rejected": -0.36007964611053467, + "sft_loss": 0.6059345006942749, + "step": 1330 + }, + { + "epoch": 1.9248011569052785, + "grad_norm": 3.447774341323875, + "learning_rate": 6.307714080142648e-06, + "logits/chosen": 0.11149915307760239, + "logits/rejected": 0.19055677950382233, + "logps/chosen": -0.4802173376083374, + "logps/rejected": -3.084043025970459, + "loss": 0.644, + "odds_ratio_loss": 0.44687554240226746, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.04802173376083374, + "rewards/margins": 0.26038259267807007, + "rewards/rejected": -0.3084043264389038, + "sft_loss": 0.4802173376083374, + "step": 1331 + }, + { + "epoch": 1.9262472885032538, + "grad_norm": 3.9481918292240095, + "learning_rate": 6.305176522609001e-06, + "logits/chosen": 0.10856892168521881, + "logits/rejected": 0.09776373207569122, + "logps/chosen": -0.5553045272827148, + "logps/rejected": -2.3343453407287598, + "loss": 0.6648, + "odds_ratio_loss": 0.25999295711517334, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05553045868873596, + "rewards/margins": 0.17790409922599792, + "rewards/rejected": -0.2334345579147339, + "sft_loss": 0.5553045272827148, + "step": 1332 + }, + { + "epoch": 1.9276934201012292, + "grad_norm": 2.152983324315511, + "learning_rate": 6.302637575275842e-06, + "logits/chosen": 0.14477403461933136, + "logits/rejected": 0.12126276642084122, + "logps/chosen": -0.6557214260101318, + "logps/rejected": -1.5948143005371094, + "loss": 0.6043, + "odds_ratio_loss": 0.44841280579566956, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06557214260101318, + "rewards/margins": 0.09390929341316223, + "rewards/rejected": -0.15948143601417542, + "sft_loss": 0.6557214260101318, + "step": 1333 + }, + { + "epoch": 1.9291395516992047, + "grad_norm": 3.301517579732635, + "learning_rate": 6.300097239673915e-06, + "logits/chosen": 0.17113935947418213, + "logits/rejected": 0.16045597195625305, + "logps/chosen": -0.5597264766693115, + "logps/rejected": -1.5276334285736084, + "loss": 0.5554, + "odds_ratio_loss": 0.41156938672065735, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.055972643196582794, + "rewards/margins": 0.09679071605205536, + "rewards/rejected": -0.15276335179805756, + "sft_loss": 0.5597264766693115, + "step": 1334 + }, + { + "epoch": 1.93058568329718, + "grad_norm": 2.5923227720502537, + "learning_rate": 6.297555517334794e-06, + "logits/chosen": 0.04513344168663025, + "logits/rejected": 0.0036378642544150352, + "logps/chosen": -0.7068974375724792, + "logps/rejected": -1.3616538047790527, + "loss": 0.7021, + "odds_ratio_loss": 0.46141964197158813, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07068974524736404, + "rewards/margins": 0.06547562777996063, + "rewards/rejected": -0.13616538047790527, + "sft_loss": 0.7068974375724792, + "step": 1335 + }, + { + "epoch": 1.9320318148951554, + "grad_norm": 2.5908665206389503, + "learning_rate": 6.295012409790896e-06, + "logits/chosen": 0.07112018764019012, + "logits/rejected": 0.14503023028373718, + "logps/chosen": -0.4185260236263275, + "logps/rejected": -2.6089491844177246, + "loss": 0.6039, + "odds_ratio_loss": 0.3201354146003723, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04185260087251663, + "rewards/margins": 0.2190423458814621, + "rewards/rejected": -0.2608949542045593, + "sft_loss": 0.4185260236263275, + "step": 1336 + }, + { + "epoch": 1.9334779464931309, + "grad_norm": 2.5569688339171632, + "learning_rate": 6.2924679185754684e-06, + "logits/chosen": 0.17154423892498016, + "logits/rejected": 0.0873059630393982, + "logps/chosen": -0.6691811084747314, + "logps/rejected": -1.198056936264038, + "loss": 0.6591, + "odds_ratio_loss": 0.3893122673034668, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06691811233758926, + "rewards/margins": 0.05288759991526604, + "rewards/rejected": -0.119805708527565, + "sft_loss": 0.6691811084747314, + "step": 1337 + }, + { + "epoch": 1.9349240780911063, + "grad_norm": 4.174361185121562, + "learning_rate": 6.289922045222594e-06, + "logits/chosen": 0.0211828351020813, + "logits/rejected": 0.07563084363937378, + "logps/chosen": -0.5859172940254211, + "logps/rejected": -2.894932270050049, + "loss": 0.5613, + "odds_ratio_loss": 0.22862787544727325, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.058591730892658234, + "rewards/margins": 0.23090147972106934, + "rewards/rejected": -0.28949323296546936, + "sft_loss": 0.5859172940254211, + "step": 1338 + }, + { + "epoch": 1.9363702096890818, + "grad_norm": 2.338603996374857, + "learning_rate": 6.28737479126719e-06, + "logits/chosen": 0.2083335667848587, + "logits/rejected": 0.13322553038597107, + "logps/chosen": -0.37943235039711, + "logps/rejected": -3.214156150817871, + "loss": 0.555, + "odds_ratio_loss": 0.2722666561603546, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03794323652982712, + "rewards/margins": 0.2834724187850952, + "rewards/rejected": -0.32141566276550293, + "sft_loss": 0.37943235039711, + "step": 1339 + }, + { + "epoch": 1.937816341287057, + "grad_norm": 11.15462592256021, + "learning_rate": 6.284826158245005e-06, + "logits/chosen": 0.06666195392608643, + "logits/rejected": 0.0736611858010292, + "logps/chosen": -0.7771252989768982, + "logps/rejected": -1.527148962020874, + "loss": 0.6956, + "odds_ratio_loss": 0.3552093803882599, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0777125358581543, + "rewards/margins": 0.07500237226486206, + "rewards/rejected": -0.15271490812301636, + "sft_loss": 0.7771252989768982, + "step": 1340 + }, + { + "epoch": 1.9392624728850325, + "grad_norm": 2.82099850918232, + "learning_rate": 6.28227614769262e-06, + "logits/chosen": 0.15024471282958984, + "logits/rejected": 0.096395343542099, + "logps/chosen": -0.5148159861564636, + "logps/rejected": -2.127951145172119, + "loss": 0.599, + "odds_ratio_loss": 0.26387378573417664, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05148159712553024, + "rewards/margins": 0.16131353378295898, + "rewards/rejected": -0.21279510855674744, + "sft_loss": 0.5148159861564636, + "step": 1341 + }, + { + "epoch": 1.940708604483008, + "grad_norm": 2.379780310975218, + "learning_rate": 6.279724761147445e-06, + "logits/chosen": 0.09975308179855347, + "logits/rejected": 0.05713462457060814, + "logps/chosen": -0.3695414662361145, + "logps/rejected": -3.6172609329223633, + "loss": 0.5196, + "odds_ratio_loss": 0.2230728268623352, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03695414215326309, + "rewards/margins": 0.3247719407081604, + "rewards/rejected": -0.3617260754108429, + "sft_loss": 0.3695414662361145, + "step": 1342 + }, + { + "epoch": 1.9421547360809832, + "grad_norm": 3.000388458997451, + "learning_rate": 6.2771720001477216e-06, + "logits/chosen": 0.16662685573101044, + "logits/rejected": 0.060050975531339645, + "logps/chosen": -0.6600314974784851, + "logps/rejected": -4.327170372009277, + "loss": 0.5381, + "odds_ratio_loss": 0.2681175768375397, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06600315123796463, + "rewards/margins": 0.36671388149261475, + "rewards/rejected": -0.43271705508232117, + "sft_loss": 0.6600314974784851, + "step": 1343 + }, + { + "epoch": 1.943600867678959, + "grad_norm": 4.460845864755663, + "learning_rate": 6.2746178662325176e-06, + "logits/chosen": 0.2377113699913025, + "logits/rejected": 0.05871110036969185, + "logps/chosen": -0.52542644739151, + "logps/rejected": -3.663177251815796, + "loss": 0.6501, + "odds_ratio_loss": 0.24479225277900696, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05254264175891876, + "rewards/margins": 0.31377506256103516, + "rewards/rejected": -0.3663177192211151, + "sft_loss": 0.52542644739151, + "step": 1344 + }, + { + "epoch": 1.9450469992769341, + "grad_norm": 2.6669621761593154, + "learning_rate": 6.2720623609417315e-06, + "logits/chosen": 0.1335269808769226, + "logits/rejected": 0.08458232134580612, + "logps/chosen": -0.49498024582862854, + "logps/rejected": -3.2982349395751953, + "loss": 0.5686, + "odds_ratio_loss": 0.20877742767333984, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.049498021602630615, + "rewards/margins": 0.2803254723548889, + "rewards/rejected": -0.3298235237598419, + "sft_loss": 0.49498024582862854, + "step": 1345 + }, + { + "epoch": 1.9464931308749096, + "grad_norm": 2.3682656918705414, + "learning_rate": 6.269505485816084e-06, + "logits/chosen": 0.05494852364063263, + "logits/rejected": 0.10123536735773087, + "logps/chosen": -0.5091966986656189, + "logps/rejected": -2.195608615875244, + "loss": 0.5588, + "odds_ratio_loss": 0.2521820366382599, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05091967061161995, + "rewards/margins": 0.16864119470119476, + "rewards/rejected": -0.21956084668636322, + "sft_loss": 0.5091966986656189, + "step": 1346 + }, + { + "epoch": 1.947939262472885, + "grad_norm": 2.6512013563553287, + "learning_rate": 6.266947242397129e-06, + "logits/chosen": -0.017354674637317657, + "logits/rejected": 0.13395394384860992, + "logps/chosen": -0.38874998688697815, + "logps/rejected": -2.445746898651123, + "loss": 0.4976, + "odds_ratio_loss": 0.14418071508407593, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.038874998688697815, + "rewards/margins": 0.20569969713687897, + "rewards/rejected": -0.24457469582557678, + "sft_loss": 0.38874998688697815, + "step": 1347 + }, + { + "epoch": 1.9493853940708603, + "grad_norm": 2.1651142096226916, + "learning_rate": 6.264387632227237e-06, + "logits/chosen": 0.15398314595222473, + "logits/rejected": 0.08990249782800674, + "logps/chosen": -0.5985828638076782, + "logps/rejected": -3.3371827602386475, + "loss": 0.5798, + "odds_ratio_loss": 0.4579890966415405, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0598582923412323, + "rewards/margins": 0.27385997772216797, + "rewards/rejected": -0.33371827006340027, + "sft_loss": 0.5985828638076782, + "step": 1348 + }, + { + "epoch": 1.950831525668836, + "grad_norm": 2.1798011502799253, + "learning_rate": 6.261826656849608e-06, + "logits/chosen": 0.07891194522380829, + "logits/rejected": 0.01930670440196991, + "logps/chosen": -0.6371974945068359, + "logps/rejected": -3.8084051609039307, + "loss": 0.6145, + "odds_ratio_loss": 0.38298970460891724, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06371975690126419, + "rewards/margins": 0.3171207010746002, + "rewards/rejected": -0.3808404505252838, + "sft_loss": 0.6371974945068359, + "step": 1349 + }, + { + "epoch": 1.9522776572668112, + "grad_norm": 2.6217473434650818, + "learning_rate": 6.259264317808265e-06, + "logits/chosen": 0.15782146155834198, + "logits/rejected": 0.04384998604655266, + "logps/chosen": -0.597080409526825, + "logps/rejected": -1.9142271280288696, + "loss": 0.6438, + "odds_ratio_loss": 0.3650417923927307, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.059708043932914734, + "rewards/margins": 0.13171467185020447, + "rewards/rejected": -0.19142268598079681, + "sft_loss": 0.597080409526825, + "step": 1350 + }, + { + "epoch": 1.9537237888647867, + "grad_norm": 2.260839269208856, + "learning_rate": 6.256700616648049e-06, + "logits/chosen": 0.02536643110215664, + "logits/rejected": -0.08087426424026489, + "logps/chosen": -0.6350283026695251, + "logps/rejected": -3.553034782409668, + "loss": 0.6017, + "odds_ratio_loss": 0.40396812558174133, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06350283324718475, + "rewards/margins": 0.2918006479740143, + "rewards/rejected": -0.3553035259246826, + "sft_loss": 0.6350283026695251, + "step": 1351 + }, + { + "epoch": 1.9551699204627622, + "grad_norm": 2.755672651848687, + "learning_rate": 6.254135554914628e-06, + "logits/chosen": 0.10055305063724518, + "logits/rejected": 0.028218144550919533, + "logps/chosen": -0.6204773187637329, + "logps/rejected": -2.3922319412231445, + "loss": 0.5954, + "odds_ratio_loss": 0.26733872294425964, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06204773113131523, + "rewards/margins": 0.17717546224594116, + "rewards/rejected": -0.2392231971025467, + "sft_loss": 0.6204773187637329, + "step": 1352 + }, + { + "epoch": 1.9566160520607374, + "grad_norm": 4.454723264605439, + "learning_rate": 6.251569134154482e-06, + "logits/chosen": 0.10258796811103821, + "logits/rejected": 0.07780814170837402, + "logps/chosen": -0.6633048057556152, + "logps/rejected": -2.7840261459350586, + "loss": 0.615, + "odds_ratio_loss": 0.28764694929122925, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06633048504590988, + "rewards/margins": 0.21207213401794434, + "rewards/rejected": -0.2784026265144348, + "sft_loss": 0.6633048057556152, + "step": 1353 + }, + { + "epoch": 1.958062183658713, + "grad_norm": 2.7603204339165597, + "learning_rate": 6.2490013559149215e-06, + "logits/chosen": 0.10790781676769257, + "logits/rejected": 0.08358579128980637, + "logps/chosen": -0.6533021330833435, + "logps/rejected": -1.621474027633667, + "loss": 0.6408, + "odds_ratio_loss": 0.42883116006851196, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06533021479845047, + "rewards/margins": 0.09681720286607742, + "rewards/rejected": -0.1621474176645279, + "sft_loss": 0.6533021330833435, + "step": 1354 + }, + { + "epoch": 1.9595083152566883, + "grad_norm": 2.125375605273302, + "learning_rate": 6.246432221744068e-06, + "logits/chosen": 0.1881817877292633, + "logits/rejected": 0.1281939148902893, + "logps/chosen": -0.5085922479629517, + "logps/rejected": -3.4811835289001465, + "loss": 0.5576, + "odds_ratio_loss": 0.25084131956100464, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05085922032594681, + "rewards/margins": 0.297259122133255, + "rewards/rejected": -0.3481183350086212, + "sft_loss": 0.5085922479629517, + "step": 1355 + }, + { + "epoch": 1.9609544468546638, + "grad_norm": 2.20344338079031, + "learning_rate": 6.2438617331908616e-06, + "logits/chosen": 0.11778862029314041, + "logits/rejected": 0.0436352975666523, + "logps/chosen": -0.5298951864242554, + "logps/rejected": -2.8205578327178955, + "loss": 0.5737, + "odds_ratio_loss": 0.2629122734069824, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.052989523857831955, + "rewards/margins": 0.22906626760959625, + "rewards/rejected": -0.2820557951927185, + "sft_loss": 0.5298951864242554, + "step": 1356 + }, + { + "epoch": 1.9624005784526393, + "grad_norm": 2.1196838584709465, + "learning_rate": 6.241289891805059e-06, + "logits/chosen": 0.017896192148327827, + "logits/rejected": 0.0749349296092987, + "logps/chosen": -0.5861757397651672, + "logps/rejected": -2.2477173805236816, + "loss": 0.589, + "odds_ratio_loss": 0.31650716066360474, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05861757695674896, + "rewards/margins": 0.166154146194458, + "rewards/rejected": -0.22477173805236816, + "sft_loss": 0.5861757397651672, + "step": 1357 + }, + { + "epoch": 1.9638467100506145, + "grad_norm": 2.4776207306510662, + "learning_rate": 6.238716699137233e-06, + "logits/chosen": 0.0893697589635849, + "logits/rejected": -0.046085745096206665, + "logps/chosen": -0.6362526416778564, + "logps/rejected": -3.7252695560455322, + "loss": 0.5203, + "odds_ratio_loss": 0.29234158992767334, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0636252611875534, + "rewards/margins": 0.30890169739723206, + "rewards/rejected": -0.3725269138813019, + "sft_loss": 0.6362526416778564, + "step": 1358 + }, + { + "epoch": 1.96529284164859, + "grad_norm": 2.6960848499910863, + "learning_rate": 6.23614215673877e-06, + "logits/chosen": 0.12152606248855591, + "logits/rejected": 0.026878921315073967, + "logps/chosen": -0.6179218888282776, + "logps/rejected": -1.8269139528274536, + "loss": 0.6289, + "odds_ratio_loss": 0.3888968229293823, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06179218739271164, + "rewards/margins": 0.12089921534061432, + "rewards/rejected": -0.18269139528274536, + "sft_loss": 0.6179218888282776, + "step": 1359 + }, + { + "epoch": 1.9667389732465654, + "grad_norm": 2.598237230889151, + "learning_rate": 6.233566266161874e-06, + "logits/chosen": 0.09245370328426361, + "logits/rejected": 0.12901175022125244, + "logps/chosen": -0.6184776425361633, + "logps/rejected": -2.0056989192962646, + "loss": 0.569, + "odds_ratio_loss": 0.3535006642341614, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.061847761273384094, + "rewards/margins": 0.13872212171554565, + "rewards/rejected": -0.20056991279125214, + "sft_loss": 0.6184776425361633, + "step": 1360 + }, + { + "epoch": 1.968185104844541, + "grad_norm": 2.58726516689127, + "learning_rate": 6.230989028959558e-06, + "logits/chosen": -0.00898485817015171, + "logits/rejected": 0.036827899515628815, + "logps/chosen": -0.7747241258621216, + "logps/rejected": -2.6281819343566895, + "loss": 0.6434, + "odds_ratio_loss": 0.3512718677520752, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07747241109609604, + "rewards/margins": 0.18534579873085022, + "rewards/rejected": -0.26281821727752686, + "sft_loss": 0.7747241258621216, + "step": 1361 + }, + { + "epoch": 1.9696312364425164, + "grad_norm": 2.662372117558744, + "learning_rate": 6.228410446685645e-06, + "logits/chosen": 0.06157606095075607, + "logits/rejected": -0.07763740420341492, + "logps/chosen": -0.8153557777404785, + "logps/rejected": -3.0472562313079834, + "loss": 0.7715, + "odds_ratio_loss": 0.40258070826530457, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08153557777404785, + "rewards/margins": 0.223190039396286, + "rewards/rejected": -0.30472564697265625, + "sft_loss": 0.8153557777404785, + "step": 1362 + }, + { + "epoch": 1.9710773680404916, + "grad_norm": 2.437914328745462, + "learning_rate": 6.225830520894776e-06, + "logits/chosen": 0.045589592307806015, + "logits/rejected": 0.004592648707330227, + "logps/chosen": -0.6443754434585571, + "logps/rejected": -3.426424741744995, + "loss": 0.6514, + "odds_ratio_loss": 0.39619654417037964, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06443755328655243, + "rewards/margins": 0.27820488810539246, + "rewards/rejected": -0.3426424562931061, + "sft_loss": 0.6443754434585571, + "step": 1363 + }, + { + "epoch": 1.972523499638467, + "grad_norm": 3.9002817508631162, + "learning_rate": 6.2232492531423945e-06, + "logits/chosen": 0.17493419349193573, + "logits/rejected": 0.04828786849975586, + "logps/chosen": -0.6880456209182739, + "logps/rejected": -3.114243984222412, + "loss": 0.615, + "odds_ratio_loss": 0.3803248405456543, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06880456209182739, + "rewards/margins": 0.2426198124885559, + "rewards/rejected": -0.3114243745803833, + "sft_loss": 0.6880456209182739, + "step": 1364 + }, + { + "epoch": 1.9739696312364425, + "grad_norm": 2.7356981494522916, + "learning_rate": 6.22066664498476e-06, + "logits/chosen": 0.09148862957954407, + "logits/rejected": 0.030430784448981285, + "logps/chosen": -0.6174599528312683, + "logps/rejected": -1.8549262285232544, + "loss": 0.6543, + "odds_ratio_loss": 0.3864895701408386, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06174599006772041, + "rewards/margins": 0.12374662607908249, + "rewards/rejected": -0.1854926198720932, + "sft_loss": 0.6174599528312683, + "step": 1365 + }, + { + "epoch": 1.9754157628344178, + "grad_norm": 2.2797866265947957, + "learning_rate": 6.218082697978934e-06, + "logits/chosen": 0.20202934741973877, + "logits/rejected": 0.13954401016235352, + "logps/chosen": -0.5369247198104858, + "logps/rejected": -2.2839553356170654, + "loss": 0.5875, + "odds_ratio_loss": 0.3428112864494324, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.053692467510700226, + "rewards/margins": 0.17470306158065796, + "rewards/rejected": -0.2283955216407776, + "sft_loss": 0.5369247198104858, + "step": 1366 + }, + { + "epoch": 1.9768618944323935, + "grad_norm": 3.3190838962022573, + "learning_rate": 6.215497413682786e-06, + "logits/chosen": 0.17240914702415466, + "logits/rejected": 0.08776585757732391, + "logps/chosen": -0.6855835914611816, + "logps/rejected": -2.02882981300354, + "loss": 0.6874, + "odds_ratio_loss": 0.3461560606956482, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06855836510658264, + "rewards/margins": 0.13432462513446808, + "rewards/rejected": -0.20288299024105072, + "sft_loss": 0.6855835914611816, + "step": 1367 + }, + { + "epoch": 1.9783080260303687, + "grad_norm": 3.256355322872998, + "learning_rate": 6.212910793654999e-06, + "logits/chosen": 0.061617545783519745, + "logits/rejected": 0.03875245153903961, + "logps/chosen": -0.5626049637794495, + "logps/rejected": -2.379176616668701, + "loss": 0.6345, + "odds_ratio_loss": 0.30977410078048706, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.056260496377944946, + "rewards/margins": 0.18165718019008636, + "rewards/rejected": -0.23791766166687012, + "sft_loss": 0.5626049637794495, + "step": 1368 + }, + { + "epoch": 1.9797541576283442, + "grad_norm": 2.622988724915116, + "learning_rate": 6.2103228394550515e-06, + "logits/chosen": 0.06107413023710251, + "logits/rejected": 0.10450088977813721, + "logps/chosen": -0.6807230710983276, + "logps/rejected": -2.148853302001953, + "loss": 0.5965, + "odds_ratio_loss": 0.32345831394195557, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06807231903076172, + "rewards/margins": 0.14681300520896912, + "rewards/rejected": -0.21488532423973083, + "sft_loss": 0.6807230710983276, + "step": 1369 + }, + { + "epoch": 1.9812002892263196, + "grad_norm": 2.682008253337646, + "learning_rate": 6.207733552643231e-06, + "logits/chosen": 0.1140088215470314, + "logits/rejected": 0.08846345543861389, + "logps/chosen": -0.5670576691627502, + "logps/rejected": -1.7721221446990967, + "loss": 0.6365, + "odds_ratio_loss": 0.4360979199409485, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0567057728767395, + "rewards/margins": 0.12050645053386688, + "rewards/rejected": -0.1772122085094452, + "sft_loss": 0.5670576691627502, + "step": 1370 + }, + { + "epoch": 1.9826464208242949, + "grad_norm": 2.952914326179049, + "learning_rate": 6.205142934780632e-06, + "logits/chosen": 0.09416116774082184, + "logits/rejected": 0.07221397757530212, + "logps/chosen": -0.5539889931678772, + "logps/rejected": -3.7700295448303223, + "loss": 0.5511, + "odds_ratio_loss": 0.27834856510162354, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05539890378713608, + "rewards/margins": 0.32160407304763794, + "rewards/rejected": -0.3770029544830322, + "sft_loss": 0.5539889931678772, + "step": 1371 + }, + { + "epoch": 1.9840925524222706, + "grad_norm": 2.4206064871782917, + "learning_rate": 6.202550987429142e-06, + "logits/chosen": 0.029989825561642647, + "logits/rejected": -0.0034413645043969154, + "logps/chosen": -0.8195029497146606, + "logps/rejected": -2.111359119415283, + "loss": 0.6337, + "odds_ratio_loss": 0.43149256706237793, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08195029199123383, + "rewards/margins": 0.12918563187122345, + "rewards/rejected": -0.21113592386245728, + "sft_loss": 0.8195029497146606, + "step": 1372 + }, + { + "epoch": 1.9855386840202458, + "grad_norm": 2.5068243281617093, + "learning_rate": 6.1999577121514595e-06, + "logits/chosen": 0.15260952711105347, + "logits/rejected": 0.10358820110559464, + "logps/chosen": -0.6531432867050171, + "logps/rejected": -2.2045655250549316, + "loss": 0.6213, + "odds_ratio_loss": 0.37667161226272583, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06531432271003723, + "rewards/margins": 0.15514221787452698, + "rewards/rejected": -0.2204565405845642, + "sft_loss": 0.6531432867050171, + "step": 1373 + }, + { + "epoch": 1.9869848156182213, + "grad_norm": 2.5425623444210954, + "learning_rate": 6.197363110511078e-06, + "logits/chosen": 0.1662987619638443, + "logits/rejected": 0.020264727994799614, + "logps/chosen": -0.485507607460022, + "logps/rejected": -3.171699047088623, + "loss": 0.6155, + "odds_ratio_loss": 0.26024413108825684, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04855076223611832, + "rewards/margins": 0.2686191499233246, + "rewards/rejected": -0.3171699047088623, + "sft_loss": 0.485507607460022, + "step": 1374 + }, + { + "epoch": 1.9884309472161967, + "grad_norm": 3.1329413200606866, + "learning_rate": 6.194767184072296e-06, + "logits/chosen": 0.12748688459396362, + "logits/rejected": 0.06533270329236984, + "logps/chosen": -0.6616427302360535, + "logps/rejected": -3.427320718765259, + "loss": 0.5622, + "odds_ratio_loss": 0.3506993353366852, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06616427004337311, + "rewards/margins": 0.27656784653663635, + "rewards/rejected": -0.34273210167884827, + "sft_loss": 0.6616427302360535, + "step": 1375 + }, + { + "epoch": 1.989877078814172, + "grad_norm": 2.888890196942516, + "learning_rate": 6.192169934400202e-06, + "logits/chosen": 0.14859473705291748, + "logits/rejected": 0.07147763669490814, + "logps/chosen": -0.39317965507507324, + "logps/rejected": -3.1786046028137207, + "loss": 0.5502, + "odds_ratio_loss": 0.2858788073062897, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.039317965507507324, + "rewards/margins": 0.27854251861572266, + "rewards/rejected": -0.31786048412323, + "sft_loss": 0.39317965507507324, + "step": 1376 + }, + { + "epoch": 1.9913232104121477, + "grad_norm": 2.493676042638736, + "learning_rate": 6.189571363060691e-06, + "logits/chosen": 0.10260988026857376, + "logits/rejected": 0.022080600261688232, + "logps/chosen": -0.37363216280937195, + "logps/rejected": -3.6419897079467773, + "loss": 0.6115, + "odds_ratio_loss": 0.17057818174362183, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.037363216280937195, + "rewards/margins": 0.3268357515335083, + "rewards/rejected": -0.3641989827156067, + "sft_loss": 0.37363216280937195, + "step": 1377 + }, + { + "epoch": 1.992769342010123, + "grad_norm": 2.5488337687194655, + "learning_rate": 6.18697147162045e-06, + "logits/chosen": 0.1628027707338333, + "logits/rejected": 0.15313595533370972, + "logps/chosen": -0.5107381343841553, + "logps/rejected": -3.4903712272644043, + "loss": 0.5691, + "odds_ratio_loss": 0.26869532465934753, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05107381194829941, + "rewards/margins": 0.29796332120895386, + "rewards/rejected": -0.3490371108055115, + "sft_loss": 0.5107381343841553, + "step": 1378 + }, + { + "epoch": 1.9942154736080984, + "grad_norm": 2.990348500728237, + "learning_rate": 6.184370261646964e-06, + "logits/chosen": 0.25367701053619385, + "logits/rejected": 0.05598670244216919, + "logps/chosen": -0.5730169415473938, + "logps/rejected": -4.140180587768555, + "loss": 0.6438, + "odds_ratio_loss": 0.31483423709869385, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05730169266462326, + "rewards/margins": 0.3567163944244385, + "rewards/rejected": -0.41401803493499756, + "sft_loss": 0.5730169415473938, + "step": 1379 + }, + { + "epoch": 1.9956616052060738, + "grad_norm": 2.4464817090945754, + "learning_rate": 6.181767734708512e-06, + "logits/chosen": 0.13885369896888733, + "logits/rejected": 0.1410420686006546, + "logps/chosen": -0.7233635187149048, + "logps/rejected": -1.2208565473556519, + "loss": 0.6577, + "odds_ratio_loss": 0.4889734983444214, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.072336345911026, + "rewards/margins": 0.04974930360913277, + "rewards/rejected": -0.12208565324544907, + "sft_loss": 0.7233635187149048, + "step": 1380 + }, + { + "epoch": 1.997107736804049, + "grad_norm": 2.624728985693753, + "learning_rate": 6.179163892374164e-06, + "logits/chosen": 0.14182862639427185, + "logits/rejected": -0.06210314854979515, + "logps/chosen": -0.7554327845573425, + "logps/rejected": -3.2970387935638428, + "loss": 0.6579, + "odds_ratio_loss": 0.3780558109283447, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07554327696561813, + "rewards/margins": 0.254160612821579, + "rewards/rejected": -0.3297038972377777, + "sft_loss": 0.7554327845573425, + "step": 1381 + }, + { + "epoch": 1.9985538684020245, + "grad_norm": 2.500628814454199, + "learning_rate": 6.176558736213793e-06, + "logits/chosen": 0.09440597891807556, + "logits/rejected": 0.0984802171587944, + "logps/chosen": -0.535467267036438, + "logps/rejected": -2.6558072566986084, + "loss": 0.531, + "odds_ratio_loss": 0.30303633213043213, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0535467267036438, + "rewards/margins": 0.21203403174877167, + "rewards/rejected": -0.2655807435512543, + "sft_loss": 0.535467267036438, + "step": 1382 + }, + { + "epoch": 2.0, + "grad_norm": 3.28712021596782, + "learning_rate": 6.173952267798052e-06, + "logits/chosen": 0.2147250473499298, + "logits/rejected": 0.11980797350406647, + "logps/chosen": -0.5156925916671753, + "logps/rejected": -3.986497402191162, + "loss": 0.5173, + "odds_ratio_loss": 0.28765448927879333, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05156925693154335, + "rewards/margins": 0.34708043932914734, + "rewards/rejected": -0.3986497223377228, + "sft_loss": 0.5156925916671753, + "step": 1383 + }, + { + "epoch": 2.0014461315979752, + "grad_norm": 3.2315398678148486, + "learning_rate": 6.171344488698393e-06, + "logits/chosen": 0.12111207842826843, + "logits/rejected": 0.17495957016944885, + "logps/chosen": -0.3766588866710663, + "logps/rejected": -2.8388314247131348, + "loss": 0.382, + "odds_ratio_loss": 0.17233169078826904, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03766588866710663, + "rewards/margins": 0.246217280626297, + "rewards/rejected": -0.28388315439224243, + "sft_loss": 0.3766588866710663, + "step": 1384 + }, + { + "epoch": 2.002892263195951, + "grad_norm": 3.189280598961191, + "learning_rate": 6.168735400487054e-06, + "logits/chosen": 0.03559612110257149, + "logits/rejected": 0.03742482513189316, + "logps/chosen": -0.5957620739936829, + "logps/rejected": -2.5465335845947266, + "loss": 0.475, + "odds_ratio_loss": 0.32061469554901123, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.059576209634542465, + "rewards/margins": 0.19507713615894318, + "rewards/rejected": -0.25465336441993713, + "sft_loss": 0.5957620739936829, + "step": 1385 + }, + { + "epoch": 2.004338394793926, + "grad_norm": 3.0677554248229746, + "learning_rate": 6.166125004737065e-06, + "logits/chosen": -0.16084589064121246, + "logits/rejected": -0.14135894179344177, + "logps/chosen": -0.32411518692970276, + "logps/rejected": -2.226219415664673, + "loss": 0.3491, + "odds_ratio_loss": 0.1918307989835739, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.032411519438028336, + "rewards/margins": 0.19021041691303253, + "rewards/rejected": -0.22262193262577057, + "sft_loss": 0.32411518692970276, + "step": 1386 + }, + { + "epoch": 2.005784526391902, + "grad_norm": 2.1849423111948445, + "learning_rate": 6.163513303022243e-06, + "logits/chosen": -0.19721360504627228, + "logits/rejected": -0.12379816919565201, + "logps/chosen": -0.26573044061660767, + "logps/rejected": -4.349565505981445, + "loss": 0.3741, + "odds_ratio_loss": 0.1065993681550026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026573043316602707, + "rewards/margins": 0.40838348865509033, + "rewards/rejected": -0.43495655059814453, + "sft_loss": 0.26573044061660767, + "step": 1387 + }, + { + "epoch": 2.007230657989877, + "grad_norm": 2.3768163135616973, + "learning_rate": 6.160900296917193e-06, + "logits/chosen": -0.33068203926086426, + "logits/rejected": -0.24483588337898254, + "logps/chosen": -0.3788696527481079, + "logps/rejected": -2.304666519165039, + "loss": 0.3719, + "odds_ratio_loss": 0.1597655564546585, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03788696601986885, + "rewards/margins": 0.19257968664169312, + "rewards/rejected": -0.23046666383743286, + "sft_loss": 0.3788696527481079, + "step": 1388 + }, + { + "epoch": 2.0086767895878523, + "grad_norm": 2.5696151099744426, + "learning_rate": 6.158285987997306e-06, + "logits/chosen": -0.3501858115196228, + "logits/rejected": -0.3419753909111023, + "logps/chosen": -0.3169471323490143, + "logps/rejected": -3.3578648567199707, + "loss": 0.3397, + "odds_ratio_loss": 0.16776108741760254, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03169471397995949, + "rewards/margins": 0.30409175157546997, + "rewards/rejected": -0.33578646183013916, + "sft_loss": 0.3169471323490143, + "step": 1389 + }, + { + "epoch": 2.010122921185828, + "grad_norm": 3.936520281332721, + "learning_rate": 6.155670377838758e-06, + "logits/chosen": -0.6566485166549683, + "logits/rejected": -0.5000695586204529, + "logps/chosen": -0.3887198567390442, + "logps/rejected": -3.2703821659088135, + "loss": 0.4191, + "odds_ratio_loss": 0.18304914236068726, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03887198492884636, + "rewards/margins": 0.28816622495651245, + "rewards/rejected": -0.3270382285118103, + "sft_loss": 0.3887198567390442, + "step": 1390 + }, + { + "epoch": 2.0115690527838033, + "grad_norm": 6.474734950245381, + "learning_rate": 6.153053468018511e-06, + "logits/chosen": -0.536896288394928, + "logits/rejected": -0.40524619817733765, + "logps/chosen": -0.3791639804840088, + "logps/rejected": -2.7076427936553955, + "loss": 0.4267, + "odds_ratio_loss": 0.10726868361234665, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.037916399538517, + "rewards/margins": 0.2328478991985321, + "rewards/rejected": -0.2707642912864685, + "sft_loss": 0.3791639804840088, + "step": 1391 + }, + { + "epoch": 2.013015184381779, + "grad_norm": 6.726741776150189, + "learning_rate": 6.15043526011431e-06, + "logits/chosen": -0.6585301756858826, + "logits/rejected": -0.3784843683242798, + "logps/chosen": -0.44925493001937866, + "logps/rejected": -2.590120792388916, + "loss": 0.4351, + "odds_ratio_loss": 0.19256196916103363, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.044925495982170105, + "rewards/margins": 0.21408656239509583, + "rewards/rejected": -0.2590120732784271, + "sft_loss": 0.44925493001937866, + "step": 1392 + }, + { + "epoch": 2.014461315979754, + "grad_norm": 3.9009213533091125, + "learning_rate": 6.14781575570468e-06, + "logits/chosen": -0.5194917321205139, + "logits/rejected": -0.2270890772342682, + "logps/chosen": -0.31589779257774353, + "logps/rejected": -5.69291877746582, + "loss": 0.3387, + "odds_ratio_loss": 0.0934254378080368, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03158978000283241, + "rewards/margins": 0.5377020239830017, + "rewards/rejected": -0.5692918300628662, + "sft_loss": 0.31589779257774353, + "step": 1393 + }, + { + "epoch": 2.0159074475777294, + "grad_norm": 2.9063940794517955, + "learning_rate": 6.145194956368932e-06, + "logits/chosen": -0.52901291847229, + "logits/rejected": -0.3962056636810303, + "logps/chosen": -0.24508774280548096, + "logps/rejected": -4.180789470672607, + "loss": 0.3288, + "odds_ratio_loss": 0.12533631920814514, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024508774280548096, + "rewards/margins": 0.3935701847076416, + "rewards/rejected": -0.4180789589881897, + "sft_loss": 0.24508774280548096, + "step": 1394 + }, + { + "epoch": 2.017353579175705, + "grad_norm": 3.1811792443833067, + "learning_rate": 6.142572863687157e-06, + "logits/chosen": -0.24593108892440796, + "logits/rejected": -0.17790842056274414, + "logps/chosen": -0.28362900018692017, + "logps/rejected": -3.8639004230499268, + "loss": 0.4598, + "odds_ratio_loss": 0.1365346908569336, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.028362900018692017, + "rewards/margins": 0.3580271601676941, + "rewards/rejected": -0.3863900601863861, + "sft_loss": 0.28362900018692017, + "step": 1395 + }, + { + "epoch": 2.0187997107736804, + "grad_norm": 2.742354366098211, + "learning_rate": 6.13994947924022e-06, + "logits/chosen": -0.4542756974697113, + "logits/rejected": -0.5005182027816772, + "logps/chosen": -0.4230310916900635, + "logps/rejected": -4.989255905151367, + "loss": 0.3642, + "odds_ratio_loss": 0.2016012817621231, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04230311140418053, + "rewards/margins": 0.45662248134613037, + "rewards/rejected": -0.4989256262779236, + "sft_loss": 0.4230310916900635, + "step": 1396 + }, + { + "epoch": 2.0202458423716556, + "grad_norm": 2.683983578822834, + "learning_rate": 6.137324804609774e-06, + "logits/chosen": -0.19099709391593933, + "logits/rejected": -0.16608811914920807, + "logps/chosen": -0.2979000210762024, + "logps/rejected": -4.687519550323486, + "loss": 0.4058, + "odds_ratio_loss": 0.09216830134391785, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029789999127388, + "rewards/margins": 0.4389619529247284, + "rewards/rejected": -0.4687519669532776, + "sft_loss": 0.2979000210762024, + "step": 1397 + }, + { + "epoch": 2.0216919739696313, + "grad_norm": 3.327282280218847, + "learning_rate": 6.134698841378243e-06, + "logits/chosen": -0.3360438644886017, + "logits/rejected": -0.18919801712036133, + "logps/chosen": -0.4767918288707733, + "logps/rejected": -3.2631969451904297, + "loss": 0.3656, + "odds_ratio_loss": 0.15634480118751526, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04767918586730957, + "rewards/margins": 0.278640478849411, + "rewards/rejected": -0.32631969451904297, + "sft_loss": 0.4767918288707733, + "step": 1398 + }, + { + "epoch": 2.0231381055676065, + "grad_norm": 2.2233945899908285, + "learning_rate": 6.132071591128829e-06, + "logits/chosen": -0.2488166093826294, + "logits/rejected": -0.12372294068336487, + "logps/chosen": -0.39679771661758423, + "logps/rejected": -3.140080451965332, + "loss": 0.3152, + "odds_ratio_loss": 0.1701274961233139, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03967977315187454, + "rewards/margins": 0.2743282914161682, + "rewards/rejected": -0.31400808691978455, + "sft_loss": 0.39679771661758423, + "step": 1399 + }, + { + "epoch": 2.0245842371655822, + "grad_norm": 2.6076662390403236, + "learning_rate": 6.129443055445512e-06, + "logits/chosen": -0.29236453771591187, + "logits/rejected": -0.30255061388015747, + "logps/chosen": -0.315972238779068, + "logps/rejected": -3.8869261741638184, + "loss": 0.3984, + "odds_ratio_loss": 0.1734756976366043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03159722685813904, + "rewards/margins": 0.3570953607559204, + "rewards/rejected": -0.38869261741638184, + "sft_loss": 0.315972238779068, + "step": 1400 + }, + { + "epoch": 2.0260303687635575, + "grad_norm": 2.3586036539147797, + "learning_rate": 6.1268132359130475e-06, + "logits/chosen": -0.5538555383682251, + "logits/rejected": -0.3169947564601898, + "logps/chosen": -0.4706674814224243, + "logps/rejected": -3.450441360473633, + "loss": 0.3946, + "odds_ratio_loss": 0.12920330464839935, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04706674814224243, + "rewards/margins": 0.29797738790512085, + "rewards/rejected": -0.34504416584968567, + "sft_loss": 0.4706674814224243, + "step": 1401 + }, + { + "epoch": 2.0274765003615327, + "grad_norm": 2.4006021186149544, + "learning_rate": 6.12418213411696e-06, + "logits/chosen": -0.32977399230003357, + "logits/rejected": -0.2614250183105469, + "logps/chosen": -0.234191432595253, + "logps/rejected": -2.427402973175049, + "loss": 0.3291, + "odds_ratio_loss": 0.10665269196033478, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02341914176940918, + "rewards/margins": 0.21932116150856018, + "rewards/rejected": -0.24274030327796936, + "sft_loss": 0.234191432595253, + "step": 1402 + }, + { + "epoch": 2.0289226319595084, + "grad_norm": 2.4765029159664924, + "learning_rate": 6.121549751643554e-06, + "logits/chosen": -0.4366395175457001, + "logits/rejected": -0.2806912660598755, + "logps/chosen": -0.3349638879299164, + "logps/rejected": -2.964669704437256, + "loss": 0.3604, + "odds_ratio_loss": 0.10340587794780731, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03349638730287552, + "rewards/margins": 0.26297056674957275, + "rewards/rejected": -0.2964669466018677, + "sft_loss": 0.3349638879299164, + "step": 1403 + }, + { + "epoch": 2.0303687635574836, + "grad_norm": 2.285256352398014, + "learning_rate": 6.118916090079901e-06, + "logits/chosen": -0.35529831051826477, + "logits/rejected": -0.43684202432632446, + "logps/chosen": -0.19356098771095276, + "logps/rejected": -2.894097328186035, + "loss": 0.3484, + "odds_ratio_loss": 0.06699930876493454, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019356099888682365, + "rewards/margins": 0.2700536251068115, + "rewards/rejected": -0.28940972685813904, + "sft_loss": 0.19356098771095276, + "step": 1404 + }, + { + "epoch": 2.0318148951554593, + "grad_norm": 2.7019259490766685, + "learning_rate": 6.116281151013846e-06, + "logits/chosen": -0.5889612436294556, + "logits/rejected": -0.3919902443885803, + "logps/chosen": -0.31004148721694946, + "logps/rejected": -4.649913311004639, + "loss": 0.3577, + "odds_ratio_loss": 0.11446838825941086, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.031004149466753006, + "rewards/margins": 0.43398720026016235, + "rewards/rejected": -0.46499139070510864, + "sft_loss": 0.31004148721694946, + "step": 1405 + }, + { + "epoch": 2.0332610267534346, + "grad_norm": 2.244900017647067, + "learning_rate": 6.113644936034002e-06, + "logits/chosen": -0.5037296414375305, + "logits/rejected": -0.39371100068092346, + "logps/chosen": -0.43020951747894287, + "logps/rejected": -2.473050832748413, + "loss": 0.3748, + "odds_ratio_loss": 0.1621953845024109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04302094876766205, + "rewards/margins": 0.20428414642810822, + "rewards/rejected": -0.24730511009693146, + "sft_loss": 0.43020951747894287, + "step": 1406 + }, + { + "epoch": 2.03470715835141, + "grad_norm": 2.474297866515971, + "learning_rate": 6.111007446729754e-06, + "logits/chosen": -0.8331849575042725, + "logits/rejected": -0.4606708288192749, + "logps/chosen": -0.2885475754737854, + "logps/rejected": -4.700456619262695, + "loss": 0.3241, + "odds_ratio_loss": 0.06865033507347107, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02885475754737854, + "rewards/margins": 0.4411908984184265, + "rewards/rejected": -0.47004562616348267, + "sft_loss": 0.2885475754737854, + "step": 1407 + }, + { + "epoch": 2.0361532899493855, + "grad_norm": 4.328881723447609, + "learning_rate": 6.108368684691255e-06, + "logits/chosen": -0.3607381284236908, + "logits/rejected": -0.3933161497116089, + "logps/chosen": -0.5243850946426392, + "logps/rejected": -2.1831188201904297, + "loss": 0.3454, + "odds_ratio_loss": 0.21166378259658813, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05243851616978645, + "rewards/margins": 0.16587337851524353, + "rewards/rejected": -0.2183118760585785, + "sft_loss": 0.5243850946426392, + "step": 1408 + }, + { + "epoch": 2.0375994215473607, + "grad_norm": 2.6476216733494073, + "learning_rate": 6.105728651509423e-06, + "logits/chosen": -0.26458385586738586, + "logits/rejected": -0.1522885262966156, + "logps/chosen": -0.3708568513393402, + "logps/rejected": -3.2693662643432617, + "loss": 0.3426, + "odds_ratio_loss": 0.14628729224205017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03708568960428238, + "rewards/margins": 0.28985095024108887, + "rewards/rejected": -0.32693663239479065, + "sft_loss": 0.3708568513393402, + "step": 1409 + }, + { + "epoch": 2.0390455531453364, + "grad_norm": 2.284226764361494, + "learning_rate": 6.103087348775945e-06, + "logits/chosen": -0.2937602996826172, + "logits/rejected": -0.39900609850883484, + "logps/chosen": -0.5060720443725586, + "logps/rejected": -2.6557836532592773, + "loss": 0.3781, + "odds_ratio_loss": 0.194400817155838, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05060720071196556, + "rewards/margins": 0.2149711549282074, + "rewards/rejected": -0.26557838916778564, + "sft_loss": 0.5060720443725586, + "step": 1410 + }, + { + "epoch": 2.0404916847433117, + "grad_norm": 2.6244235607829243, + "learning_rate": 6.100444778083271e-06, + "logits/chosen": -0.46006476879119873, + "logits/rejected": -0.16463829576969147, + "logps/chosen": -0.4287043809890747, + "logps/rejected": -2.332899332046509, + "loss": 0.4342, + "odds_ratio_loss": 0.13950975239276886, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04287043958902359, + "rewards/margins": 0.1904195100069046, + "rewards/rejected": -0.2332899272441864, + "sft_loss": 0.4287043809890747, + "step": 1411 + }, + { + "epoch": 2.041937816341287, + "grad_norm": 2.2183067171761057, + "learning_rate": 6.097800941024618e-06, + "logits/chosen": -0.4148803651332855, + "logits/rejected": -0.4651082754135132, + "logps/chosen": -0.39564990997314453, + "logps/rejected": -2.487851858139038, + "loss": 0.3184, + "odds_ratio_loss": 0.15196757018566132, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03956499695777893, + "rewards/margins": 0.20922018587589264, + "rewards/rejected": -0.24878518283367157, + "sft_loss": 0.39564990997314453, + "step": 1412 + }, + { + "epoch": 2.0433839479392626, + "grad_norm": 2.5032801033265715, + "learning_rate": 6.095155839193964e-06, + "logits/chosen": -0.2763396203517914, + "logits/rejected": -0.12382631003856659, + "logps/chosen": -0.28163591027259827, + "logps/rejected": -4.132618427276611, + "loss": 0.3437, + "odds_ratio_loss": 0.08366397768259048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.028163593262434006, + "rewards/margins": 0.3850982189178467, + "rewards/rejected": -0.4132618308067322, + "sft_loss": 0.28163591027259827, + "step": 1413 + }, + { + "epoch": 2.044830079537238, + "grad_norm": 2.667578076549321, + "learning_rate": 6.092509474186052e-06, + "logits/chosen": -0.40817150473594666, + "logits/rejected": -0.4003009796142578, + "logps/chosen": -0.459774911403656, + "logps/rejected": -1.49889075756073, + "loss": 0.4754, + "odds_ratio_loss": 0.22171664237976074, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04597749188542366, + "rewards/margins": 0.10391158610582352, + "rewards/rejected": -0.14988906681537628, + "sft_loss": 0.459774911403656, + "step": 1414 + }, + { + "epoch": 2.0462762111352135, + "grad_norm": 4.948341598400315, + "learning_rate": 6.089861847596385e-06, + "logits/chosen": -0.3838014006614685, + "logits/rejected": -0.32643434405326843, + "logps/chosen": -0.42405328154563904, + "logps/rejected": -3.091123104095459, + "loss": 0.4266, + "odds_ratio_loss": 0.1393635869026184, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04240532964468002, + "rewards/margins": 0.2667069733142853, + "rewards/rejected": -0.3091123402118683, + "sft_loss": 0.42405328154563904, + "step": 1415 + }, + { + "epoch": 2.0477223427331888, + "grad_norm": 2.333675906831936, + "learning_rate": 6.087212961021226e-06, + "logits/chosen": -0.361023873090744, + "logits/rejected": -0.31981074810028076, + "logps/chosen": -0.2632179856300354, + "logps/rejected": -3.056180000305176, + "loss": 0.3845, + "odds_ratio_loss": 0.1083202064037323, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02632179856300354, + "rewards/margins": 0.2792961597442627, + "rewards/rejected": -0.3056179881095886, + "sft_loss": 0.2632179856300354, + "step": 1416 + }, + { + "epoch": 2.049168474331164, + "grad_norm": 2.291013338955284, + "learning_rate": 6.084562816057599e-06, + "logits/chosen": -0.35837870836257935, + "logits/rejected": -0.29917028546333313, + "logps/chosen": -0.22934255003929138, + "logps/rejected": -3.9777276515960693, + "loss": 0.3472, + "odds_ratio_loss": 0.0663267970085144, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02293425425887108, + "rewards/margins": 0.37483853101730347, + "rewards/rejected": -0.39777275919914246, + "sft_loss": 0.22934255003929138, + "step": 1417 + }, + { + "epoch": 2.0506146059291397, + "grad_norm": 2.361170250846157, + "learning_rate": 6.081911414303286e-06, + "logits/chosen": -0.3557944595813751, + "logits/rejected": -0.2875556945800781, + "logps/chosen": -0.33980539441108704, + "logps/rejected": -2.679823398590088, + "loss": 0.3357, + "odds_ratio_loss": 0.13775214552879333, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.033980537205934525, + "rewards/margins": 0.23400181531906128, + "rewards/rejected": -0.2679823637008667, + "sft_loss": 0.33980539441108704, + "step": 1418 + }, + { + "epoch": 2.052060737527115, + "grad_norm": 2.658491704323656, + "learning_rate": 6.0792587573568285e-06, + "logits/chosen": -0.3484780192375183, + "logits/rejected": -0.21401377022266388, + "logps/chosen": -0.3252895474433899, + "logps/rejected": -3.0992331504821777, + "loss": 0.3577, + "odds_ratio_loss": 0.08834249526262283, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03252895176410675, + "rewards/margins": 0.2773943543434143, + "rewards/rejected": -0.30992335081100464, + "sft_loss": 0.3252895474433899, + "step": 1419 + }, + { + "epoch": 2.05350686912509, + "grad_norm": 2.505632092772328, + "learning_rate": 6.076604846817522e-06, + "logits/chosen": -0.46591854095458984, + "logits/rejected": -0.24125395715236664, + "logps/chosen": -0.39621445536613464, + "logps/rejected": -3.352841854095459, + "loss": 0.3723, + "odds_ratio_loss": 0.10687874257564545, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.039621446281671524, + "rewards/margins": 0.2956627607345581, + "rewards/rejected": -0.33528417348861694, + "sft_loss": 0.39621445536613464, + "step": 1420 + }, + { + "epoch": 2.054953000723066, + "grad_norm": 2.2675780954478486, + "learning_rate": 6.073949684285419e-06, + "logits/chosen": -0.2978203296661377, + "logits/rejected": -0.15997031331062317, + "logps/chosen": -0.3860008716583252, + "logps/rejected": -2.842381000518799, + "loss": 0.332, + "odds_ratio_loss": 0.17229007184505463, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03860008716583252, + "rewards/margins": 0.24563804268836975, + "rewards/rejected": -0.28423812985420227, + "sft_loss": 0.3860008716583252, + "step": 1421 + }, + { + "epoch": 2.056399132321041, + "grad_norm": 2.3078172574413096, + "learning_rate": 6.071293271361327e-06, + "logits/chosen": -0.36375710368156433, + "logits/rejected": -0.291500061750412, + "logps/chosen": -0.37237662076950073, + "logps/rejected": -3.426270008087158, + "loss": 0.3802, + "odds_ratio_loss": 0.14352582395076752, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.037237659096717834, + "rewards/margins": 0.3053893446922302, + "rewards/rejected": -0.34262698888778687, + "sft_loss": 0.37237662076950073, + "step": 1422 + }, + { + "epoch": 2.057845263919017, + "grad_norm": 1.983808931410135, + "learning_rate": 6.068635609646808e-06, + "logits/chosen": -0.41622287034988403, + "logits/rejected": -0.169271320104599, + "logps/chosen": -0.3853681683540344, + "logps/rejected": -4.620948791503906, + "loss": 0.3248, + "odds_ratio_loss": 0.05672647804021835, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03853682056069374, + "rewards/margins": 0.4235580861568451, + "rewards/rejected": -0.4620949327945709, + "sft_loss": 0.3853681683540344, + "step": 1423 + }, + { + "epoch": 2.059291395516992, + "grad_norm": 2.335324198672471, + "learning_rate": 6.065976700744174e-06, + "logits/chosen": -0.231398805975914, + "logits/rejected": -0.194851815700531, + "logps/chosen": -0.39015644788742065, + "logps/rejected": -4.776452541351318, + "loss": 0.3797, + "odds_ratio_loss": 0.1511651873588562, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.039015647023916245, + "rewards/margins": 0.4386296272277832, + "rewards/rejected": -0.47764530777931213, + "sft_loss": 0.39015644788742065, + "step": 1424 + }, + { + "epoch": 2.0607375271149673, + "grad_norm": 1.9881483716659663, + "learning_rate": 6.063316546256494e-06, + "logits/chosen": -0.4184289574623108, + "logits/rejected": -0.23517775535583496, + "logps/chosen": -0.2277361899614334, + "logps/rejected": -3.0598249435424805, + "loss": 0.3115, + "odds_ratio_loss": 0.08884076774120331, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0227736197412014, + "rewards/margins": 0.2832088768482208, + "rewards/rejected": -0.3059825003147125, + "sft_loss": 0.2277361899614334, + "step": 1425 + }, + { + "epoch": 2.062183658712943, + "grad_norm": 3.2525491666588797, + "learning_rate": 6.060655147787583e-06, + "logits/chosen": -0.6589328646659851, + "logits/rejected": -0.32301610708236694, + "logps/chosen": -0.2796526551246643, + "logps/rejected": -2.624934673309326, + "loss": 0.3279, + "odds_ratio_loss": 0.08123660087585449, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02796526625752449, + "rewards/margins": 0.23452815413475037, + "rewards/rejected": -0.26249343156814575, + "sft_loss": 0.2796526551246643, + "step": 1426 + }, + { + "epoch": 2.063629790310918, + "grad_norm": 2.345153662950682, + "learning_rate": 6.057992506942011e-06, + "logits/chosen": -0.40048494935035706, + "logits/rejected": -0.2785194218158722, + "logps/chosen": -0.248963862657547, + "logps/rejected": -2.053396224975586, + "loss": 0.3094, + "odds_ratio_loss": 0.08460384607315063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02489638887345791, + "rewards/margins": 0.18044325709342957, + "rewards/rejected": -0.20533964037895203, + "sft_loss": 0.248963862657547, + "step": 1427 + }, + { + "epoch": 2.065075921908894, + "grad_norm": 2.48367237998288, + "learning_rate": 6.05532862532509e-06, + "logits/chosen": -0.39829522371292114, + "logits/rejected": -0.17285127937793732, + "logps/chosen": -0.3011299669742584, + "logps/rejected": -2.470442295074463, + "loss": 0.3419, + "odds_ratio_loss": 0.060505881905555725, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03011299856007099, + "rewards/margins": 0.21693125367164612, + "rewards/rejected": -0.24704423546791077, + "sft_loss": 0.3011299669742584, + "step": 1428 + }, + { + "epoch": 2.066522053506869, + "grad_norm": 2.6112651161735503, + "learning_rate": 6.052663504542885e-06, + "logits/chosen": -0.23426532745361328, + "logits/rejected": -0.22717681527137756, + "logps/chosen": -0.4564788341522217, + "logps/rejected": -3.6011054515838623, + "loss": 0.363, + "odds_ratio_loss": 0.15331970155239105, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04564788565039635, + "rewards/margins": 0.31446266174316406, + "rewards/rejected": -0.3601105511188507, + "sft_loss": 0.4564788341522217, + "step": 1429 + }, + { + "epoch": 2.0679681851048444, + "grad_norm": 2.6695083429349715, + "learning_rate": 6.049997146202209e-06, + "logits/chosen": -0.2030077427625656, + "logits/rejected": -0.09532079100608826, + "logps/chosen": -0.28371351957321167, + "logps/rejected": -3.22945499420166, + "loss": 0.3147, + "odds_ratio_loss": 0.10474459081888199, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.028371348977088928, + "rewards/margins": 0.29457414150238037, + "rewards/rejected": -0.3229454755783081, + "sft_loss": 0.28371351957321167, + "step": 1430 + }, + { + "epoch": 2.06941431670282, + "grad_norm": 2.0842316658257376, + "learning_rate": 6.047329551910618e-06, + "logits/chosen": -0.42209625244140625, + "logits/rejected": -0.3098366856575012, + "logps/chosen": -0.44695577025413513, + "logps/rejected": -4.16575813293457, + "loss": 0.3566, + "odds_ratio_loss": 0.1542935073375702, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04469558224081993, + "rewards/margins": 0.3718802332878113, + "rewards/rejected": -0.41657575964927673, + "sft_loss": 0.44695577025413513, + "step": 1431 + }, + { + "epoch": 2.0708604483007953, + "grad_norm": 2.558852343608781, + "learning_rate": 6.044660723276416e-06, + "logits/chosen": -0.49510613083839417, + "logits/rejected": -0.3418270945549011, + "logps/chosen": -0.4254521131515503, + "logps/rejected": -3.873826742172241, + "loss": 0.3418, + "odds_ratio_loss": 0.10353498160839081, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04254521429538727, + "rewards/margins": 0.3448374569416046, + "rewards/rejected": -0.38738271594047546, + "sft_loss": 0.4254521131515503, + "step": 1432 + }, + { + "epoch": 2.072306579898771, + "grad_norm": 2.2228242975974393, + "learning_rate": 6.0419906619086485e-06, + "logits/chosen": -0.4619770646095276, + "logits/rejected": -0.21492421627044678, + "logps/chosen": -0.23521602153778076, + "logps/rejected": -3.90215802192688, + "loss": 0.3316, + "odds_ratio_loss": 0.0810660645365715, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023521605879068375, + "rewards/margins": 0.36669421195983887, + "rewards/rejected": -0.39021581411361694, + "sft_loss": 0.23521602153778076, + "step": 1433 + }, + { + "epoch": 2.0737527114967462, + "grad_norm": 2.5406277424592143, + "learning_rate": 6.0393193694171055e-06, + "logits/chosen": -0.32404616475105286, + "logits/rejected": -0.1625586450099945, + "logps/chosen": -0.26816701889038086, + "logps/rejected": -4.354163646697998, + "loss": 0.4009, + "odds_ratio_loss": 0.08176937699317932, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026816701516509056, + "rewards/margins": 0.4085996747016907, + "rewards/rejected": -0.43541640043258667, + "sft_loss": 0.26816701889038086, + "step": 1434 + }, + { + "epoch": 2.0751988430947215, + "grad_norm": 2.2669801066940907, + "learning_rate": 6.03664684741232e-06, + "logits/chosen": -0.5940994024276733, + "logits/rejected": -0.299757182598114, + "logps/chosen": -0.3598395586013794, + "logps/rejected": -3.4677209854125977, + "loss": 0.3712, + "odds_ratio_loss": 0.1251244992017746, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03598395735025406, + "rewards/margins": 0.3107881546020508, + "rewards/rejected": -0.34677213430404663, + "sft_loss": 0.3598395586013794, + "step": 1435 + }, + { + "epoch": 2.076644974692697, + "grad_norm": 2.1993829720352602, + "learning_rate": 6.033973097505564e-06, + "logits/chosen": -0.197866290807724, + "logits/rejected": -0.18637999892234802, + "logps/chosen": -0.3552109897136688, + "logps/rejected": -4.12385892868042, + "loss": 0.3592, + "odds_ratio_loss": 0.19755232334136963, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03552110120654106, + "rewards/margins": 0.3768647611141205, + "rewards/rejected": -0.41238588094711304, + "sft_loss": 0.3552109897136688, + "step": 1436 + }, + { + "epoch": 2.0780911062906724, + "grad_norm": 2.4029872434354282, + "learning_rate": 6.031298121308852e-06, + "logits/chosen": -0.24159984290599823, + "logits/rejected": -0.18060526251792908, + "logps/chosen": -0.2984180152416229, + "logps/rejected": -3.943260908126831, + "loss": 0.3652, + "odds_ratio_loss": 0.1224970817565918, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.029841801151633263, + "rewards/margins": 0.3644842803478241, + "rewards/rejected": -0.3943260908126831, + "sft_loss": 0.2984180152416229, + "step": 1437 + }, + { + "epoch": 2.0795372378886476, + "grad_norm": 2.058087744744759, + "learning_rate": 6.028621920434938e-06, + "logits/chosen": -0.18899688124656677, + "logits/rejected": -0.17238955199718475, + "logps/chosen": -0.4569585621356964, + "logps/rejected": -2.871255397796631, + "loss": 0.3386, + "odds_ratio_loss": 0.1777493953704834, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04569585993885994, + "rewards/margins": 0.24142968654632568, + "rewards/rejected": -0.28712552785873413, + "sft_loss": 0.4569585621356964, + "step": 1438 + }, + { + "epoch": 2.0809833694866233, + "grad_norm": 2.7713822465891975, + "learning_rate": 6.025944496497313e-06, + "logits/chosen": -0.2562764286994934, + "logits/rejected": -0.2689245939254761, + "logps/chosen": -0.3553205728530884, + "logps/rejected": -3.7182862758636475, + "loss": 0.3025, + "odds_ratio_loss": 0.16768378019332886, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03553205728530884, + "rewards/margins": 0.33629655838012695, + "rewards/rejected": -0.3718286156654358, + "sft_loss": 0.3553205728530884, + "step": 1439 + }, + { + "epoch": 2.0824295010845986, + "grad_norm": 2.104472864919027, + "learning_rate": 6.023265851110206e-06, + "logits/chosen": -0.4192018210887909, + "logits/rejected": -0.4185807406902313, + "logps/chosen": -0.29165175557136536, + "logps/rejected": -3.254685878753662, + "loss": 0.3423, + "odds_ratio_loss": 0.07294730842113495, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029165174812078476, + "rewards/margins": 0.2963034212589264, + "rewards/rejected": -0.32546859979629517, + "sft_loss": 0.29165175557136536, + "step": 1440 + }, + { + "epoch": 2.0838756326825743, + "grad_norm": 3.2957542835935714, + "learning_rate": 6.0205859858885815e-06, + "logits/chosen": -0.38088786602020264, + "logits/rejected": -0.3934791088104248, + "logps/chosen": -0.3900391757488251, + "logps/rejected": -3.510143756866455, + "loss": 0.3766, + "odds_ratio_loss": 0.16783195734024048, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.039003919810056686, + "rewards/margins": 0.31201043725013733, + "rewards/rejected": -0.3510143458843231, + "sft_loss": 0.3900391757488251, + "step": 1441 + }, + { + "epoch": 2.0853217642805495, + "grad_norm": 2.578364476031665, + "learning_rate": 6.01790490244814e-06, + "logits/chosen": -0.4493975043296814, + "logits/rejected": -0.4192134737968445, + "logps/chosen": -0.5234689712524414, + "logps/rejected": -4.9513840675354, + "loss": 0.4525, + "odds_ratio_loss": 0.19005721807479858, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05234689638018608, + "rewards/margins": 0.44279155135154724, + "rewards/rejected": -0.4951384365558624, + "sft_loss": 0.5234689712524414, + "step": 1442 + }, + { + "epoch": 2.0867678958785247, + "grad_norm": 2.396996991834125, + "learning_rate": 6.015222602405318e-06, + "logits/chosen": -0.337810754776001, + "logits/rejected": -0.22689113020896912, + "logps/chosen": -0.2907380759716034, + "logps/rejected": -3.4063007831573486, + "loss": 0.3455, + "odds_ratio_loss": 0.12362713366746902, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0290738046169281, + "rewards/margins": 0.31155624985694885, + "rewards/rejected": -0.34063005447387695, + "sft_loss": 0.2907380759716034, + "step": 1443 + }, + { + "epoch": 2.0882140274765004, + "grad_norm": 2.5822318052964803, + "learning_rate": 6.012539087377284e-06, + "logits/chosen": -0.3404845595359802, + "logits/rejected": -0.3742299973964691, + "logps/chosen": -0.4546850323677063, + "logps/rejected": -2.706575870513916, + "loss": 0.4444, + "odds_ratio_loss": 0.17479830980300903, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04546850547194481, + "rewards/margins": 0.22518908977508545, + "rewards/rejected": -0.27065759897232056, + "sft_loss": 0.4546850323677063, + "step": 1444 + }, + { + "epoch": 2.0896601590744757, + "grad_norm": 2.504565887985123, + "learning_rate": 6.009854358981938e-06, + "logits/chosen": -0.27679046988487244, + "logits/rejected": -0.3251495361328125, + "logps/chosen": -0.38980117440223694, + "logps/rejected": -3.832094192504883, + "loss": 0.3848, + "odds_ratio_loss": 0.14260970056056976, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03898011893033981, + "rewards/margins": 0.3442293107509613, + "rewards/rejected": -0.3832094073295593, + "sft_loss": 0.38980117440223694, + "step": 1445 + }, + { + "epoch": 2.0911062906724514, + "grad_norm": 2.287041085168791, + "learning_rate": 6.007168418837913e-06, + "logits/chosen": -0.342728853225708, + "logits/rejected": -0.27378779649734497, + "logps/chosen": -0.31904324889183044, + "logps/rejected": -4.140244007110596, + "loss": 0.3465, + "odds_ratio_loss": 0.11946651339530945, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03190432861447334, + "rewards/margins": 0.38212013244628906, + "rewards/rejected": -0.4140244126319885, + "sft_loss": 0.31904324889183044, + "step": 1446 + }, + { + "epoch": 2.0925524222704266, + "grad_norm": 2.4471257161139164, + "learning_rate": 6.004481268564573e-06, + "logits/chosen": -0.32866159081459045, + "logits/rejected": -0.27318888902664185, + "logps/chosen": -0.34691765904426575, + "logps/rejected": -4.629284858703613, + "loss": 0.2858, + "odds_ratio_loss": 0.07531234622001648, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.034691765904426575, + "rewards/margins": 0.428236722946167, + "rewards/rejected": -0.4629284739494324, + "sft_loss": 0.34691765904426575, + "step": 1447 + }, + { + "epoch": 2.093998553868402, + "grad_norm": 2.5341871999397068, + "learning_rate": 6.001792909782012e-06, + "logits/chosen": -0.17833292484283447, + "logits/rejected": -0.31797996163368225, + "logps/chosen": -0.4669630229473114, + "logps/rejected": -2.347073793411255, + "loss": 0.4726, + "odds_ratio_loss": 0.23080308735370636, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04669630154967308, + "rewards/margins": 0.18801110982894897, + "rewards/rejected": -0.23470738530158997, + "sft_loss": 0.4669630229473114, + "step": 1448 + }, + { + "epoch": 2.0954446854663775, + "grad_norm": 2.0149281802026104, + "learning_rate": 5.999103344111049e-06, + "logits/chosen": -0.282225638628006, + "logits/rejected": -0.33347731828689575, + "logps/chosen": -0.3228495419025421, + "logps/rejected": -4.6756062507629395, + "loss": 0.312, + "odds_ratio_loss": 0.15524053573608398, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03228495270013809, + "rewards/margins": 0.4352756440639496, + "rewards/rejected": -0.4675605893135071, + "sft_loss": 0.3228495419025421, + "step": 1449 + }, + { + "epoch": 2.0968908170643528, + "grad_norm": 2.4196352705276296, + "learning_rate": 5.996412573173233e-06, + "logits/chosen": -0.23691165447235107, + "logits/rejected": -0.2045428454875946, + "logps/chosen": -0.3596211075782776, + "logps/rejected": -3.006347179412842, + "loss": 0.4048, + "odds_ratio_loss": 0.17852452397346497, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03596211224794388, + "rewards/margins": 0.2646726071834564, + "rewards/rejected": -0.3006346821784973, + "sft_loss": 0.3596211075782776, + "step": 1450 + }, + { + "epoch": 2.0983369486623284, + "grad_norm": 4.531344093999202, + "learning_rate": 5.993720598590844e-06, + "logits/chosen": -0.40281057357788086, + "logits/rejected": -0.30446839332580566, + "logps/chosen": -0.3508280813694, + "logps/rejected": -1.9880702495574951, + "loss": 0.3769, + "odds_ratio_loss": 0.14897975325584412, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03508280962705612, + "rewards/margins": 0.16372422873973846, + "rewards/rejected": -0.198807030916214, + "sft_loss": 0.3508280813694, + "step": 1451 + }, + { + "epoch": 2.0997830802603037, + "grad_norm": 3.298538569192066, + "learning_rate": 5.99102742198688e-06, + "logits/chosen": -0.3555454909801483, + "logits/rejected": -0.2171410322189331, + "logps/chosen": -0.2860006093978882, + "logps/rejected": -3.838912010192871, + "loss": 0.4106, + "odds_ratio_loss": 0.12151093780994415, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.028600063174962997, + "rewards/margins": 0.3552911877632141, + "rewards/rejected": -0.383891224861145, + "sft_loss": 0.2860006093978882, + "step": 1452 + }, + { + "epoch": 2.101229211858279, + "grad_norm": 3.7818647661180673, + "learning_rate": 5.988333044985067e-06, + "logits/chosen": -0.2495804727077484, + "logits/rejected": -0.17438268661499023, + "logps/chosen": -0.24221912026405334, + "logps/rejected": -2.4278950691223145, + "loss": 0.304, + "odds_ratio_loss": 0.12932734191417694, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024221913889050484, + "rewards/margins": 0.21856757998466492, + "rewards/rejected": -0.24278950691223145, + "sft_loss": 0.24221912026405334, + "step": 1453 + }, + { + "epoch": 2.1026753434562546, + "grad_norm": 3.043471182443482, + "learning_rate": 5.985637469209855e-06, + "logits/chosen": -0.2824966609477997, + "logits/rejected": -0.23998260498046875, + "logps/chosen": -0.22306549549102783, + "logps/rejected": -3.71874737739563, + "loss": 0.3302, + "odds_ratio_loss": 0.0718691423535347, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022306548431515694, + "rewards/margins": 0.3495681881904602, + "rewards/rejected": -0.37187474966049194, + "sft_loss": 0.22306549549102783, + "step": 1454 + }, + { + "epoch": 2.10412147505423, + "grad_norm": 2.343444348205682, + "learning_rate": 5.98294069628642e-06, + "logits/chosen": -0.3612365126609802, + "logits/rejected": -0.3207022547721863, + "logps/chosen": -0.4752647578716278, + "logps/rejected": -2.5952839851379395, + "loss": 0.4237, + "odds_ratio_loss": 0.17657390236854553, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04752647876739502, + "rewards/margins": 0.21200190484523773, + "rewards/rejected": -0.25952839851379395, + "sft_loss": 0.4752647578716278, + "step": 1455 + }, + { + "epoch": 2.1055676066522055, + "grad_norm": 2.4742086731738633, + "learning_rate": 5.980242727840653e-06, + "logits/chosen": -0.5590451955795288, + "logits/rejected": -0.43150633573532104, + "logps/chosen": -0.3560371696949005, + "logps/rejected": -2.1207826137542725, + "loss": 0.3667, + "odds_ratio_loss": 0.12519478797912598, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03560371696949005, + "rewards/margins": 0.17647455632686615, + "rewards/rejected": -0.2120782434940338, + "sft_loss": 0.3560371696949005, + "step": 1456 + }, + { + "epoch": 2.107013738250181, + "grad_norm": 3.0913457397602664, + "learning_rate": 5.9775435654991695e-06, + "logits/chosen": -0.321668416261673, + "logits/rejected": -0.2340753823518753, + "logps/chosen": -0.3525116443634033, + "logps/rejected": -1.9900145530700684, + "loss": 0.3869, + "odds_ratio_loss": 0.17652732133865356, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03525116667151451, + "rewards/margins": 0.1637502759695053, + "rewards/rejected": -0.19900144636631012, + "sft_loss": 0.3525116443634033, + "step": 1457 + }, + { + "epoch": 2.108459869848156, + "grad_norm": 2.3776487240970665, + "learning_rate": 5.974843210889306e-06, + "logits/chosen": -0.3362390398979187, + "logits/rejected": -0.26001688838005066, + "logps/chosen": -0.3976631462574005, + "logps/rejected": -3.968487024307251, + "loss": 0.3903, + "odds_ratio_loss": 0.1420903205871582, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03976631909608841, + "rewards/margins": 0.3570823669433594, + "rewards/rejected": -0.3968486487865448, + "sft_loss": 0.3976631462574005, + "step": 1458 + }, + { + "epoch": 2.1099060014461317, + "grad_norm": 2.1848053409122445, + "learning_rate": 5.972141665639116e-06, + "logits/chosen": -0.35201942920684814, + "logits/rejected": -0.48823869228363037, + "logps/chosen": -0.3853376507759094, + "logps/rejected": -2.537548542022705, + "loss": 0.3747, + "odds_ratio_loss": 0.12692534923553467, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.038533765822649, + "rewards/margins": 0.2152210772037506, + "rewards/rejected": -0.2537548542022705, + "sft_loss": 0.3853376507759094, + "step": 1459 + }, + { + "epoch": 2.111352133044107, + "grad_norm": 2.33061695960309, + "learning_rate": 5.969438931377368e-06, + "logits/chosen": -0.3261950612068176, + "logits/rejected": -0.22967484593391418, + "logps/chosen": -0.28880342841148376, + "logps/rejected": -2.5488944053649902, + "loss": 0.3145, + "odds_ratio_loss": 0.11012070626020432, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.028880342841148376, + "rewards/margins": 0.2260090857744217, + "rewards/rejected": -0.25488942861557007, + "sft_loss": 0.28880342841148376, + "step": 1460 + }, + { + "epoch": 2.112798264642082, + "grad_norm": 2.415827008373336, + "learning_rate": 5.966735009733555e-06, + "logits/chosen": -0.15220189094543457, + "logits/rejected": -0.09951989352703094, + "logps/chosen": -0.2214992344379425, + "logps/rejected": -3.8833041191101074, + "loss": 0.3814, + "odds_ratio_loss": 0.15547379851341248, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.02214992418885231, + "rewards/margins": 0.3661804795265198, + "rewards/rejected": -0.3883304297924042, + "sft_loss": 0.2214992344379425, + "step": 1461 + }, + { + "epoch": 2.114244396240058, + "grad_norm": 2.60807066935678, + "learning_rate": 5.964029902337881e-06, + "logits/chosen": -0.2591196298599243, + "logits/rejected": -0.23985594511032104, + "logps/chosen": -0.3921408951282501, + "logps/rejected": -1.9706977605819702, + "loss": 0.4367, + "odds_ratio_loss": 0.20559149980545044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03921408951282501, + "rewards/margins": 0.15785571932792664, + "rewards/rejected": -0.19706979393959045, + "sft_loss": 0.3921408951282501, + "step": 1462 + }, + { + "epoch": 2.115690527838033, + "grad_norm": 2.414112330767816, + "learning_rate": 5.961323610821263e-06, + "logits/chosen": -0.4906052350997925, + "logits/rejected": -0.37421441078186035, + "logps/chosen": -0.33694005012512207, + "logps/rejected": -2.2455477714538574, + "loss": 0.3451, + "odds_ratio_loss": 0.12408454716205597, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.033694006502628326, + "rewards/margins": 0.19086073338985443, + "rewards/rejected": -0.22455476224422455, + "sft_loss": 0.33694005012512207, + "step": 1463 + }, + { + "epoch": 2.117136659436009, + "grad_norm": 2.341502677921619, + "learning_rate": 5.9586161368153345e-06, + "logits/chosen": -0.23700487613677979, + "logits/rejected": -0.24998445808887482, + "logps/chosen": -0.32917293906211853, + "logps/rejected": -2.9448232650756836, + "loss": 0.338, + "odds_ratio_loss": 0.17031408846378326, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03291729465126991, + "rewards/margins": 0.26156502962112427, + "rewards/rejected": -0.2944823205471039, + "sft_loss": 0.32917293906211853, + "step": 1464 + }, + { + "epoch": 2.118582791033984, + "grad_norm": 2.8595164304602863, + "learning_rate": 5.955907481952444e-06, + "logits/chosen": -0.3763851523399353, + "logits/rejected": -0.27793562412261963, + "logps/chosen": -0.49938255548477173, + "logps/rejected": -4.238722324371338, + "loss": 0.3731, + "odds_ratio_loss": 0.14284662902355194, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04993825778365135, + "rewards/margins": 0.37393397092819214, + "rewards/rejected": -0.4238722324371338, + "sft_loss": 0.49938255548477173, + "step": 1465 + }, + { + "epoch": 2.1200289226319593, + "grad_norm": 5.9339196597381045, + "learning_rate": 5.953197647865646e-06, + "logits/chosen": -0.30029064416885376, + "logits/rejected": -0.2469271421432495, + "logps/chosen": -0.2845560908317566, + "logps/rejected": -5.233601093292236, + "loss": 0.3089, + "odds_ratio_loss": 0.06689879298210144, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.028455613180994987, + "rewards/margins": 0.494904488325119, + "rewards/rejected": -0.5233600735664368, + "sft_loss": 0.2845560908317566, + "step": 1466 + }, + { + "epoch": 2.121475054229935, + "grad_norm": 5.434321085409511, + "learning_rate": 5.950486636188713e-06, + "logits/chosen": -0.36320340633392334, + "logits/rejected": -0.23057013750076294, + "logps/chosen": -0.3520301580429077, + "logps/rejected": -4.251169204711914, + "loss": 0.353, + "odds_ratio_loss": 0.1139448881149292, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03520301356911659, + "rewards/margins": 0.3899139165878296, + "rewards/rejected": -0.4251168966293335, + "sft_loss": 0.3520301580429077, + "step": 1467 + }, + { + "epoch": 2.1229211858279102, + "grad_norm": 2.4788779481832988, + "learning_rate": 5.947774448556123e-06, + "logits/chosen": -0.2565758526325226, + "logits/rejected": -0.16842395067214966, + "logps/chosen": -0.3628906011581421, + "logps/rejected": -3.6560139656066895, + "loss": 0.3845, + "odds_ratio_loss": 0.1105215921998024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03628905862569809, + "rewards/margins": 0.3293122947216034, + "rewards/rejected": -0.3656013607978821, + "sft_loss": 0.3628906011581421, + "step": 1468 + }, + { + "epoch": 2.124367317425886, + "grad_norm": 2.086746606792482, + "learning_rate": 5.9450610866030635e-06, + "logits/chosen": -0.24368371069431305, + "logits/rejected": -0.16405165195465088, + "logps/chosen": -0.41052013635635376, + "logps/rejected": -3.302800416946411, + "loss": 0.3025, + "odds_ratio_loss": 0.16078418493270874, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.041052013635635376, + "rewards/margins": 0.28922802209854126, + "rewards/rejected": -0.330280065536499, + "sft_loss": 0.41052013635635376, + "step": 1469 + }, + { + "epoch": 2.125813449023861, + "grad_norm": 3.696865328122287, + "learning_rate": 5.94234655196543e-06, + "logits/chosen": -0.3694218397140503, + "logits/rejected": -0.20579954981803894, + "logps/chosen": -0.2565337121486664, + "logps/rejected": -3.989961624145508, + "loss": 0.3041, + "odds_ratio_loss": 0.09575256705284119, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02565336972475052, + "rewards/margins": 0.3733427822589874, + "rewards/rejected": -0.39899617433547974, + "sft_loss": 0.2565337121486664, + "step": 1470 + }, + { + "epoch": 2.1272595806218364, + "grad_norm": 2.1564596951388095, + "learning_rate": 5.939630846279828e-06, + "logits/chosen": -0.40536144375801086, + "logits/rejected": -0.37199535965919495, + "logps/chosen": -0.379259318113327, + "logps/rejected": -4.299976825714111, + "loss": 0.3549, + "odds_ratio_loss": 0.14220932126045227, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03792593628168106, + "rewards/margins": 0.3920717239379883, + "rewards/rejected": -0.42999768257141113, + "sft_loss": 0.379259318113327, + "step": 1471 + }, + { + "epoch": 2.128705712219812, + "grad_norm": 2.2649955437637272, + "learning_rate": 5.936913971183562e-06, + "logits/chosen": -0.29743102192878723, + "logits/rejected": -0.19528326392173767, + "logps/chosen": -0.336579829454422, + "logps/rejected": -4.535792350769043, + "loss": 0.3418, + "odds_ratio_loss": 0.11933722347021103, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0336579829454422, + "rewards/margins": 0.41992127895355225, + "rewards/rejected": -0.45357927680015564, + "sft_loss": 0.336579829454422, + "step": 1472 + }, + { + "epoch": 2.1301518438177873, + "grad_norm": 2.1566457466271616, + "learning_rate": 5.93419592831465e-06, + "logits/chosen": -0.24219074845314026, + "logits/rejected": -0.10176640748977661, + "logps/chosen": -0.296125590801239, + "logps/rejected": -3.7083725929260254, + "loss": 0.3531, + "odds_ratio_loss": 0.14927887916564941, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02961255982518196, + "rewards/margins": 0.341224730014801, + "rewards/rejected": -0.3708372712135315, + "sft_loss": 0.296125590801239, + "step": 1473 + }, + { + "epoch": 2.131597975415763, + "grad_norm": 2.641384013878491, + "learning_rate": 5.9314767193118104e-06, + "logits/chosen": -0.28196150064468384, + "logits/rejected": -0.15774741768836975, + "logps/chosen": -0.3431814908981323, + "logps/rejected": -3.5857958793640137, + "loss": 0.3147, + "odds_ratio_loss": 0.1359584480524063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03431814908981323, + "rewards/margins": 0.3242614269256592, + "rewards/rejected": -0.3585796058177948, + "sft_loss": 0.3431814908981323, + "step": 1474 + }, + { + "epoch": 2.1330441070137383, + "grad_norm": 2.1937742000667018, + "learning_rate": 5.928756345814462e-06, + "logits/chosen": -0.15431058406829834, + "logits/rejected": -0.1452016532421112, + "logps/chosen": -0.38092872500419617, + "logps/rejected": -3.261220932006836, + "loss": 0.3528, + "odds_ratio_loss": 0.1584779918193817, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.038092873990535736, + "rewards/margins": 0.2880292236804962, + "rewards/rejected": -0.32612210512161255, + "sft_loss": 0.38092872500419617, + "step": 1475 + }, + { + "epoch": 2.1344902386117135, + "grad_norm": 2.7322141667155297, + "learning_rate": 5.926034809462729e-06, + "logits/chosen": -0.2722333073616028, + "logits/rejected": -0.28116655349731445, + "logps/chosen": -0.3736817240715027, + "logps/rejected": -3.1287436485290527, + "loss": 0.3954, + "odds_ratio_loss": 0.16877394914627075, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03736817464232445, + "rewards/margins": 0.2755061984062195, + "rewards/rejected": -0.31287437677383423, + "sft_loss": 0.3736817240715027, + "step": 1476 + }, + { + "epoch": 2.135936370209689, + "grad_norm": 2.3384100846293454, + "learning_rate": 5.923312111897437e-06, + "logits/chosen": -0.20643384754657745, + "logits/rejected": -0.19397512078285217, + "logps/chosen": -0.3226177394390106, + "logps/rejected": -3.1748647689819336, + "loss": 0.3832, + "odds_ratio_loss": 0.13731184601783752, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03226177394390106, + "rewards/margins": 0.2852247357368469, + "rewards/rejected": -0.3174864947795868, + "sft_loss": 0.3226177394390106, + "step": 1477 + }, + { + "epoch": 2.1373825018076644, + "grad_norm": 2.691733868567432, + "learning_rate": 5.92058825476011e-06, + "logits/chosen": -0.3043122887611389, + "logits/rejected": -0.2385656237602234, + "logps/chosen": -0.39572885632514954, + "logps/rejected": -2.609856128692627, + "loss": 0.3954, + "odds_ratio_loss": 0.11852793395519257, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.039572883397340775, + "rewards/margins": 0.2214127480983734, + "rewards/rejected": -0.2609856128692627, + "sft_loss": 0.39572885632514954, + "step": 1478 + }, + { + "epoch": 2.13882863340564, + "grad_norm": 3.5489740621337864, + "learning_rate": 5.917863239692969e-06, + "logits/chosen": -0.20991955697536469, + "logits/rejected": -0.19747528433799744, + "logps/chosen": -0.34138545393943787, + "logps/rejected": -3.09604549407959, + "loss": 0.4404, + "odds_ratio_loss": 0.13940584659576416, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03413854539394379, + "rewards/margins": 0.27546602487564087, + "rewards/rejected": -0.30960455536842346, + "sft_loss": 0.34138545393943787, + "step": 1479 + }, + { + "epoch": 2.1402747650036154, + "grad_norm": 2.1460462624407914, + "learning_rate": 5.91513706833894e-06, + "logits/chosen": -0.24288667738437653, + "logits/rejected": -0.15092161297798157, + "logps/chosen": -0.2745353877544403, + "logps/rejected": -2.8250560760498047, + "loss": 0.3642, + "odds_ratio_loss": 0.09830009937286377, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02745353803038597, + "rewards/margins": 0.25505203008651733, + "rewards/rejected": -0.2825055718421936, + "sft_loss": 0.2745353877544403, + "step": 1480 + }, + { + "epoch": 2.1417208966015906, + "grad_norm": 2.89276313645248, + "learning_rate": 5.912409742341639e-06, + "logits/chosen": -0.2257198840379715, + "logits/rejected": -0.28929704427719116, + "logps/chosen": -0.5430160760879517, + "logps/rejected": -2.884078025817871, + "loss": 0.3624, + "odds_ratio_loss": 0.18084710836410522, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05430160462856293, + "rewards/margins": 0.23410621285438538, + "rewards/rejected": -0.2884078323841095, + "sft_loss": 0.5430160760879517, + "step": 1481 + }, + { + "epoch": 2.1431670281995663, + "grad_norm": 2.12580286870257, + "learning_rate": 5.909681263345382e-06, + "logits/chosen": -0.26066428422927856, + "logits/rejected": -0.2458367943763733, + "logps/chosen": -0.46075600385665894, + "logps/rejected": -2.822026252746582, + "loss": 0.4077, + "odds_ratio_loss": 0.18911300599575043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04607560113072395, + "rewards/margins": 0.23612701892852783, + "rewards/rejected": -0.2822026312351227, + "sft_loss": 0.46075600385665894, + "step": 1482 + }, + { + "epoch": 2.1446131597975415, + "grad_norm": 2.339258086988403, + "learning_rate": 5.906951632995179e-06, + "logits/chosen": -0.22002863883972168, + "logits/rejected": -0.16895297169685364, + "logps/chosen": -0.39282089471817017, + "logps/rejected": -4.94498872756958, + "loss": 0.3477, + "odds_ratio_loss": 0.11626096069812775, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.039282094687223434, + "rewards/margins": 0.45521676540374756, + "rewards/rejected": -0.4944988489151001, + "sft_loss": 0.39282089471817017, + "step": 1483 + }, + { + "epoch": 2.1460592913955168, + "grad_norm": 2.4373005337957516, + "learning_rate": 5.904220852936733e-06, + "logits/chosen": -0.253675639629364, + "logits/rejected": -0.33716779947280884, + "logps/chosen": -0.2549727261066437, + "logps/rejected": -3.052334785461426, + "loss": 0.2846, + "odds_ratio_loss": 0.13065417110919952, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02549727074801922, + "rewards/margins": 0.2797362208366394, + "rewards/rejected": -0.3052334785461426, + "sft_loss": 0.2549727261066437, + "step": 1484 + }, + { + "epoch": 2.1475054229934925, + "grad_norm": 3.249695223924806, + "learning_rate": 5.901488924816444e-06, + "logits/chosen": -0.3323328495025635, + "logits/rejected": -0.2721770405769348, + "logps/chosen": -0.4279077351093292, + "logps/rejected": -2.4622364044189453, + "loss": 0.4213, + "odds_ratio_loss": 0.1369471549987793, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04279077425599098, + "rewards/margins": 0.2034328728914261, + "rewards/rejected": -0.24622364342212677, + "sft_loss": 0.4279077351093292, + "step": 1485 + }, + { + "epoch": 2.1489515545914677, + "grad_norm": 2.2438085957378293, + "learning_rate": 5.8987558502814e-06, + "logits/chosen": -0.3333353102207184, + "logits/rejected": -0.27355486154556274, + "logps/chosen": -0.482401043176651, + "logps/rejected": -3.146805763244629, + "loss": 0.4227, + "odds_ratio_loss": 0.19827474653720856, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04824010655283928, + "rewards/margins": 0.2664404511451721, + "rewards/rejected": -0.3146805763244629, + "sft_loss": 0.482401043176651, + "step": 1486 + }, + { + "epoch": 2.1503976861894434, + "grad_norm": 2.104417928949804, + "learning_rate": 5.896021630979382e-06, + "logits/chosen": -0.35297325253486633, + "logits/rejected": -0.2755223512649536, + "logps/chosen": -0.3497896194458008, + "logps/rejected": -2.348954439163208, + "loss": 0.3253, + "odds_ratio_loss": 0.1740923374891281, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0349789634346962, + "rewards/margins": 0.19991648197174072, + "rewards/rejected": -0.23489545285701752, + "sft_loss": 0.3497896194458008, + "step": 1487 + }, + { + "epoch": 2.1518438177874186, + "grad_norm": 2.600438723649929, + "learning_rate": 5.89328626855886e-06, + "logits/chosen": -0.3274020850658417, + "logits/rejected": -0.19287540018558502, + "logps/chosen": -0.2810543477535248, + "logps/rejected": -3.3650529384613037, + "loss": 0.3803, + "odds_ratio_loss": 0.06862045079469681, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02810543403029442, + "rewards/margins": 0.30839988589286804, + "rewards/rejected": -0.33650532364845276, + "sft_loss": 0.2810543477535248, + "step": 1488 + }, + { + "epoch": 2.153289949385394, + "grad_norm": 2.4433889180529382, + "learning_rate": 5.890549764668996e-06, + "logits/chosen": -0.1400214284658432, + "logits/rejected": -0.08871597051620483, + "logps/chosen": -0.2968231737613678, + "logps/rejected": -2.3680202960968018, + "loss": 0.2994, + "odds_ratio_loss": 0.1344771385192871, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02968231588602066, + "rewards/margins": 0.20711973309516907, + "rewards/rejected": -0.23680204153060913, + "sft_loss": 0.2968231737613678, + "step": 1489 + }, + { + "epoch": 2.1547360809833696, + "grad_norm": 2.2686277694593144, + "learning_rate": 5.88781212095964e-06, + "logits/chosen": -0.4212700426578522, + "logits/rejected": -0.21845176815986633, + "logps/chosen": -0.2443973571062088, + "logps/rejected": -5.354547500610352, + "loss": 0.3728, + "odds_ratio_loss": 0.08256202936172485, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02443973906338215, + "rewards/margins": 0.5110150575637817, + "rewards/rejected": -0.5354547500610352, + "sft_loss": 0.2443973571062088, + "step": 1490 + }, + { + "epoch": 2.156182212581345, + "grad_norm": 2.32926426146957, + "learning_rate": 5.885073339081323e-06, + "logits/chosen": -0.28272032737731934, + "logits/rejected": -0.24804599583148956, + "logps/chosen": -0.3671835660934448, + "logps/rejected": -3.007756233215332, + "loss": 0.3437, + "odds_ratio_loss": 0.12326133251190186, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03671835735440254, + "rewards/margins": 0.2640572488307953, + "rewards/rejected": -0.3007756173610687, + "sft_loss": 0.3671835660934448, + "step": 1491 + }, + { + "epoch": 2.1576283441793205, + "grad_norm": 3.877286822110988, + "learning_rate": 5.882333420685269e-06, + "logits/chosen": -0.5978209376335144, + "logits/rejected": -0.43089497089385986, + "logps/chosen": -0.2918180227279663, + "logps/rejected": -3.2058868408203125, + "loss": 0.3835, + "odds_ratio_loss": 0.0905960351228714, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02918180450797081, + "rewards/margins": 0.29140689969062805, + "rewards/rejected": -0.32058870792388916, + "sft_loss": 0.2918180227279663, + "step": 1492 + }, + { + "epoch": 2.1590744757772957, + "grad_norm": 2.453498067923628, + "learning_rate": 5.879592367423386e-06, + "logits/chosen": -0.2288958728313446, + "logits/rejected": -0.15676091611385345, + "logps/chosen": -0.4409548044204712, + "logps/rejected": -2.5945446491241455, + "loss": 0.4106, + "odds_ratio_loss": 0.132335364818573, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.044095478951931, + "rewards/margins": 0.21535900235176086, + "rewards/rejected": -0.25945448875427246, + "sft_loss": 0.4409548044204712, + "step": 1493 + }, + { + "epoch": 2.160520607375271, + "grad_norm": 2.689912629165323, + "learning_rate": 5.876850180948265e-06, + "logits/chosen": -0.30421751737594604, + "logits/rejected": -0.23876774311065674, + "logps/chosen": -0.3396947979927063, + "logps/rejected": -3.2614731788635254, + "loss": 0.405, + "odds_ratio_loss": 0.12139807641506195, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03396947681903839, + "rewards/margins": 0.29217785596847534, + "rewards/rejected": -0.32614731788635254, + "sft_loss": 0.3396947979927063, + "step": 1494 + }, + { + "epoch": 2.1619667389732466, + "grad_norm": 2.4501357659326675, + "learning_rate": 5.8741068629131785e-06, + "logits/chosen": -0.18467435240745544, + "logits/rejected": -0.16658593714237213, + "logps/chosen": -0.29923176765441895, + "logps/rejected": -6.522655487060547, + "loss": 0.3052, + "odds_ratio_loss": 0.10115586221218109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029923178255558014, + "rewards/margins": 0.6223424077033997, + "rewards/rejected": -0.6522656083106995, + "sft_loss": 0.29923176765441895, + "step": 1495 + }, + { + "epoch": 2.163412870571222, + "grad_norm": 2.286318868410751, + "learning_rate": 5.871362414972084e-06, + "logits/chosen": -0.1483822911977768, + "logits/rejected": -0.19763171672821045, + "logps/chosen": -0.3210403323173523, + "logps/rejected": -2.6171536445617676, + "loss": 0.3945, + "odds_ratio_loss": 0.14660733938217163, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03210403397679329, + "rewards/margins": 0.229611337184906, + "rewards/rejected": -0.2617153525352478, + "sft_loss": 0.3210403323173523, + "step": 1496 + }, + { + "epoch": 2.1648590021691976, + "grad_norm": 2.561828911466251, + "learning_rate": 5.8686168387796205e-06, + "logits/chosen": -0.2733272314071655, + "logits/rejected": -0.2274172306060791, + "logps/chosen": -0.33266696333885193, + "logps/rejected": -2.2120425701141357, + "loss": 0.4399, + "odds_ratio_loss": 0.178709477186203, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03326670080423355, + "rewards/margins": 0.18793755769729614, + "rewards/rejected": -0.2212042361497879, + "sft_loss": 0.33266696333885193, + "step": 1497 + }, + { + "epoch": 2.166305133767173, + "grad_norm": 2.9375646923284338, + "learning_rate": 5.865870135991107e-06, + "logits/chosen": -0.30630001425743103, + "logits/rejected": -0.19453690946102142, + "logps/chosen": -0.21391445398330688, + "logps/rejected": -2.3469486236572266, + "loss": 0.3543, + "odds_ratio_loss": 0.10182206332683563, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02139144390821457, + "rewards/margins": 0.21330343186855316, + "rewards/rejected": -0.23469488322734833, + "sft_loss": 0.21391445398330688, + "step": 1498 + }, + { + "epoch": 2.167751265365148, + "grad_norm": 2.2293176746225645, + "learning_rate": 5.863122308262538e-06, + "logits/chosen": -0.36253878474235535, + "logits/rejected": -0.217836394906044, + "logps/chosen": -0.28018245100975037, + "logps/rejected": -3.587547779083252, + "loss": 0.2916, + "odds_ratio_loss": 0.07377097755670547, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.028018245473504066, + "rewards/margins": 0.33073654770851135, + "rewards/rejected": -0.35875481367111206, + "sft_loss": 0.28018245100975037, + "step": 1499 + }, + { + "epoch": 2.1691973969631237, + "grad_norm": 2.1990139631710544, + "learning_rate": 5.86037335725059e-06, + "logits/chosen": -0.2960984408855438, + "logits/rejected": -0.17636118829250336, + "logps/chosen": -0.36348897218704224, + "logps/rejected": -3.5222487449645996, + "loss": 0.3755, + "odds_ratio_loss": 0.06385881453752518, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.036348894238471985, + "rewards/margins": 0.31587597727775574, + "rewards/rejected": -0.35222485661506653, + "sft_loss": 0.36348897218704224, + "step": 1500 + }, + { + "epoch": 2.170643528561099, + "grad_norm": 3.0269353766132534, + "learning_rate": 5.857623284612616e-06, + "logits/chosen": -0.3736283481121063, + "logits/rejected": -0.3629686236381531, + "logps/chosen": -0.3977740406990051, + "logps/rejected": -3.777735710144043, + "loss": 0.3153, + "odds_ratio_loss": 0.112032949924469, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.039777401834726334, + "rewards/margins": 0.3379961848258972, + "rewards/rejected": -0.37777355313301086, + "sft_loss": 0.3977740406990051, + "step": 1501 + }, + { + "epoch": 2.1720896601590747, + "grad_norm": 2.413695680791457, + "learning_rate": 5.854872092006645e-06, + "logits/chosen": -0.20464223623275757, + "logits/rejected": -0.19365337491035461, + "logps/chosen": -0.264778733253479, + "logps/rejected": -2.7025864124298096, + "loss": 0.2889, + "odds_ratio_loss": 0.0847976952791214, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0264778733253479, + "rewards/margins": 0.24378079175949097, + "rewards/rejected": -0.27025866508483887, + "sft_loss": 0.264778733253479, + "step": 1502 + }, + { + "epoch": 2.17353579175705, + "grad_norm": 3.374590164095406, + "learning_rate": 5.852119781091381e-06, + "logits/chosen": -0.4547528922557831, + "logits/rejected": -0.34812527894973755, + "logps/chosen": -0.4338432252407074, + "logps/rejected": -3.0329198837280273, + "loss": 0.4761, + "odds_ratio_loss": 0.12955176830291748, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04338432103395462, + "rewards/margins": 0.25990769267082214, + "rewards/rejected": -0.30329203605651855, + "sft_loss": 0.4338432252407074, + "step": 1503 + }, + { + "epoch": 2.174981923355025, + "grad_norm": 2.146790054606926, + "learning_rate": 5.8493663535262045e-06, + "logits/chosen": -0.18142646551132202, + "logits/rejected": -0.1190670058131218, + "logps/chosen": -0.23194721341133118, + "logps/rejected": -3.571894645690918, + "loss": 0.3787, + "odds_ratio_loss": 0.09054180234670639, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023194722831249237, + "rewards/margins": 0.3339947462081909, + "rewards/rejected": -0.35718944668769836, + "sft_loss": 0.23194721341133118, + "step": 1504 + }, + { + "epoch": 2.176428054953001, + "grad_norm": 2.1354133121281076, + "learning_rate": 5.846611810971166e-06, + "logits/chosen": -0.32426026463508606, + "logits/rejected": -0.24918986856937408, + "logps/chosen": -0.3435249328613281, + "logps/rejected": -3.3698172569274902, + "loss": 0.3461, + "odds_ratio_loss": 0.15128874778747559, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03435249626636505, + "rewards/margins": 0.3026292324066162, + "rewards/rejected": -0.33698174357414246, + "sft_loss": 0.3435249328613281, + "step": 1505 + }, + { + "epoch": 2.177874186550976, + "grad_norm": 2.7078861801848744, + "learning_rate": 5.843856155086988e-06, + "logits/chosen": 0.006320249754935503, + "logits/rejected": -0.04288327321410179, + "logps/chosen": -0.29730698466300964, + "logps/rejected": -3.772261619567871, + "loss": 0.2973, + "odds_ratio_loss": 0.1492423266172409, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.029730698093771935, + "rewards/margins": 0.347495436668396, + "rewards/rejected": -0.3772261142730713, + "sft_loss": 0.29730698466300964, + "step": 1506 + }, + { + "epoch": 2.1793203181489513, + "grad_norm": 2.815754065971916, + "learning_rate": 5.841099387535067e-06, + "logits/chosen": -0.13005805015563965, + "logits/rejected": -0.21556347608566284, + "logps/chosen": -0.486311674118042, + "logps/rejected": -3.861595869064331, + "loss": 0.359, + "odds_ratio_loss": 0.21694286167621613, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04863116517663002, + "rewards/margins": 0.33752843737602234, + "rewards/rejected": -0.38615962862968445, + "sft_loss": 0.486311674118042, + "step": 1507 + }, + { + "epoch": 2.180766449746927, + "grad_norm": 2.365644205544063, + "learning_rate": 5.838341509977468e-06, + "logits/chosen": -0.26615601778030396, + "logits/rejected": -0.30764591693878174, + "logps/chosen": -0.38783353567123413, + "logps/rejected": -3.8921470642089844, + "loss": 0.4536, + "odds_ratio_loss": 0.13304699957370758, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03878335654735565, + "rewards/margins": 0.3504313826560974, + "rewards/rejected": -0.38921472430229187, + "sft_loss": 0.38783353567123413, + "step": 1508 + }, + { + "epoch": 2.1822125813449023, + "grad_norm": 2.3090179607820875, + "learning_rate": 5.835582524076927e-06, + "logits/chosen": -0.2245335578918457, + "logits/rejected": -0.3578979969024658, + "logps/chosen": -0.40855467319488525, + "logps/rejected": -2.059465169906616, + "loss": 0.4066, + "odds_ratio_loss": 0.18744851648807526, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04085546359419823, + "rewards/margins": 0.16509106755256653, + "rewards/rejected": -0.20594651997089386, + "sft_loss": 0.40855467319488525, + "step": 1509 + }, + { + "epoch": 2.183658712942878, + "grad_norm": 2.794188680111159, + "learning_rate": 5.832822431496845e-06, + "logits/chosen": -0.265776127576828, + "logits/rejected": -0.12581999599933624, + "logps/chosen": -0.4239056706428528, + "logps/rejected": -1.9903366565704346, + "loss": 0.3896, + "odds_ratio_loss": 0.1404733806848526, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.042390573769807816, + "rewards/margins": 0.1566431075334549, + "rewards/rejected": -0.1990336775779724, + "sft_loss": 0.4239056706428528, + "step": 1510 + }, + { + "epoch": 2.185104844540853, + "grad_norm": 2.7143930939543486, + "learning_rate": 5.830061233901293e-06, + "logits/chosen": -0.29388314485549927, + "logits/rejected": -0.31625548005104065, + "logps/chosen": -0.39467012882232666, + "logps/rejected": -2.1236424446105957, + "loss": 0.3985, + "odds_ratio_loss": 0.17664192616939545, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03946701064705849, + "rewards/margins": 0.17289723455905914, + "rewards/rejected": -0.21236424148082733, + "sft_loss": 0.39467012882232666, + "step": 1511 + }, + { + "epoch": 2.1865509761388284, + "grad_norm": 2.362745074271975, + "learning_rate": 5.827298932955006e-06, + "logits/chosen": -0.2649492025375366, + "logits/rejected": -0.14799144864082336, + "logps/chosen": -0.21747642755508423, + "logps/rejected": -5.31502103805542, + "loss": 0.3557, + "odds_ratio_loss": 0.08317440748214722, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02174764685332775, + "rewards/margins": 0.509754478931427, + "rewards/rejected": -0.5315020680427551, + "sft_loss": 0.21747642755508423, + "step": 1512 + }, + { + "epoch": 2.187997107736804, + "grad_norm": 2.5515207334279997, + "learning_rate": 5.8245355303233885e-06, + "logits/chosen": -0.3026021718978882, + "logits/rejected": -0.160127192735672, + "logps/chosen": -0.29952093958854675, + "logps/rejected": -3.455519199371338, + "loss": 0.3361, + "odds_ratio_loss": 0.07680322229862213, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029952093958854675, + "rewards/margins": 0.31559985876083374, + "rewards/rejected": -0.3455519378185272, + "sft_loss": 0.29952093958854675, + "step": 1513 + }, + { + "epoch": 2.1894432393347794, + "grad_norm": 2.312767228310274, + "learning_rate": 5.8217710276725034e-06, + "logits/chosen": -0.3605382740497589, + "logits/rejected": -0.20827777683734894, + "logps/chosen": -0.36337408423423767, + "logps/rejected": -3.856226921081543, + "loss": 0.3926, + "odds_ratio_loss": 0.16154375672340393, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03633740544319153, + "rewards/margins": 0.3492853045463562, + "rewards/rejected": -0.38562270998954773, + "sft_loss": 0.36337408423423767, + "step": 1514 + }, + { + "epoch": 2.190889370932755, + "grad_norm": 2.5824996018885518, + "learning_rate": 5.819005426669081e-06, + "logits/chosen": -0.45703208446502686, + "logits/rejected": -0.34461653232574463, + "logps/chosen": -0.32233983278274536, + "logps/rejected": -3.073678970336914, + "loss": 0.3253, + "odds_ratio_loss": 0.11809199303388596, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.032233983278274536, + "rewards/margins": 0.27513387799263, + "rewards/rejected": -0.30736786127090454, + "sft_loss": 0.32233983278274536, + "step": 1515 + }, + { + "epoch": 2.1923355025307303, + "grad_norm": 2.8923982163571713, + "learning_rate": 5.816238728980512e-06, + "logits/chosen": -0.12480375170707703, + "logits/rejected": -0.20347639918327332, + "logps/chosen": -0.3857441842556, + "logps/rejected": -4.365511894226074, + "loss": 0.374, + "odds_ratio_loss": 0.18619787693023682, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03857441619038582, + "rewards/margins": 0.39797675609588623, + "rewards/rejected": -0.43655121326446533, + "sft_loss": 0.3857441842556, + "step": 1516 + }, + { + "epoch": 2.1937816341287055, + "grad_norm": 2.3513904706926465, + "learning_rate": 5.81347093627485e-06, + "logits/chosen": -0.18540328741073608, + "logits/rejected": -0.2682988941669464, + "logps/chosen": -0.41352933645248413, + "logps/rejected": -2.648174285888672, + "loss": 0.3474, + "odds_ratio_loss": 0.20547065138816833, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04135293513536453, + "rewards/margins": 0.2234645038843155, + "rewards/rejected": -0.26481741666793823, + "sft_loss": 0.41352933645248413, + "step": 1517 + }, + { + "epoch": 2.195227765726681, + "grad_norm": 2.3218155921073924, + "learning_rate": 5.810702050220806e-06, + "logits/chosen": -0.26080507040023804, + "logits/rejected": -0.16710934042930603, + "logps/chosen": -0.5044231414794922, + "logps/rejected": -3.791574716567993, + "loss": 0.3948, + "odds_ratio_loss": 0.15905873477458954, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05044231563806534, + "rewards/margins": 0.32871514558792114, + "rewards/rejected": -0.3791574537754059, + "sft_loss": 0.5044231414794922, + "step": 1518 + }, + { + "epoch": 2.1966738973246565, + "grad_norm": 2.523759857827364, + "learning_rate": 5.807932072487751e-06, + "logits/chosen": -0.24406197667121887, + "logits/rejected": -0.18871784210205078, + "logps/chosen": -0.37185871601104736, + "logps/rejected": -2.7142934799194336, + "loss": 0.3622, + "odds_ratio_loss": 0.21322374045848846, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03718587011098862, + "rewards/margins": 0.2342434823513031, + "rewards/rejected": -0.2714293599128723, + "sft_loss": 0.37185871601104736, + "step": 1519 + }, + { + "epoch": 2.198120028922632, + "grad_norm": 2.3677642125499387, + "learning_rate": 5.805161004745716e-06, + "logits/chosen": -0.29101336002349854, + "logits/rejected": -0.3166297674179077, + "logps/chosen": -0.4049059748649597, + "logps/rejected": -2.3067679405212402, + "loss": 0.4088, + "odds_ratio_loss": 0.19290974736213684, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04049060121178627, + "rewards/margins": 0.19018618762493134, + "rewards/rejected": -0.2306768000125885, + "sft_loss": 0.4049059748649597, + "step": 1520 + }, + { + "epoch": 2.1995661605206074, + "grad_norm": 2.3802608136855268, + "learning_rate": 5.802388848665391e-06, + "logits/chosen": -0.236686110496521, + "logits/rejected": -0.09095649421215057, + "logps/chosen": -0.30331850051879883, + "logps/rejected": -3.6234824657440186, + "loss": 0.3045, + "odds_ratio_loss": 0.0965602695941925, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.030331851914525032, + "rewards/margins": 0.33201637864112854, + "rewards/rejected": -0.3623482584953308, + "sft_loss": 0.30331850051879883, + "step": 1521 + }, + { + "epoch": 2.2010122921185826, + "grad_norm": 9.350151858192348, + "learning_rate": 5.7996156059181135e-06, + "logits/chosen": -0.27046582102775574, + "logits/rejected": -0.17398667335510254, + "logps/chosen": -0.20965230464935303, + "logps/rejected": -2.9031565189361572, + "loss": 0.3707, + "odds_ratio_loss": 0.08045334368944168, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020965231582522392, + "rewards/margins": 0.26935043931007385, + "rewards/rejected": -0.2903156578540802, + "sft_loss": 0.20965230464935303, + "step": 1522 + }, + { + "epoch": 2.2024584237165583, + "grad_norm": 2.9087098075463276, + "learning_rate": 5.796841278175886e-06, + "logits/chosen": -0.34366631507873535, + "logits/rejected": -0.2516658306121826, + "logps/chosen": -0.3939272165298462, + "logps/rejected": -2.5516390800476074, + "loss": 0.4125, + "odds_ratio_loss": 0.11796994507312775, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03939272090792656, + "rewards/margins": 0.21577118337154388, + "rewards/rejected": -0.25516390800476074, + "sft_loss": 0.3939272165298462, + "step": 1523 + }, + { + "epoch": 2.2039045553145336, + "grad_norm": 2.3977389772644417, + "learning_rate": 5.794065867111359e-06, + "logits/chosen": -0.15821127593517303, + "logits/rejected": -0.3337038457393646, + "logps/chosen": -0.5078226923942566, + "logps/rejected": -3.0043892860412598, + "loss": 0.3828, + "odds_ratio_loss": 0.2925236225128174, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05078226700425148, + "rewards/margins": 0.24965664744377136, + "rewards/rejected": -0.30043891072273254, + "sft_loss": 0.5078226923942566, + "step": 1524 + }, + { + "epoch": 2.2053506869125092, + "grad_norm": 2.468736682512821, + "learning_rate": 5.791289374397839e-06, + "logits/chosen": -0.31990671157836914, + "logits/rejected": -0.4198131561279297, + "logps/chosen": -0.3348398804664612, + "logps/rejected": -2.396658420562744, + "loss": 0.3388, + "odds_ratio_loss": 0.14263346791267395, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03348398581147194, + "rewards/margins": 0.20618188381195068, + "rewards/rejected": -0.23966586589813232, + "sft_loss": 0.3348398804664612, + "step": 1525 + }, + { + "epoch": 2.2067968185104845, + "grad_norm": 2.172638209003934, + "learning_rate": 5.788511801709283e-06, + "logits/chosen": -0.30636391043663025, + "logits/rejected": -0.2372589260339737, + "logps/chosen": -0.26845329999923706, + "logps/rejected": -4.8146281242370605, + "loss": 0.3712, + "odds_ratio_loss": 0.07393340766429901, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026845330372452736, + "rewards/margins": 0.4546175003051758, + "rewards/rejected": -0.48146283626556396, + "sft_loss": 0.26845329999923706, + "step": 1526 + }, + { + "epoch": 2.2082429501084597, + "grad_norm": 3.077739945365637, + "learning_rate": 5.785733150720301e-06, + "logits/chosen": -0.3096201717853546, + "logits/rejected": -0.1331636607646942, + "logps/chosen": -0.5265023708343506, + "logps/rejected": -4.324217796325684, + "loss": 0.358, + "odds_ratio_loss": 0.12146250903606415, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05265023931860924, + "rewards/margins": 0.3797715902328491, + "rewards/rejected": -0.4324217736721039, + "sft_loss": 0.5265023708343506, + "step": 1527 + }, + { + "epoch": 2.2096890817064354, + "grad_norm": 2.6942329260961744, + "learning_rate": 5.782953423106153e-06, + "logits/chosen": -0.1731545627117157, + "logits/rejected": -0.3327570855617523, + "logps/chosen": -0.38150539994239807, + "logps/rejected": -3.1940901279449463, + "loss": 0.384, + "odds_ratio_loss": 0.1315080225467682, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.038150541484355927, + "rewards/margins": 0.2812584340572357, + "rewards/rejected": -0.31940898299217224, + "sft_loss": 0.38150539994239807, + "step": 1528 + }, + { + "epoch": 2.2111352133044107, + "grad_norm": 2.569601628130879, + "learning_rate": 5.780172620542744e-06, + "logits/chosen": -0.3070398271083832, + "logits/rejected": -0.2711654305458069, + "logps/chosen": -0.35240018367767334, + "logps/rejected": -3.4073891639709473, + "loss": 0.3628, + "odds_ratio_loss": 0.09907615929841995, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03524002060294151, + "rewards/margins": 0.3054988980293274, + "rewards/rejected": -0.3407389223575592, + "sft_loss": 0.35240018367767334, + "step": 1529 + }, + { + "epoch": 2.212581344902386, + "grad_norm": 2.322302360573357, + "learning_rate": 5.777390744706633e-06, + "logits/chosen": -0.11148097366094589, + "logits/rejected": -0.10225170850753784, + "logps/chosen": -0.37124699354171753, + "logps/rejected": -1.8571745157241821, + "loss": 0.3813, + "odds_ratio_loss": 0.16256582736968994, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.037124693393707275, + "rewards/margins": 0.1485927402973175, + "rewards/rejected": -0.18571743369102478, + "sft_loss": 0.37124699354171753, + "step": 1530 + }, + { + "epoch": 2.2140274765003616, + "grad_norm": 2.645782452973883, + "learning_rate": 5.774607797275022e-06, + "logits/chosen": -0.21809406578540802, + "logits/rejected": -0.14942537248134613, + "logps/chosen": -0.2832741439342499, + "logps/rejected": -2.8527350425720215, + "loss": 0.3689, + "odds_ratio_loss": 0.14639288187026978, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.028327414765954018, + "rewards/margins": 0.2569460868835449, + "rewards/rejected": -0.2852734923362732, + "sft_loss": 0.2832741439342499, + "step": 1531 + }, + { + "epoch": 2.215473608098337, + "grad_norm": 2.793134220684291, + "learning_rate": 5.7718237799257625e-06, + "logits/chosen": -0.1597381979227066, + "logits/rejected": -0.2850331962108612, + "logps/chosen": -0.3894084692001343, + "logps/rejected": -1.2152106761932373, + "loss": 0.3134, + "odds_ratio_loss": 0.23522335290908813, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03894084692001343, + "rewards/margins": 0.08258021622896194, + "rewards/rejected": -0.12152105569839478, + "sft_loss": 0.3894084692001343, + "step": 1532 + }, + { + "epoch": 2.2169197396963125, + "grad_norm": 2.5901822894457056, + "learning_rate": 5.7690386943373446e-06, + "logits/chosen": -0.3303926885128021, + "logits/rejected": -0.15845176577568054, + "logps/chosen": -0.23409810662269592, + "logps/rejected": -4.917486190795898, + "loss": 0.2976, + "odds_ratio_loss": 0.05652255564928055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023409809917211533, + "rewards/margins": 0.46833881735801697, + "rewards/rejected": -0.4917486011981964, + "sft_loss": 0.23409810662269592, + "step": 1533 + }, + { + "epoch": 2.2183658712942878, + "grad_norm": 2.844379078494416, + "learning_rate": 5.76625254218891e-06, + "logits/chosen": -0.30775171518325806, + "logits/rejected": -0.3369847536087036, + "logps/chosen": -0.3618554472923279, + "logps/rejected": -2.403303861618042, + "loss": 0.329, + "odds_ratio_loss": 0.1528136283159256, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03618554398417473, + "rewards/margins": 0.20414483547210693, + "rewards/rejected": -0.24033036828041077, + "sft_loss": 0.3618554472923279, + "step": 1534 + }, + { + "epoch": 2.219812002892263, + "grad_norm": 2.324950377218177, + "learning_rate": 5.76346532516024e-06, + "logits/chosen": -0.3371676206588745, + "logits/rejected": -0.32295674085617065, + "logps/chosen": -0.3337056636810303, + "logps/rejected": -3.918829917907715, + "loss": 0.3407, + "odds_ratio_loss": 0.09082160890102386, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03337056562304497, + "rewards/margins": 0.3585124611854553, + "rewards/rejected": -0.3918830156326294, + "sft_loss": 0.3337056636810303, + "step": 1535 + }, + { + "epoch": 2.2212581344902387, + "grad_norm": 2.290719343416125, + "learning_rate": 5.760677044931757e-06, + "logits/chosen": -0.22997716069221497, + "logits/rejected": -0.30874577164649963, + "logps/chosen": -0.43819090723991394, + "logps/rejected": -2.9812252521514893, + "loss": 0.3561, + "odds_ratio_loss": 0.13137231767177582, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04381909221410751, + "rewards/margins": 0.2543034553527832, + "rewards/rejected": -0.2981225252151489, + "sft_loss": 0.43819090723991394, + "step": 1536 + }, + { + "epoch": 2.222704266088214, + "grad_norm": 2.2585019090751004, + "learning_rate": 5.7578877031845265e-06, + "logits/chosen": -0.20603443682193756, + "logits/rejected": -0.17099282145500183, + "logps/chosen": -0.3480113446712494, + "logps/rejected": -4.4564995765686035, + "loss": 0.3771, + "odds_ratio_loss": 0.09290958940982819, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03480113670229912, + "rewards/margins": 0.41084882616996765, + "rewards/rejected": -0.44565001130104065, + "sft_loss": 0.3480113446712494, + "step": 1537 + }, + { + "epoch": 2.2241503976861896, + "grad_norm": 2.3447858727279933, + "learning_rate": 5.755097301600253e-06, + "logits/chosen": -0.3461535573005676, + "logits/rejected": -0.2234947383403778, + "logps/chosen": -0.3295784890651703, + "logps/rejected": -2.077148199081421, + "loss": 0.4611, + "odds_ratio_loss": 0.13608911633491516, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03295784816145897, + "rewards/margins": 0.1747569590806961, + "rewards/rejected": -0.20771479606628418, + "sft_loss": 0.3295784890651703, + "step": 1538 + }, + { + "epoch": 2.225596529284165, + "grad_norm": 2.5895792850597226, + "learning_rate": 5.752305841861279e-06, + "logits/chosen": -0.2195424735546112, + "logits/rejected": -0.1914370357990265, + "logps/chosen": -0.34031060338020325, + "logps/rejected": -3.349759578704834, + "loss": 0.3642, + "odds_ratio_loss": 0.1579233705997467, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.034031059592962265, + "rewards/margins": 0.3009449243545532, + "rewards/rejected": -0.3349759578704834, + "sft_loss": 0.34031060338020325, + "step": 1539 + }, + { + "epoch": 2.22704266088214, + "grad_norm": 2.363995097567075, + "learning_rate": 5.749513325650586e-06, + "logits/chosen": -0.09926959127187729, + "logits/rejected": -0.14528873562812805, + "logps/chosen": -0.20169880986213684, + "logps/rejected": -4.050556659698486, + "loss": 0.3357, + "odds_ratio_loss": 0.07336930185556412, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020169882103800774, + "rewards/margins": 0.3848857581615448, + "rewards/rejected": -0.40505561232566833, + "sft_loss": 0.20169880986213684, + "step": 1540 + }, + { + "epoch": 2.2284887924801158, + "grad_norm": 2.356577808785414, + "learning_rate": 5.746719754651795e-06, + "logits/chosen": -0.32525795698165894, + "logits/rejected": -0.2390633374452591, + "logps/chosen": -0.47373491525650024, + "logps/rejected": -2.445295572280884, + "loss": 0.3442, + "odds_ratio_loss": 0.17469418048858643, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.047373492270708084, + "rewards/margins": 0.19715607166290283, + "rewards/rejected": -0.2445295751094818, + "sft_loss": 0.47373491525650024, + "step": 1541 + }, + { + "epoch": 2.229934924078091, + "grad_norm": 2.0394552640377652, + "learning_rate": 5.743925130549157e-06, + "logits/chosen": -0.2975271940231323, + "logits/rejected": -0.12008170038461685, + "logps/chosen": -0.38611143827438354, + "logps/rejected": -3.99062442779541, + "loss": 0.4142, + "odds_ratio_loss": 0.08279373496770859, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.038611143827438354, + "rewards/margins": 0.3604513108730316, + "rewards/rejected": -0.39906245470046997, + "sft_loss": 0.38611143827438354, + "step": 1542 + }, + { + "epoch": 2.2313810556760667, + "grad_norm": 2.5021991247752973, + "learning_rate": 5.741129455027563e-06, + "logits/chosen": -0.22896796464920044, + "logits/rejected": -0.31885460019111633, + "logps/chosen": -0.3669053018093109, + "logps/rejected": -2.162534713745117, + "loss": 0.4192, + "odds_ratio_loss": 0.16942007839679718, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03669053316116333, + "rewards/margins": 0.17956297099590302, + "rewards/rejected": -0.21625348925590515, + "sft_loss": 0.3669053018093109, + "step": 1543 + }, + { + "epoch": 2.232827187274042, + "grad_norm": 2.4755706908681656, + "learning_rate": 5.738332729772537e-06, + "logits/chosen": -0.21915926039218903, + "logits/rejected": -0.14056669175624847, + "logps/chosen": -0.4281160533428192, + "logps/rejected": -2.106233835220337, + "loss": 0.3769, + "odds_ratio_loss": 0.18937034904956818, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04281160607933998, + "rewards/margins": 0.16781175136566162, + "rewards/rejected": -0.2106233537197113, + "sft_loss": 0.4281160533428192, + "step": 1544 + }, + { + "epoch": 2.234273318872017, + "grad_norm": 2.233386700181181, + "learning_rate": 5.735534956470232e-06, + "logits/chosen": -0.3864569365978241, + "logits/rejected": -0.3033338785171509, + "logps/chosen": -0.4460701644420624, + "logps/rejected": -2.0371532440185547, + "loss": 0.29, + "odds_ratio_loss": 0.15289722383022308, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.044607020914554596, + "rewards/margins": 0.15910829603672028, + "rewards/rejected": -0.20371532440185547, + "sft_loss": 0.4460701644420624, + "step": 1545 + }, + { + "epoch": 2.235719450469993, + "grad_norm": 2.655882713565303, + "learning_rate": 5.732736136807439e-06, + "logits/chosen": -0.2490551769733429, + "logits/rejected": -0.22575412690639496, + "logps/chosen": -0.4899711608886719, + "logps/rejected": -3.2424023151397705, + "loss": 0.395, + "odds_ratio_loss": 0.18628421425819397, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.048997119069099426, + "rewards/margins": 0.2752431035041809, + "rewards/rejected": -0.32424020767211914, + "sft_loss": 0.4899711608886719, + "step": 1546 + }, + { + "epoch": 2.237165582067968, + "grad_norm": 3.28371337385153, + "learning_rate": 5.729936272471576e-06, + "logits/chosen": -0.11610257625579834, + "logits/rejected": -0.1111803650856018, + "logps/chosen": -0.24118897318840027, + "logps/rejected": -3.4827942848205566, + "loss": 0.3084, + "odds_ratio_loss": 0.11601855605840683, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024118896573781967, + "rewards/margins": 0.32416054606437683, + "rewards/rejected": -0.3482794463634491, + "sft_loss": 0.24118897318840027, + "step": 1547 + }, + { + "epoch": 2.238611713665944, + "grad_norm": 2.2346359384189554, + "learning_rate": 5.7271353651506914e-06, + "logits/chosen": -0.19794616103172302, + "logits/rejected": -0.1796175241470337, + "logps/chosen": -0.1995583176612854, + "logps/rejected": -4.046960353851318, + "loss": 0.2976, + "odds_ratio_loss": 0.07498744130134583, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01995583064854145, + "rewards/margins": 0.3847401738166809, + "rewards/rejected": -0.4046960175037384, + "sft_loss": 0.1995583176612854, + "step": 1548 + }, + { + "epoch": 2.240057845263919, + "grad_norm": 2.356672314146801, + "learning_rate": 5.724333416533462e-06, + "logits/chosen": -0.30946430563926697, + "logits/rejected": -0.3228055238723755, + "logps/chosen": -0.39164984226226807, + "logps/rejected": -4.4202165603637695, + "loss": 0.3586, + "odds_ratio_loss": 0.07636836171150208, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.039164986461400986, + "rewards/margins": 0.402856707572937, + "rewards/rejected": -0.4420216679573059, + "sft_loss": 0.39164984226226807, + "step": 1549 + }, + { + "epoch": 2.2415039768618943, + "grad_norm": 1.9100552032789626, + "learning_rate": 5.721530428309193e-06, + "logits/chosen": -0.24844273924827576, + "logits/rejected": -0.2502165734767914, + "logps/chosen": -0.2769375443458557, + "logps/rejected": -3.178349256515503, + "loss": 0.3436, + "odds_ratio_loss": 0.13039150834083557, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02769375592470169, + "rewards/margins": 0.29014119505882263, + "rewards/rejected": -0.3178349733352661, + "sft_loss": 0.2769375443458557, + "step": 1550 + }, + { + "epoch": 2.24295010845987, + "grad_norm": 2.268645975601301, + "learning_rate": 5.718726402167819e-06, + "logits/chosen": -0.38420581817626953, + "logits/rejected": -0.2832087278366089, + "logps/chosen": -0.3715498745441437, + "logps/rejected": -2.8785829544067383, + "loss": 0.4685, + "odds_ratio_loss": 0.13134051859378815, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03715498745441437, + "rewards/margins": 0.2507033348083496, + "rewards/rejected": -0.2878583073616028, + "sft_loss": 0.3715498745441437, + "step": 1551 + }, + { + "epoch": 2.244396240057845, + "grad_norm": 2.161285650065301, + "learning_rate": 5.715921339799895e-06, + "logits/chosen": -0.18678617477416992, + "logits/rejected": -0.24380554258823395, + "logps/chosen": -0.3200456500053406, + "logps/rejected": -4.384403228759766, + "loss": 0.3791, + "odds_ratio_loss": 0.12353577464818954, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03200456500053406, + "rewards/margins": 0.4064357876777649, + "rewards/rejected": -0.43844032287597656, + "sft_loss": 0.3200456500053406, + "step": 1552 + }, + { + "epoch": 2.2458423716558205, + "grad_norm": 2.117005371321443, + "learning_rate": 5.713115242896604e-06, + "logits/chosen": -0.37839236855506897, + "logits/rejected": -0.3601863980293274, + "logps/chosen": -0.2849191427230835, + "logps/rejected": -4.144497394561768, + "loss": 0.2864, + "odds_ratio_loss": 0.09585070610046387, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02849191054701805, + "rewards/margins": 0.385957807302475, + "rewards/rejected": -0.4144497215747833, + "sft_loss": 0.2849191427230835, + "step": 1553 + }, + { + "epoch": 2.247288503253796, + "grad_norm": 4.326532462909167, + "learning_rate": 5.710308113149753e-06, + "logits/chosen": -0.37594687938690186, + "logits/rejected": -0.4265070855617523, + "logps/chosen": -0.4674017131328583, + "logps/rejected": -3.600942850112915, + "loss": 0.4116, + "odds_ratio_loss": 0.2612759470939636, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.046740174293518066, + "rewards/margins": 0.31335410475730896, + "rewards/rejected": -0.360094279050827, + "sft_loss": 0.4674017131328583, + "step": 1554 + }, + { + "epoch": 2.2487346348517714, + "grad_norm": 2.9543812647551495, + "learning_rate": 5.707499952251771e-06, + "logits/chosen": -0.2548982799053192, + "logits/rejected": -0.18201807141304016, + "logps/chosen": -0.340849369764328, + "logps/rejected": -2.363661766052246, + "loss": 0.3777, + "odds_ratio_loss": 0.18882401287555695, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03408493846654892, + "rewards/margins": 0.20228123664855957, + "rewards/rejected": -0.2363661825656891, + "sft_loss": 0.340849369764328, + "step": 1555 + }, + { + "epoch": 2.250180766449747, + "grad_norm": 2.5022329892397814, + "learning_rate": 5.704690761895708e-06, + "logits/chosen": -0.2796105444431305, + "logits/rejected": -0.20917150378227234, + "logps/chosen": -0.3368090093135834, + "logps/rejected": -4.430875301361084, + "loss": 0.3872, + "odds_ratio_loss": 0.05535319447517395, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03368090093135834, + "rewards/margins": 0.4094066321849823, + "rewards/rejected": -0.44308754801750183, + "sft_loss": 0.3368090093135834, + "step": 1556 + }, + { + "epoch": 2.2516268980477223, + "grad_norm": 2.354413200823632, + "learning_rate": 5.7018805437752366e-06, + "logits/chosen": -0.3531820476055145, + "logits/rejected": -0.2590283751487732, + "logps/chosen": -0.4060213267803192, + "logps/rejected": -2.9722695350646973, + "loss": 0.4336, + "odds_ratio_loss": 0.18422815203666687, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04060213267803192, + "rewards/margins": 0.25662487745285034, + "rewards/rejected": -0.29722699522972107, + "sft_loss": 0.4060213267803192, + "step": 1557 + }, + { + "epoch": 2.253073029645698, + "grad_norm": 2.082596219774037, + "learning_rate": 5.699069299584646e-06, + "logits/chosen": -0.14018461108207703, + "logits/rejected": -0.14732438325881958, + "logps/chosen": -0.30800139904022217, + "logps/rejected": -3.749959945678711, + "loss": 0.403, + "odds_ratio_loss": 0.10707899928092957, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.030800141394138336, + "rewards/margins": 0.3441958427429199, + "rewards/rejected": -0.37499600648880005, + "sft_loss": 0.30800139904022217, + "step": 1558 + }, + { + "epoch": 2.2545191612436732, + "grad_norm": 2.631005758226159, + "learning_rate": 5.696257031018847e-06, + "logits/chosen": -0.20500501990318298, + "logits/rejected": -0.15662819147109985, + "logps/chosen": -0.3218449056148529, + "logps/rejected": -4.851985454559326, + "loss": 0.3504, + "odds_ratio_loss": 0.05477791652083397, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03218448907136917, + "rewards/margins": 0.4530141055583954, + "rewards/rejected": -0.4851985573768616, + "sft_loss": 0.3218449056148529, + "step": 1559 + }, + { + "epoch": 2.2559652928416485, + "grad_norm": 2.153417090098337, + "learning_rate": 5.6934437397733664e-06, + "logits/chosen": -0.27325424551963806, + "logits/rejected": -0.38711655139923096, + "logps/chosen": -0.3427465856075287, + "logps/rejected": -4.2784833908081055, + "loss": 0.3354, + "odds_ratio_loss": 0.19038382172584534, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03427466005086899, + "rewards/margins": 0.39357367157936096, + "rewards/rejected": -0.42784836888313293, + "sft_loss": 0.3427465856075287, + "step": 1560 + }, + { + "epoch": 2.257411424439624, + "grad_norm": 2.6827162453990163, + "learning_rate": 5.690629427544348e-06, + "logits/chosen": -0.3731663227081299, + "logits/rejected": -0.41208338737487793, + "logps/chosen": -0.34578463435173035, + "logps/rejected": -3.8187103271484375, + "loss": 0.3752, + "odds_ratio_loss": 0.1189727634191513, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.034578464925289154, + "rewards/margins": 0.34729254245758057, + "rewards/rejected": -0.38187098503112793, + "sft_loss": 0.34578463435173035, + "step": 1561 + }, + { + "epoch": 2.2588575560375994, + "grad_norm": 2.321970247707492, + "learning_rate": 5.68781409602855e-06, + "logits/chosen": -0.13874304294586182, + "logits/rejected": -0.1839933842420578, + "logps/chosen": -0.35330072045326233, + "logps/rejected": -3.258653163909912, + "loss": 0.3224, + "odds_ratio_loss": 0.14055302739143372, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.035330068320035934, + "rewards/margins": 0.29053524136543274, + "rewards/rejected": -0.3258652985095978, + "sft_loss": 0.35330072045326233, + "step": 1562 + }, + { + "epoch": 2.2603036876355747, + "grad_norm": 2.4312378589175085, + "learning_rate": 5.684997746923349e-06, + "logits/chosen": -0.36342984437942505, + "logits/rejected": -0.18977266550064087, + "logps/chosen": -0.3192789554595947, + "logps/rejected": -4.501648902893066, + "loss": 0.3177, + "odds_ratio_loss": 0.07834473252296448, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03192789852619171, + "rewards/margins": 0.41823703050613403, + "rewards/rejected": -0.45016491413116455, + "sft_loss": 0.3192789554595947, + "step": 1563 + }, + { + "epoch": 2.2617498192335503, + "grad_norm": 2.2726822259954726, + "learning_rate": 5.6821803819267306e-06, + "logits/chosen": -0.3450077474117279, + "logits/rejected": -0.1680723875761032, + "logps/chosen": -0.30977359414100647, + "logps/rejected": -3.765933036804199, + "loss": 0.3865, + "odds_ratio_loss": 0.09045100957155228, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.030977360904216766, + "rewards/margins": 0.345615953207016, + "rewards/rejected": -0.37659329175949097, + "sft_loss": 0.30977359414100647, + "step": 1564 + }, + { + "epoch": 2.2631959508315256, + "grad_norm": 3.067578422370726, + "learning_rate": 5.679362002737295e-06, + "logits/chosen": -0.32226288318634033, + "logits/rejected": -0.35191404819488525, + "logps/chosen": -0.4810445308685303, + "logps/rejected": -3.7931578159332275, + "loss": 0.3509, + "odds_ratio_loss": 0.19164201617240906, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04810445010662079, + "rewards/margins": 0.3312113285064697, + "rewards/rejected": -0.3793157935142517, + "sft_loss": 0.4810445308685303, + "step": 1565 + }, + { + "epoch": 2.2646420824295013, + "grad_norm": 3.6725044603624832, + "learning_rate": 5.676542611054253e-06, + "logits/chosen": -0.5231328010559082, + "logits/rejected": -0.33863845467567444, + "logps/chosen": -0.3771399259567261, + "logps/rejected": -2.39211368560791, + "loss": 0.3874, + "odds_ratio_loss": 0.1751105934381485, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03771399334073067, + "rewards/margins": 0.20149734616279602, + "rewards/rejected": -0.23921135067939758, + "sft_loss": 0.3771399259567261, + "step": 1566 + }, + { + "epoch": 2.2660882140274765, + "grad_norm": 2.3515505758843367, + "learning_rate": 5.673722208577426e-06, + "logits/chosen": -0.18822497129440308, + "logits/rejected": -0.1882564127445221, + "logps/chosen": -0.2656404376029968, + "logps/rejected": -3.3323557376861572, + "loss": 0.3407, + "odds_ratio_loss": 0.08594746887683868, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026564043015241623, + "rewards/margins": 0.30667150020599365, + "rewards/rejected": -0.33323556184768677, + "sft_loss": 0.2656404376029968, + "step": 1567 + }, + { + "epoch": 2.2675343456254518, + "grad_norm": 2.6577713876240434, + "learning_rate": 5.670900797007246e-06, + "logits/chosen": -0.24050821363925934, + "logits/rejected": -0.2003868669271469, + "logps/chosen": -0.3192717432975769, + "logps/rejected": -2.265965700149536, + "loss": 0.411, + "odds_ratio_loss": 0.08194440603256226, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03192717581987381, + "rewards/margins": 0.19466939568519592, + "rewards/rejected": -0.22659656405448914, + "sft_loss": 0.3192717432975769, + "step": 1568 + }, + { + "epoch": 2.2689804772234274, + "grad_norm": 2.2815630278049355, + "learning_rate": 5.668078378044753e-06, + "logits/chosen": -0.2181173712015152, + "logits/rejected": -0.1383255273103714, + "logps/chosen": -0.25415533781051636, + "logps/rejected": -4.44294548034668, + "loss": 0.2843, + "odds_ratio_loss": 0.06722106039524078, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025415534153580666, + "rewards/margins": 0.4188790023326874, + "rewards/rejected": -0.4442945420742035, + "sft_loss": 0.25415533781051636, + "step": 1569 + }, + { + "epoch": 2.2704266088214027, + "grad_norm": 4.736083331714562, + "learning_rate": 5.665254953391593e-06, + "logits/chosen": -0.21430820226669312, + "logits/rejected": -0.34477680921554565, + "logps/chosen": -0.43496644496917725, + "logps/rejected": -3.20230770111084, + "loss": 0.3673, + "odds_ratio_loss": 0.12124226987361908, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.043496645987033844, + "rewards/margins": 0.2767341434955597, + "rewards/rejected": -0.32023078203201294, + "sft_loss": 0.43496644496917725, + "step": 1570 + }, + { + "epoch": 2.2718727404193784, + "grad_norm": 2.8236198696675725, + "learning_rate": 5.662430524750021e-06, + "logits/chosen": -0.15283435583114624, + "logits/rejected": -0.19004985690116882, + "logps/chosen": -0.5074213147163391, + "logps/rejected": -4.028660297393799, + "loss": 0.4107, + "odds_ratio_loss": 0.1295696198940277, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05074213445186615, + "rewards/margins": 0.3521239459514618, + "rewards/rejected": -0.40286606550216675, + "sft_loss": 0.5074213147163391, + "step": 1571 + }, + { + "epoch": 2.2733188720173536, + "grad_norm": 2.3251385690876085, + "learning_rate": 5.659605093822891e-06, + "logits/chosen": -0.2327795922756195, + "logits/rejected": -0.1698169857263565, + "logps/chosen": -0.4730731248855591, + "logps/rejected": -3.355661392211914, + "loss": 0.4151, + "odds_ratio_loss": 0.19861802458763123, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04730731621384621, + "rewards/margins": 0.288258820772171, + "rewards/rejected": -0.33556610345840454, + "sft_loss": 0.4730731248855591, + "step": 1572 + }, + { + "epoch": 2.274765003615329, + "grad_norm": 2.4179894674242255, + "learning_rate": 5.656778662313671e-06, + "logits/chosen": -0.05721982568502426, + "logits/rejected": -0.03928186744451523, + "logps/chosen": -0.36657410860061646, + "logps/rejected": -2.0526092052459717, + "loss": 0.3521, + "odds_ratio_loss": 0.19514396786689758, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.036657411605119705, + "rewards/margins": 0.16860352456569672, + "rewards/rejected": -0.20526093244552612, + "sft_loss": 0.36657410860061646, + "step": 1573 + }, + { + "epoch": 2.2762111352133045, + "grad_norm": 2.159865190586481, + "learning_rate": 5.653951231926425e-06, + "logits/chosen": -0.19648948311805725, + "logits/rejected": -0.3218488097190857, + "logps/chosen": -0.3456001877784729, + "logps/rejected": -4.398506164550781, + "loss": 0.3824, + "odds_ratio_loss": 0.15548524260520935, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03456001728773117, + "rewards/margins": 0.4052906334400177, + "rewards/rejected": -0.4398505985736847, + "sft_loss": 0.3456001877784729, + "step": 1574 + }, + { + "epoch": 2.27765726681128, + "grad_norm": 2.2373719533444305, + "learning_rate": 5.651122804365822e-06, + "logits/chosen": -0.15806661546230316, + "logits/rejected": -0.24433177709579468, + "logps/chosen": -0.28150510787963867, + "logps/rejected": -2.8424034118652344, + "loss": 0.3845, + "odds_ratio_loss": 0.10211706161499023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.028150510042905807, + "rewards/margins": 0.25608983635902405, + "rewards/rejected": -0.28424033522605896, + "sft_loss": 0.28150510787963867, + "step": 1575 + }, + { + "epoch": 2.279103398409255, + "grad_norm": 3.2054290088877346, + "learning_rate": 5.6482933813371295e-06, + "logits/chosen": -0.1453315168619156, + "logits/rejected": -0.1947060525417328, + "logps/chosen": -0.5173884630203247, + "logps/rejected": -1.7582117319107056, + "loss": 0.4024, + "odds_ratio_loss": 0.24231034517288208, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05173884332180023, + "rewards/margins": 0.12408233433961868, + "rewards/rejected": -0.1758211851119995, + "sft_loss": 0.5173884630203247, + "step": 1576 + }, + { + "epoch": 2.2805495300072307, + "grad_norm": 2.8464475838685814, + "learning_rate": 5.645462964546218e-06, + "logits/chosen": -0.5189359784126282, + "logits/rejected": -0.32251614332199097, + "logps/chosen": -0.3037870228290558, + "logps/rejected": -2.479979991912842, + "loss": 0.3257, + "odds_ratio_loss": 0.12056048214435577, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03037870302796364, + "rewards/margins": 0.21761931478977203, + "rewards/rejected": -0.24799801409244537, + "sft_loss": 0.3037870228290558, + "step": 1577 + }, + { + "epoch": 2.281995661605206, + "grad_norm": 2.6644325848886017, + "learning_rate": 5.642631555699557e-06, + "logits/chosen": -0.13242171704769135, + "logits/rejected": -0.20441709458827972, + "logps/chosen": -0.4285106956958771, + "logps/rejected": -3.3875656127929688, + "loss": 0.3443, + "odds_ratio_loss": 0.12020072340965271, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04285107180476189, + "rewards/margins": 0.29590553045272827, + "rewards/rejected": -0.3387565612792969, + "sft_loss": 0.4285106956958771, + "step": 1578 + }, + { + "epoch": 2.2834417932031816, + "grad_norm": 2.2024351205491253, + "learning_rate": 5.639799156504215e-06, + "logits/chosen": -0.0775141790509224, + "logits/rejected": -0.14957457780838013, + "logps/chosen": -0.3367796838283539, + "logps/rejected": -3.4249143600463867, + "loss": 0.4105, + "odds_ratio_loss": 0.09568314254283905, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03367796912789345, + "rewards/margins": 0.3088134527206421, + "rewards/rejected": -0.34249138832092285, + "sft_loss": 0.3367796838283539, + "step": 1579 + }, + { + "epoch": 2.284887924801157, + "grad_norm": 3.0406782139100006, + "learning_rate": 5.636965768667852e-06, + "logits/chosen": -0.1693412810564041, + "logits/rejected": -0.20891311764717102, + "logps/chosen": -0.3991576135158539, + "logps/rejected": -3.2093253135681152, + "loss": 0.3954, + "odds_ratio_loss": 0.15770481526851654, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03991575911641121, + "rewards/margins": 0.2810167670249939, + "rewards/rejected": -0.3209325075149536, + "sft_loss": 0.3991576135158539, + "step": 1580 + }, + { + "epoch": 2.2863340563991326, + "grad_norm": 2.3637053224768176, + "learning_rate": 5.6341313938987314e-06, + "logits/chosen": -0.19435811042785645, + "logits/rejected": -0.08956211805343628, + "logps/chosen": -0.40447860956192017, + "logps/rejected": -2.695289373397827, + "loss": 0.4166, + "odds_ratio_loss": 0.14535120129585266, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04044786095619202, + "rewards/margins": 0.22908106446266174, + "rewards/rejected": -0.26952892541885376, + "sft_loss": 0.40447860956192017, + "step": 1581 + }, + { + "epoch": 2.287780187997108, + "grad_norm": 2.3223751156245984, + "learning_rate": 5.631296033905707e-06, + "logits/chosen": -0.20546045899391174, + "logits/rejected": -0.13516230881214142, + "logps/chosen": -0.2861219346523285, + "logps/rejected": -4.935697555541992, + "loss": 0.2542, + "odds_ratio_loss": 0.0564875528216362, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02861219458281994, + "rewards/margins": 0.4649575650691986, + "rewards/rejected": -0.4935697317123413, + "sft_loss": 0.2861219346523285, + "step": 1582 + }, + { + "epoch": 2.289226319595083, + "grad_norm": 2.4551762245248807, + "learning_rate": 5.628459690398229e-06, + "logits/chosen": -0.2333042323589325, + "logits/rejected": -0.14890116453170776, + "logps/chosen": -0.3023528456687927, + "logps/rejected": -4.237617015838623, + "loss": 0.3242, + "odds_ratio_loss": 0.1010616272687912, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03023528680205345, + "rewards/margins": 0.39352643489837646, + "rewards/rejected": -0.4237616956233978, + "sft_loss": 0.3023528456687927, + "step": 1583 + }, + { + "epoch": 2.2906724511930587, + "grad_norm": 2.3849079762907697, + "learning_rate": 5.625622365086338e-06, + "logits/chosen": -0.2263105809688568, + "logits/rejected": -0.14233849942684174, + "logps/chosen": -0.4132815897464752, + "logps/rejected": -1.7932350635528564, + "loss": 0.394, + "odds_ratio_loss": 0.14649933576583862, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04132816195487976, + "rewards/margins": 0.13799536228179932, + "rewards/rejected": -0.17932350933551788, + "sft_loss": 0.4132815897464752, + "step": 1584 + }, + { + "epoch": 2.292118582791034, + "grad_norm": 2.833070998521513, + "learning_rate": 5.6227840596806685e-06, + "logits/chosen": -0.23122721910476685, + "logits/rejected": -0.17656515538692474, + "logps/chosen": -0.37628495693206787, + "logps/rejected": -2.2837910652160645, + "loss": 0.3649, + "odds_ratio_loss": 0.1681251972913742, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03762849420309067, + "rewards/margins": 0.1907506287097931, + "rewards/rejected": -0.22837910056114197, + "sft_loss": 0.37628495693206787, + "step": 1585 + }, + { + "epoch": 2.293564714389009, + "grad_norm": 2.1224928456477574, + "learning_rate": 5.6199447758924454e-06, + "logits/chosen": -0.1943625807762146, + "logits/rejected": -0.15852569043636322, + "logps/chosen": -0.31918591260910034, + "logps/rejected": -2.6646080017089844, + "loss": 0.3446, + "odds_ratio_loss": 0.10638980567455292, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.031918592751026154, + "rewards/margins": 0.23454220592975616, + "rewards/rejected": -0.2664608061313629, + "sft_loss": 0.31918591260910034, + "step": 1586 + }, + { + "epoch": 2.295010845986985, + "grad_norm": 2.630639755058868, + "learning_rate": 5.617104515433485e-06, + "logits/chosen": -0.36188462376594543, + "logits/rejected": -0.27973437309265137, + "logps/chosen": -0.36127543449401855, + "logps/rejected": -4.726844787597656, + "loss": 0.3132, + "odds_ratio_loss": 0.20731763541698456, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03612754866480827, + "rewards/margins": 0.43655693531036377, + "rewards/rejected": -0.47268450260162354, + "sft_loss": 0.36127543449401855, + "step": 1587 + }, + { + "epoch": 2.29645697758496, + "grad_norm": 2.327846104888335, + "learning_rate": 5.614263280016188e-06, + "logits/chosen": -0.31197673082351685, + "logits/rejected": -0.1848919838666916, + "logps/chosen": -0.346306711435318, + "logps/rejected": -2.4159696102142334, + "loss": 0.3584, + "odds_ratio_loss": 0.10952720046043396, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0346306711435318, + "rewards/margins": 0.20696629583835602, + "rewards/rejected": -0.24159696698188782, + "sft_loss": 0.346306711435318, + "step": 1588 + }, + { + "epoch": 2.297903109182936, + "grad_norm": 3.011830252732662, + "learning_rate": 5.611421071353547e-06, + "logits/chosen": -0.18929818272590637, + "logits/rejected": -0.14918991923332214, + "logps/chosen": -0.23382627964019775, + "logps/rejected": -2.677581310272217, + "loss": 0.2883, + "odds_ratio_loss": 0.07694613188505173, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023382626473903656, + "rewards/margins": 0.24437551200389862, + "rewards/rejected": -0.2677581310272217, + "sft_loss": 0.23382627964019775, + "step": 1589 + }, + { + "epoch": 2.299349240780911, + "grad_norm": 2.482707891186509, + "learning_rate": 5.608577891159141e-06, + "logits/chosen": -0.3832859396934509, + "logits/rejected": -0.18635818362236023, + "logps/chosen": -0.3988485038280487, + "logps/rejected": -2.1749815940856934, + "loss": 0.3525, + "odds_ratio_loss": 0.11210395395755768, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03988485038280487, + "rewards/margins": 0.17761333286762238, + "rewards/rejected": -0.21749816834926605, + "sft_loss": 0.3988485038280487, + "step": 1590 + }, + { + "epoch": 2.3007953723788863, + "grad_norm": 2.9702079948943774, + "learning_rate": 5.605733741147135e-06, + "logits/chosen": -0.1605539172887802, + "logits/rejected": -0.23468124866485596, + "logps/chosen": -0.46900007128715515, + "logps/rejected": -1.4842274188995361, + "loss": 0.4442, + "odds_ratio_loss": 0.22805261611938477, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04690001159906387, + "rewards/margins": 0.10152273625135422, + "rewards/rejected": -0.1484227478504181, + "sft_loss": 0.46900007128715515, + "step": 1591 + }, + { + "epoch": 2.302241503976862, + "grad_norm": 2.438494937693134, + "learning_rate": 5.602888623032275e-06, + "logits/chosen": -0.2626468539237976, + "logits/rejected": -0.18816189467906952, + "logps/chosen": -0.30163171887397766, + "logps/rejected": -3.5179507732391357, + "loss": 0.3645, + "odds_ratio_loss": 0.07172201573848724, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.030163172632455826, + "rewards/margins": 0.32163190841674805, + "rewards/rejected": -0.3517950773239136, + "sft_loss": 0.30163171887397766, + "step": 1592 + }, + { + "epoch": 2.3036876355748372, + "grad_norm": 2.433363209494988, + "learning_rate": 5.600042538529893e-06, + "logits/chosen": -0.10985489189624786, + "logits/rejected": -0.2541528344154358, + "logps/chosen": -0.3402894139289856, + "logps/rejected": -4.231769561767578, + "loss": 0.3691, + "odds_ratio_loss": 0.15273644030094147, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03402893990278244, + "rewards/margins": 0.3891479969024658, + "rewards/rejected": -0.42317694425582886, + "sft_loss": 0.3402894139289856, + "step": 1593 + }, + { + "epoch": 2.305133767172813, + "grad_norm": 2.1024763470220296, + "learning_rate": 5.597195489355907e-06, + "logits/chosen": -0.3650687336921692, + "logits/rejected": -0.36112746596336365, + "logps/chosen": -0.36865419149398804, + "logps/rejected": -2.913234233856201, + "loss": 0.3323, + "odds_ratio_loss": 0.15798337757587433, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03686542063951492, + "rewards/margins": 0.2544580399990082, + "rewards/rejected": -0.2913234829902649, + "sft_loss": 0.36865419149398804, + "step": 1594 + }, + { + "epoch": 2.306579898770788, + "grad_norm": 2.426897816658698, + "learning_rate": 5.594347477226811e-06, + "logits/chosen": -0.19730783998966217, + "logits/rejected": -0.31510406732559204, + "logps/chosen": -0.39814573526382446, + "logps/rejected": -3.4227256774902344, + "loss": 0.3904, + "odds_ratio_loss": 0.11480727046728134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03981457278132439, + "rewards/margins": 0.3024579882621765, + "rewards/rejected": -0.3422725796699524, + "sft_loss": 0.39814573526382446, + "step": 1595 + }, + { + "epoch": 2.3080260303687634, + "grad_norm": 2.7323229958390973, + "learning_rate": 5.591498503859683e-06, + "logits/chosen": -0.12184923887252808, + "logits/rejected": -0.12560710310935974, + "logps/chosen": -0.3566288650035858, + "logps/rejected": -2.558873176574707, + "loss": 0.3593, + "odds_ratio_loss": 0.12249592691659927, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03566288948059082, + "rewards/margins": 0.22022442519664764, + "rewards/rejected": -0.25588732957839966, + "sft_loss": 0.3566288650035858, + "step": 1596 + }, + { + "epoch": 2.309472161966739, + "grad_norm": 3.2383620194913374, + "learning_rate": 5.58864857097218e-06, + "logits/chosen": -0.4806884527206421, + "logits/rejected": -0.3084767758846283, + "logps/chosen": -0.3511160612106323, + "logps/rejected": -2.599006175994873, + "loss": 0.357, + "odds_ratio_loss": 0.16197161376476288, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03511160612106323, + "rewards/margins": 0.22478902339935303, + "rewards/rejected": -0.25990062952041626, + "sft_loss": 0.3511160612106323, + "step": 1597 + }, + { + "epoch": 2.3109182935647143, + "grad_norm": 2.8203160493059882, + "learning_rate": 5.585797680282537e-06, + "logits/chosen": -0.17948085069656372, + "logits/rejected": -0.16148388385772705, + "logps/chosen": -0.2975139617919922, + "logps/rejected": -4.275174140930176, + "loss": 0.463, + "odds_ratio_loss": 0.1559380292892456, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.029751399531960487, + "rewards/margins": 0.3977660536766052, + "rewards/rejected": -0.4275174140930176, + "sft_loss": 0.2975139617919922, + "step": 1598 + }, + { + "epoch": 2.3123644251626896, + "grad_norm": 2.553105162778212, + "learning_rate": 5.582945833509567e-06, + "logits/chosen": -0.13604110479354858, + "logits/rejected": -0.11723095923662186, + "logps/chosen": -0.3466201424598694, + "logps/rejected": -2.9810094833374023, + "loss": 0.3345, + "odds_ratio_loss": 0.11769793182611465, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03466201201081276, + "rewards/margins": 0.2634389400482178, + "rewards/rejected": -0.29810094833374023, + "sft_loss": 0.3466201424598694, + "step": 1599 + }, + { + "epoch": 2.3138105567606653, + "grad_norm": 2.2628957428181042, + "learning_rate": 5.580093032372657e-06, + "logits/chosen": -0.28234773874282837, + "logits/rejected": -0.3510010540485382, + "logps/chosen": -0.26932573318481445, + "logps/rejected": -5.100521564483643, + "loss": 0.279, + "odds_ratio_loss": 0.08029340207576752, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026932574808597565, + "rewards/margins": 0.48311957716941833, + "rewards/rejected": -0.5100522041320801, + "sft_loss": 0.26932573318481445, + "step": 1600 + }, + { + "epoch": 2.3152566883586405, + "grad_norm": 2.449902283988286, + "learning_rate": 5.577239278591773e-06, + "logits/chosen": -0.1463385969400406, + "logits/rejected": -0.12493189424276352, + "logps/chosen": -0.3543298840522766, + "logps/rejected": -3.84521222114563, + "loss": 0.3948, + "odds_ratio_loss": 0.11065103113651276, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03543298691511154, + "rewards/margins": 0.34908822178840637, + "rewards/rejected": -0.3845212161540985, + "sft_loss": 0.3543298840522766, + "step": 1601 + }, + { + "epoch": 2.316702819956616, + "grad_norm": 2.2581803555255164, + "learning_rate": 5.574384573887455e-06, + "logits/chosen": -0.14903423190116882, + "logits/rejected": -0.09336914867162704, + "logps/chosen": -0.2692457437515259, + "logps/rejected": -1.9810104370117188, + "loss": 0.2907, + "odds_ratio_loss": 0.14965462684631348, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026924576610326767, + "rewards/margins": 0.1711764633655548, + "rewards/rejected": -0.19810104370117188, + "sft_loss": 0.2692457437515259, + "step": 1602 + }, + { + "epoch": 2.3181489515545914, + "grad_norm": 2.3125551075396187, + "learning_rate": 5.571528919980813e-06, + "logits/chosen": -0.2181708812713623, + "logits/rejected": -0.14055103063583374, + "logps/chosen": -0.39502519369125366, + "logps/rejected": -2.1328787803649902, + "loss": 0.3481, + "odds_ratio_loss": 0.17443543672561646, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.039502520114183426, + "rewards/margins": 0.17378535866737366, + "rewards/rejected": -0.2132878601551056, + "sft_loss": 0.39502519369125366, + "step": 1603 + }, + { + "epoch": 2.319595083152567, + "grad_norm": 2.4434556007442434, + "learning_rate": 5.568672318593532e-06, + "logits/chosen": -0.14153406023979187, + "logits/rejected": -0.12052149325609207, + "logps/chosen": -0.2865070104598999, + "logps/rejected": -1.733748197555542, + "loss": 0.365, + "odds_ratio_loss": 0.09804748743772507, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02865069918334484, + "rewards/margins": 0.14472413063049316, + "rewards/rejected": -0.17337481677532196, + "sft_loss": 0.2865070104598999, + "step": 1604 + }, + { + "epoch": 2.3210412147505424, + "grad_norm": 2.6434099915570104, + "learning_rate": 5.5658147714478674e-06, + "logits/chosen": -0.2638757824897766, + "logits/rejected": -0.15295341610908508, + "logps/chosen": -0.3959035277366638, + "logps/rejected": -2.8813705444335938, + "loss": 0.3362, + "odds_ratio_loss": 0.1301441639661789, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03959035128355026, + "rewards/margins": 0.24854671955108643, + "rewards/rejected": -0.2881370782852173, + "sft_loss": 0.3959035277366638, + "step": 1605 + }, + { + "epoch": 2.3224873463485176, + "grad_norm": 2.322832426601466, + "learning_rate": 5.5629562802666466e-06, + "logits/chosen": -0.08946573734283447, + "logits/rejected": -0.08883717656135559, + "logps/chosen": -0.22113816440105438, + "logps/rejected": -5.388749599456787, + "loss": 0.3381, + "odds_ratio_loss": 0.07277220487594604, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022113818675279617, + "rewards/margins": 0.5167611837387085, + "rewards/rejected": -0.5388749837875366, + "sft_loss": 0.22113816440105438, + "step": 1606 + }, + { + "epoch": 2.3239334779464933, + "grad_norm": 2.072956257971344, + "learning_rate": 5.5600968467732624e-06, + "logits/chosen": -0.1779942810535431, + "logits/rejected": -0.30677855014801025, + "logps/chosen": -0.3421142101287842, + "logps/rejected": -3.0264923572540283, + "loss": 0.3417, + "odds_ratio_loss": 0.1177440956234932, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0342114195227623, + "rewards/margins": 0.26843780279159546, + "rewards/rejected": -0.30264922976493835, + "sft_loss": 0.3421142101287842, + "step": 1607 + }, + { + "epoch": 2.3253796095444685, + "grad_norm": 2.1562983279773458, + "learning_rate": 5.557236472691679e-06, + "logits/chosen": -0.3682592511177063, + "logits/rejected": -0.2512025237083435, + "logps/chosen": -0.4082300066947937, + "logps/rejected": -4.011061668395996, + "loss": 0.3899, + "odds_ratio_loss": 0.14864492416381836, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04082300141453743, + "rewards/margins": 0.36028316617012024, + "rewards/rejected": -0.40110617876052856, + "sft_loss": 0.4082300066947937, + "step": 1608 + }, + { + "epoch": 2.326825741142444, + "grad_norm": 2.3589668387098146, + "learning_rate": 5.554375159746426e-06, + "logits/chosen": -0.08552505075931549, + "logits/rejected": -0.08895818889141083, + "logps/chosen": -0.37686607241630554, + "logps/rejected": -4.821796894073486, + "loss": 0.3431, + "odds_ratio_loss": 0.1013682559132576, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.037686608731746674, + "rewards/margins": 0.4444930851459503, + "rewards/rejected": -0.4821796715259552, + "sft_loss": 0.37686607241630554, + "step": 1609 + }, + { + "epoch": 2.3282718727404195, + "grad_norm": 2.573254384047697, + "learning_rate": 5.551512909662601e-06, + "logits/chosen": -0.2964475452899933, + "logits/rejected": -0.2937246263027191, + "logps/chosen": -0.36214813590049744, + "logps/rejected": -4.1340742111206055, + "loss": 0.3698, + "odds_ratio_loss": 0.11668211221694946, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.036214813590049744, + "rewards/margins": 0.3771926462650299, + "rewards/rejected": -0.41340741515159607, + "sft_loss": 0.36214813590049744, + "step": 1610 + }, + { + "epoch": 2.3297180043383947, + "grad_norm": 2.3020655070754286, + "learning_rate": 5.548649724165864e-06, + "logits/chosen": -0.2145930677652359, + "logits/rejected": -0.21104787290096283, + "logps/chosen": -0.43282127380371094, + "logps/rejected": -2.7367076873779297, + "loss": 0.404, + "odds_ratio_loss": 0.1472308188676834, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04328212887048721, + "rewards/margins": 0.23038864135742188, + "rewards/rejected": -0.2736707925796509, + "sft_loss": 0.43282127380371094, + "step": 1611 + }, + { + "epoch": 2.3311641359363704, + "grad_norm": 2.51857465350935, + "learning_rate": 5.545785604982441e-06, + "logits/chosen": -0.2099495381116867, + "logits/rejected": -0.15136927366256714, + "logps/chosen": -0.34850409626960754, + "logps/rejected": -3.427006244659424, + "loss": 0.3156, + "odds_ratio_loss": 0.13121187686920166, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.034850407391786575, + "rewards/margins": 0.3078502118587494, + "rewards/rejected": -0.34270063042640686, + "sft_loss": 0.34850409626960754, + "step": 1612 + }, + { + "epoch": 2.3326102675343456, + "grad_norm": 2.1677477412206443, + "learning_rate": 5.542920553839118e-06, + "logits/chosen": -0.13443297147750854, + "logits/rejected": -0.10968001186847687, + "logps/chosen": -0.3220829367637634, + "logps/rejected": -4.633582592010498, + "loss": 0.3408, + "odds_ratio_loss": 0.14093899726867676, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03220829367637634, + "rewards/margins": 0.431149959564209, + "rewards/rejected": -0.4633582532405853, + "sft_loss": 0.3220829367637634, + "step": 1613 + }, + { + "epoch": 2.334056399132321, + "grad_norm": 2.3359773826568775, + "learning_rate": 5.540054572463249e-06, + "logits/chosen": -0.17375683784484863, + "logits/rejected": -0.05261297523975372, + "logps/chosen": -0.3510321378707886, + "logps/rejected": -3.398996353149414, + "loss": 0.3801, + "odds_ratio_loss": 0.11157586425542831, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0351032130420208, + "rewards/margins": 0.3047964572906494, + "rewards/rejected": -0.3398996591567993, + "sft_loss": 0.3510321378707886, + "step": 1614 + }, + { + "epoch": 2.3355025307302966, + "grad_norm": 2.4381191286475516, + "learning_rate": 5.5371876625827405e-06, + "logits/chosen": -0.00956578366458416, + "logits/rejected": -0.046139203011989594, + "logps/chosen": -0.3790590167045593, + "logps/rejected": -3.913565158843994, + "loss": 0.3705, + "odds_ratio_loss": 0.16183798015117645, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03790590539574623, + "rewards/margins": 0.35345059633255005, + "rewards/rejected": -0.39135652780532837, + "sft_loss": 0.3790590167045593, + "step": 1615 + }, + { + "epoch": 2.336948662328272, + "grad_norm": 2.6120749263979466, + "learning_rate": 5.534319825926066e-06, + "logits/chosen": -0.1352277547121048, + "logits/rejected": -0.2033660113811493, + "logps/chosen": -0.46139898896217346, + "logps/rejected": -2.7106308937072754, + "loss": 0.4047, + "odds_ratio_loss": 0.11662759631872177, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.046139899641275406, + "rewards/margins": 0.22492320835590363, + "rewards/rejected": -0.27106308937072754, + "sft_loss": 0.46139898896217346, + "step": 1616 + }, + { + "epoch": 2.3383947939262475, + "grad_norm": 2.5091957811194825, + "learning_rate": 5.531451064222254e-06, + "logits/chosen": -0.4063303470611572, + "logits/rejected": -0.35132384300231934, + "logps/chosen": -0.30066221952438354, + "logps/rejected": -2.8382208347320557, + "loss": 0.3784, + "odds_ratio_loss": 0.09488758444786072, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.030066223815083504, + "rewards/margins": 0.2537558674812317, + "rewards/rejected": -0.28382205963134766, + "sft_loss": 0.30066221952438354, + "step": 1617 + }, + { + "epoch": 2.3398409255242227, + "grad_norm": 4.2557519976659055, + "learning_rate": 5.528581379200892e-06, + "logits/chosen": -0.1894337385892868, + "logits/rejected": -0.3243355453014374, + "logps/chosen": -0.5176692605018616, + "logps/rejected": -2.7984094619750977, + "loss": 0.3693, + "odds_ratio_loss": 0.24235056340694427, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.051766932010650635, + "rewards/margins": 0.22807404398918152, + "rewards/rejected": -0.27984097599983215, + "sft_loss": 0.5176692605018616, + "step": 1618 + }, + { + "epoch": 2.341287057122198, + "grad_norm": 2.181643600483142, + "learning_rate": 5.525710772592123e-06, + "logits/chosen": -0.1662239134311676, + "logits/rejected": -0.26300936937332153, + "logps/chosen": -0.27158379554748535, + "logps/rejected": -3.557821273803711, + "loss": 0.3126, + "odds_ratio_loss": 0.13274052739143372, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027158383280038834, + "rewards/margins": 0.3286237418651581, + "rewards/rejected": -0.3557821214199066, + "sft_loss": 0.27158379554748535, + "step": 1619 + }, + { + "epoch": 2.3427331887201737, + "grad_norm": 3.107249690135884, + "learning_rate": 5.522839246126646e-06, + "logits/chosen": -0.32011640071868896, + "logits/rejected": -0.22637443244457245, + "logps/chosen": -0.4642205238342285, + "logps/rejected": -2.9727323055267334, + "loss": 0.4463, + "odds_ratio_loss": 0.13931193947792053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04642205685377121, + "rewards/margins": 0.25085121393203735, + "rewards/rejected": -0.2972732186317444, + "sft_loss": 0.4642205238342285, + "step": 1620 + }, + { + "epoch": 2.344179320318149, + "grad_norm": 2.2401661446990166, + "learning_rate": 5.519966801535716e-06, + "logits/chosen": -0.3168110251426697, + "logits/rejected": -0.1873689442873001, + "logps/chosen": -0.36878371238708496, + "logps/rejected": -3.3086607456207275, + "loss": 0.4047, + "odds_ratio_loss": 0.13117769360542297, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03687836974859238, + "rewards/margins": 0.2939877212047577, + "rewards/rejected": -0.33086609840393066, + "sft_loss": 0.36878371238708496, + "step": 1621 + }, + { + "epoch": 2.345625451916124, + "grad_norm": 2.477972330148887, + "learning_rate": 5.5170934405511415e-06, + "logits/chosen": -0.2881897985935211, + "logits/rejected": -0.20093801617622375, + "logps/chosen": -0.3505580425262451, + "logps/rejected": -2.470184564590454, + "loss": 0.3773, + "odds_ratio_loss": 0.11334122717380524, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03505580127239227, + "rewards/margins": 0.21196265518665314, + "rewards/rejected": -0.2470184564590454, + "sft_loss": 0.3505580425262451, + "step": 1622 + }, + { + "epoch": 2.3470715835141, + "grad_norm": 2.620054583835237, + "learning_rate": 5.514219164905281e-06, + "logits/chosen": -0.19945791363716125, + "logits/rejected": -0.13795584440231323, + "logps/chosen": -0.34572529792785645, + "logps/rejected": -3.0925025939941406, + "loss": 0.3618, + "odds_ratio_loss": 0.11854077130556107, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.034572526812553406, + "rewards/margins": 0.27467772364616394, + "rewards/rejected": -0.30925023555755615, + "sft_loss": 0.34572529792785645, + "step": 1623 + }, + { + "epoch": 2.348517715112075, + "grad_norm": 2.6367388267693053, + "learning_rate": 5.511343976331046e-06, + "logits/chosen": -0.11850079149007797, + "logits/rejected": -0.20706413686275482, + "logps/chosen": -0.238744854927063, + "logps/rejected": -4.866823196411133, + "loss": 0.3102, + "odds_ratio_loss": 0.05267883464694023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02387448400259018, + "rewards/margins": 0.462807834148407, + "rewards/rejected": -0.48668232560157776, + "sft_loss": 0.238744854927063, + "step": 1624 + }, + { + "epoch": 2.3499638467100508, + "grad_norm": 2.4421056602770745, + "learning_rate": 5.5084678765618994e-06, + "logits/chosen": -0.037788279354572296, + "logits/rejected": -0.1496773064136505, + "logps/chosen": -0.2970682382583618, + "logps/rejected": -5.067946434020996, + "loss": 0.3392, + "odds_ratio_loss": 0.13825398683547974, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02970682457089424, + "rewards/margins": 0.4770878553390503, + "rewards/rejected": -0.5067946314811707, + "sft_loss": 0.2970682382583618, + "step": 1625 + }, + { + "epoch": 2.351409978308026, + "grad_norm": 3.113000332950432, + "learning_rate": 5.505590867331852e-06, + "logits/chosen": -0.004002872854471207, + "logits/rejected": 0.004178255796432495, + "logps/chosen": -0.26595339179039, + "logps/rejected": -3.0875930786132812, + "loss": 0.3252, + "odds_ratio_loss": 0.12349444627761841, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02659534104168415, + "rewards/margins": 0.28216397762298584, + "rewards/rejected": -0.30875933170318604, + "sft_loss": 0.26595339179039, + "step": 1626 + }, + { + "epoch": 2.3528561099060017, + "grad_norm": 3.2436647624885144, + "learning_rate": 5.502712950375462e-06, + "logits/chosen": -0.27105289697647095, + "logits/rejected": -0.2777266204357147, + "logps/chosen": -0.29128292202949524, + "logps/rejected": -3.751927375793457, + "loss": 0.356, + "odds_ratio_loss": 0.08841782808303833, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029128294438123703, + "rewards/margins": 0.3460644483566284, + "rewards/rejected": -0.3751927316188812, + "sft_loss": 0.29128292202949524, + "step": 1627 + }, + { + "epoch": 2.354302241503977, + "grad_norm": 2.247157821160996, + "learning_rate": 5.499834127427839e-06, + "logits/chosen": -0.23087726533412933, + "logits/rejected": -0.21010765433311462, + "logps/chosen": -0.28846973180770874, + "logps/rejected": -2.382047653198242, + "loss": 0.3048, + "odds_ratio_loss": 0.1609227955341339, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.028846973553299904, + "rewards/margins": 0.20935779809951782, + "rewards/rejected": -0.23820477724075317, + "sft_loss": 0.28846973180770874, + "step": 1628 + }, + { + "epoch": 2.355748373101952, + "grad_norm": 2.3409386872383005, + "learning_rate": 5.4969544002246355e-06, + "logits/chosen": -0.13611923158168793, + "logits/rejected": -0.17130246758460999, + "logps/chosen": -0.38131436705589294, + "logps/rejected": -2.489086389541626, + "loss": 0.3294, + "odds_ratio_loss": 0.10580040514469147, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.038131434470415115, + "rewards/margins": 0.21077720820903778, + "rewards/rejected": -0.2489086389541626, + "sft_loss": 0.38131436705589294, + "step": 1629 + }, + { + "epoch": 2.357194504699928, + "grad_norm": 2.2600185027758273, + "learning_rate": 5.494073770502046e-06, + "logits/chosen": -0.17199571430683136, + "logits/rejected": -0.29529863595962524, + "logps/chosen": -0.3580894470214844, + "logps/rejected": -1.801833152770996, + "loss": 0.3892, + "odds_ratio_loss": 0.19995692372322083, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03580894321203232, + "rewards/margins": 0.14437437057495117, + "rewards/rejected": -0.1801833212375641, + "sft_loss": 0.3580894470214844, + "step": 1630 + }, + { + "epoch": 2.358640636297903, + "grad_norm": 2.1930278604061755, + "learning_rate": 5.4911922399968175e-06, + "logits/chosen": -0.2684285640716553, + "logits/rejected": -0.20364916324615479, + "logps/chosen": -0.41383859515190125, + "logps/rejected": -2.9595816135406494, + "loss": 0.3904, + "odds_ratio_loss": 0.12188836932182312, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04138386249542236, + "rewards/margins": 0.2545742988586426, + "rewards/rejected": -0.29595816135406494, + "sft_loss": 0.41383859515190125, + "step": 1631 + }, + { + "epoch": 2.3600867678958783, + "grad_norm": 2.1821309968908364, + "learning_rate": 5.488309810446233e-06, + "logits/chosen": -0.06782963871955872, + "logits/rejected": -0.2602359652519226, + "logps/chosen": -0.2693048417568207, + "logps/rejected": -6.274295330047607, + "loss": 0.3377, + "odds_ratio_loss": 0.12266825139522552, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.026930484920740128, + "rewards/margins": 0.6004990339279175, + "rewards/rejected": -0.6274295449256897, + "sft_loss": 0.2693048417568207, + "step": 1632 + }, + { + "epoch": 2.361532899493854, + "grad_norm": 2.279263386951824, + "learning_rate": 5.485426483588121e-06, + "logits/chosen": -0.25602948665618896, + "logits/rejected": -0.21806968748569489, + "logps/chosen": -0.3889194428920746, + "logps/rejected": -2.4275267124176025, + "loss": 0.369, + "odds_ratio_loss": 0.11265772581100464, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03889194503426552, + "rewards/margins": 0.20386072993278503, + "rewards/rejected": -0.24275267124176025, + "sft_loss": 0.3889194428920746, + "step": 1633 + }, + { + "epoch": 2.3629790310918293, + "grad_norm": 2.4595078931890475, + "learning_rate": 5.482542261160849e-06, + "logits/chosen": -0.17380627989768982, + "logits/rejected": -0.06503782421350479, + "logps/chosen": -0.33673372864723206, + "logps/rejected": -2.4677717685699463, + "loss": 0.3231, + "odds_ratio_loss": 0.12649573385715485, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.033673375844955444, + "rewards/margins": 0.21310380101203918, + "rewards/rejected": -0.24677720665931702, + "sft_loss": 0.33673372864723206, + "step": 1634 + }, + { + "epoch": 2.364425162689805, + "grad_norm": 2.1872419569650052, + "learning_rate": 5.479657144903327e-06, + "logits/chosen": -0.30153438448905945, + "logits/rejected": -0.22050045430660248, + "logps/chosen": -0.31074702739715576, + "logps/rejected": -2.7804017066955566, + "loss": 0.3438, + "odds_ratio_loss": 0.09722352027893066, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.031074702739715576, + "rewards/margins": 0.24696548283100128, + "rewards/rejected": -0.27804017066955566, + "sft_loss": 0.31074702739715576, + "step": 1635 + }, + { + "epoch": 2.36587129428778, + "grad_norm": 2.370571176585605, + "learning_rate": 5.476771136555002e-06, + "logits/chosen": -0.20978805422782898, + "logits/rejected": -0.404296338558197, + "logps/chosen": -0.3532348573207855, + "logps/rejected": -2.493743658065796, + "loss": 0.4171, + "odds_ratio_loss": 0.16929349303245544, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03532348573207855, + "rewards/margins": 0.21405087411403656, + "rewards/rejected": -0.2493743747472763, + "sft_loss": 0.3532348573207855, + "step": 1636 + }, + { + "epoch": 2.3673174258857554, + "grad_norm": 2.0667254729152478, + "learning_rate": 5.4738842378558596e-06, + "logits/chosen": -0.304945170879364, + "logits/rejected": -0.3427213430404663, + "logps/chosen": -0.3743901550769806, + "logps/rejected": -3.3770601749420166, + "loss": 0.2937, + "odds_ratio_loss": 0.16788852214813232, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03743901476264, + "rewards/margins": 0.3002670109272003, + "rewards/rejected": -0.3377059996128082, + "sft_loss": 0.3743901550769806, + "step": 1637 + }, + { + "epoch": 2.368763557483731, + "grad_norm": 2.425960860579774, + "learning_rate": 5.470996450546419e-06, + "logits/chosen": -0.11950647830963135, + "logits/rejected": -0.08787774294614792, + "logps/chosen": -0.1783597767353058, + "logps/rejected": -3.924421787261963, + "loss": 0.3464, + "odds_ratio_loss": 0.088666170835495, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01783597841858864, + "rewards/margins": 0.37460625171661377, + "rewards/rejected": -0.3924421966075897, + "sft_loss": 0.1783597767353058, + "step": 1638 + }, + { + "epoch": 2.3702096890817064, + "grad_norm": 2.4264887671943516, + "learning_rate": 5.46810777636774e-06, + "logits/chosen": -0.16909067332744598, + "logits/rejected": -0.26136305928230286, + "logps/chosen": -0.4223901629447937, + "logps/rejected": -3.608455181121826, + "loss": 0.3747, + "odds_ratio_loss": 0.06967879831790924, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04223902150988579, + "rewards/margins": 0.31860652565956116, + "rewards/rejected": -0.36084550619125366, + "sft_loss": 0.4223901629447937, + "step": 1639 + }, + { + "epoch": 2.371655820679682, + "grad_norm": 2.8905963601226983, + "learning_rate": 5.465218217061415e-06, + "logits/chosen": -0.25168269872665405, + "logits/rejected": -0.36768415570259094, + "logps/chosen": -0.29154396057128906, + "logps/rejected": -4.554935455322266, + "loss": 0.3803, + "odds_ratio_loss": 0.09950277209281921, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029154395684599876, + "rewards/margins": 0.42633917927742004, + "rewards/rejected": -0.4554935693740845, + "sft_loss": 0.29154396057128906, + "step": 1640 + }, + { + "epoch": 2.3731019522776573, + "grad_norm": 2.3523138704511686, + "learning_rate": 5.46232777436957e-06, + "logits/chosen": -0.3319550156593323, + "logits/rejected": -0.25737500190734863, + "logps/chosen": -0.35822594165802, + "logps/rejected": -4.630117893218994, + "loss": 0.3781, + "odds_ratio_loss": 0.08917144685983658, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03582259640097618, + "rewards/margins": 0.4271891713142395, + "rewards/rejected": -0.46301180124282837, + "sft_loss": 0.35822594165802, + "step": 1641 + }, + { + "epoch": 2.3745480838756325, + "grad_norm": 2.244224679372636, + "learning_rate": 5.4594364500348635e-06, + "logits/chosen": -0.2159147709608078, + "logits/rejected": -0.1707444190979004, + "logps/chosen": -0.1780027151107788, + "logps/rejected": -5.523977279663086, + "loss": 0.3182, + "odds_ratio_loss": 0.062143001705408096, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01780027337372303, + "rewards/margins": 0.5345974564552307, + "rewards/rejected": -0.5523977279663086, + "sft_loss": 0.1780027151107788, + "step": 1642 + }, + { + "epoch": 2.3759942154736082, + "grad_norm": 2.97272951082299, + "learning_rate": 5.456544245800486e-06, + "logits/chosen": -0.43455255031585693, + "logits/rejected": -0.26484546065330505, + "logps/chosen": -0.2586551010608673, + "logps/rejected": -4.144586086273193, + "loss": 0.3175, + "odds_ratio_loss": 0.0730627104640007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02586551196873188, + "rewards/margins": 0.38859307765960693, + "rewards/rejected": -0.41445863246917725, + "sft_loss": 0.2586551010608673, + "step": 1643 + }, + { + "epoch": 2.3774403470715835, + "grad_norm": 3.159824614056135, + "learning_rate": 5.453651163410157e-06, + "logits/chosen": -0.2961457669734955, + "logits/rejected": -0.3236675262451172, + "logps/chosen": -0.4048958718776703, + "logps/rejected": -5.435911178588867, + "loss": 0.3044, + "odds_ratio_loss": 0.07996393740177155, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04048958793282509, + "rewards/margins": 0.5031015872955322, + "rewards/rejected": -0.5435911417007446, + "sft_loss": 0.4048958718776703, + "step": 1644 + }, + { + "epoch": 2.3788864786695587, + "grad_norm": 2.4232305825130873, + "learning_rate": 5.45075720460813e-06, + "logits/chosen": -0.2177274227142334, + "logits/rejected": -0.2560899257659912, + "logps/chosen": -0.39400598406791687, + "logps/rejected": -3.329862594604492, + "loss": 0.3572, + "odds_ratio_loss": 0.17939239740371704, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.039400599896907806, + "rewards/margins": 0.2935856580734253, + "rewards/rejected": -0.3329862654209137, + "sft_loss": 0.39400598406791687, + "step": 1645 + }, + { + "epoch": 2.3803326102675344, + "grad_norm": 2.3769012497590136, + "learning_rate": 5.4478623711391785e-06, + "logits/chosen": -0.41154927015304565, + "logits/rejected": -0.1991625726222992, + "logps/chosen": -0.26996538043022156, + "logps/rejected": -3.1177456378936768, + "loss": 0.3654, + "odds_ratio_loss": 0.10832569748163223, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.026996538043022156, + "rewards/margins": 0.28477805852890015, + "rewards/rejected": -0.3117745816707611, + "sft_loss": 0.26996538043022156, + "step": 1646 + }, + { + "epoch": 2.3817787418655096, + "grad_norm": 2.9369047044268206, + "learning_rate": 5.4449666647486125e-06, + "logits/chosen": -0.13625237345695496, + "logits/rejected": -0.08367015421390533, + "logps/chosen": -0.43870049715042114, + "logps/rejected": -2.0339059829711914, + "loss": 0.4882, + "odds_ratio_loss": 0.16639426350593567, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.043870046734809875, + "rewards/margins": 0.15952055156230927, + "rewards/rejected": -0.20339059829711914, + "sft_loss": 0.43870049715042114, + "step": 1647 + }, + { + "epoch": 2.3832248734634853, + "grad_norm": 6.563565558670957, + "learning_rate": 5.4420700871822616e-06, + "logits/chosen": -0.25511327385902405, + "logits/rejected": -0.1838105469942093, + "logps/chosen": -0.46203696727752686, + "logps/rejected": -4.043597221374512, + "loss": 0.3845, + "odds_ratio_loss": 0.19478821754455566, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04620370268821716, + "rewards/margins": 0.3581559956073761, + "rewards/rejected": -0.40435969829559326, + "sft_loss": 0.46203696727752686, + "step": 1648 + }, + { + "epoch": 2.3846710050614606, + "grad_norm": 2.4494751986770646, + "learning_rate": 5.439172640186484e-06, + "logits/chosen": -0.07603715360164642, + "logits/rejected": -0.08391943573951721, + "logps/chosen": -0.47998425364494324, + "logps/rejected": -4.149824142456055, + "loss": 0.3811, + "odds_ratio_loss": 0.16126686334609985, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04799842834472656, + "rewards/margins": 0.36698397994041443, + "rewards/rejected": -0.4149824380874634, + "sft_loss": 0.47998425364494324, + "step": 1649 + }, + { + "epoch": 2.3861171366594363, + "grad_norm": 3.217436034110034, + "learning_rate": 5.436274325508164e-06, + "logits/chosen": -0.1692204475402832, + "logits/rejected": -0.18024393916130066, + "logps/chosen": -0.3294297754764557, + "logps/rejected": -2.8294403553009033, + "loss": 0.3719, + "odds_ratio_loss": 0.11639772355556488, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03294298052787781, + "rewards/margins": 0.25000107288360596, + "rewards/rejected": -0.28294405341148376, + "sft_loss": 0.3294297754764557, + "step": 1650 + }, + { + "epoch": 2.3875632682574115, + "grad_norm": 2.43631615840935, + "learning_rate": 5.433375144894701e-06, + "logits/chosen": -0.21938952803611755, + "logits/rejected": -0.1646469086408615, + "logps/chosen": -0.2409743219614029, + "logps/rejected": -4.187996864318848, + "loss": 0.3565, + "odds_ratio_loss": 0.03713899105787277, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02409743145108223, + "rewards/margins": 0.3947022259235382, + "rewards/rejected": -0.4187996983528137, + "sft_loss": 0.2409743219614029, + "step": 1651 + }, + { + "epoch": 2.3890093998553867, + "grad_norm": 2.6636340250404253, + "learning_rate": 5.430475100094026e-06, + "logits/chosen": -0.16366271674633026, + "logits/rejected": -0.2531028091907501, + "logps/chosen": -0.3762364983558655, + "logps/rejected": -4.51442289352417, + "loss": 0.3527, + "odds_ratio_loss": 0.1351589560508728, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03762365132570267, + "rewards/margins": 0.4138185977935791, + "rewards/rejected": -0.45144224166870117, + "sft_loss": 0.3762364983558655, + "step": 1652 + }, + { + "epoch": 2.3904555314533624, + "grad_norm": 2.1635789447744815, + "learning_rate": 5.427574192854586e-06, + "logits/chosen": -0.18240058422088623, + "logits/rejected": -0.12168880552053452, + "logps/chosen": -0.3345116376876831, + "logps/rejected": -3.058701753616333, + "loss": 0.3597, + "odds_ratio_loss": 0.09098277986049652, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03345116227865219, + "rewards/margins": 0.2724190056324005, + "rewards/rejected": -0.3058701753616333, + "sft_loss": 0.3345116376876831, + "step": 1653 + }, + { + "epoch": 2.3919016630513377, + "grad_norm": 2.0656474748307363, + "learning_rate": 5.424672424925347e-06, + "logits/chosen": -0.13181257247924805, + "logits/rejected": -0.0836930125951767, + "logps/chosen": -0.2531171441078186, + "logps/rejected": -3.279249906539917, + "loss": 0.3388, + "odds_ratio_loss": 0.08992984890937805, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02531171217560768, + "rewards/margins": 0.3026133179664612, + "rewards/rejected": -0.32792502641677856, + "sft_loss": 0.2531171441078186, + "step": 1654 + }, + { + "epoch": 2.393347794649313, + "grad_norm": 2.2867887869624126, + "learning_rate": 5.4217697980557986e-06, + "logits/chosen": -0.18714286386966705, + "logits/rejected": -0.2178667187690735, + "logps/chosen": -0.456027090549469, + "logps/rejected": -3.494032621383667, + "loss": 0.3659, + "odds_ratio_loss": 0.1177714616060257, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0456027090549469, + "rewards/margins": 0.3038005530834198, + "rewards/rejected": -0.3494032621383667, + "sft_loss": 0.456027090549469, + "step": 1655 + }, + { + "epoch": 2.3947939262472886, + "grad_norm": 2.385763157906236, + "learning_rate": 5.418866313995942e-06, + "logits/chosen": -0.14378896355628967, + "logits/rejected": -0.02609366364777088, + "logps/chosen": -0.33504900336265564, + "logps/rejected": -2.4867379665374756, + "loss": 0.3178, + "odds_ratio_loss": 0.10914792120456696, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.033504899591207504, + "rewards/margins": 0.21516892313957214, + "rewards/rejected": -0.24867381155490875, + "sft_loss": 0.33504900336265564, + "step": 1656 + }, + { + "epoch": 2.396240057845264, + "grad_norm": 2.5382250015062793, + "learning_rate": 5.415961974496303e-06, + "logits/chosen": -0.27873504161834717, + "logits/rejected": -0.10101763904094696, + "logps/chosen": -0.5347107648849487, + "logps/rejected": -2.174900531768799, + "loss": 0.4443, + "odds_ratio_loss": 0.18945449590682983, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05347108095884323, + "rewards/margins": 0.16401898860931396, + "rewards/rejected": -0.2174900472164154, + "sft_loss": 0.5347107648849487, + "step": 1657 + }, + { + "epoch": 2.3976861894432395, + "grad_norm": 2.094994084526692, + "learning_rate": 5.413056781307913e-06, + "logits/chosen": -0.09330153465270996, + "logits/rejected": -0.0472065694630146, + "logps/chosen": -0.34694916009902954, + "logps/rejected": -2.9058146476745605, + "loss": 0.427, + "odds_ratio_loss": 0.10710655152797699, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03469491004943848, + "rewards/margins": 0.2558865547180176, + "rewards/rejected": -0.29058146476745605, + "sft_loss": 0.34694916009902954, + "step": 1658 + }, + { + "epoch": 2.3991323210412148, + "grad_norm": 2.3038775902562425, + "learning_rate": 5.4101507361823276e-06, + "logits/chosen": -0.3557550609111786, + "logits/rejected": -0.37921270728111267, + "logps/chosen": -0.3219497799873352, + "logps/rejected": -3.9622740745544434, + "loss": 0.3468, + "odds_ratio_loss": 0.13163802027702332, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03219497948884964, + "rewards/margins": 0.36403244733810425, + "rewards/rejected": -0.3962274193763733, + "sft_loss": 0.3219497799873352, + "step": 1659 + }, + { + "epoch": 2.40057845263919, + "grad_norm": 2.3419223245541727, + "learning_rate": 5.407243840871612e-06, + "logits/chosen": -0.20818328857421875, + "logits/rejected": -0.16019529104232788, + "logps/chosen": -0.39010053873062134, + "logps/rejected": -2.511380672454834, + "loss": 0.363, + "odds_ratio_loss": 0.1622430980205536, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.039010051637887955, + "rewards/margins": 0.21212802827358246, + "rewards/rejected": -0.2511380910873413, + "sft_loss": 0.39010053873062134, + "step": 1660 + }, + { + "epoch": 2.4020245842371657, + "grad_norm": 4.380688001278866, + "learning_rate": 5.404336097128343e-06, + "logits/chosen": -0.2879038155078888, + "logits/rejected": -0.30223992466926575, + "logps/chosen": -0.4455604553222656, + "logps/rejected": -2.5138981342315674, + "loss": 0.3703, + "odds_ratio_loss": 0.15395966172218323, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04455604776740074, + "rewards/margins": 0.20683377981185913, + "rewards/rejected": -0.25138983130455017, + "sft_loss": 0.4455604553222656, + "step": 1661 + }, + { + "epoch": 2.403470715835141, + "grad_norm": 2.3187283533283938, + "learning_rate": 5.401427506705611e-06, + "logits/chosen": -0.08860334753990173, + "logits/rejected": -0.08838482946157455, + "logps/chosen": -0.2660841941833496, + "logps/rejected": -3.3898189067840576, + "loss": 0.4052, + "odds_ratio_loss": 0.09372745454311371, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0266084186732769, + "rewards/margins": 0.31237348914146423, + "rewards/rejected": -0.33898189663887024, + "sft_loss": 0.2660841941833496, + "step": 1662 + }, + { + "epoch": 2.4049168474331166, + "grad_norm": 2.7227957969637595, + "learning_rate": 5.398518071357015e-06, + "logits/chosen": -0.1172524020075798, + "logits/rejected": -0.33546555042266846, + "logps/chosen": -0.37985527515411377, + "logps/rejected": -2.7703769207000732, + "loss": 0.4467, + "odds_ratio_loss": 0.2563801407814026, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.037985529750585556, + "rewards/margins": 0.2390521615743637, + "rewards/rejected": -0.27703768014907837, + "sft_loss": 0.37985527515411377, + "step": 1663 + }, + { + "epoch": 2.406362979031092, + "grad_norm": 2.773528286720876, + "learning_rate": 5.395607792836667e-06, + "logits/chosen": -0.20116093754768372, + "logits/rejected": -0.11985060572624207, + "logps/chosen": -0.3083297610282898, + "logps/rejected": -1.3712188005447388, + "loss": 0.3717, + "odds_ratio_loss": 0.17615623772144318, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03083297424018383, + "rewards/margins": 0.10628890991210938, + "rewards/rejected": -0.13712188601493835, + "sft_loss": 0.3083297610282898, + "step": 1664 + }, + { + "epoch": 2.407809110629067, + "grad_norm": 2.3754850371079197, + "learning_rate": 5.392696672899181e-06, + "logits/chosen": -0.3145734667778015, + "logits/rejected": -0.23199528455734253, + "logps/chosen": -0.26135680079460144, + "logps/rejected": -3.4864580631256104, + "loss": 0.3643, + "odds_ratio_loss": 0.06875382363796234, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026135679334402084, + "rewards/margins": 0.32251012325286865, + "rewards/rejected": -0.34864580631256104, + "sft_loss": 0.26135680079460144, + "step": 1665 + }, + { + "epoch": 2.409255242227043, + "grad_norm": 2.497467322183816, + "learning_rate": 5.389784713299686e-06, + "logits/chosen": -0.07668744772672653, + "logits/rejected": -0.007611650973558426, + "logps/chosen": -0.21216994524002075, + "logps/rejected": -4.282060146331787, + "loss": 0.303, + "odds_ratio_loss": 0.1205163523554802, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021216996014118195, + "rewards/margins": 0.40698903799057007, + "rewards/rejected": -0.4282059967517853, + "sft_loss": 0.21216994524002075, + "step": 1666 + }, + { + "epoch": 2.410701373825018, + "grad_norm": 2.1520905912288177, + "learning_rate": 5.386871915793809e-06, + "logits/chosen": -0.2075837105512619, + "logits/rejected": -0.1876615285873413, + "logps/chosen": -0.2702080309391022, + "logps/rejected": -5.139091968536377, + "loss": 0.3105, + "odds_ratio_loss": 0.1293424367904663, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027020802721381187, + "rewards/margins": 0.48688840866088867, + "rewards/rejected": -0.5139092206954956, + "sft_loss": 0.2702080309391022, + "step": 1667 + }, + { + "epoch": 2.4121475054229933, + "grad_norm": 3.2146716999123193, + "learning_rate": 5.383958282137691e-06, + "logits/chosen": -0.12676213681697845, + "logits/rejected": -0.1635175496339798, + "logps/chosen": -0.2572956681251526, + "logps/rejected": -3.4657866954803467, + "loss": 0.39, + "odds_ratio_loss": 0.10569733381271362, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025729568675160408, + "rewards/margins": 0.32084912061691284, + "rewards/rejected": -0.3465786874294281, + "sft_loss": 0.2572956681251526, + "step": 1668 + }, + { + "epoch": 2.413593637020969, + "grad_norm": 2.3616969258191065, + "learning_rate": 5.381043814087968e-06, + "logits/chosen": -0.38791531324386597, + "logits/rejected": -0.27840742468833923, + "logps/chosen": -0.3243800401687622, + "logps/rejected": -3.7442002296447754, + "loss": 0.3139, + "odds_ratio_loss": 0.10507690906524658, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0324380025267601, + "rewards/margins": 0.34198203682899475, + "rewards/rejected": -0.37442004680633545, + "sft_loss": 0.3243800401687622, + "step": 1669 + }, + { + "epoch": 2.415039768618944, + "grad_norm": 2.2888964516476467, + "learning_rate": 5.3781285134017865e-06, + "logits/chosen": -0.25714176893234253, + "logits/rejected": -0.1979777216911316, + "logps/chosen": -0.3115207552909851, + "logps/rejected": -3.852271556854248, + "loss": 0.3226, + "odds_ratio_loss": 0.09582686424255371, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03115207701921463, + "rewards/margins": 0.3540751039981842, + "rewards/rejected": -0.3852272033691406, + "sft_loss": 0.3115207552909851, + "step": 1670 + }, + { + "epoch": 2.41648590021692, + "grad_norm": 2.041066759979958, + "learning_rate": 5.375212381836793e-06, + "logits/chosen": -0.10201995074748993, + "logits/rejected": -0.08090163767337799, + "logps/chosen": -0.3471717834472656, + "logps/rejected": -5.151330947875977, + "loss": 0.3825, + "odds_ratio_loss": 0.08015061914920807, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03471717610955238, + "rewards/margins": 0.480415940284729, + "rewards/rejected": -0.5151331424713135, + "sft_loss": 0.3471717834472656, + "step": 1671 + }, + { + "epoch": 2.417932031814895, + "grad_norm": 2.402521519996911, + "learning_rate": 5.3722954211511314e-06, + "logits/chosen": -0.20054322481155396, + "logits/rejected": -0.3008997440338135, + "logps/chosen": -0.19180569052696228, + "logps/rejected": -4.623170375823975, + "loss": 0.3376, + "odds_ratio_loss": 0.032107461243867874, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019180569797754288, + "rewards/margins": 0.4431364834308624, + "rewards/rejected": -0.4623170495033264, + "sft_loss": 0.19180569052696228, + "step": 1672 + }, + { + "epoch": 2.419378163412871, + "grad_norm": 2.8755560314390736, + "learning_rate": 5.369377633103449e-06, + "logits/chosen": -0.24951305985450745, + "logits/rejected": -0.16665250062942505, + "logps/chosen": -0.21055331826210022, + "logps/rejected": -2.9385476112365723, + "loss": 0.3952, + "odds_ratio_loss": 0.0641937106847763, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021055329591035843, + "rewards/margins": 0.27279946208000183, + "rewards/rejected": -0.2938547730445862, + "sft_loss": 0.21055331826210022, + "step": 1673 + }, + { + "epoch": 2.420824295010846, + "grad_norm": 2.2612662097686047, + "learning_rate": 5.366459019452893e-06, + "logits/chosen": -0.3434632122516632, + "logits/rejected": -0.21899384260177612, + "logps/chosen": -0.38703057169914246, + "logps/rejected": -2.9113190174102783, + "loss": 0.3515, + "odds_ratio_loss": 0.15410572290420532, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.038703057914972305, + "rewards/margins": 0.2524288594722748, + "rewards/rejected": -0.2911319136619568, + "sft_loss": 0.38703057169914246, + "step": 1674 + }, + { + "epoch": 2.4222704266088213, + "grad_norm": 2.1759070085509067, + "learning_rate": 5.363539581959102e-06, + "logits/chosen": -0.3367041349411011, + "logits/rejected": -0.2844616770744324, + "logps/chosen": -0.39401325583457947, + "logps/rejected": -2.856097459793091, + "loss": 0.406, + "odds_ratio_loss": 0.16076377034187317, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03940132260322571, + "rewards/margins": 0.24620842933654785, + "rewards/rejected": -0.28560978174209595, + "sft_loss": 0.39401325583457947, + "step": 1675 + }, + { + "epoch": 2.423716558206797, + "grad_norm": 4.34686480463768, + "learning_rate": 5.3606193223822215e-06, + "logits/chosen": -0.28801190853118896, + "logits/rejected": -0.31603163480758667, + "logps/chosen": -0.3019119203090668, + "logps/rejected": -3.840654134750366, + "loss": 0.3827, + "odds_ratio_loss": 0.07661937922239304, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.030191190540790558, + "rewards/margins": 0.35387420654296875, + "rewards/rejected": -0.3840653896331787, + "sft_loss": 0.3019119203090668, + "step": 1676 + }, + { + "epoch": 2.4251626898047722, + "grad_norm": 2.2347029036364083, + "learning_rate": 5.357698242482884e-06, + "logits/chosen": -0.30417659878730774, + "logits/rejected": -0.16208516061306, + "logps/chosen": -0.3697463870048523, + "logps/rejected": -5.748754978179932, + "loss": 0.3974, + "odds_ratio_loss": 0.1115613579750061, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03697463870048523, + "rewards/margins": 0.5379008054733276, + "rewards/rejected": -0.5748754739761353, + "sft_loss": 0.3697463870048523, + "step": 1677 + }, + { + "epoch": 2.4266088214027475, + "grad_norm": 2.746152164966934, + "learning_rate": 5.354776344022219e-06, + "logits/chosen": -0.2900420129299164, + "logits/rejected": -0.23626813292503357, + "logps/chosen": -0.2787986993789673, + "logps/rejected": -2.1767630577087402, + "loss": 0.3304, + "odds_ratio_loss": 0.120830237865448, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027879871428012848, + "rewards/margins": 0.18979641795158386, + "rewards/rejected": -0.2176763117313385, + "sft_loss": 0.2787986993789673, + "step": 1678 + }, + { + "epoch": 2.428054953000723, + "grad_norm": 2.620208103065728, + "learning_rate": 5.35185362876185e-06, + "logits/chosen": -0.3657112121582031, + "logits/rejected": -0.3040888011455536, + "logps/chosen": -0.43680983781814575, + "logps/rejected": -2.43369197845459, + "loss": 0.3569, + "odds_ratio_loss": 0.1762293428182602, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04368098825216293, + "rewards/margins": 0.19968819618225098, + "rewards/rejected": -0.2433691918849945, + "sft_loss": 0.43680983781814575, + "step": 1679 + }, + { + "epoch": 2.4295010845986984, + "grad_norm": 2.4352727537772316, + "learning_rate": 5.348930098463894e-06, + "logits/chosen": -0.6010982394218445, + "logits/rejected": -0.30227330327033997, + "logps/chosen": -0.3067004978656769, + "logps/rejected": -3.6038007736206055, + "loss": 0.3692, + "odds_ratio_loss": 0.06560537964105606, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.030670054256916046, + "rewards/margins": 0.3297100067138672, + "rewards/rejected": -0.36038005352020264, + "sft_loss": 0.3067004978656769, + "step": 1680 + }, + { + "epoch": 2.430947216196674, + "grad_norm": 2.4907614571390337, + "learning_rate": 5.346005754890956e-06, + "logits/chosen": -0.40090346336364746, + "logits/rejected": -0.18178865313529968, + "logps/chosen": -0.3909543752670288, + "logps/rejected": -2.7736334800720215, + "loss": 0.4267, + "odds_ratio_loss": 0.09884399175643921, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.039095439016819, + "rewards/margins": 0.2382679283618927, + "rewards/rejected": -0.2773633599281311, + "sft_loss": 0.3909543752670288, + "step": 1681 + }, + { + "epoch": 2.4323933477946493, + "grad_norm": 2.9164477390226398, + "learning_rate": 5.3430805998061375e-06, + "logits/chosen": -0.24474173784255981, + "logits/rejected": -0.21296316385269165, + "logps/chosen": -0.4344663918018341, + "logps/rejected": -3.148064136505127, + "loss": 0.3499, + "odds_ratio_loss": 0.09581369161605835, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04344664514064789, + "rewards/margins": 0.27135974168777466, + "rewards/rejected": -0.31480640172958374, + "sft_loss": 0.4344663918018341, + "step": 1682 + }, + { + "epoch": 2.4338394793926246, + "grad_norm": 2.1284693896066735, + "learning_rate": 5.340154634973023e-06, + "logits/chosen": -0.18710467219352722, + "logits/rejected": -0.18588191270828247, + "logps/chosen": -0.3472820818424225, + "logps/rejected": -2.692941427230835, + "loss": 0.4106, + "odds_ratio_loss": 0.12289082258939743, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03472820669412613, + "rewards/margins": 0.23456594347953796, + "rewards/rejected": -0.2692941427230835, + "sft_loss": 0.3472820818424225, + "step": 1683 + }, + { + "epoch": 2.4352856109906003, + "grad_norm": 2.467601888547069, + "learning_rate": 5.337227862155687e-06, + "logits/chosen": -0.13609479367733002, + "logits/rejected": -0.135749951004982, + "logps/chosen": -0.35460272431373596, + "logps/rejected": -3.266855478286743, + "loss": 0.3055, + "odds_ratio_loss": 0.12023431807756424, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.035460274666547775, + "rewards/margins": 0.29122528433799744, + "rewards/rejected": -0.3266855478286743, + "sft_loss": 0.35460272431373596, + "step": 1684 + }, + { + "epoch": 2.4367317425885755, + "grad_norm": 3.181573666104813, + "learning_rate": 5.334300283118692e-06, + "logits/chosen": -0.20146742463111877, + "logits/rejected": -0.2168891727924347, + "logps/chosen": -0.5368006825447083, + "logps/rejected": -3.477538824081421, + "loss": 0.3767, + "odds_ratio_loss": 0.1835523396730423, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.053680069744586945, + "rewards/margins": 0.29407382011413574, + "rewards/rejected": -0.3477538824081421, + "sft_loss": 0.5368006825447083, + "step": 1685 + }, + { + "epoch": 2.438177874186551, + "grad_norm": 2.23714776177442, + "learning_rate": 5.331371899627088e-06, + "logits/chosen": -0.10427071899175644, + "logits/rejected": -0.18009522557258606, + "logps/chosen": -0.3216210603713989, + "logps/rejected": -4.574368000030518, + "loss": 0.3989, + "odds_ratio_loss": 0.09731189906597137, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03216210752725601, + "rewards/margins": 0.42527472972869873, + "rewards/rejected": -0.45743680000305176, + "sft_loss": 0.3216210603713989, + "step": 1686 + }, + { + "epoch": 2.4396240057845264, + "grad_norm": 2.070128394841874, + "learning_rate": 5.328442713446407e-06, + "logits/chosen": -0.16747835278511047, + "logits/rejected": -0.25303810834884644, + "logps/chosen": -0.2828282415866852, + "logps/rejected": -2.099355459213257, + "loss": 0.3438, + "odds_ratio_loss": 0.1440654695034027, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.028282826766371727, + "rewards/margins": 0.1816527247428894, + "rewards/rejected": -0.20993554592132568, + "sft_loss": 0.2828282415866852, + "step": 1687 + }, + { + "epoch": 2.4410701373825017, + "grad_norm": 2.458892699445312, + "learning_rate": 5.325512726342665e-06, + "logits/chosen": -0.28189414739608765, + "logits/rejected": -0.383240282535553, + "logps/chosen": -0.3891836404800415, + "logps/rejected": -3.0360727310180664, + "loss": 0.4436, + "odds_ratio_loss": 0.16465787589550018, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03891836479306221, + "rewards/margins": 0.2646889090538025, + "rewards/rejected": -0.3036072850227356, + "sft_loss": 0.3891836404800415, + "step": 1688 + }, + { + "epoch": 2.4425162689804774, + "grad_norm": 2.337184676111808, + "learning_rate": 5.322581940082365e-06, + "logits/chosen": -0.13843964040279388, + "logits/rejected": -0.15620598196983337, + "logps/chosen": -0.20821207761764526, + "logps/rejected": -3.2480251789093018, + "loss": 0.3055, + "odds_ratio_loss": 0.05318732559680939, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020821209996938705, + "rewards/margins": 0.30398133397102356, + "rewards/rejected": -0.3248025178909302, + "sft_loss": 0.20821207761764526, + "step": 1689 + }, + { + "epoch": 2.4439624005784526, + "grad_norm": 2.4611245315908197, + "learning_rate": 5.319650356432487e-06, + "logits/chosen": -0.38569334149360657, + "logits/rejected": -0.34914782643318176, + "logps/chosen": -0.2619364261627197, + "logps/rejected": -4.187211036682129, + "loss": 0.3669, + "odds_ratio_loss": 0.11673162132501602, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026193641126155853, + "rewards/margins": 0.39252740144729614, + "rewards/rejected": -0.418721079826355, + "sft_loss": 0.2619364261627197, + "step": 1690 + }, + { + "epoch": 2.445408532176428, + "grad_norm": 2.3690510230543937, + "learning_rate": 5.316717977160495e-06, + "logits/chosen": -0.1654704362154007, + "logits/rejected": -0.24338343739509583, + "logps/chosen": -0.36776530742645264, + "logps/rejected": -4.427489757537842, + "loss": 0.3698, + "odds_ratio_loss": 0.14131522178649902, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03677653148770332, + "rewards/margins": 0.405972421169281, + "rewards/rejected": -0.4427489638328552, + "sft_loss": 0.36776530742645264, + "step": 1691 + }, + { + "epoch": 2.4468546637744035, + "grad_norm": 2.7640415911692315, + "learning_rate": 5.31378480403433e-06, + "logits/chosen": -0.11378660053014755, + "logits/rejected": -0.28160223364830017, + "logps/chosen": -0.39058995246887207, + "logps/rejected": -1.9533859491348267, + "loss": 0.3786, + "odds_ratio_loss": 0.24598926305770874, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.039058998227119446, + "rewards/margins": 0.15627959370613098, + "rewards/rejected": -0.19533859193325043, + "sft_loss": 0.39058995246887207, + "step": 1692 + }, + { + "epoch": 2.4483007953723788, + "grad_norm": 2.3969905094526816, + "learning_rate": 5.310850838822413e-06, + "logits/chosen": -0.25515154004096985, + "logits/rejected": -0.3220319151878357, + "logps/chosen": -0.2755758464336395, + "logps/rejected": -4.758266925811768, + "loss": 0.2722, + "odds_ratio_loss": 0.08147071301937103, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027557585388422012, + "rewards/margins": 0.4482691287994385, + "rewards/rejected": -0.4758267104625702, + "sft_loss": 0.2755758464336395, + "step": 1693 + }, + { + "epoch": 2.4497469269703545, + "grad_norm": 2.7245877218943138, + "learning_rate": 5.307916083293643e-06, + "logits/chosen": -0.24902743101119995, + "logits/rejected": -0.173957958817482, + "logps/chosen": -0.2562694847583771, + "logps/rejected": -2.591289758682251, + "loss": 0.3452, + "odds_ratio_loss": 0.12518920004367828, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025626949965953827, + "rewards/margins": 0.23350203037261963, + "rewards/rejected": -0.25912898778915405, + "sft_loss": 0.2562694847583771, + "step": 1694 + }, + { + "epoch": 2.4511930585683297, + "grad_norm": 2.525754725939901, + "learning_rate": 5.304980539217397e-06, + "logits/chosen": -0.255185067653656, + "logits/rejected": -0.2326502501964569, + "logps/chosen": -0.3928337097167969, + "logps/rejected": -4.967213153839111, + "loss": 0.3448, + "odds_ratio_loss": 0.13604749739170074, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03928337246179581, + "rewards/margins": 0.45743799209594727, + "rewards/rejected": -0.49672138690948486, + "sft_loss": 0.3928337097167969, + "step": 1695 + }, + { + "epoch": 2.4526391901663054, + "grad_norm": 2.157462237065824, + "learning_rate": 5.3020442083635225e-06, + "logits/chosen": -0.33897683024406433, + "logits/rejected": -0.36058297753334045, + "logps/chosen": -0.3103875517845154, + "logps/rejected": -4.488574981689453, + "loss": 0.3695, + "odds_ratio_loss": 0.07876772433519363, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.031038757413625717, + "rewards/margins": 0.41781869530677795, + "rewards/rejected": -0.44885745644569397, + "sft_loss": 0.3103875517845154, + "step": 1696 + }, + { + "epoch": 2.4540853217642806, + "grad_norm": 3.264192289306274, + "learning_rate": 5.299107092502345e-06, + "logits/chosen": -0.14832520484924316, + "logits/rejected": -0.16244558990001678, + "logps/chosen": -0.4910193979740143, + "logps/rejected": -3.7641544342041016, + "loss": 0.4286, + "odds_ratio_loss": 0.13385328650474548, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04910193756222725, + "rewards/margins": 0.32731351256370544, + "rewards/rejected": -0.376415491104126, + "sft_loss": 0.4910193979740143, + "step": 1697 + }, + { + "epoch": 2.455531453362256, + "grad_norm": 2.813819539819064, + "learning_rate": 5.296169193404664e-06, + "logits/chosen": -0.24914006888866425, + "logits/rejected": -0.20338988304138184, + "logps/chosen": -0.21061301231384277, + "logps/rejected": -4.377030849456787, + "loss": 0.3424, + "odds_ratio_loss": 0.038216181099414825, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021061301231384277, + "rewards/margins": 0.4166417717933655, + "rewards/rejected": -0.43770307302474976, + "sft_loss": 0.21061301231384277, + "step": 1698 + }, + { + "epoch": 2.4569775849602316, + "grad_norm": 2.6327695248210983, + "learning_rate": 5.2932305128417484e-06, + "logits/chosen": -0.17436552047729492, + "logits/rejected": -0.16264991462230682, + "logps/chosen": -0.35267582535743713, + "logps/rejected": -3.094590663909912, + "loss": 0.3573, + "odds_ratio_loss": 0.14291812479496002, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03526758402585983, + "rewards/margins": 0.2741914987564087, + "rewards/rejected": -0.30945906043052673, + "sft_loss": 0.35267582535743713, + "step": 1699 + }, + { + "epoch": 2.458423716558207, + "grad_norm": 2.7114975370715704, + "learning_rate": 5.2902910525853406e-06, + "logits/chosen": -0.16905909776687622, + "logits/rejected": -0.12139132618904114, + "logps/chosen": -0.518578290939331, + "logps/rejected": -2.4249773025512695, + "loss": 0.4167, + "odds_ratio_loss": 0.183926060795784, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.051857829093933105, + "rewards/margins": 0.19063988327980042, + "rewards/rejected": -0.24249771237373352, + "sft_loss": 0.518578290939331, + "step": 1700 + }, + { + "epoch": 2.459869848156182, + "grad_norm": 2.209082477790294, + "learning_rate": 5.28735081440765e-06, + "logits/chosen": -0.14498230814933777, + "logits/rejected": -0.25710102915763855, + "logps/chosen": -0.37873467803001404, + "logps/rejected": -3.46140193939209, + "loss": 0.3614, + "odds_ratio_loss": 0.17570456862449646, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03787346929311752, + "rewards/margins": 0.30826669931411743, + "rewards/rejected": -0.34614014625549316, + "sft_loss": 0.37873467803001404, + "step": 1701 + }, + { + "epoch": 2.4613159797541577, + "grad_norm": 2.563412683406822, + "learning_rate": 5.284409800081359e-06, + "logits/chosen": -0.05897979438304901, + "logits/rejected": -0.1600523293018341, + "logps/chosen": -0.3373136818408966, + "logps/rejected": -2.6773574352264404, + "loss": 0.3735, + "odds_ratio_loss": 0.15935732424259186, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0337313674390316, + "rewards/margins": 0.23400436341762543, + "rewards/rejected": -0.2677357494831085, + "sft_loss": 0.3373136818408966, + "step": 1702 + }, + { + "epoch": 2.462762111352133, + "grad_norm": 2.8871356309464016, + "learning_rate": 5.281468011379618e-06, + "logits/chosen": -0.12770754098892212, + "logits/rejected": -0.05635076016187668, + "logps/chosen": -0.3339754045009613, + "logps/rejected": -2.7018704414367676, + "loss": 0.4228, + "odds_ratio_loss": 0.09883897006511688, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03339754045009613, + "rewards/margins": 0.2367894947528839, + "rewards/rejected": -0.27018705010414124, + "sft_loss": 0.3339754045009613, + "step": 1703 + }, + { + "epoch": 2.4642082429501087, + "grad_norm": 2.5395367967099958, + "learning_rate": 5.278525450076038e-06, + "logits/chosen": -0.19829659163951874, + "logits/rejected": -0.25783413648605347, + "logps/chosen": -0.28885769844055176, + "logps/rejected": -3.8891334533691406, + "loss": 0.3464, + "odds_ratio_loss": 0.12367962300777435, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.028885772451758385, + "rewards/margins": 0.36002761125564575, + "rewards/rejected": -0.3889133930206299, + "sft_loss": 0.28885769844055176, + "step": 1704 + }, + { + "epoch": 2.465654374548084, + "grad_norm": 2.7686820913229866, + "learning_rate": 5.275582117944704e-06, + "logits/chosen": -0.20002993941307068, + "logits/rejected": -0.23674233257770538, + "logps/chosen": -0.26880428194999695, + "logps/rejected": -4.4511919021606445, + "loss": 0.382, + "odds_ratio_loss": 0.07453987747430801, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026880430057644844, + "rewards/margins": 0.4182387888431549, + "rewards/rejected": -0.4451192021369934, + "sft_loss": 0.26880428194999695, + "step": 1705 + }, + { + "epoch": 2.467100506146059, + "grad_norm": 2.482392899389993, + "learning_rate": 5.2726380167601595e-06, + "logits/chosen": -0.03449631482362747, + "logits/rejected": -0.16584265232086182, + "logps/chosen": -0.40714091062545776, + "logps/rejected": -2.2733700275421143, + "loss": 0.383, + "odds_ratio_loss": 0.18216630816459656, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.040714096277952194, + "rewards/margins": 0.18662291765213013, + "rewards/rejected": -0.22733700275421143, + "sft_loss": 0.40714091062545776, + "step": 1706 + }, + { + "epoch": 2.468546637744035, + "grad_norm": 2.4272265459394418, + "learning_rate": 5.269693148297415e-06, + "logits/chosen": -0.18469105660915375, + "logits/rejected": -0.18337099254131317, + "logps/chosen": -0.276080459356308, + "logps/rejected": -3.6516079902648926, + "loss": 0.354, + "odds_ratio_loss": 0.12775567173957825, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027608048170804977, + "rewards/margins": 0.3375527262687683, + "rewards/rejected": -0.3651607930660248, + "sft_loss": 0.276080459356308, + "step": 1707 + }, + { + "epoch": 2.46999276934201, + "grad_norm": 2.1862324548948315, + "learning_rate": 5.266747514331943e-06, + "logits/chosen": -0.09375106543302536, + "logits/rejected": -0.06606069207191467, + "logps/chosen": -0.3427000343799591, + "logps/rejected": -4.577975273132324, + "loss": 0.3287, + "odds_ratio_loss": 0.11397892236709595, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03427000343799591, + "rewards/margins": 0.4235275089740753, + "rewards/rejected": -0.45779749751091003, + "sft_loss": 0.3427000343799591, + "step": 1708 + }, + { + "epoch": 2.4714389009399857, + "grad_norm": 3.017503213679481, + "learning_rate": 5.2638011166396765e-06, + "logits/chosen": -0.15729525685310364, + "logits/rejected": -0.31160077452659607, + "logps/chosen": -0.40494704246520996, + "logps/rejected": -3.3157148361206055, + "loss": 0.3849, + "odds_ratio_loss": 0.1344398856163025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04049470275640488, + "rewards/margins": 0.29107674956321716, + "rewards/rejected": -0.33157145977020264, + "sft_loss": 0.40494704246520996, + "step": 1709 + }, + { + "epoch": 2.472885032537961, + "grad_norm": 2.498208498215936, + "learning_rate": 5.26085395699701e-06, + "logits/chosen": -0.30746138095855713, + "logits/rejected": -0.2394457757472992, + "logps/chosen": -0.3937021493911743, + "logps/rejected": -3.2749500274658203, + "loss": 0.3325, + "odds_ratio_loss": 0.13062110543251038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03937021642923355, + "rewards/margins": 0.28812479972839355, + "rewards/rejected": -0.3274950087070465, + "sft_loss": 0.3937021493911743, + "step": 1710 + }, + { + "epoch": 2.4743311641359362, + "grad_norm": 2.347678712121523, + "learning_rate": 5.257906037180797e-06, + "logits/chosen": -0.2330562025308609, + "logits/rejected": -0.30464982986450195, + "logps/chosen": -0.3614528775215149, + "logps/rejected": -4.782719135284424, + "loss": 0.3511, + "odds_ratio_loss": 0.13488833606243134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03614528849720955, + "rewards/margins": 0.44212663173675537, + "rewards/rejected": -0.4782719016075134, + "sft_loss": 0.3614528775215149, + "step": 1711 + }, + { + "epoch": 2.475777295733912, + "grad_norm": 2.3030121406973025, + "learning_rate": 5.2549573589683494e-06, + "logits/chosen": -0.129247784614563, + "logits/rejected": -0.21539118885993958, + "logps/chosen": -0.23400619626045227, + "logps/rejected": -3.61014986038208, + "loss": 0.3575, + "odds_ratio_loss": 0.0835333988070488, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023400619626045227, + "rewards/margins": 0.33761438727378845, + "rewards/rejected": -0.3610149919986725, + "sft_loss": 0.23400619626045227, + "step": 1712 + }, + { + "epoch": 2.477223427331887, + "grad_norm": 14.891392154897925, + "learning_rate": 5.252007924137435e-06, + "logits/chosen": -0.20700593292713165, + "logits/rejected": -0.1795155555009842, + "logps/chosen": -0.37232303619384766, + "logps/rejected": -3.1692867279052734, + "loss": 0.3599, + "odds_ratio_loss": 0.08920063823461533, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03723230957984924, + "rewards/margins": 0.27969637513160706, + "rewards/rejected": -0.3169286847114563, + "sft_loss": 0.37232303619384766, + "step": 1713 + }, + { + "epoch": 2.4786695589298624, + "grad_norm": 2.223481410602812, + "learning_rate": 5.24905773446628e-06, + "logits/chosen": -0.13908199965953827, + "logits/rejected": -0.13732969760894775, + "logps/chosen": -0.38454872369766235, + "logps/rejected": -4.367215156555176, + "loss": 0.3717, + "odds_ratio_loss": 0.12911780178546906, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.038454875349998474, + "rewards/margins": 0.39826661348342896, + "rewards/rejected": -0.4367215037345886, + "sft_loss": 0.38454872369766235, + "step": 1714 + }, + { + "epoch": 2.480115690527838, + "grad_norm": 2.161222093495033, + "learning_rate": 5.2461067917335655e-06, + "logits/chosen": -0.18323522806167603, + "logits/rejected": -0.262523353099823, + "logps/chosen": -0.3305858373641968, + "logps/rejected": -3.8417184352874756, + "loss": 0.3449, + "odds_ratio_loss": 0.11526430398225784, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03305858373641968, + "rewards/margins": 0.3511132299900055, + "rewards/rejected": -0.38417184352874756, + "sft_loss": 0.3305858373641968, + "step": 1715 + }, + { + "epoch": 2.4815618221258133, + "grad_norm": 2.1036749771049172, + "learning_rate": 5.2431550977184255e-06, + "logits/chosen": -0.11821121722459793, + "logits/rejected": -0.08068836480379105, + "logps/chosen": -0.327120840549469, + "logps/rejected": -4.980037689208984, + "loss": 0.3295, + "odds_ratio_loss": 0.12885092198848724, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03271208703517914, + "rewards/margins": 0.4652916491031647, + "rewards/rejected": -0.4980037212371826, + "sft_loss": 0.327120840549469, + "step": 1716 + }, + { + "epoch": 2.483007953723789, + "grad_norm": 2.229147869090855, + "learning_rate": 5.240202654200448e-06, + "logits/chosen": -0.24735526740550995, + "logits/rejected": -0.19834265112876892, + "logps/chosen": -0.313556432723999, + "logps/rejected": -2.68381404876709, + "loss": 0.3639, + "odds_ratio_loss": 0.12288917601108551, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03135564550757408, + "rewards/margins": 0.23702575266361237, + "rewards/rejected": -0.26838141679763794, + "sft_loss": 0.313556432723999, + "step": 1717 + }, + { + "epoch": 2.4844540853217643, + "grad_norm": 3.1343885146836885, + "learning_rate": 5.237249462959671e-06, + "logits/chosen": -0.03448060154914856, + "logits/rejected": -0.14667919278144836, + "logps/chosen": -0.4028518795967102, + "logps/rejected": -3.0416674613952637, + "loss": 0.3938, + "odds_ratio_loss": 0.09556388854980469, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04028518870472908, + "rewards/margins": 0.2638815641403198, + "rewards/rejected": -0.3041667342185974, + "sft_loss": 0.4028518795967102, + "step": 1718 + }, + { + "epoch": 2.4859002169197395, + "grad_norm": 2.5207532915019506, + "learning_rate": 5.234295525776583e-06, + "logits/chosen": -0.3418402075767517, + "logits/rejected": -0.3059270977973938, + "logps/chosen": -0.42127281427383423, + "logps/rejected": -4.643973350524902, + "loss": 0.3958, + "odds_ratio_loss": 0.11073525995016098, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04212728142738342, + "rewards/margins": 0.4222700595855713, + "rewards/rejected": -0.4643973410129547, + "sft_loss": 0.42127281427383423, + "step": 1719 + }, + { + "epoch": 2.487346348517715, + "grad_norm": 2.3498993433765683, + "learning_rate": 5.231340844432127e-06, + "logits/chosen": -0.07430073618888855, + "logits/rejected": -0.014699429273605347, + "logps/chosen": -0.2515304386615753, + "logps/rejected": -4.094211101531982, + "loss": 0.3621, + "odds_ratio_loss": 0.13141734898090363, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025153042748570442, + "rewards/margins": 0.3842681050300598, + "rewards/rejected": -0.4094211161136627, + "sft_loss": 0.2515304386615753, + "step": 1720 + }, + { + "epoch": 2.4887924801156904, + "grad_norm": 2.257830950000833, + "learning_rate": 5.228385420707688e-06, + "logits/chosen": -0.2305629849433899, + "logits/rejected": -0.22776669263839722, + "logps/chosen": -0.2552865445613861, + "logps/rejected": -4.56567907333374, + "loss": 0.332, + "odds_ratio_loss": 0.09708862006664276, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02552865631878376, + "rewards/margins": 0.4310392737388611, + "rewards/rejected": -0.4565679430961609, + "sft_loss": 0.2552865445613861, + "step": 1721 + }, + { + "epoch": 2.490238611713666, + "grad_norm": 3.0069655541822193, + "learning_rate": 5.225429256385107e-06, + "logits/chosen": -0.3482007384300232, + "logits/rejected": -0.18393605947494507, + "logps/chosen": -0.31930986046791077, + "logps/rejected": -2.396862268447876, + "loss": 0.337, + "odds_ratio_loss": 0.13777418434619904, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03193098306655884, + "rewards/margins": 0.20775523781776428, + "rewards/rejected": -0.23968622088432312, + "sft_loss": 0.31930986046791077, + "step": 1722 + }, + { + "epoch": 2.4916847433116414, + "grad_norm": 2.5308497032909116, + "learning_rate": 5.2224723532466615e-06, + "logits/chosen": -0.1514590084552765, + "logits/rejected": -0.20194968581199646, + "logps/chosen": -0.4799078702926636, + "logps/rejected": -3.5649938583374023, + "loss": 0.3802, + "odds_ratio_loss": 0.19344067573547363, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04799078777432442, + "rewards/margins": 0.30850860476493835, + "rewards/rejected": -0.3564993739128113, + "sft_loss": 0.4799078702926636, + "step": 1723 + }, + { + "epoch": 2.4931308749096166, + "grad_norm": 2.264852029050408, + "learning_rate": 5.219514713075082e-06, + "logits/chosen": -0.4040144681930542, + "logits/rejected": -0.2476567029953003, + "logps/chosen": -0.3959325850009918, + "logps/rejected": -3.717017650604248, + "loss": 0.3556, + "odds_ratio_loss": 0.12017939984798431, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03959326446056366, + "rewards/margins": 0.3321084976196289, + "rewards/rejected": -0.37170177698135376, + "sft_loss": 0.3959325850009918, + "step": 1724 + }, + { + "epoch": 2.4945770065075923, + "grad_norm": 2.322670274244093, + "learning_rate": 5.216556337653538e-06, + "logits/chosen": -0.19745758175849915, + "logits/rejected": -0.2630073130130768, + "logps/chosen": -0.4968724846839905, + "logps/rejected": -4.204809188842773, + "loss": 0.4631, + "odds_ratio_loss": 0.1597743183374405, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04968724772334099, + "rewards/margins": 0.3707936704158783, + "rewards/rejected": -0.4204809069633484, + "sft_loss": 0.4968724846839905, + "step": 1725 + }, + { + "epoch": 2.4960231381055675, + "grad_norm": 2.2728574888436226, + "learning_rate": 5.213597228765649e-06, + "logits/chosen": -0.25864142179489136, + "logits/rejected": -0.29847434163093567, + "logps/chosen": -0.3347530663013458, + "logps/rejected": -2.365760087966919, + "loss": 0.3878, + "odds_ratio_loss": 0.11688689887523651, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03347530961036682, + "rewards/margins": 0.20310071110725403, + "rewards/rejected": -0.23657602071762085, + "sft_loss": 0.3347530663013458, + "step": 1726 + }, + { + "epoch": 2.497469269703543, + "grad_norm": 2.259706025611289, + "learning_rate": 5.210637388195471e-06, + "logits/chosen": -0.25694119930267334, + "logits/rejected": -0.33037617802619934, + "logps/chosen": -0.42037951946258545, + "logps/rejected": -3.479623794555664, + "loss": 0.4642, + "odds_ratio_loss": 0.13307222723960876, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.042037952691316605, + "rewards/margins": 0.3059244155883789, + "rewards/rejected": -0.347962349653244, + "sft_loss": 0.42037951946258545, + "step": 1727 + }, + { + "epoch": 2.4989154013015185, + "grad_norm": 3.456435966726747, + "learning_rate": 5.207676817727501e-06, + "logits/chosen": -0.19776073098182678, + "logits/rejected": -0.1664964109659195, + "logps/chosen": -0.46457743644714355, + "logps/rejected": -3.276613473892212, + "loss": 0.4425, + "odds_ratio_loss": 0.41586270928382874, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.046457745134830475, + "rewards/margins": 0.28120362758636475, + "rewards/rejected": -0.3276613652706146, + "sft_loss": 0.46457743644714355, + "step": 1728 + }, + { + "epoch": 2.5003615328994937, + "grad_norm": 2.383659929319483, + "learning_rate": 5.204715519146681e-06, + "logits/chosen": -0.3448188602924347, + "logits/rejected": -0.32050418853759766, + "logps/chosen": -0.4060935378074646, + "logps/rejected": -2.0824034214019775, + "loss": 0.3698, + "odds_ratio_loss": 0.15973150730133057, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04060935229063034, + "rewards/margins": 0.16763100028038025, + "rewards/rejected": -0.2082403302192688, + "sft_loss": 0.4060935378074646, + "step": 1729 + }, + { + "epoch": 2.5018076644974694, + "grad_norm": 2.3443217772376737, + "learning_rate": 5.201753494238388e-06, + "logits/chosen": -0.24633188545703888, + "logits/rejected": -0.14402739703655243, + "logps/chosen": -0.38479936122894287, + "logps/rejected": -2.347269058227539, + "loss": 0.3658, + "odds_ratio_loss": 0.15785683691501617, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03847993165254593, + "rewards/margins": 0.1962469518184662, + "rewards/rejected": -0.2347269058227539, + "sft_loss": 0.38479936122894287, + "step": 1730 + }, + { + "epoch": 2.5032537960954446, + "grad_norm": 2.37665885239867, + "learning_rate": 5.198790744788437e-06, + "logits/chosen": -0.21425655484199524, + "logits/rejected": -0.22479024529457092, + "logps/chosen": -0.2819811999797821, + "logps/rejected": -2.8894686698913574, + "loss": 0.2802, + "odds_ratio_loss": 0.10602892190217972, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0281981211155653, + "rewards/margins": 0.2607487440109253, + "rewards/rejected": -0.28894686698913574, + "sft_loss": 0.2819811999797821, + "step": 1731 + }, + { + "epoch": 2.5046999276934203, + "grad_norm": 2.314013984407445, + "learning_rate": 5.195827272583081e-06, + "logits/chosen": -0.26546451449394226, + "logits/rejected": -0.3177344799041748, + "logps/chosen": -0.35778549313545227, + "logps/rejected": -5.036369800567627, + "loss": 0.4006, + "odds_ratio_loss": 0.09799312800168991, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03577854856848717, + "rewards/margins": 0.4678584933280945, + "rewards/rejected": -0.5036370158195496, + "sft_loss": 0.35778549313545227, + "step": 1732 + }, + { + "epoch": 2.5061460592913956, + "grad_norm": 3.047246994936488, + "learning_rate": 5.192863079409009e-06, + "logits/chosen": -0.22313661873340607, + "logits/rejected": -0.12890973687171936, + "logps/chosen": -0.3412661850452423, + "logps/rejected": -4.243041515350342, + "loss": 0.3225, + "odds_ratio_loss": 0.12478017061948776, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03412661701440811, + "rewards/margins": 0.39017754793167114, + "rewards/rejected": -0.42430418729782104, + "sft_loss": 0.3412661850452423, + "step": 1733 + }, + { + "epoch": 2.507592190889371, + "grad_norm": 2.490857880194928, + "learning_rate": 5.189898167053344e-06, + "logits/chosen": -0.1840282380580902, + "logits/rejected": -0.06506327539682388, + "logps/chosen": -0.26957932114601135, + "logps/rejected": -4.575106143951416, + "loss": 0.3101, + "odds_ratio_loss": 0.15700577199459076, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.026957932859659195, + "rewards/margins": 0.4305526614189148, + "rewards/rejected": -0.4575105905532837, + "sft_loss": 0.26957932114601135, + "step": 1734 + }, + { + "epoch": 2.5090383224873465, + "grad_norm": 2.9187090161830582, + "learning_rate": 5.186932537303642e-06, + "logits/chosen": -0.4572184383869171, + "logits/rejected": -0.3056212067604065, + "logps/chosen": -0.40126121044158936, + "logps/rejected": -3.7890090942382812, + "loss": 0.3877, + "odds_ratio_loss": 0.1334933638572693, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04012611508369446, + "rewards/margins": 0.33877480030059814, + "rewards/rejected": -0.3789009153842926, + "sft_loss": 0.40126121044158936, + "step": 1735 + }, + { + "epoch": 2.5104844540853217, + "grad_norm": 2.377849930238879, + "learning_rate": 5.183966191947893e-06, + "logits/chosen": -0.2291964888572693, + "logits/rejected": -0.21579010784626007, + "logps/chosen": -0.360879123210907, + "logps/rejected": -2.2859559059143066, + "loss": 0.3647, + "odds_ratio_loss": 0.16899463534355164, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03608791530132294, + "rewards/margins": 0.19250769913196564, + "rewards/rejected": -0.22859559953212738, + "sft_loss": 0.360879123210907, + "step": 1736 + }, + { + "epoch": 2.511930585683297, + "grad_norm": 2.6284904360262606, + "learning_rate": 5.180999132774517e-06, + "logits/chosen": -0.18004505336284637, + "logits/rejected": -0.17705950140953064, + "logps/chosen": -0.40871530771255493, + "logps/rejected": -3.158888101577759, + "loss": 0.3391, + "odds_ratio_loss": 0.15241380035877228, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04087153077125549, + "rewards/margins": 0.27501729130744934, + "rewards/rejected": -0.31588882207870483, + "sft_loss": 0.40871530771255493, + "step": 1737 + }, + { + "epoch": 2.5133767172812727, + "grad_norm": 2.8519916155319454, + "learning_rate": 5.1780313615723655e-06, + "logits/chosen": -0.13925258815288544, + "logits/rejected": -0.0671801045536995, + "logps/chosen": -0.35058093070983887, + "logps/rejected": -2.5616981983184814, + "loss": 0.3681, + "odds_ratio_loss": 0.16618746519088745, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03505809232592583, + "rewards/margins": 0.2211117297410965, + "rewards/rejected": -0.2561698257923126, + "sft_loss": 0.35058093070983887, + "step": 1738 + }, + { + "epoch": 2.514822848879248, + "grad_norm": 2.4797827030221598, + "learning_rate": 5.175062880130719e-06, + "logits/chosen": -0.11228927969932556, + "logits/rejected": -0.11741993576288223, + "logps/chosen": -0.4418836832046509, + "logps/rejected": -3.7302236557006836, + "loss": 0.3823, + "odds_ratio_loss": 0.1561032235622406, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04418836906552315, + "rewards/margins": 0.3288339674472809, + "rewards/rejected": -0.3730223774909973, + "sft_loss": 0.4418836832046509, + "step": 1739 + }, + { + "epoch": 2.5162689804772236, + "grad_norm": 2.2932480834373874, + "learning_rate": 5.172093690239284e-06, + "logits/chosen": -0.031644146889448166, + "logits/rejected": -0.13634979724884033, + "logps/chosen": -0.25486522912979126, + "logps/rejected": -4.379749298095703, + "loss": 0.3217, + "odds_ratio_loss": 0.09034018963575363, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025486523285508156, + "rewards/margins": 0.4124884307384491, + "rewards/rejected": -0.4379749298095703, + "sft_loss": 0.25486522912979126, + "step": 1740 + }, + { + "epoch": 2.517715112075199, + "grad_norm": 2.1654095204528847, + "learning_rate": 5.1691237936881994e-06, + "logits/chosen": -0.2752266526222229, + "logits/rejected": -0.2713732421398163, + "logps/chosen": -0.34550338983535767, + "logps/rejected": -2.8938426971435547, + "loss": 0.3392, + "odds_ratio_loss": 0.15460900962352753, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03455033898353577, + "rewards/margins": 0.2548339366912842, + "rewards/rejected": -0.28938430547714233, + "sft_loss": 0.34550338983535767, + "step": 1741 + }, + { + "epoch": 2.5191612436731745, + "grad_norm": 2.6936346195386016, + "learning_rate": 5.166153192268025e-06, + "logits/chosen": -0.17216843366622925, + "logits/rejected": -0.24632829427719116, + "logps/chosen": -0.44458234310150146, + "logps/rejected": -2.9181787967681885, + "loss": 0.4975, + "odds_ratio_loss": 0.2072901576757431, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04445823282003403, + "rewards/margins": 0.24735963344573975, + "rewards/rejected": -0.29181787371635437, + "sft_loss": 0.44458234310150146, + "step": 1742 + }, + { + "epoch": 2.5206073752711498, + "grad_norm": 2.3809133922547736, + "learning_rate": 5.163181887769747e-06, + "logits/chosen": -0.2375820130109787, + "logits/rejected": -0.16164171695709229, + "logps/chosen": -0.2919715642929077, + "logps/rejected": -3.8255269527435303, + "loss": 0.345, + "odds_ratio_loss": 0.11046281456947327, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02919716015458107, + "rewards/margins": 0.3533555567264557, + "rewards/rejected": -0.3825526833534241, + "sft_loss": 0.2919715642929077, + "step": 1743 + }, + { + "epoch": 2.522053506869125, + "grad_norm": 1.961472431039239, + "learning_rate": 5.160209881984777e-06, + "logits/chosen": -0.13250018656253815, + "logits/rejected": -0.08406895399093628, + "logps/chosen": -0.3599141836166382, + "logps/rejected": -3.0004963874816895, + "loss": 0.3582, + "odds_ratio_loss": 0.19820904731750488, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03599141910672188, + "rewards/margins": 0.2640582323074341, + "rewards/rejected": -0.30004966259002686, + "sft_loss": 0.3599141836166382, + "step": 1744 + }, + { + "epoch": 2.5234996384671007, + "grad_norm": 2.452443666828418, + "learning_rate": 5.15723717670495e-06, + "logits/chosen": -0.30242788791656494, + "logits/rejected": -0.23525869846343994, + "logps/chosen": -0.3961702883243561, + "logps/rejected": -2.2211079597473145, + "loss": 0.3799, + "odds_ratio_loss": 0.16140399873256683, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03961702808737755, + "rewards/margins": 0.18249374628067017, + "rewards/rejected": -0.222110778093338, + "sft_loss": 0.3961702883243561, + "step": 1745 + }, + { + "epoch": 2.524945770065076, + "grad_norm": 2.28124974891748, + "learning_rate": 5.154263773722517e-06, + "logits/chosen": -0.32361429929733276, + "logits/rejected": -0.21904835104942322, + "logps/chosen": -0.40971463918685913, + "logps/rejected": -3.0012941360473633, + "loss": 0.3653, + "odds_ratio_loss": 0.15136511623859406, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04097146540880203, + "rewards/margins": 0.2591579556465149, + "rewards/rejected": -0.30012941360473633, + "sft_loss": 0.40971463918685913, + "step": 1746 + }, + { + "epoch": 2.526391901663051, + "grad_norm": 4.146284761192451, + "learning_rate": 5.151289674830156e-06, + "logits/chosen": -0.2679465711116791, + "logits/rejected": -0.2800706923007965, + "logps/chosen": -0.33803799748420715, + "logps/rejected": -4.649941921234131, + "loss": 0.4659, + "odds_ratio_loss": 0.09396877884864807, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.033803801983594894, + "rewards/margins": 0.4311903417110443, + "rewards/rejected": -0.4649941623210907, + "sft_loss": 0.33803799748420715, + "step": 1747 + }, + { + "epoch": 2.527838033261027, + "grad_norm": 2.5255140596275494, + "learning_rate": 5.1483148818209625e-06, + "logits/chosen": -0.2974938750267029, + "logits/rejected": -0.24165314435958862, + "logps/chosen": -0.2745737135410309, + "logps/rejected": -3.838913917541504, + "loss": 0.3668, + "odds_ratio_loss": 0.09007024019956589, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02745737135410309, + "rewards/margins": 0.35643401741981506, + "rewards/rejected": -0.38389140367507935, + "sft_loss": 0.2745737135410309, + "step": 1748 + }, + { + "epoch": 2.529284164859002, + "grad_norm": 2.338760253007698, + "learning_rate": 5.145339396488451e-06, + "logits/chosen": -0.24079769849777222, + "logits/rejected": -0.19569242000579834, + "logps/chosen": -0.3212064504623413, + "logps/rejected": -3.878096580505371, + "loss": 0.408, + "odds_ratio_loss": 0.11682701855897903, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03212064504623413, + "rewards/margins": 0.35568904876708984, + "rewards/rejected": -0.387809693813324, + "sft_loss": 0.3212064504623413, + "step": 1749 + }, + { + "epoch": 2.5307302964569773, + "grad_norm": 2.828854798037771, + "learning_rate": 5.142363220626551e-06, + "logits/chosen": -0.08817961812019348, + "logits/rejected": -0.052211862057447433, + "logps/chosen": -0.31730931997299194, + "logps/rejected": -1.8730766773223877, + "loss": 0.3626, + "odds_ratio_loss": 0.10939719527959824, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03173093497753143, + "rewards/margins": 0.15557675063610077, + "rewards/rejected": -0.187307670712471, + "sft_loss": 0.31730931997299194, + "step": 1750 + }, + { + "epoch": 2.532176428054953, + "grad_norm": 2.378524988191866, + "learning_rate": 5.13938635602961e-06, + "logits/chosen": -0.2921004891395569, + "logits/rejected": -0.13167288899421692, + "logps/chosen": -0.4142140746116638, + "logps/rejected": -4.806788921356201, + "loss": 0.3473, + "odds_ratio_loss": 0.15716761350631714, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04142140597105026, + "rewards/margins": 0.4392574727535248, + "rewards/rejected": -0.480678915977478, + "sft_loss": 0.4142140746116638, + "step": 1751 + }, + { + "epoch": 2.5336225596529283, + "grad_norm": 2.206670746385489, + "learning_rate": 5.136408804492392e-06, + "logits/chosen": -0.16804078221321106, + "logits/rejected": -0.12704414129257202, + "logps/chosen": -0.4827273190021515, + "logps/rejected": -3.0154800415039062, + "loss": 0.415, + "odds_ratio_loss": 0.19364948570728302, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04827273264527321, + "rewards/margins": 0.2532752454280853, + "rewards/rejected": -0.30154797434806824, + "sft_loss": 0.4827273190021515, + "step": 1752 + }, + { + "epoch": 2.535068691250904, + "grad_norm": 2.7522459127137004, + "learning_rate": 5.133430567810073e-06, + "logits/chosen": -0.4092782437801361, + "logits/rejected": -0.4329729378223419, + "logps/chosen": -0.40332627296447754, + "logps/rejected": -2.8373968601226807, + "loss": 0.3651, + "odds_ratio_loss": 0.175297811627388, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04033263027667999, + "rewards/margins": 0.24340707063674927, + "rewards/rejected": -0.28373971581459045, + "sft_loss": 0.40332627296447754, + "step": 1753 + }, + { + "epoch": 2.536514822848879, + "grad_norm": 2.438351487855574, + "learning_rate": 5.1304516477782444e-06, + "logits/chosen": -0.32628947496414185, + "logits/rejected": -0.30095720291137695, + "logps/chosen": -0.3710482716560364, + "logps/rejected": -4.039909362792969, + "loss": 0.3771, + "odds_ratio_loss": 0.1693974733352661, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03710482642054558, + "rewards/margins": 0.3668861389160156, + "rewards/rejected": -0.4039909541606903, + "sft_loss": 0.3710482716560364, + "step": 1754 + }, + { + "epoch": 2.537960954446855, + "grad_norm": 2.222626728874631, + "learning_rate": 5.127472046192904e-06, + "logits/chosen": -0.24637198448181152, + "logits/rejected": -0.21760375797748566, + "logps/chosen": -0.5183929204940796, + "logps/rejected": -4.226579666137695, + "loss": 0.3649, + "odds_ratio_loss": 0.17385032773017883, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05183929204940796, + "rewards/margins": 0.3708186745643616, + "rewards/rejected": -0.4226579964160919, + "sft_loss": 0.5183929204940796, + "step": 1755 + }, + { + "epoch": 2.53940708604483, + "grad_norm": 2.3187889318353427, + "learning_rate": 5.12449176485047e-06, + "logits/chosen": -0.32632243633270264, + "logits/rejected": -0.23128627240657806, + "logps/chosen": -0.41758835315704346, + "logps/rejected": -3.34269380569458, + "loss": 0.3577, + "odds_ratio_loss": 0.1132265031337738, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.041758835315704346, + "rewards/margins": 0.2925105690956116, + "rewards/rejected": -0.3342694044113159, + "sft_loss": 0.41758835315704346, + "step": 1756 + }, + { + "epoch": 2.5408532176428054, + "grad_norm": 2.454164577145383, + "learning_rate": 5.121510805547764e-06, + "logits/chosen": -0.2939985692501068, + "logits/rejected": -0.239321768283844, + "logps/chosen": -0.34688299894332886, + "logps/rejected": -2.9085657596588135, + "loss": 0.3901, + "odds_ratio_loss": 0.1403641402721405, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.034688301384449005, + "rewards/margins": 0.25616827607154846, + "rewards/rejected": -0.29085657000541687, + "sft_loss": 0.34688299894332886, + "step": 1757 + }, + { + "epoch": 2.542299349240781, + "grad_norm": 2.434087179736664, + "learning_rate": 5.118529170082016e-06, + "logits/chosen": -0.15723645687103271, + "logits/rejected": -0.11477909237146378, + "logps/chosen": -0.4263688325881958, + "logps/rejected": -2.351132392883301, + "loss": 0.3856, + "odds_ratio_loss": 0.2027743011713028, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04263688623905182, + "rewards/margins": 0.19247636198997498, + "rewards/rejected": -0.2351132482290268, + "sft_loss": 0.4263688325881958, + "step": 1758 + }, + { + "epoch": 2.5437454808387563, + "grad_norm": 2.066090453677344, + "learning_rate": 5.115546860250865e-06, + "logits/chosen": -0.20648221671581268, + "logits/rejected": -0.14154908061027527, + "logps/chosen": -0.4318174123764038, + "logps/rejected": -2.9517130851745605, + "loss": 0.3699, + "odds_ratio_loss": 0.22002653777599335, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04318173974752426, + "rewards/margins": 0.25198957324028015, + "rewards/rejected": -0.295171320438385, + "sft_loss": 0.4318174123764038, + "step": 1759 + }, + { + "epoch": 2.5451916124367315, + "grad_norm": 3.54077227747576, + "learning_rate": 5.112563877852356e-06, + "logits/chosen": -0.29718634486198425, + "logits/rejected": -0.2714141607284546, + "logps/chosen": -0.4731605350971222, + "logps/rejected": -2.4305949211120605, + "loss": 0.3938, + "odds_ratio_loss": 0.15969420969486237, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0473160557448864, + "rewards/margins": 0.19574342668056488, + "rewards/rejected": -0.24305948615074158, + "sft_loss": 0.4731605350971222, + "step": 1760 + }, + { + "epoch": 2.546637744034707, + "grad_norm": 2.1618260984771016, + "learning_rate": 5.1095802246849435e-06, + "logits/chosen": -0.2292264997959137, + "logits/rejected": -0.15521396696567535, + "logps/chosen": -0.3275163173675537, + "logps/rejected": -3.1341147422790527, + "loss": 0.4518, + "odds_ratio_loss": 0.1408037543296814, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03275163471698761, + "rewards/margins": 0.28065982460975647, + "rewards/rejected": -0.3134114444255829, + "sft_loss": 0.3275163173675537, + "step": 1761 + }, + { + "epoch": 2.5480838756326825, + "grad_norm": 2.568063340628784, + "learning_rate": 5.10659590254748e-06, + "logits/chosen": -0.2893018126487732, + "logits/rejected": -0.28161394596099854, + "logps/chosen": -0.3742448687553406, + "logps/rejected": -2.4380226135253906, + "loss": 0.3964, + "odds_ratio_loss": 0.12025295197963715, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.037424489855766296, + "rewards/margins": 0.206377774477005, + "rewards/rejected": -0.2438022643327713, + "sft_loss": 0.3742448687553406, + "step": 1762 + }, + { + "epoch": 2.549530007230658, + "grad_norm": 3.318379202971589, + "learning_rate": 5.103610913239225e-06, + "logits/chosen": -0.17492324113845825, + "logits/rejected": -0.1730252504348755, + "logps/chosen": -0.3307463526725769, + "logps/rejected": -4.673501014709473, + "loss": 0.401, + "odds_ratio_loss": 0.11378967761993408, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03307463601231575, + "rewards/margins": 0.43427544832229614, + "rewards/rejected": -0.4673501253128052, + "sft_loss": 0.3307463526725769, + "step": 1763 + }, + { + "epoch": 2.5509761388286334, + "grad_norm": 2.2137214503057017, + "learning_rate": 5.100625258559841e-06, + "logits/chosen": -0.09644006192684174, + "logits/rejected": -0.12039525806903839, + "logps/chosen": -0.3201032280921936, + "logps/rejected": -2.2959609031677246, + "loss": 0.3397, + "odds_ratio_loss": 0.1471930742263794, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03201032429933548, + "rewards/margins": 0.19758576154708862, + "rewards/rejected": -0.2295960783958435, + "sft_loss": 0.3201032280921936, + "step": 1764 + }, + { + "epoch": 2.552422270426609, + "grad_norm": 2.450947199520994, + "learning_rate": 5.097638940309389e-06, + "logits/chosen": -0.32353049516677856, + "logits/rejected": -0.3000898063182831, + "logps/chosen": -0.41360723972320557, + "logps/rejected": -2.5127758979797363, + "loss": 0.3678, + "odds_ratio_loss": 0.15336181223392487, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.041360728442668915, + "rewards/margins": 0.2099168598651886, + "rewards/rejected": -0.2512775957584381, + "sft_loss": 0.41360723972320557, + "step": 1765 + }, + { + "epoch": 2.5538684020245843, + "grad_norm": 2.4017350682281604, + "learning_rate": 5.094651960288332e-06, + "logits/chosen": -0.1385897994041443, + "logits/rejected": -0.19996079802513123, + "logps/chosen": -0.21565288305282593, + "logps/rejected": -4.9466094970703125, + "loss": 0.3049, + "odds_ratio_loss": 0.032502688467502594, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02156529203057289, + "rewards/margins": 0.4730956256389618, + "rewards/rejected": -0.494660884141922, + "sft_loss": 0.21565288305282593, + "step": 1766 + }, + { + "epoch": 2.5553145336225596, + "grad_norm": 2.5067478353690413, + "learning_rate": 5.0916643202975305e-06, + "logits/chosen": -0.20666462182998657, + "logits/rejected": -0.2589164972305298, + "logps/chosen": -0.3715488612651825, + "logps/rejected": -2.179995059967041, + "loss": 0.441, + "odds_ratio_loss": 0.16341277956962585, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03715488687157631, + "rewards/margins": 0.18084463477134705, + "rewards/rejected": -0.21799951791763306, + "sft_loss": 0.3715488612651825, + "step": 1767 + }, + { + "epoch": 2.5567606652205352, + "grad_norm": 2.0202885964031836, + "learning_rate": 5.088676022138245e-06, + "logits/chosen": -0.23196789622306824, + "logits/rejected": -0.19383683800697327, + "logps/chosen": -0.24499648809432983, + "logps/rejected": -4.9586405754089355, + "loss": 0.259, + "odds_ratio_loss": 0.10917812585830688, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024499651044607162, + "rewards/margins": 0.47136440873146057, + "rewards/rejected": -0.49586403369903564, + "sft_loss": 0.24499648809432983, + "step": 1768 + }, + { + "epoch": 2.5582067968185105, + "grad_norm": 2.1272904237171466, + "learning_rate": 5.0856870676121304e-06, + "logits/chosen": -0.23364225029945374, + "logits/rejected": -0.2448486089706421, + "logps/chosen": -0.36238884925842285, + "logps/rejected": -3.3999645709991455, + "loss": 0.3577, + "odds_ratio_loss": 0.18717606365680695, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.036238886415958405, + "rewards/margins": 0.30375754833221436, + "rewards/rejected": -0.33999642729759216, + "sft_loss": 0.36238884925842285, + "step": 1769 + }, + { + "epoch": 2.5596529284164857, + "grad_norm": 2.284375817275679, + "learning_rate": 5.082697458521241e-06, + "logits/chosen": -0.25109994411468506, + "logits/rejected": -0.22882197797298431, + "logps/chosen": -0.40674877166748047, + "logps/rejected": -1.9313181638717651, + "loss": 0.398, + "odds_ratio_loss": 0.10357539355754852, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.040674880146980286, + "rewards/margins": 0.15245693922042847, + "rewards/rejected": -0.19313181936740875, + "sft_loss": 0.40674877166748047, + "step": 1770 + }, + { + "epoch": 2.5610990600144614, + "grad_norm": 2.4387901998478596, + "learning_rate": 5.079707196668019e-06, + "logits/chosen": -0.26109209656715393, + "logits/rejected": -0.16213780641555786, + "logps/chosen": -0.3069310784339905, + "logps/rejected": -5.687686920166016, + "loss": 0.312, + "odds_ratio_loss": 0.09986848384141922, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.030693108215928078, + "rewards/margins": 0.5380756258964539, + "rewards/rejected": -0.5687687397003174, + "sft_loss": 0.3069310784339905, + "step": 1771 + }, + { + "epoch": 2.5625451916124367, + "grad_norm": 2.6215024856040445, + "learning_rate": 5.076716283855309e-06, + "logits/chosen": -0.1803428828716278, + "logits/rejected": -0.22443482279777527, + "logps/chosen": -0.3120989203453064, + "logps/rejected": -2.8625402450561523, + "loss": 0.3838, + "odds_ratio_loss": 0.11180431395769119, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03120989166200161, + "rewards/margins": 0.2550441324710846, + "rewards/rejected": -0.28625404834747314, + "sft_loss": 0.3120989203453064, + "step": 1772 + }, + { + "epoch": 2.563991323210412, + "grad_norm": 4.162773344211327, + "learning_rate": 5.073724721886341e-06, + "logits/chosen": -0.2028559148311615, + "logits/rejected": -0.3784460425376892, + "logps/chosen": -0.37769874930381775, + "logps/rejected": -3.5124316215515137, + "loss": 0.3303, + "odds_ratio_loss": 0.12656457722187042, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.037769876420497894, + "rewards/margins": 0.31347325444221497, + "rewards/rejected": -0.35124313831329346, + "sft_loss": 0.37769874930381775, + "step": 1773 + }, + { + "epoch": 2.5654374548083876, + "grad_norm": 3.3997712168625416, + "learning_rate": 5.0707325125647395e-06, + "logits/chosen": -0.26617467403411865, + "logits/rejected": -0.3048926293849945, + "logps/chosen": -0.2930275499820709, + "logps/rejected": -4.605068683624268, + "loss": 0.3076, + "odds_ratio_loss": 0.08594189584255219, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02930275723338127, + "rewards/margins": 0.4312041103839874, + "rewards/rejected": -0.4605068266391754, + "sft_loss": 0.2930275499820709, + "step": 1774 + }, + { + "epoch": 2.566883586406363, + "grad_norm": 2.6963829649064133, + "learning_rate": 5.067739657694517e-06, + "logits/chosen": -0.2267257273197174, + "logits/rejected": -0.24845337867736816, + "logps/chosen": -0.40409570932388306, + "logps/rejected": -3.9773378372192383, + "loss": 0.368, + "odds_ratio_loss": 0.11968840658664703, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.040409572422504425, + "rewards/margins": 0.3573242425918579, + "rewards/rejected": -0.39773380756378174, + "sft_loss": 0.40409570932388306, + "step": 1775 + }, + { + "epoch": 2.5683297180043385, + "grad_norm": 2.727857096900295, + "learning_rate": 5.064746159080079e-06, + "logits/chosen": -0.2270076870918274, + "logits/rejected": -0.22585347294807434, + "logps/chosen": -0.37619680166244507, + "logps/rejected": -3.0323472023010254, + "loss": 0.4573, + "odds_ratio_loss": 0.12163165211677551, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03761968016624451, + "rewards/margins": 0.2656150460243225, + "rewards/rejected": -0.30323469638824463, + "sft_loss": 0.37619680166244507, + "step": 1776 + }, + { + "epoch": 2.5697758496023138, + "grad_norm": 2.065413871507, + "learning_rate": 5.061752018526217e-06, + "logits/chosen": -0.2135777473449707, + "logits/rejected": -0.1215197741985321, + "logps/chosen": -0.18856766819953918, + "logps/rejected": -5.205729007720947, + "loss": 0.2787, + "odds_ratio_loss": 0.03695458173751831, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018856767565011978, + "rewards/margins": 0.501716136932373, + "rewards/rejected": -0.5205729007720947, + "sft_loss": 0.18856766819953918, + "step": 1777 + }, + { + "epoch": 2.5712219812002894, + "grad_norm": 3.612771418317987, + "learning_rate": 5.058757237838107e-06, + "logits/chosen": -0.28231173753738403, + "logits/rejected": -0.21534737944602966, + "logps/chosen": -0.25970977544784546, + "logps/rejected": -3.527226209640503, + "loss": 0.3124, + "odds_ratio_loss": 0.08613273501396179, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025970978662371635, + "rewards/margins": 0.3267516493797302, + "rewards/rejected": -0.3527226448059082, + "sft_loss": 0.25970977544784546, + "step": 1778 + }, + { + "epoch": 2.5726681127982647, + "grad_norm": 2.3176478326208265, + "learning_rate": 5.0557618188213155e-06, + "logits/chosen": -0.22027641534805298, + "logits/rejected": -0.19152703881263733, + "logps/chosen": -0.2974584400653839, + "logps/rejected": -4.103922367095947, + "loss": 0.3498, + "odds_ratio_loss": 0.09893074631690979, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02974584326148033, + "rewards/margins": 0.3806464374065399, + "rewards/rejected": -0.41039225459098816, + "sft_loss": 0.2974584400653839, + "step": 1779 + }, + { + "epoch": 2.57411424439624, + "grad_norm": 2.6204193220471637, + "learning_rate": 5.052765763281792e-06, + "logits/chosen": -0.10732519626617432, + "logits/rejected": -0.1166638433933258, + "logps/chosen": -0.3432236313819885, + "logps/rejected": -4.708727836608887, + "loss": 0.3735, + "odds_ratio_loss": 0.1429092288017273, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03432236239314079, + "rewards/margins": 0.43655040860176086, + "rewards/rejected": -0.47087281942367554, + "sft_loss": 0.3432236313819885, + "step": 1780 + }, + { + "epoch": 2.5755603759942156, + "grad_norm": 2.364122545033094, + "learning_rate": 5.049769073025869e-06, + "logits/chosen": -0.2854674458503723, + "logits/rejected": -0.20616787672042847, + "logps/chosen": -0.365227073431015, + "logps/rejected": -3.223012924194336, + "loss": 0.3999, + "odds_ratio_loss": 0.14509882032871246, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03652270510792732, + "rewards/margins": 0.28577861189842224, + "rewards/rejected": -0.32230129837989807, + "sft_loss": 0.365227073431015, + "step": 1781 + }, + { + "epoch": 2.577006507592191, + "grad_norm": 2.3993429794021894, + "learning_rate": 5.046771749860261e-06, + "logits/chosen": -0.18015708029270172, + "logits/rejected": -0.12893535196781158, + "logps/chosen": -0.32794156670570374, + "logps/rejected": -3.0708723068237305, + "loss": 0.361, + "odds_ratio_loss": 0.0962129533290863, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03279416263103485, + "rewards/margins": 0.27429306507110596, + "rewards/rejected": -0.307087242603302, + "sft_loss": 0.32794156670570374, + "step": 1782 + }, + { + "epoch": 2.578452639190166, + "grad_norm": 2.244853886626483, + "learning_rate": 5.0437737955920665e-06, + "logits/chosen": -0.13234691321849823, + "logits/rejected": -0.19031468033790588, + "logps/chosen": -0.4550319015979767, + "logps/rejected": -2.549581527709961, + "loss": 0.3881, + "odds_ratio_loss": 0.12124250829219818, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04550319164991379, + "rewards/margins": 0.2094549834728241, + "rewards/rejected": -0.2549581825733185, + "sft_loss": 0.4550319015979767, + "step": 1783 + }, + { + "epoch": 2.579898770788142, + "grad_norm": 2.451189109939498, + "learning_rate": 5.040775212028764e-06, + "logits/chosen": -0.19473308324813843, + "logits/rejected": -0.3043419122695923, + "logps/chosen": -0.41355323791503906, + "logps/rejected": -3.423119068145752, + "loss": 0.4049, + "odds_ratio_loss": 0.20922225713729858, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.041355326771736145, + "rewards/margins": 0.3009566068649292, + "rewards/rejected": -0.34231194853782654, + "sft_loss": 0.41355323791503906, + "step": 1784 + }, + { + "epoch": 2.581344902386117, + "grad_norm": 2.176181910862176, + "learning_rate": 5.03777600097821e-06, + "logits/chosen": -0.19760388135910034, + "logits/rejected": -0.17856121063232422, + "logps/chosen": -0.3006751537322998, + "logps/rejected": -3.3096654415130615, + "loss": 0.2758, + "odds_ratio_loss": 0.10000480711460114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03006751835346222, + "rewards/margins": 0.30089902877807617, + "rewards/rejected": -0.330966591835022, + "sft_loss": 0.3006751537322998, + "step": 1785 + }, + { + "epoch": 2.5827910339840927, + "grad_norm": 2.485935727916885, + "learning_rate": 5.034776164248639e-06, + "logits/chosen": -0.07469113171100616, + "logits/rejected": -0.1526949256658554, + "logps/chosen": -0.39017635583877563, + "logps/rejected": -3.454286575317383, + "loss": 0.3686, + "odds_ratio_loss": 0.14497698843479156, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.039017632603645325, + "rewards/margins": 0.3064110279083252, + "rewards/rejected": -0.3454286754131317, + "sft_loss": 0.39017635583877563, + "step": 1786 + }, + { + "epoch": 2.584237165582068, + "grad_norm": 7.36586680246059, + "learning_rate": 5.031775703648665e-06, + "logits/chosen": -0.15478000044822693, + "logits/rejected": -0.13516290485858917, + "logps/chosen": -0.469347208738327, + "logps/rejected": -2.471134662628174, + "loss": 0.35, + "odds_ratio_loss": 0.15310752391815186, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04693472385406494, + "rewards/margins": 0.20017877221107483, + "rewards/rejected": -0.24711349606513977, + "sft_loss": 0.469347208738327, + "step": 1787 + }, + { + "epoch": 2.5856832971800436, + "grad_norm": 2.21099952495446, + "learning_rate": 5.028774620987278e-06, + "logits/chosen": -0.14791055023670197, + "logits/rejected": -0.17355845868587494, + "logps/chosen": -0.3638818860054016, + "logps/rejected": -5.218241214752197, + "loss": 0.4219, + "odds_ratio_loss": 0.11315981298685074, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03638819232583046, + "rewards/margins": 0.48543596267700195, + "rewards/rejected": -0.5218241214752197, + "sft_loss": 0.3638818860054016, + "step": 1788 + }, + { + "epoch": 2.587129428778019, + "grad_norm": 2.2439650907311957, + "learning_rate": 5.025772918073839e-06, + "logits/chosen": -0.22446078062057495, + "logits/rejected": -0.3045191168785095, + "logps/chosen": -0.48371070623397827, + "logps/rejected": -1.8402783870697021, + "loss": 0.3619, + "odds_ratio_loss": 0.21439674496650696, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04837106913328171, + "rewards/margins": 0.13565678894519806, + "rewards/rejected": -0.18402786552906036, + "sft_loss": 0.48371070623397827, + "step": 1789 + }, + { + "epoch": 2.588575560375994, + "grad_norm": 2.4998273129519264, + "learning_rate": 5.0227705967180875e-06, + "logits/chosen": -0.2764180302619934, + "logits/rejected": -0.19125843048095703, + "logps/chosen": -0.33420807123184204, + "logps/rejected": -2.7277328968048096, + "loss": 0.4544, + "odds_ratio_loss": 0.10430102050304413, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.033420804888010025, + "rewards/margins": 0.2393524944782257, + "rewards/rejected": -0.2727733254432678, + "sft_loss": 0.33420807123184204, + "step": 1790 + }, + { + "epoch": 2.59002169197397, + "grad_norm": 2.9213180750521617, + "learning_rate": 5.019767658730133e-06, + "logits/chosen": -0.166767418384552, + "logits/rejected": -0.13819444179534912, + "logps/chosen": -0.3064895272254944, + "logps/rejected": -2.5118443965911865, + "loss": 0.3143, + "odds_ratio_loss": 0.14545565843582153, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.030648954212665558, + "rewards/margins": 0.22053547203540802, + "rewards/rejected": -0.2511844336986542, + "sft_loss": 0.3064895272254944, + "step": 1791 + }, + { + "epoch": 2.591467823571945, + "grad_norm": 2.3230275123604027, + "learning_rate": 5.016764105920462e-06, + "logits/chosen": -0.2184886634349823, + "logits/rejected": -0.24857985973358154, + "logps/chosen": -0.43511849641799927, + "logps/rejected": -3.9661684036254883, + "loss": 0.4028, + "odds_ratio_loss": 0.19005386531352997, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04351184517145157, + "rewards/margins": 0.35310500860214233, + "rewards/rejected": -0.3966168761253357, + "sft_loss": 0.43511849641799927, + "step": 1792 + }, + { + "epoch": 2.5929139551699203, + "grad_norm": 2.3592440154154897, + "learning_rate": 5.013759940099921e-06, + "logits/chosen": -0.19317913055419922, + "logits/rejected": -0.22358053922653198, + "logps/chosen": -0.2777489125728607, + "logps/rejected": -3.4764034748077393, + "loss": 0.3779, + "odds_ratio_loss": 0.07792896032333374, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02777489274740219, + "rewards/margins": 0.3198654353618622, + "rewards/rejected": -0.34764033555984497, + "sft_loss": 0.2777489125728607, + "step": 1793 + }, + { + "epoch": 2.594360086767896, + "grad_norm": 2.388230189802143, + "learning_rate": 5.010755163079739e-06, + "logits/chosen": -0.12977398931980133, + "logits/rejected": -0.26473286747932434, + "logps/chosen": -0.42577025294303894, + "logps/rejected": -2.8500428199768066, + "loss": 0.4189, + "odds_ratio_loss": 0.19891750812530518, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04257702827453613, + "rewards/margins": 0.242427259683609, + "rewards/rejected": -0.28500428795814514, + "sft_loss": 0.42577025294303894, + "step": 1794 + }, + { + "epoch": 2.595806218365871, + "grad_norm": 2.622254082892315, + "learning_rate": 5.007749776671503e-06, + "logits/chosen": -0.282731294631958, + "logits/rejected": -0.17344717681407928, + "logps/chosen": -0.33832526206970215, + "logps/rejected": -2.8043172359466553, + "loss": 0.3371, + "odds_ratio_loss": 0.10820700973272324, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.033832527697086334, + "rewards/margins": 0.2465991973876953, + "rewards/rejected": -0.28043174743652344, + "sft_loss": 0.33832526206970215, + "step": 1795 + }, + { + "epoch": 2.5972523499638465, + "grad_norm": 2.535611097183941, + "learning_rate": 5.0047437826871745e-06, + "logits/chosen": -0.29366135597229004, + "logits/rejected": -0.29001742601394653, + "logps/chosen": -0.37644755840301514, + "logps/rejected": -2.264528274536133, + "loss": 0.3912, + "odds_ratio_loss": 0.19424787163734436, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03764475882053375, + "rewards/margins": 0.18880808353424072, + "rewards/rejected": -0.22645285725593567, + "sft_loss": 0.37644755840301514, + "step": 1796 + }, + { + "epoch": 2.598698481561822, + "grad_norm": 2.46192301019977, + "learning_rate": 5.001737182939077e-06, + "logits/chosen": -0.32444244623184204, + "logits/rejected": -0.18136954307556152, + "logps/chosen": -0.4060314893722534, + "logps/rejected": -2.0913939476013184, + "loss": 0.3662, + "odds_ratio_loss": 0.17886869609355927, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0406031496822834, + "rewards/margins": 0.1685362458229065, + "rewards/rejected": -0.2091393917798996, + "sft_loss": 0.4060314893722534, + "step": 1797 + }, + { + "epoch": 2.6001446131597974, + "grad_norm": 2.4189945586758403, + "learning_rate": 4.9987299792399014e-06, + "logits/chosen": -0.26979711651802063, + "logits/rejected": -0.22070440649986267, + "logps/chosen": -0.47620007395744324, + "logps/rejected": -4.007668972015381, + "loss": 0.4251, + "odds_ratio_loss": 0.14132900536060333, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.047620005905628204, + "rewards/margins": 0.35314691066741943, + "rewards/rejected": -0.40076693892478943, + "sft_loss": 0.47620007395744324, + "step": 1798 + }, + { + "epoch": 2.601590744757773, + "grad_norm": 2.296226400698763, + "learning_rate": 4.995722173402702e-06, + "logits/chosen": -0.3933625817298889, + "logits/rejected": -0.24747397005558014, + "logps/chosen": -0.3423520028591156, + "logps/rejected": -2.0690927505493164, + "loss": 0.4247, + "odds_ratio_loss": 0.1446721851825714, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03423519805073738, + "rewards/margins": 0.17267407476902008, + "rewards/rejected": -0.20690926909446716, + "sft_loss": 0.3423520028591156, + "step": 1799 + }, + { + "epoch": 2.6030368763557483, + "grad_norm": 2.337562593004367, + "learning_rate": 4.9927137672408955e-06, + "logits/chosen": -0.24255606532096863, + "logits/rejected": -0.38033175468444824, + "logps/chosen": -0.4118923246860504, + "logps/rejected": -3.219302177429199, + "loss": 0.3441, + "odds_ratio_loss": 0.22141076624393463, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04118923097848892, + "rewards/margins": 0.28074100613594055, + "rewards/rejected": -0.3219302296638489, + "sft_loss": 0.4118923246860504, + "step": 1800 + }, + { + "epoch": 2.604483007953724, + "grad_norm": 2.4080451586988003, + "learning_rate": 4.989704762568262e-06, + "logits/chosen": -0.2137508988380432, + "logits/rejected": -0.2190382331609726, + "logps/chosen": -0.29253143072128296, + "logps/rejected": -5.632781982421875, + "loss": 0.3804, + "odds_ratio_loss": 0.06299417465925217, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029253143817186356, + "rewards/margins": 0.5340250730514526, + "rewards/rejected": -0.5632781982421875, + "sft_loss": 0.29253143072128296, + "step": 1801 + }, + { + "epoch": 2.6059291395516992, + "grad_norm": 2.640139404715311, + "learning_rate": 4.986695161198939e-06, + "logits/chosen": -0.3997032046318054, + "logits/rejected": -0.3515118360519409, + "logps/chosen": -0.3487919569015503, + "logps/rejected": -3.5432519912719727, + "loss": 0.3935, + "odds_ratio_loss": 0.08523302525281906, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03487919270992279, + "rewards/margins": 0.31944602727890015, + "rewards/rejected": -0.35432523488998413, + "sft_loss": 0.3487919569015503, + "step": 1802 + }, + { + "epoch": 2.6073752711496745, + "grad_norm": 6.59977106509988, + "learning_rate": 4.98368496494743e-06, + "logits/chosen": -0.07293814420700073, + "logits/rejected": -0.09770108759403229, + "logps/chosen": -0.3182450234889984, + "logps/rejected": -5.30604362487793, + "loss": 0.4086, + "odds_ratio_loss": 0.1032043844461441, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0318245030939579, + "rewards/margins": 0.498779833316803, + "rewards/rejected": -0.530604362487793, + "sft_loss": 0.3182450234889984, + "step": 1803 + }, + { + "epoch": 2.60882140274765, + "grad_norm": 2.1346639954836726, + "learning_rate": 4.980674175628593e-06, + "logits/chosen": -0.3480367660522461, + "logits/rejected": -0.2504662573337555, + "logps/chosen": -0.270319402217865, + "logps/rejected": -4.932693958282471, + "loss": 0.3592, + "odds_ratio_loss": 0.10022924840450287, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02703193947672844, + "rewards/margins": 0.46623745560646057, + "rewards/rejected": -0.4932693839073181, + "sft_loss": 0.270319402217865, + "step": 1804 + }, + { + "epoch": 2.6102675343456254, + "grad_norm": 2.214914283556716, + "learning_rate": 4.977662795057641e-06, + "logits/chosen": -0.3243568539619446, + "logits/rejected": -0.20184937119483948, + "logps/chosen": -0.38584083318710327, + "logps/rejected": -4.922272205352783, + "loss": 0.4123, + "odds_ratio_loss": 0.14743968844413757, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03858408331871033, + "rewards/margins": 0.45364317297935486, + "rewards/rejected": -0.4922272562980652, + "sft_loss": 0.38584083318710327, + "step": 1805 + }, + { + "epoch": 2.6117136659436007, + "grad_norm": 2.396234219132774, + "learning_rate": 4.974650825050149e-06, + "logits/chosen": -0.3889038860797882, + "logits/rejected": -0.39398396015167236, + "logps/chosen": -0.4082852900028229, + "logps/rejected": -2.398747682571411, + "loss": 0.3867, + "odds_ratio_loss": 0.15101824700832367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.040828533470630646, + "rewards/margins": 0.19904625415802002, + "rewards/rejected": -0.23987479507923126, + "sft_loss": 0.4082852900028229, + "step": 1806 + }, + { + "epoch": 2.6131597975415763, + "grad_norm": 2.170326355834699, + "learning_rate": 4.971638267422046e-06, + "logits/chosen": -0.45325085520744324, + "logits/rejected": -0.3461891710758209, + "logps/chosen": -0.3370608687400818, + "logps/rejected": -3.079336166381836, + "loss": 0.3023, + "odds_ratio_loss": 0.09138752520084381, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03370608389377594, + "rewards/margins": 0.2742275297641754, + "rewards/rejected": -0.30793359875679016, + "sft_loss": 0.3370608687400818, + "step": 1807 + }, + { + "epoch": 2.6146059291395516, + "grad_norm": 2.4688817277696695, + "learning_rate": 4.968625123989612e-06, + "logits/chosen": -0.28213053941726685, + "logits/rejected": -0.28659752011299133, + "logps/chosen": -0.29719579219818115, + "logps/rejected": -3.136472225189209, + "loss": 0.3744, + "odds_ratio_loss": 0.08925367891788483, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029719576239585876, + "rewards/margins": 0.28392767906188965, + "rewards/rejected": -0.31364724040031433, + "sft_loss": 0.29719579219818115, + "step": 1808 + }, + { + "epoch": 2.6160520607375273, + "grad_norm": 2.7610315458080086, + "learning_rate": 4.965611396569483e-06, + "logits/chosen": -0.25537610054016113, + "logits/rejected": -0.23609262704849243, + "logps/chosen": -0.4210742712020874, + "logps/rejected": -3.389645576477051, + "loss": 0.4538, + "odds_ratio_loss": 0.15438979864120483, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04210742563009262, + "rewards/margins": 0.2968571186065674, + "rewards/rejected": -0.3389645218849182, + "sft_loss": 0.4210742712020874, + "step": 1809 + }, + { + "epoch": 2.6174981923355025, + "grad_norm": 2.3541496517196685, + "learning_rate": 4.962597086978646e-06, + "logits/chosen": -0.16515159606933594, + "logits/rejected": -0.14388681948184967, + "logps/chosen": -0.48628005385398865, + "logps/rejected": -4.436060905456543, + "loss": 0.3932, + "odds_ratio_loss": 0.19803836941719055, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04862800985574722, + "rewards/margins": 0.39497804641723633, + "rewards/rejected": -0.44360604882240295, + "sft_loss": 0.48628005385398865, + "step": 1810 + }, + { + "epoch": 2.618944323933478, + "grad_norm": 2.4441155908403838, + "learning_rate": 4.959582197034442e-06, + "logits/chosen": -0.2943183183670044, + "logits/rejected": -0.2773338258266449, + "logps/chosen": -0.35410210490226746, + "logps/rejected": -4.187530994415283, + "loss": 0.311, + "odds_ratio_loss": 0.07550371438264847, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.035410210490226746, + "rewards/margins": 0.3833428919315338, + "rewards/rejected": -0.41875308752059937, + "sft_loss": 0.35410210490226746, + "step": 1811 + }, + { + "epoch": 2.6203904555314534, + "grad_norm": 11.302558835724303, + "learning_rate": 4.956566728554556e-06, + "logits/chosen": -0.1322237253189087, + "logits/rejected": -0.163202166557312, + "logps/chosen": -0.34461265802383423, + "logps/rejected": -4.301257133483887, + "loss": 0.4088, + "odds_ratio_loss": 0.13887152075767517, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03446126729249954, + "rewards/margins": 0.3956645131111145, + "rewards/rejected": -0.43012574315071106, + "sft_loss": 0.34461265802383423, + "step": 1812 + }, + { + "epoch": 2.6218365871294287, + "grad_norm": 2.472796208481264, + "learning_rate": 4.953550683357027e-06, + "logits/chosen": -0.27975982427597046, + "logits/rejected": -0.222725510597229, + "logps/chosen": -0.3341180086135864, + "logps/rejected": -2.5925729274749756, + "loss": 0.3509, + "odds_ratio_loss": 0.14651554822921753, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03341180086135864, + "rewards/margins": 0.22584550082683563, + "rewards/rejected": -0.25925731658935547, + "sft_loss": 0.3341180086135864, + "step": 1813 + }, + { + "epoch": 2.6232827187274044, + "grad_norm": 2.7613022352029306, + "learning_rate": 4.95053406326024e-06, + "logits/chosen": -0.28327706456184387, + "logits/rejected": -0.2542087435722351, + "logps/chosen": -0.3347577452659607, + "logps/rejected": -2.825500011444092, + "loss": 0.3515, + "odds_ratio_loss": 0.17398758232593536, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03347577154636383, + "rewards/margins": 0.24907422065734863, + "rewards/rejected": -0.28255000710487366, + "sft_loss": 0.3347577452659607, + "step": 1814 + }, + { + "epoch": 2.6247288503253796, + "grad_norm": 3.439891250550689, + "learning_rate": 4.947516870082926e-06, + "logits/chosen": -0.31636762619018555, + "logits/rejected": -0.2951328456401825, + "logps/chosen": -0.3941154479980469, + "logps/rejected": -3.2448151111602783, + "loss": 0.3978, + "odds_ratio_loss": 0.15645679831504822, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03941154479980469, + "rewards/margins": 0.28506994247436523, + "rewards/rejected": -0.32448145747184753, + "sft_loss": 0.3941154479980469, + "step": 1815 + }, + { + "epoch": 2.626174981923355, + "grad_norm": 2.899928062378467, + "learning_rate": 4.944499105644163e-06, + "logits/chosen": -0.3948509991168976, + "logits/rejected": -0.2616193890571594, + "logps/chosen": -0.4997680187225342, + "logps/rejected": -2.383761405944824, + "loss": 0.3824, + "odds_ratio_loss": 0.12019702047109604, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04997680336236954, + "rewards/margins": 0.18839934468269348, + "rewards/rejected": -0.23837614059448242, + "sft_loss": 0.4997680187225342, + "step": 1816 + }, + { + "epoch": 2.6276211135213305, + "grad_norm": 2.326957425313526, + "learning_rate": 4.94148077176337e-06, + "logits/chosen": -0.3444955348968506, + "logits/rejected": -0.4002199172973633, + "logps/chosen": -0.2849689722061157, + "logps/rejected": -4.096587657928467, + "loss": 0.2927, + "odds_ratio_loss": 0.10544468462467194, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.028496896848082542, + "rewards/margins": 0.3811618983745575, + "rewards/rejected": -0.4096587598323822, + "sft_loss": 0.2849689722061157, + "step": 1817 + }, + { + "epoch": 2.629067245119306, + "grad_norm": 3.583265711689172, + "learning_rate": 4.938461870260314e-06, + "logits/chosen": -0.31613588333129883, + "logits/rejected": -0.24397434294223785, + "logps/chosen": -0.352634072303772, + "logps/rejected": -4.161520957946777, + "loss": 0.3471, + "odds_ratio_loss": 0.0841374322772026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03526340797543526, + "rewards/margins": 0.3808886408805847, + "rewards/rejected": -0.41615208983421326, + "sft_loss": 0.352634072303772, + "step": 1818 + }, + { + "epoch": 2.630513376717281, + "grad_norm": 2.1560310075706046, + "learning_rate": 4.9354424029551005e-06, + "logits/chosen": -0.43916022777557373, + "logits/rejected": -0.18733128905296326, + "logps/chosen": -0.25644809007644653, + "logps/rejected": -2.241384267807007, + "loss": 0.3276, + "odds_ratio_loss": 0.112449511885643, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025644807144999504, + "rewards/margins": 0.1984936147928238, + "rewards/rejected": -0.22413842380046844, + "sft_loss": 0.25644809007644653, + "step": 1819 + }, + { + "epoch": 2.6319595083152567, + "grad_norm": 2.445334001860914, + "learning_rate": 4.93242237166818e-06, + "logits/chosen": -0.257645845413208, + "logits/rejected": -0.2424604594707489, + "logps/chosen": -0.39798110723495483, + "logps/rejected": -4.620807647705078, + "loss": 0.3767, + "odds_ratio_loss": 0.09533874690532684, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03979811072349548, + "rewards/margins": 0.4222826063632965, + "rewards/rejected": -0.462080717086792, + "sft_loss": 0.39798110723495483, + "step": 1820 + }, + { + "epoch": 2.633405639913232, + "grad_norm": 2.917112963148812, + "learning_rate": 4.929401778220337e-06, + "logits/chosen": -0.41579800844192505, + "logits/rejected": -0.2872324585914612, + "logps/chosen": -0.2373175173997879, + "logps/rejected": -5.390769004821777, + "loss": 0.3582, + "odds_ratio_loss": 0.05972566083073616, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02373175323009491, + "rewards/margins": 0.5153451561927795, + "rewards/rejected": -0.5390769243240356, + "sft_loss": 0.2373175173997879, + "step": 1821 + }, + { + "epoch": 2.6348517715112076, + "grad_norm": 2.2456614622953714, + "learning_rate": 4.926380624432701e-06, + "logits/chosen": -0.19419153034687042, + "logits/rejected": -0.18315161764621735, + "logps/chosen": -0.46090608835220337, + "logps/rejected": -3.4876163005828857, + "loss": 0.3824, + "odds_ratio_loss": 0.1783159077167511, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.046090610325336456, + "rewards/margins": 0.30267101526260376, + "rewards/rejected": -0.3487616181373596, + "sft_loss": 0.46090608835220337, + "step": 1822 + }, + { + "epoch": 2.636297903109183, + "grad_norm": 2.510652257091109, + "learning_rate": 4.923358912126737e-06, + "logits/chosen": -0.20722374320030212, + "logits/rejected": -0.10644985735416412, + "logps/chosen": -0.4585767388343811, + "logps/rejected": -4.008731842041016, + "loss": 0.4682, + "odds_ratio_loss": 0.17287862300872803, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04585767537355423, + "rewards/margins": 0.35501551628112793, + "rewards/rejected": -0.40087318420410156, + "sft_loss": 0.4585767388343811, + "step": 1823 + }, + { + "epoch": 2.6377440347071586, + "grad_norm": 3.897456193625065, + "learning_rate": 4.920336643124245e-06, + "logits/chosen": -0.038326654583215714, + "logits/rejected": -0.1093643456697464, + "logps/chosen": -0.27784785628318787, + "logps/rejected": -4.6210737228393555, + "loss": 0.3785, + "odds_ratio_loss": 0.08139775693416595, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.027784785255789757, + "rewards/margins": 0.4343225955963135, + "rewards/rejected": -0.4621073603630066, + "sft_loss": 0.27784785628318787, + "step": 1824 + }, + { + "epoch": 2.639190166305134, + "grad_norm": 2.560995261737236, + "learning_rate": 4.917313819247363e-06, + "logits/chosen": -0.1732892543077469, + "logits/rejected": -0.25222983956336975, + "logps/chosen": -0.3237385153770447, + "logps/rejected": -2.947514295578003, + "loss": 0.3709, + "odds_ratio_loss": 0.16104400157928467, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03237384930253029, + "rewards/margins": 0.2623775899410248, + "rewards/rejected": -0.29475143551826477, + "sft_loss": 0.3237385153770447, + "step": 1825 + }, + { + "epoch": 2.640636297903109, + "grad_norm": 2.765067172884269, + "learning_rate": 4.914290442318564e-06, + "logits/chosen": -0.31066641211509705, + "logits/rejected": -0.15391992032527924, + "logps/chosen": -0.32306790351867676, + "logps/rejected": -2.4956536293029785, + "loss": 0.3796, + "odds_ratio_loss": 0.11629980057477951, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.032306790351867676, + "rewards/margins": 0.21725855767726898, + "rewards/rejected": -0.24956536293029785, + "sft_loss": 0.32306790351867676, + "step": 1826 + }, + { + "epoch": 2.6420824295010847, + "grad_norm": 2.2647017949805015, + "learning_rate": 4.911266514160652e-06, + "logits/chosen": -0.23672957718372345, + "logits/rejected": -0.21628616750240326, + "logps/chosen": -0.3888474106788635, + "logps/rejected": -3.2323203086853027, + "loss": 0.3573, + "odds_ratio_loss": 0.22670409083366394, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03888474404811859, + "rewards/margins": 0.2843472957611084, + "rewards/rejected": -0.3232320547103882, + "sft_loss": 0.3888474106788635, + "step": 1827 + }, + { + "epoch": 2.64352856109906, + "grad_norm": 2.2611203101933013, + "learning_rate": 4.908242036596764e-06, + "logits/chosen": -0.1369129866361618, + "logits/rejected": -0.2426975816488266, + "logps/chosen": -0.4664827883243561, + "logps/rejected": -2.282384157180786, + "loss": 0.3863, + "odds_ratio_loss": 0.2692616283893585, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04664827883243561, + "rewards/margins": 0.18159013986587524, + "rewards/rejected": -0.22823843359947205, + "sft_loss": 0.4664827883243561, + "step": 1828 + }, + { + "epoch": 2.6449746926970352, + "grad_norm": 3.4362278755855487, + "learning_rate": 4.905217011450371e-06, + "logits/chosen": -0.2658514976501465, + "logits/rejected": -0.13316062092781067, + "logps/chosen": -0.42111894488334656, + "logps/rejected": -3.255000591278076, + "loss": 0.3749, + "odds_ratio_loss": 0.09841261804103851, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.042111895978450775, + "rewards/margins": 0.2833881676197052, + "rewards/rejected": -0.3255000710487366, + "sft_loss": 0.42111894488334656, + "step": 1829 + }, + { + "epoch": 2.646420824295011, + "grad_norm": 2.5230284188198855, + "learning_rate": 4.902191440545269e-06, + "logits/chosen": -0.1730635017156601, + "logits/rejected": -0.08948790282011032, + "logps/chosen": -0.35513409972190857, + "logps/rejected": -2.901704788208008, + "loss": 0.428, + "odds_ratio_loss": 0.09851216524839401, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03551340475678444, + "rewards/margins": 0.2546570599079132, + "rewards/rejected": -0.29017049074172974, + "sft_loss": 0.35513409972190857, + "step": 1830 + }, + { + "epoch": 2.647866955892986, + "grad_norm": 4.862374325559571, + "learning_rate": 4.899165325705588e-06, + "logits/chosen": -0.11161284148693085, + "logits/rejected": -0.31778573989868164, + "logps/chosen": -0.2354210615158081, + "logps/rejected": -4.787618637084961, + "loss": 0.3417, + "odds_ratio_loss": 0.1100718304514885, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02354210615158081, + "rewards/margins": 0.45521974563598633, + "rewards/rejected": -0.4787618815898895, + "sft_loss": 0.2354210615158081, + "step": 1831 + }, + { + "epoch": 2.649313087490962, + "grad_norm": 2.260787046159271, + "learning_rate": 4.896138668755783e-06, + "logits/chosen": -0.09058425575494766, + "logits/rejected": 0.016488205641508102, + "logps/chosen": -0.2910672724246979, + "logps/rejected": -2.503805637359619, + "loss": 0.3102, + "odds_ratio_loss": 0.15633143484592438, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029106728732585907, + "rewards/margins": 0.22127383947372437, + "rewards/rejected": -0.25038057565689087, + "sft_loss": 0.2910672724246979, + "step": 1832 + }, + { + "epoch": 2.650759219088937, + "grad_norm": 2.1646837648987343, + "learning_rate": 4.893111471520637e-06, + "logits/chosen": -0.2946851849555969, + "logits/rejected": -0.3171956539154053, + "logps/chosen": -0.4242430329322815, + "logps/rejected": -4.413708209991455, + "loss": 0.3374, + "odds_ratio_loss": 0.13221263885498047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04242429882287979, + "rewards/margins": 0.39894652366638184, + "rewards/rejected": -0.4413708448410034, + "sft_loss": 0.4242430329322815, + "step": 1833 + }, + { + "epoch": 2.6522053506869128, + "grad_norm": 2.3527904907517105, + "learning_rate": 4.890083735825257e-06, + "logits/chosen": -0.12634724378585815, + "logits/rejected": -0.18269193172454834, + "logps/chosen": -0.3488408625125885, + "logps/rejected": -3.032761573791504, + "loss": 0.3689, + "odds_ratio_loss": 0.16602487862110138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03488408774137497, + "rewards/margins": 0.26839208602905273, + "rewards/rejected": -0.3032761812210083, + "sft_loss": 0.3488408625125885, + "step": 1834 + }, + { + "epoch": 2.653651482284888, + "grad_norm": 2.332969487913798, + "learning_rate": 4.88705546349508e-06, + "logits/chosen": -0.41772475838661194, + "logits/rejected": -0.2163325548171997, + "logps/chosen": -0.31502100825309753, + "logps/rejected": -3.173156976699829, + "loss": 0.3645, + "odds_ratio_loss": 0.10136070847511292, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03150210529565811, + "rewards/margins": 0.2858135998249054, + "rewards/rejected": -0.3173157274723053, + "sft_loss": 0.31502100825309753, + "step": 1835 + }, + { + "epoch": 2.6550976138828633, + "grad_norm": 2.353448236079133, + "learning_rate": 4.884026656355859e-06, + "logits/chosen": -0.16601859033107758, + "logits/rejected": -0.1426706314086914, + "logps/chosen": -0.29813799262046814, + "logps/rejected": -3.2178196907043457, + "loss": 0.371, + "odds_ratio_loss": 0.07204075902700424, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029813801869750023, + "rewards/margins": 0.2919681668281555, + "rewards/rejected": -0.3217819929122925, + "sft_loss": 0.29813799262046814, + "step": 1836 + }, + { + "epoch": 2.656543745480839, + "grad_norm": 2.5198880328260858, + "learning_rate": 4.880997316233675e-06, + "logits/chosen": -0.08051356673240662, + "logits/rejected": -0.03858156129717827, + "logps/chosen": -0.27724963426589966, + "logps/rejected": -2.3203272819519043, + "loss": 0.3583, + "odds_ratio_loss": 0.1639135181903839, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.027724966406822205, + "rewards/margins": 0.20430777966976166, + "rewards/rejected": -0.23203276097774506, + "sft_loss": 0.27724963426589966, + "step": 1837 + }, + { + "epoch": 2.657989877078814, + "grad_norm": 2.2869035506328763, + "learning_rate": 4.877967444954928e-06, + "logits/chosen": -0.11426656693220139, + "logits/rejected": -0.21190780401229858, + "logps/chosen": -0.27625131607055664, + "logps/rejected": -3.109833002090454, + "loss": 0.3755, + "odds_ratio_loss": 0.08180101215839386, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027625130489468575, + "rewards/margins": 0.2833581864833832, + "rewards/rejected": -0.3109833002090454, + "sft_loss": 0.27625131607055664, + "step": 1838 + }, + { + "epoch": 2.6594360086767894, + "grad_norm": 2.057596946172654, + "learning_rate": 4.874937044346338e-06, + "logits/chosen": -0.18351957201957703, + "logits/rejected": -0.19910424947738647, + "logps/chosen": -0.3876161575317383, + "logps/rejected": -4.693860054016113, + "loss": 0.3362, + "odds_ratio_loss": 0.13495394587516785, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03876161575317383, + "rewards/margins": 0.43062442541122437, + "rewards/rejected": -0.4693860709667206, + "sft_loss": 0.3876161575317383, + "step": 1839 + }, + { + "epoch": 2.660882140274765, + "grad_norm": 2.296001222551746, + "learning_rate": 4.871906116234946e-06, + "logits/chosen": -0.23469124734401703, + "logits/rejected": -0.22504733502864838, + "logps/chosen": -0.2928099036216736, + "logps/rejected": -5.243224143981934, + "loss": 0.342, + "odds_ratio_loss": 0.0769428163766861, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029280992224812508, + "rewards/margins": 0.49504148960113525, + "rewards/rejected": -0.524322509765625, + "sft_loss": 0.2928099036216736, + "step": 1840 + }, + { + "epoch": 2.6623282718727403, + "grad_norm": 2.686575716206401, + "learning_rate": 4.868874662448108e-06, + "logits/chosen": -0.14365555346012115, + "logits/rejected": -0.13080452382564545, + "logps/chosen": -0.29430562257766724, + "logps/rejected": -4.659070014953613, + "loss": 0.3909, + "odds_ratio_loss": 0.09934721887111664, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029430562630295753, + "rewards/margins": 0.4364764392375946, + "rewards/rejected": -0.4659070074558258, + "sft_loss": 0.29430562257766724, + "step": 1841 + }, + { + "epoch": 2.6637744034707156, + "grad_norm": 2.3024896264526964, + "learning_rate": 4.865842684813501e-06, + "logits/chosen": -0.14090366661548615, + "logits/rejected": -0.2288772016763687, + "logps/chosen": -0.13625164330005646, + "logps/rejected": -4.667929649353027, + "loss": 0.2728, + "odds_ratio_loss": 0.05977560207247734, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01362516451627016, + "rewards/margins": 0.45316773653030396, + "rewards/rejected": -0.46679291129112244, + "sft_loss": 0.13625164330005646, + "step": 1842 + }, + { + "epoch": 2.6652205350686913, + "grad_norm": 2.5026158538150822, + "learning_rate": 4.862810185159115e-06, + "logits/chosen": -0.12085071206092834, + "logits/rejected": -0.1948763132095337, + "logps/chosen": -0.3157857656478882, + "logps/rejected": -4.3117899894714355, + "loss": 0.312, + "odds_ratio_loss": 0.12068923562765121, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03157857805490494, + "rewards/margins": 0.39960044622421265, + "rewards/rejected": -0.431179016828537, + "sft_loss": 0.3157857656478882, + "step": 1843 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 2.42599710631053, + "learning_rate": 4.859777165313254e-06, + "logits/chosen": -0.06984113156795502, + "logits/rejected": -0.11556413769721985, + "logps/chosen": -0.26944583654403687, + "logps/rejected": -5.107959747314453, + "loss": 0.3727, + "odds_ratio_loss": 0.06308241933584213, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026944583281874657, + "rewards/margins": 0.4838513433933258, + "rewards/rejected": -0.5107959508895874, + "sft_loss": 0.26944583654403687, + "step": 1844 + }, + { + "epoch": 2.668112798264642, + "grad_norm": 2.362337955779663, + "learning_rate": 4.856743627104538e-06, + "logits/chosen": -0.13892969489097595, + "logits/rejected": -0.2013864815235138, + "logps/chosen": -0.4483214020729065, + "logps/rejected": -4.599874496459961, + "loss": 0.3419, + "odds_ratio_loss": 0.1677556186914444, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04483213648200035, + "rewards/margins": 0.4151553511619568, + "rewards/rejected": -0.45998746156692505, + "sft_loss": 0.4483214020729065, + "step": 1845 + }, + { + "epoch": 2.6695589298626174, + "grad_norm": 2.174536905457461, + "learning_rate": 4.8537095723618984e-06, + "logits/chosen": -0.3669096529483795, + "logits/rejected": -0.22529584169387817, + "logps/chosen": -0.33727243542671204, + "logps/rejected": -4.307671070098877, + "loss": 0.3986, + "odds_ratio_loss": 0.07581018656492233, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.033727243542671204, + "rewards/margins": 0.39703983068466187, + "rewards/rejected": -0.43076711893081665, + "sft_loss": 0.33727243542671204, + "step": 1846 + }, + { + "epoch": 2.671005061460593, + "grad_norm": 2.2127792507009985, + "learning_rate": 4.850675002914579e-06, + "logits/chosen": -0.23622475564479828, + "logits/rejected": -0.3096325397491455, + "logps/chosen": -0.377718448638916, + "logps/rejected": -2.8641698360443115, + "loss": 0.372, + "odds_ratio_loss": 0.1540893316268921, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03777184337377548, + "rewards/margins": 0.24864515662193298, + "rewards/rejected": -0.28641700744628906, + "sft_loss": 0.377718448638916, + "step": 1847 + }, + { + "epoch": 2.6724511930585684, + "grad_norm": 2.641085873459316, + "learning_rate": 4.847639920592131e-06, + "logits/chosen": -0.4159122705459595, + "logits/rejected": -0.47509440779685974, + "logps/chosen": -0.4206668734550476, + "logps/rejected": -3.3233633041381836, + "loss": 0.4468, + "odds_ratio_loss": 0.1505662351846695, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04206668958067894, + "rewards/margins": 0.290269672870636, + "rewards/rejected": -0.3323363661766052, + "sft_loss": 0.4206668734550476, + "step": 1848 + }, + { + "epoch": 2.6738973246565436, + "grad_norm": 2.5318611255942773, + "learning_rate": 4.8446043272244174e-06, + "logits/chosen": -0.2710360586643219, + "logits/rejected": -0.3191642165184021, + "logps/chosen": -0.43725037574768066, + "logps/rejected": -3.675550699234009, + "loss": 0.3333, + "odds_ratio_loss": 0.16352544724941254, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04372503608465195, + "rewards/margins": 0.3238300681114197, + "rewards/rejected": -0.36755508184432983, + "sft_loss": 0.43725037574768066, + "step": 1849 + }, + { + "epoch": 2.6753434562545193, + "grad_norm": 1.9803532713734116, + "learning_rate": 4.841568224641611e-06, + "logits/chosen": -0.2555657923221588, + "logits/rejected": -0.21312148869037628, + "logps/chosen": -0.3199189603328705, + "logps/rejected": -4.294778823852539, + "loss": 0.353, + "odds_ratio_loss": 0.11003275960683823, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03199189528822899, + "rewards/margins": 0.39748603105545044, + "rewards/rejected": -0.4294779300689697, + "sft_loss": 0.3199189603328705, + "step": 1850 + }, + { + "epoch": 2.6767895878524945, + "grad_norm": 2.3395839403914476, + "learning_rate": 4.838531614674187e-06, + "logits/chosen": -0.12954504787921906, + "logits/rejected": -0.13004763424396515, + "logps/chosen": -0.3799174427986145, + "logps/rejected": -2.644596576690674, + "loss": 0.3671, + "odds_ratio_loss": 0.11590912193059921, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03799174726009369, + "rewards/margins": 0.22646790742874146, + "rewards/rejected": -0.26445966958999634, + "sft_loss": 0.3799174427986145, + "step": 1851 + }, + { + "epoch": 2.67823571945047, + "grad_norm": 2.2326525435791713, + "learning_rate": 4.835494499152929e-06, + "logits/chosen": -0.1807328760623932, + "logits/rejected": -0.1978977471590042, + "logps/chosen": -0.5129043459892273, + "logps/rejected": -4.094215393066406, + "loss": 0.4231, + "odds_ratio_loss": 0.16595354676246643, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05129043757915497, + "rewards/margins": 0.3581311106681824, + "rewards/rejected": -0.40942153334617615, + "sft_loss": 0.5129043459892273, + "step": 1852 + }, + { + "epoch": 2.6796818510484455, + "grad_norm": 2.5480902565729946, + "learning_rate": 4.832456879908925e-06, + "logits/chosen": -0.28215038776397705, + "logits/rejected": -0.2248668372631073, + "logps/chosen": -0.4055119752883911, + "logps/rejected": -3.6705069541931152, + "loss": 0.4347, + "odds_ratio_loss": 0.10765072703361511, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04055120050907135, + "rewards/margins": 0.3264995217323303, + "rewards/rejected": -0.36705073714256287, + "sft_loss": 0.4055119752883911, + "step": 1853 + }, + { + "epoch": 2.6811279826464207, + "grad_norm": 2.0106016982008064, + "learning_rate": 4.829418758773569e-06, + "logits/chosen": -0.36731940507888794, + "logits/rejected": -0.3151874244213104, + "logps/chosen": -0.276762455701828, + "logps/rejected": -5.286036968231201, + "loss": 0.31, + "odds_ratio_loss": 0.052199844270944595, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02767624519765377, + "rewards/margins": 0.5009274482727051, + "rewards/rejected": -0.528603732585907, + "sft_loss": 0.276762455701828, + "step": 1854 + }, + { + "epoch": 2.6825741142443964, + "grad_norm": 2.547022613146218, + "learning_rate": 4.826380137578554e-06, + "logits/chosen": -0.20882338285446167, + "logits/rejected": -0.2723812162876129, + "logps/chosen": -0.3922734260559082, + "logps/rejected": -3.59426212310791, + "loss": 0.3881, + "odds_ratio_loss": 0.13410308957099915, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03922734037041664, + "rewards/margins": 0.3201989233493805, + "rewards/rejected": -0.35942625999450684, + "sft_loss": 0.3922734260559082, + "step": 1855 + }, + { + "epoch": 2.6840202458423716, + "grad_norm": 2.798533244170884, + "learning_rate": 4.823341018155876e-06, + "logits/chosen": -0.2806504964828491, + "logits/rejected": -0.3115236163139343, + "logps/chosen": -0.3371948301792145, + "logps/rejected": -1.5915367603302002, + "loss": 0.3913, + "odds_ratio_loss": 0.17964649200439453, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03371948376297951, + "rewards/margins": 0.12543419003486633, + "rewards/rejected": -0.15915367007255554, + "sft_loss": 0.3371948301792145, + "step": 1856 + }, + { + "epoch": 2.6854663774403473, + "grad_norm": 2.0924046826364595, + "learning_rate": 4.8203014023378315e-06, + "logits/chosen": -0.34187567234039307, + "logits/rejected": -0.31444215774536133, + "logps/chosen": -0.4763234853744507, + "logps/rejected": -3.6403157711029053, + "loss": 0.4091, + "odds_ratio_loss": 0.17209962010383606, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04763234779238701, + "rewards/margins": 0.3163992464542389, + "rewards/rejected": -0.3640316128730774, + "sft_loss": 0.4763234853744507, + "step": 1857 + }, + { + "epoch": 2.6869125090383226, + "grad_norm": 2.692447994435838, + "learning_rate": 4.8172612919570175e-06, + "logits/chosen": -0.2727445363998413, + "logits/rejected": -0.3767099678516388, + "logps/chosen": -0.5349310636520386, + "logps/rejected": -4.6468892097473145, + "loss": 0.4726, + "odds_ratio_loss": 0.12882837653160095, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05349310487508774, + "rewards/margins": 0.41119587421417236, + "rewards/rejected": -0.4646889567375183, + "sft_loss": 0.5349310636520386, + "step": 1858 + }, + { + "epoch": 2.688358640636298, + "grad_norm": 2.336635461911087, + "learning_rate": 4.814220688846326e-06, + "logits/chosen": -0.30912578105926514, + "logits/rejected": -0.11871747672557831, + "logps/chosen": -0.2691580653190613, + "logps/rejected": -2.848756790161133, + "loss": 0.3284, + "odds_ratio_loss": 0.08548923581838608, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026915807276964188, + "rewards/margins": 0.25795987248420715, + "rewards/rejected": -0.28487569093704224, + "sft_loss": 0.2691580653190613, + "step": 1859 + }, + { + "epoch": 2.6898047722342735, + "grad_norm": 2.6703621825691024, + "learning_rate": 4.811179594838949e-06, + "logits/chosen": -0.2815786898136139, + "logits/rejected": -0.1816381812095642, + "logps/chosen": -0.45205157995224, + "logps/rejected": -3.6539969444274902, + "loss": 0.4271, + "odds_ratio_loss": 0.11903213709592819, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04520515725016594, + "rewards/margins": 0.3201946020126343, + "rewards/rejected": -0.3653997480869293, + "sft_loss": 0.45205157995224, + "step": 1860 + }, + { + "epoch": 2.6912509038322487, + "grad_norm": 2.3383290717041545, + "learning_rate": 4.808138011768372e-06, + "logits/chosen": -0.35975927114486694, + "logits/rejected": -0.263312965631485, + "logps/chosen": -0.37474626302719116, + "logps/rejected": -3.156010150909424, + "loss": 0.3931, + "odds_ratio_loss": 0.09446077048778534, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.037474624812603, + "rewards/margins": 0.27812641859054565, + "rewards/rejected": -0.31560102105140686, + "sft_loss": 0.37474626302719116, + "step": 1861 + }, + { + "epoch": 2.692697035430224, + "grad_norm": 2.49353100276086, + "learning_rate": 4.805095941468379e-06, + "logits/chosen": -0.23262527585029602, + "logits/rejected": -0.14135374128818512, + "logps/chosen": -0.19227707386016846, + "logps/rejected": -6.0062255859375, + "loss": 0.3543, + "odds_ratio_loss": 0.022998683154582977, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019227707758545876, + "rewards/margins": 0.5813947916030884, + "rewards/rejected": -0.6006225347518921, + "sft_loss": 0.19227707386016846, + "step": 1862 + }, + { + "epoch": 2.6941431670281997, + "grad_norm": 2.254707565413196, + "learning_rate": 4.8020533857730446e-06, + "logits/chosen": -0.3178359866142273, + "logits/rejected": -0.42701420187950134, + "logps/chosen": -0.33618977665901184, + "logps/rejected": -5.497029781341553, + "loss": 0.3486, + "odds_ratio_loss": 0.11398158967494965, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0336189791560173, + "rewards/margins": 0.5160840153694153, + "rewards/rejected": -0.5497030019760132, + "sft_loss": 0.33618977665901184, + "step": 1863 + }, + { + "epoch": 2.695589298626175, + "grad_norm": 2.2948507334152732, + "learning_rate": 4.799010346516736e-06, + "logits/chosen": -0.199011892080307, + "logits/rejected": -0.21153825521469116, + "logps/chosen": -0.21537664532661438, + "logps/rejected": -3.909799098968506, + "loss": 0.2931, + "odds_ratio_loss": 0.09779515117406845, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02153766341507435, + "rewards/margins": 0.36944228410720825, + "rewards/rejected": -0.39097991585731506, + "sft_loss": 0.21537664532661438, + "step": 1864 + }, + { + "epoch": 2.69703543022415, + "grad_norm": 2.355929773573819, + "learning_rate": 4.795966825534113e-06, + "logits/chosen": -0.2665369212627411, + "logits/rejected": -0.1927603930234909, + "logps/chosen": -0.2670694887638092, + "logps/rejected": -3.359889030456543, + "loss": 0.3798, + "odds_ratio_loss": 0.09360802173614502, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02670694887638092, + "rewards/margins": 0.30928194522857666, + "rewards/rejected": -0.3359888792037964, + "sft_loss": 0.2670694887638092, + "step": 1865 + }, + { + "epoch": 2.698481561822126, + "grad_norm": 2.361748512537748, + "learning_rate": 4.7929228246601284e-06, + "logits/chosen": -0.14877094328403473, + "logits/rejected": -0.20902572572231293, + "logps/chosen": -0.2963876724243164, + "logps/rejected": -3.6602306365966797, + "loss": 0.3352, + "odds_ratio_loss": 0.11799450218677521, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02963876724243164, + "rewards/margins": 0.33638429641723633, + "rewards/rejected": -0.36602309346199036, + "sft_loss": 0.2963876724243164, + "step": 1866 + }, + { + "epoch": 2.699927693420101, + "grad_norm": 2.690629222337708, + "learning_rate": 4.789878345730018e-06, + "logits/chosen": -0.3562997877597809, + "logits/rejected": -0.26115548610687256, + "logps/chosen": -0.44163084030151367, + "logps/rejected": -2.134089469909668, + "loss": 0.3545, + "odds_ratio_loss": 0.1546330451965332, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04416308552026749, + "rewards/margins": 0.1692458689212799, + "rewards/rejected": -0.2134089469909668, + "sft_loss": 0.44163084030151367, + "step": 1867 + }, + { + "epoch": 2.7013738250180768, + "grad_norm": 3.981075125457187, + "learning_rate": 4.786833390579312e-06, + "logits/chosen": -0.20393416285514832, + "logits/rejected": -0.25781792402267456, + "logps/chosen": -0.30942434072494507, + "logps/rejected": -5.1820878982543945, + "loss": 0.3604, + "odds_ratio_loss": 0.10879567265510559, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.030942432582378387, + "rewards/margins": 0.4872663617134094, + "rewards/rejected": -0.5182087421417236, + "sft_loss": 0.30942434072494507, + "step": 1868 + }, + { + "epoch": 2.702819956616052, + "grad_norm": 6.019392366774251, + "learning_rate": 4.783787961043824e-06, + "logits/chosen": -0.21762330830097198, + "logits/rejected": -0.18777711689472198, + "logps/chosen": -0.4841902256011963, + "logps/rejected": -2.6938180923461914, + "loss": 0.3912, + "odds_ratio_loss": 0.23626786470413208, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04841902479529381, + "rewards/margins": 0.220962792634964, + "rewards/rejected": -0.2693817913532257, + "sft_loss": 0.4841902256011963, + "step": 1869 + }, + { + "epoch": 2.7042660882140277, + "grad_norm": 2.30269671228443, + "learning_rate": 4.780742058959657e-06, + "logits/chosen": -0.10956840962171555, + "logits/rejected": -0.15537512302398682, + "logps/chosen": -0.301375150680542, + "logps/rejected": -3.672297239303589, + "loss": 0.3748, + "odds_ratio_loss": 0.0992063656449318, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03013751283288002, + "rewards/margins": 0.33709222078323364, + "rewards/rejected": -0.367229700088501, + "sft_loss": 0.301375150680542, + "step": 1870 + }, + { + "epoch": 2.705712219812003, + "grad_norm": 2.932343582129926, + "learning_rate": 4.777695686163193e-06, + "logits/chosen": -0.17159150540828705, + "logits/rejected": -0.2002825140953064, + "logps/chosen": -0.37887436151504517, + "logps/rejected": -1.8474010229110718, + "loss": 0.4221, + "odds_ratio_loss": 0.13345777988433838, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.037887439131736755, + "rewards/margins": 0.14685267210006714, + "rewards/rejected": -0.1847401261329651, + "sft_loss": 0.37887436151504517, + "step": 1871 + }, + { + "epoch": 2.707158351409978, + "grad_norm": 2.7981338785453183, + "learning_rate": 4.774648844491103e-06, + "logits/chosen": -0.15738992393016815, + "logits/rejected": -0.13649892807006836, + "logps/chosen": -0.3411653935909271, + "logps/rejected": -3.9342665672302246, + "loss": 0.3566, + "odds_ratio_loss": 0.09266319870948792, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03411654382944107, + "rewards/margins": 0.359310120344162, + "rewards/rejected": -0.39342665672302246, + "sft_loss": 0.3411653935909271, + "step": 1872 + }, + { + "epoch": 2.708604483007954, + "grad_norm": 2.162648757347475, + "learning_rate": 4.77160153578034e-06, + "logits/chosen": -0.11060208082199097, + "logits/rejected": -0.16701571643352509, + "logps/chosen": -0.3412019610404968, + "logps/rejected": -2.1302669048309326, + "loss": 0.3996, + "odds_ratio_loss": 0.18422572314739227, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03412019461393356, + "rewards/margins": 0.17890648543834686, + "rewards/rejected": -0.21302670240402222, + "sft_loss": 0.3412019610404968, + "step": 1873 + }, + { + "epoch": 2.710050614605929, + "grad_norm": 4.0674750995594, + "learning_rate": 4.7685537618681375e-06, + "logits/chosen": -0.19251331686973572, + "logits/rejected": -0.21041332185268402, + "logps/chosen": -0.431249737739563, + "logps/rejected": -1.8905670642852783, + "loss": 0.3992, + "odds_ratio_loss": 0.1726667582988739, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0431249774992466, + "rewards/margins": 0.14593173563480377, + "rewards/rejected": -0.18905670940876007, + "sft_loss": 0.431249737739563, + "step": 1874 + }, + { + "epoch": 2.7114967462039044, + "grad_norm": 2.152647508251286, + "learning_rate": 4.765505524592009e-06, + "logits/chosen": -0.10581720620393753, + "logits/rejected": -0.0880403071641922, + "logps/chosen": -0.3255786895751953, + "logps/rejected": -2.7813684940338135, + "loss": 0.3069, + "odds_ratio_loss": 0.11545327305793762, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03255787119269371, + "rewards/margins": 0.24557897448539734, + "rewards/rejected": -0.27813684940338135, + "sft_loss": 0.3255786895751953, + "step": 1875 + }, + { + "epoch": 2.71294287780188, + "grad_norm": 2.099315725130296, + "learning_rate": 4.762456825789747e-06, + "logits/chosen": -0.1821872740983963, + "logits/rejected": -0.2587732672691345, + "logps/chosen": -0.5633606910705566, + "logps/rejected": -2.573119640350342, + "loss": 0.4576, + "odds_ratio_loss": 0.26334935426712036, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.056336067616939545, + "rewards/margins": 0.2009759247303009, + "rewards/rejected": -0.25731199979782104, + "sft_loss": 0.5633606910705566, + "step": 1876 + }, + { + "epoch": 2.7143890093998553, + "grad_norm": 2.197321214866364, + "learning_rate": 4.759407667299429e-06, + "logits/chosen": -0.22565627098083496, + "logits/rejected": -0.2331491857767105, + "logps/chosen": -0.46682557463645935, + "logps/rejected": -3.439094066619873, + "loss": 0.4119, + "odds_ratio_loss": 0.24814468622207642, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.046682558953762054, + "rewards/margins": 0.29722684621810913, + "rewards/rejected": -0.3439093828201294, + "sft_loss": 0.46682557463645935, + "step": 1877 + }, + { + "epoch": 2.715835140997831, + "grad_norm": 2.479655479780689, + "learning_rate": 4.756358050959398e-06, + "logits/chosen": -0.4306910037994385, + "logits/rejected": -0.4077626168727875, + "logps/chosen": -0.3527236878871918, + "logps/rejected": -2.965698003768921, + "loss": 0.4205, + "odds_ratio_loss": 0.14371559023857117, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.035272371023893356, + "rewards/margins": 0.26129746437072754, + "rewards/rejected": -0.29656982421875, + "sft_loss": 0.3527236878871918, + "step": 1878 + }, + { + "epoch": 2.717281272595806, + "grad_norm": 4.301904026116894, + "learning_rate": 4.75330797860828e-06, + "logits/chosen": -0.1542791873216629, + "logits/rejected": -0.12551337480545044, + "logps/chosen": -0.3861375153064728, + "logps/rejected": -4.375819683074951, + "loss": 0.3695, + "odds_ratio_loss": 0.17965860664844513, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03861375153064728, + "rewards/margins": 0.3989682197570801, + "rewards/rejected": -0.43758198618888855, + "sft_loss": 0.3861375153064728, + "step": 1879 + }, + { + "epoch": 2.718727404193782, + "grad_norm": 2.494798707133846, + "learning_rate": 4.750257452084979e-06, + "logits/chosen": -0.2010928988456726, + "logits/rejected": -0.2024509459733963, + "logps/chosen": -0.3827640414237976, + "logps/rejected": -3.6829090118408203, + "loss": 0.305, + "odds_ratio_loss": 0.15489722788333893, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03827640041708946, + "rewards/margins": 0.33001452684402466, + "rewards/rejected": -0.36829090118408203, + "sft_loss": 0.3827640414237976, + "step": 1880 + }, + { + "epoch": 2.720173535791757, + "grad_norm": 2.424215307007899, + "learning_rate": 4.747206473228664e-06, + "logits/chosen": -0.259044885635376, + "logits/rejected": -0.16557075083255768, + "logps/chosen": -0.3136051297187805, + "logps/rejected": -3.817262649536133, + "loss": 0.3868, + "odds_ratio_loss": 0.24824589490890503, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03136051446199417, + "rewards/margins": 0.3503657579421997, + "rewards/rejected": -0.3817262649536133, + "sft_loss": 0.3136051297187805, + "step": 1881 + }, + { + "epoch": 2.7216196673897324, + "grad_norm": 2.205167059464972, + "learning_rate": 4.744155043878784e-06, + "logits/chosen": -0.23924411833286285, + "logits/rejected": -0.16571259498596191, + "logps/chosen": -0.35944584012031555, + "logps/rejected": -4.56205940246582, + "loss": 0.3075, + "odds_ratio_loss": 0.15001921355724335, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03594458848237991, + "rewards/margins": 0.4202613830566406, + "rewards/rejected": -0.45620596408843994, + "sft_loss": 0.35944584012031555, + "step": 1882 + }, + { + "epoch": 2.723065798987708, + "grad_norm": 2.3130878554020646, + "learning_rate": 4.741103165875056e-06, + "logits/chosen": -0.23174218833446503, + "logits/rejected": -0.13708661496639252, + "logps/chosen": -0.3431485593318939, + "logps/rejected": -2.549919843673706, + "loss": 0.3049, + "odds_ratio_loss": 0.10180535167455673, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03431485593318939, + "rewards/margins": 0.22067713737487793, + "rewards/rejected": -0.25499197840690613, + "sft_loss": 0.3431485593318939, + "step": 1883 + }, + { + "epoch": 2.7245119305856833, + "grad_norm": 2.3552092476829634, + "learning_rate": 4.738050841057469e-06, + "logits/chosen": -0.5356272459030151, + "logits/rejected": -0.5339710116386414, + "logps/chosen": -0.35015958547592163, + "logps/rejected": -2.9828152656555176, + "loss": 0.3505, + "odds_ratio_loss": 0.09774233400821686, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03501596301794052, + "rewards/margins": 0.26326555013656616, + "rewards/rejected": -0.2982814908027649, + "sft_loss": 0.35015958547592163, + "step": 1884 + }, + { + "epoch": 2.7259580621836585, + "grad_norm": 2.4694330118824896, + "learning_rate": 4.734998071266282e-06, + "logits/chosen": -0.3343193233013153, + "logits/rejected": -0.24137598276138306, + "logps/chosen": -0.37988466024398804, + "logps/rejected": -3.0774569511413574, + "loss": 0.368, + "odds_ratio_loss": 0.10103145241737366, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03798846900463104, + "rewards/margins": 0.2697572708129883, + "rewards/rejected": -0.3077457547187805, + "sft_loss": 0.37988466024398804, + "step": 1885 + }, + { + "epoch": 2.7274041937816342, + "grad_norm": 2.2085538092010206, + "learning_rate": 4.7319448583420195e-06, + "logits/chosen": -0.3982861042022705, + "logits/rejected": -0.36158859729766846, + "logps/chosen": -0.3293648958206177, + "logps/rejected": -2.769756555557251, + "loss": 0.3269, + "odds_ratio_loss": 0.09307774901390076, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03293649107217789, + "rewards/margins": 0.24403917789459229, + "rewards/rejected": -0.2769756615161896, + "sft_loss": 0.3293648958206177, + "step": 1886 + }, + { + "epoch": 2.7288503253796095, + "grad_norm": 2.6220443532888407, + "learning_rate": 4.7288912041254765e-06, + "logits/chosen": -0.35197606682777405, + "logits/rejected": -0.2917653024196625, + "logps/chosen": -0.2370256781578064, + "logps/rejected": -4.931891441345215, + "loss": 0.3206, + "odds_ratio_loss": 0.08049122244119644, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02370256744325161, + "rewards/margins": 0.4694865643978119, + "rewards/rejected": -0.49318912625312805, + "sft_loss": 0.2370256781578064, + "step": 1887 + }, + { + "epoch": 2.7302964569775847, + "grad_norm": 2.229589260053656, + "learning_rate": 4.72583711045771e-06, + "logits/chosen": -0.26306650042533875, + "logits/rejected": -0.3761969208717346, + "logps/chosen": -0.5922081470489502, + "logps/rejected": -3.768022298812866, + "loss": 0.4555, + "odds_ratio_loss": 0.2729063630104065, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0592208169400692, + "rewards/margins": 0.3175814151763916, + "rewards/rejected": -0.3768022358417511, + "sft_loss": 0.5922081470489502, + "step": 1888 + }, + { + "epoch": 2.7317425885755604, + "grad_norm": 2.0352388645586124, + "learning_rate": 4.722782579180048e-06, + "logits/chosen": -0.23472218215465546, + "logits/rejected": -0.15055781602859497, + "logps/chosen": -0.23011812567710876, + "logps/rejected": -3.8374698162078857, + "loss": 0.302, + "odds_ratio_loss": 0.06934744119644165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023011812940239906, + "rewards/margins": 0.3607351779937744, + "rewards/rejected": -0.3837469816207886, + "sft_loss": 0.23011812567710876, + "step": 1889 + }, + { + "epoch": 2.7331887201735356, + "grad_norm": 2.1876288211484156, + "learning_rate": 4.719727612134077e-06, + "logits/chosen": -0.24740484356880188, + "logits/rejected": -0.23094268143177032, + "logps/chosen": -0.32862362265586853, + "logps/rejected": -2.0781450271606445, + "loss": 0.3565, + "odds_ratio_loss": 0.11739077419042587, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03286236524581909, + "rewards/margins": 0.17495213449001312, + "rewards/rejected": -0.2078145146369934, + "sft_loss": 0.32862362265586853, + "step": 1890 + }, + { + "epoch": 2.7346348517715113, + "grad_norm": 2.539941122203935, + "learning_rate": 4.716672211161648e-06, + "logits/chosen": -0.2699774503707886, + "logits/rejected": -0.3347630202770233, + "logps/chosen": -0.38555604219436646, + "logps/rejected": -3.047736644744873, + "loss": 0.354, + "odds_ratio_loss": 0.15888690948486328, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.038555607199668884, + "rewards/margins": 0.26621806621551514, + "rewards/rejected": -0.3047736883163452, + "sft_loss": 0.38555604219436646, + "step": 1891 + }, + { + "epoch": 2.7360809833694866, + "grad_norm": 2.97857518962784, + "learning_rate": 4.713616378104874e-06, + "logits/chosen": -0.15838538110256195, + "logits/rejected": -0.18540534377098083, + "logps/chosen": -0.2908265292644501, + "logps/rejected": -2.8895366191864014, + "loss": 0.3958, + "odds_ratio_loss": 0.11644550412893295, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029082655906677246, + "rewards/margins": 0.2598710060119629, + "rewards/rejected": -0.28895366191864014, + "sft_loss": 0.2908265292644501, + "step": 1892 + }, + { + "epoch": 2.7375271149674623, + "grad_norm": 3.137520567531617, + "learning_rate": 4.710560114806128e-06, + "logits/chosen": -0.2546279728412628, + "logits/rejected": -0.22949475049972534, + "logps/chosen": -0.5076591372489929, + "logps/rejected": -3.7342796325683594, + "loss": 0.5022, + "odds_ratio_loss": 0.22245776653289795, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05076591670513153, + "rewards/margins": 0.32266199588775635, + "rewards/rejected": -0.3734279274940491, + "sft_loss": 0.5076591372489929, + "step": 1893 + }, + { + "epoch": 2.7389732465654375, + "grad_norm": 2.8291511708947965, + "learning_rate": 4.707503423108042e-06, + "logits/chosen": -0.24433070421218872, + "logits/rejected": -0.24968703091144562, + "logps/chosen": -0.48571792244911194, + "logps/rejected": -1.9403795003890991, + "loss": 0.4089, + "odds_ratio_loss": 0.2183419167995453, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.048571791499853134, + "rewards/margins": 0.1454661637544632, + "rewards/rejected": -0.19403795897960663, + "sft_loss": 0.48571792244911194, + "step": 1894 + }, + { + "epoch": 2.7404193781634127, + "grad_norm": 5.589426058096608, + "learning_rate": 4.7044463048535065e-06, + "logits/chosen": -0.21336159110069275, + "logits/rejected": -0.15851540863513947, + "logps/chosen": -0.3277932405471802, + "logps/rejected": -3.657266139984131, + "loss": 0.3647, + "odds_ratio_loss": 0.0775853842496872, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03277932107448578, + "rewards/margins": 0.33294734358787537, + "rewards/rejected": -0.36572664976119995, + "sft_loss": 0.3277932405471802, + "step": 1895 + }, + { + "epoch": 2.7418655097613884, + "grad_norm": 2.559094190532531, + "learning_rate": 4.70138876188567e-06, + "logits/chosen": -0.1807275265455246, + "logits/rejected": -0.25307416915893555, + "logps/chosen": -0.5286034941673279, + "logps/rejected": -4.811222553253174, + "loss": 0.4474, + "odds_ratio_loss": 0.15058596432209015, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05286034941673279, + "rewards/margins": 0.428261935710907, + "rewards/rejected": -0.4811222553253174, + "sft_loss": 0.5286034941673279, + "step": 1896 + }, + { + "epoch": 2.7433116413593637, + "grad_norm": 2.7848490528190295, + "learning_rate": 4.6983307960479386e-06, + "logits/chosen": -0.10986755788326263, + "logits/rejected": -0.18220268189907074, + "logps/chosen": -0.2580132782459259, + "logps/rejected": -3.7098939418792725, + "loss": 0.3357, + "odds_ratio_loss": 0.11131416261196136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02580132894217968, + "rewards/margins": 0.34518808126449585, + "rewards/rejected": -0.3709894120693207, + "sft_loss": 0.2580132782459259, + "step": 1897 + }, + { + "epoch": 2.744757772957339, + "grad_norm": 2.3933005373715575, + "learning_rate": 4.695272409183969e-06, + "logits/chosen": -0.36661919951438904, + "logits/rejected": -0.2121463268995285, + "logps/chosen": -0.45644479990005493, + "logps/rejected": -4.979647636413574, + "loss": 0.4361, + "odds_ratio_loss": 0.1398359090089798, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04564448446035385, + "rewards/margins": 0.45232027769088745, + "rewards/rejected": -0.4979647099971771, + "sft_loss": 0.45644479990005493, + "step": 1898 + }, + { + "epoch": 2.7462039045553146, + "grad_norm": 2.6501699786450947, + "learning_rate": 4.692213603137673e-06, + "logits/chosen": -0.15103618800640106, + "logits/rejected": -0.2331678569316864, + "logps/chosen": -0.3875581622123718, + "logps/rejected": -2.742100238800049, + "loss": 0.369, + "odds_ratio_loss": 0.20608189702033997, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03875581920146942, + "rewards/margins": 0.23545420169830322, + "rewards/rejected": -0.27421000599861145, + "sft_loss": 0.3875581622123718, + "step": 1899 + }, + { + "epoch": 2.74765003615329, + "grad_norm": 2.2818384142651658, + "learning_rate": 4.689154379753219e-06, + "logits/chosen": -0.2545371651649475, + "logits/rejected": -0.3067525327205658, + "logps/chosen": -0.3318255841732025, + "logps/rejected": -4.289620876312256, + "loss": 0.395, + "odds_ratio_loss": 0.10435459017753601, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03318255394697189, + "rewards/margins": 0.395779550075531, + "rewards/rejected": -0.4289621114730835, + "sft_loss": 0.3318255841732025, + "step": 1900 + }, + { + "epoch": 2.7490961677512655, + "grad_norm": 2.5609234326640675, + "learning_rate": 4.686094740875022e-06, + "logits/chosen": -0.2666279673576355, + "logits/rejected": -0.21327508985996246, + "logps/chosen": -0.2115049809217453, + "logps/rejected": -3.417975664138794, + "loss": 0.3291, + "odds_ratio_loss": 0.07255256921052933, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02115049958229065, + "rewards/margins": 0.32064709067344666, + "rewards/rejected": -0.3417975902557373, + "sft_loss": 0.2115049809217453, + "step": 1901 + }, + { + "epoch": 2.7505422993492408, + "grad_norm": 2.3829562380839913, + "learning_rate": 4.68303468834775e-06, + "logits/chosen": -0.12061008810997009, + "logits/rejected": -0.12486258894205093, + "logps/chosen": -0.3109768033027649, + "logps/rejected": -3.7834866046905518, + "loss": 0.4206, + "odds_ratio_loss": 0.11438363790512085, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03109768033027649, + "rewards/margins": 0.3472509980201721, + "rewards/rejected": -0.3783486485481262, + "sft_loss": 0.3109768033027649, + "step": 1902 + }, + { + "epoch": 2.7519884309472165, + "grad_norm": 2.862427532503618, + "learning_rate": 4.67997422401632e-06, + "logits/chosen": -0.16161176562309265, + "logits/rejected": -0.20153605937957764, + "logps/chosen": -0.41737887263298035, + "logps/rejected": -1.8366215229034424, + "loss": 0.3795, + "odds_ratio_loss": 0.2205577939748764, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.041737888008356094, + "rewards/margins": 0.14192426204681396, + "rewards/rejected": -0.18366214632987976, + "sft_loss": 0.41737887263298035, + "step": 1903 + }, + { + "epoch": 2.7534345625451917, + "grad_norm": 3.6690114295501624, + "learning_rate": 4.6769133497259006e-06, + "logits/chosen": -0.20664706826210022, + "logits/rejected": -0.026020802557468414, + "logps/chosen": -0.34037303924560547, + "logps/rejected": -3.9743449687957764, + "loss": 0.4032, + "odds_ratio_loss": 0.08332730084657669, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.034037306904792786, + "rewards/margins": 0.3633972108364105, + "rewards/rejected": -0.3974345326423645, + "sft_loss": 0.34037303924560547, + "step": 1904 + }, + { + "epoch": 2.754880694143167, + "grad_norm": 2.294070706599266, + "learning_rate": 4.673852067321899e-06, + "logits/chosen": -0.18250615894794464, + "logits/rejected": -0.2798006534576416, + "logps/chosen": -0.32629454135894775, + "logps/rejected": -2.708524703979492, + "loss": 0.3524, + "odds_ratio_loss": 0.1289883852005005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.032629452645778656, + "rewards/margins": 0.23822301626205444, + "rewards/rejected": -0.2708524763584137, + "sft_loss": 0.32629454135894775, + "step": 1905 + }, + { + "epoch": 2.7563268257411426, + "grad_norm": 2.554522375851076, + "learning_rate": 4.670790378649977e-06, + "logits/chosen": -0.08530348539352417, + "logits/rejected": -0.12577472627162933, + "logps/chosen": -0.31157389283180237, + "logps/rejected": -3.460144281387329, + "loss": 0.3684, + "odds_ratio_loss": 0.1313571035861969, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.031157393008470535, + "rewards/margins": 0.31485703587532043, + "rewards/rejected": -0.34601444005966187, + "sft_loss": 0.31157389283180237, + "step": 1906 + }, + { + "epoch": 2.757772957339118, + "grad_norm": 2.6229933469821898, + "learning_rate": 4.6677282855560375e-06, + "logits/chosen": -0.3336745798587799, + "logits/rejected": -0.16491839289665222, + "logps/chosen": -0.2330479621887207, + "logps/rejected": -3.2504565715789795, + "loss": 0.374, + "odds_ratio_loss": 0.054218702018260956, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02330479584634304, + "rewards/margins": 0.3017408847808838, + "rewards/rejected": -0.3250456750392914, + "sft_loss": 0.2330479621887207, + "step": 1907 + }, + { + "epoch": 2.759219088937093, + "grad_norm": 3.622563710757675, + "learning_rate": 4.6646657898862284e-06, + "logits/chosen": -0.3674685060977936, + "logits/rejected": -0.3329426050186157, + "logps/chosen": -0.499603271484375, + "logps/rejected": -1.849241852760315, + "loss": 0.4121, + "odds_ratio_loss": 0.2355216145515442, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04996032640337944, + "rewards/margins": 0.13496387004852295, + "rewards/rejected": -0.1849241852760315, + "sft_loss": 0.499603271484375, + "step": 1908 + }, + { + "epoch": 2.760665220535069, + "grad_norm": 3.44369490784189, + "learning_rate": 4.661602893486939e-06, + "logits/chosen": -0.24251700937747955, + "logits/rejected": -0.36239850521087646, + "logps/chosen": -0.49035611748695374, + "logps/rejected": -2.837653636932373, + "loss": 0.329, + "odds_ratio_loss": 0.20834286510944366, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04903561249375343, + "rewards/margins": 0.23472976684570312, + "rewards/rejected": -0.28376537561416626, + "sft_loss": 0.49035611748695374, + "step": 1909 + }, + { + "epoch": 2.762111352133044, + "grad_norm": 2.5770843438365243, + "learning_rate": 4.6585395982048e-06, + "logits/chosen": -0.2157149314880371, + "logits/rejected": -0.2589437663555145, + "logps/chosen": -0.34972837567329407, + "logps/rejected": -2.4410877227783203, + "loss": 0.4031, + "odds_ratio_loss": 0.1218947246670723, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.034972839057445526, + "rewards/margins": 0.20913594961166382, + "rewards/rejected": -0.24410878121852875, + "sft_loss": 0.34972837567329407, + "step": 1910 + }, + { + "epoch": 2.7635574837310193, + "grad_norm": 3.9997670265300695, + "learning_rate": 4.655475905886685e-06, + "logits/chosen": -0.31204700469970703, + "logits/rejected": -0.2673895061016083, + "logps/chosen": -0.3067542314529419, + "logps/rejected": -2.5147337913513184, + "loss": 0.3784, + "odds_ratio_loss": 0.11027967929840088, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03067542240023613, + "rewards/margins": 0.22079795598983765, + "rewards/rejected": -0.2514733672142029, + "sft_loss": 0.3067542314529419, + "step": 1911 + }, + { + "epoch": 2.765003615328995, + "grad_norm": 2.6263995198618337, + "learning_rate": 4.652411818379706e-06, + "logits/chosen": -0.2939597964286804, + "logits/rejected": -0.22305241227149963, + "logps/chosen": -0.3409603238105774, + "logps/rejected": -2.78116774559021, + "loss": 0.3668, + "odds_ratio_loss": 0.0981820672750473, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03409603238105774, + "rewards/margins": 0.2440207302570343, + "rewards/rejected": -0.27811676263809204, + "sft_loss": 0.3409603238105774, + "step": 1912 + }, + { + "epoch": 2.76644974692697, + "grad_norm": 3.1002078754602933, + "learning_rate": 4.6493473375312106e-06, + "logits/chosen": -0.1616482436656952, + "logits/rejected": -0.18430642783641815, + "logps/chosen": -0.296634316444397, + "logps/rejected": -3.99161434173584, + "loss": 0.3846, + "odds_ratio_loss": 0.1349400132894516, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02966342866420746, + "rewards/margins": 0.36949801445007324, + "rewards/rejected": -0.3991614282131195, + "sft_loss": 0.296634316444397, + "step": 1913 + }, + { + "epoch": 2.767895878524946, + "grad_norm": 4.2146173773241244, + "learning_rate": 4.646282465188788e-06, + "logits/chosen": -0.3848811984062195, + "logits/rejected": -0.3408425450325012, + "logps/chosen": -0.36459803581237793, + "logps/rejected": -2.6756057739257812, + "loss": 0.3984, + "odds_ratio_loss": 0.1446124166250229, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03645980730652809, + "rewards/margins": 0.23110075294971466, + "rewards/rejected": -0.26756054162979126, + "sft_loss": 0.36459803581237793, + "step": 1914 + }, + { + "epoch": 2.769342010122921, + "grad_norm": 2.4636205252773133, + "learning_rate": 4.643217203200259e-06, + "logits/chosen": -0.17520803213119507, + "logits/rejected": -0.21425886452198029, + "logps/chosen": -0.3626866638660431, + "logps/rejected": -3.3118858337402344, + "loss": 0.4663, + "odds_ratio_loss": 0.11417470127344131, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.036268673837184906, + "rewards/margins": 0.2949199378490448, + "rewards/rejected": -0.3311886191368103, + "sft_loss": 0.3626866638660431, + "step": 1915 + }, + { + "epoch": 2.770788141720897, + "grad_norm": 2.1402630431394782, + "learning_rate": 4.640151553413683e-06, + "logits/chosen": -0.2816272974014282, + "logits/rejected": -0.3485042154788971, + "logps/chosen": -0.4563095271587372, + "logps/rejected": -3.226483106613159, + "loss": 0.4248, + "odds_ratio_loss": 0.1943037509918213, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04563095420598984, + "rewards/margins": 0.27701735496520996, + "rewards/rejected": -0.3226483166217804, + "sft_loss": 0.4563095271587372, + "step": 1916 + }, + { + "epoch": 2.772234273318872, + "grad_norm": 2.2310359352159654, + "learning_rate": 4.637085517677351e-06, + "logits/chosen": -0.12221245467662811, + "logits/rejected": -0.11277172714471817, + "logps/chosen": -0.4151383638381958, + "logps/rejected": -2.5533032417297363, + "loss": 0.3756, + "odds_ratio_loss": 0.20337051153182983, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0415138378739357, + "rewards/margins": 0.21381649374961853, + "rewards/rejected": -0.25533032417297363, + "sft_loss": 0.4151383638381958, + "step": 1917 + }, + { + "epoch": 2.7736804049168473, + "grad_norm": 8.261017962903177, + "learning_rate": 4.634019097839788e-06, + "logits/chosen": -0.39285701513290405, + "logits/rejected": -0.25874412059783936, + "logps/chosen": -0.5222992300987244, + "logps/rejected": -2.182068347930908, + "loss": 0.4893, + "odds_ratio_loss": 0.1882818341255188, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.052229925990104675, + "rewards/margins": 0.16597692668437958, + "rewards/rejected": -0.21820686757564545, + "sft_loss": 0.5222992300987244, + "step": 1918 + }, + { + "epoch": 2.775126536514823, + "grad_norm": 2.1730397586110053, + "learning_rate": 4.630952295749749e-06, + "logits/chosen": -0.1443309485912323, + "logits/rejected": -0.4016629457473755, + "logps/chosen": -0.384902685880661, + "logps/rejected": -2.4064371585845947, + "loss": 0.4804, + "odds_ratio_loss": 0.18294137716293335, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03849026933312416, + "rewards/margins": 0.20215344429016113, + "rewards/rejected": -0.240643709897995, + "sft_loss": 0.384902685880661, + "step": 1919 + }, + { + "epoch": 2.7765726681127982, + "grad_norm": 2.675044128872941, + "learning_rate": 4.627885113256223e-06, + "logits/chosen": -0.06406591832637787, + "logits/rejected": -0.044175997376441956, + "logps/chosen": -0.3621594309806824, + "logps/rejected": -2.851767063140869, + "loss": 0.334, + "odds_ratio_loss": 0.14854490756988525, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.036215946078300476, + "rewards/margins": 0.24896079301834106, + "rewards/rejected": -0.28517672419548035, + "sft_loss": 0.3621594309806824, + "step": 1920 + }, + { + "epoch": 2.7780187997107735, + "grad_norm": 2.4626982788921623, + "learning_rate": 4.624817552208422e-06, + "logits/chosen": -0.1586943119764328, + "logits/rejected": -0.1781804859638214, + "logps/chosen": -0.2940862476825714, + "logps/rejected": -2.0932531356811523, + "loss": 0.3358, + "odds_ratio_loss": 0.12081068754196167, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029408622533082962, + "rewards/margins": 0.17991669476032257, + "rewards/rejected": -0.20932531356811523, + "sft_loss": 0.2940862476825714, + "step": 1921 + }, + { + "epoch": 2.779464931308749, + "grad_norm": 3.056963625474084, + "learning_rate": 4.621749614455792e-06, + "logits/chosen": -0.21742503345012665, + "logits/rejected": -0.2462393343448639, + "logps/chosen": -0.4259098172187805, + "logps/rejected": -2.3623046875, + "loss": 0.3804, + "odds_ratio_loss": 0.13163158297538757, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04259098693728447, + "rewards/margins": 0.19363948702812195, + "rewards/rejected": -0.23623046278953552, + "sft_loss": 0.4259098172187805, + "step": 1922 + }, + { + "epoch": 2.7809110629067244, + "grad_norm": 2.591769874435251, + "learning_rate": 4.618681301848004e-06, + "logits/chosen": -0.3165079951286316, + "logits/rejected": -0.11529199779033661, + "logps/chosen": -0.4419441819190979, + "logps/rejected": -2.79586124420166, + "loss": 0.3997, + "odds_ratio_loss": 0.14373686909675598, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04419442266225815, + "rewards/margins": 0.23539170622825623, + "rewards/rejected": -0.2795861065387726, + "sft_loss": 0.4419441819190979, + "step": 1923 + }, + { + "epoch": 2.7823571945047, + "grad_norm": 2.5546096197432835, + "learning_rate": 4.615612616234955e-06, + "logits/chosen": -0.2990122139453888, + "logits/rejected": -0.32261422276496887, + "logps/chosen": -0.32381701469421387, + "logps/rejected": -3.8825607299804688, + "loss": 0.3826, + "odds_ratio_loss": 0.09977390617132187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.032381702214479446, + "rewards/margins": 0.3558743894100189, + "rewards/rejected": -0.3882560729980469, + "sft_loss": 0.32381701469421387, + "step": 1924 + }, + { + "epoch": 2.7838033261026753, + "grad_norm": 2.607250283502564, + "learning_rate": 4.6125435594667664e-06, + "logits/chosen": -0.286873459815979, + "logits/rejected": -0.3211660087108612, + "logps/chosen": -0.37380242347717285, + "logps/rejected": -3.475512981414795, + "loss": 0.3714, + "odds_ratio_loss": 0.1345575600862503, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.037380240857601166, + "rewards/margins": 0.31017106771469116, + "rewards/rejected": -0.3475513160228729, + "sft_loss": 0.37380242347717285, + "step": 1925 + }, + { + "epoch": 2.785249457700651, + "grad_norm": 2.3706014238306015, + "learning_rate": 4.609474133393785e-06, + "logits/chosen": -0.3925056457519531, + "logits/rejected": -0.4020850658416748, + "logps/chosen": -0.2961789071559906, + "logps/rejected": -2.771475315093994, + "loss": 0.346, + "odds_ratio_loss": 0.10398849099874496, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02961789071559906, + "rewards/margins": 0.24752962589263916, + "rewards/rejected": -0.2771475315093994, + "sft_loss": 0.2961789071559906, + "step": 1926 + }, + { + "epoch": 2.7866955892986263, + "grad_norm": 3.0033007962008327, + "learning_rate": 4.606404339866578e-06, + "logits/chosen": -0.2951948046684265, + "logits/rejected": -0.23653864860534668, + "logps/chosen": -0.43498289585113525, + "logps/rejected": -3.2631759643554688, + "loss": 0.3759, + "odds_ratio_loss": 0.14501672983169556, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.043498288840055466, + "rewards/margins": 0.28281930088996887, + "rewards/rejected": -0.32631760835647583, + "sft_loss": 0.43498289585113525, + "step": 1927 + }, + { + "epoch": 2.7881417208966015, + "grad_norm": 2.454319936149338, + "learning_rate": 4.603334180735937e-06, + "logits/chosen": -0.3240278661251068, + "logits/rejected": -0.2735302448272705, + "logps/chosen": -0.31935903429985046, + "logps/rejected": -3.252980947494507, + "loss": 0.3543, + "odds_ratio_loss": 0.07885086536407471, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.031935904175043106, + "rewards/margins": 0.29336220026016235, + "rewards/rejected": -0.32529810070991516, + "sft_loss": 0.31935903429985046, + "step": 1928 + }, + { + "epoch": 2.789587852494577, + "grad_norm": 2.3281215603843237, + "learning_rate": 4.6002636578528694e-06, + "logits/chosen": -0.18546296656131744, + "logits/rejected": -0.17621514201164246, + "logps/chosen": -0.2209104746580124, + "logps/rejected": -4.707164287567139, + "loss": 0.2778, + "odds_ratio_loss": 0.054455384612083435, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022091049700975418, + "rewards/margins": 0.448625385761261, + "rewards/rejected": -0.4707164168357849, + "sft_loss": 0.2209104746580124, + "step": 1929 + }, + { + "epoch": 2.7910339840925524, + "grad_norm": 2.251641017951371, + "learning_rate": 4.5971927730686086e-06, + "logits/chosen": -0.29331666231155396, + "logits/rejected": -0.29489755630493164, + "logps/chosen": -0.4534047245979309, + "logps/rejected": -1.960153341293335, + "loss": 0.4003, + "odds_ratio_loss": 0.15419454872608185, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04534047096967697, + "rewards/margins": 0.15067486464977264, + "rewards/rejected": -0.19601532816886902, + "sft_loss": 0.4534047245979309, + "step": 1930 + }, + { + "epoch": 2.7924801156905277, + "grad_norm": 2.304782801749755, + "learning_rate": 4.594121528234601e-06, + "logits/chosen": -0.1861201524734497, + "logits/rejected": -0.22230559587478638, + "logps/chosen": -0.39050740003585815, + "logps/rejected": -4.507702350616455, + "loss": 0.4072, + "odds_ratio_loss": 0.13908474147319794, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.039050742983818054, + "rewards/margins": 0.41171950101852417, + "rewards/rejected": -0.45077022910118103, + "sft_loss": 0.39050740003585815, + "step": 1931 + }, + { + "epoch": 2.7939262472885034, + "grad_norm": 2.5018670474835845, + "learning_rate": 4.59104992520251e-06, + "logits/chosen": -0.24248866736888885, + "logits/rejected": -0.23236139118671417, + "logps/chosen": -0.446467787027359, + "logps/rejected": -2.6032891273498535, + "loss": 0.38, + "odds_ratio_loss": 0.20997995138168335, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04464678466320038, + "rewards/margins": 0.21568216383457184, + "rewards/rejected": -0.2603289484977722, + "sft_loss": 0.446467787027359, + "step": 1932 + }, + { + "epoch": 2.7953723788864786, + "grad_norm": 2.272702135116343, + "learning_rate": 4.5879779658242185e-06, + "logits/chosen": -0.22327488660812378, + "logits/rejected": -0.2359078824520111, + "logps/chosen": -0.27088087797164917, + "logps/rejected": -2.6554477214813232, + "loss": 0.3654, + "odds_ratio_loss": 0.08074113726615906, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027088087052106857, + "rewards/margins": 0.23845668137073517, + "rewards/rejected": -0.2655448019504547, + "sft_loss": 0.27088087797164917, + "step": 1933 + }, + { + "epoch": 2.796818510484454, + "grad_norm": 2.714031627693277, + "learning_rate": 4.584905651951821e-06, + "logits/chosen": -0.31978341937065125, + "logits/rejected": -0.19837269186973572, + "logps/chosen": -0.3051181733608246, + "logps/rejected": -3.7367353439331055, + "loss": 0.4014, + "odds_ratio_loss": 0.09203245490789413, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03051181510090828, + "rewards/margins": 0.3431617021560669, + "rewards/rejected": -0.3736734986305237, + "sft_loss": 0.3051181733608246, + "step": 1934 + }, + { + "epoch": 2.7982646420824295, + "grad_norm": 2.8248128614823362, + "learning_rate": 4.581832985437628e-06, + "logits/chosen": -0.2673256993293762, + "logits/rejected": -0.33064621686935425, + "logps/chosen": -0.3906742334365845, + "logps/rejected": -2.7726762294769287, + "loss": 0.3861, + "odds_ratio_loss": 0.20273485779762268, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03906742483377457, + "rewards/margins": 0.23820018768310547, + "rewards/rejected": -0.2772676348686218, + "sft_loss": 0.3906742334365845, + "step": 1935 + }, + { + "epoch": 2.7997107736804048, + "grad_norm": 2.2782935918289287, + "learning_rate": 4.578759968134162e-06, + "logits/chosen": -0.4593513607978821, + "logits/rejected": -0.26922720670700073, + "logps/chosen": -0.4158691167831421, + "logps/rejected": -2.6285738945007324, + "loss": 0.3801, + "odds_ratio_loss": 0.19207270443439484, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04158691689372063, + "rewards/margins": 0.22127047181129456, + "rewards/rejected": -0.2628573775291443, + "sft_loss": 0.4158691167831421, + "step": 1936 + }, + { + "epoch": 2.8011569052783805, + "grad_norm": 7.689527810087393, + "learning_rate": 4.575686601894154e-06, + "logits/chosen": -0.3853088617324829, + "logits/rejected": -0.40306854248046875, + "logps/chosen": -0.38764315843582153, + "logps/rejected": -3.561250925064087, + "loss": 0.4396, + "odds_ratio_loss": 0.12997904419898987, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03876432031393051, + "rewards/margins": 0.3173607885837555, + "rewards/rejected": -0.3561251163482666, + "sft_loss": 0.38764315843582153, + "step": 1937 + }, + { + "epoch": 2.8026030368763557, + "grad_norm": 2.4015205603833865, + "learning_rate": 4.572612888570551e-06, + "logits/chosen": -0.15974129736423492, + "logits/rejected": -0.201629638671875, + "logps/chosen": -0.2965061664581299, + "logps/rejected": -2.6370511054992676, + "loss": 0.3518, + "odds_ratio_loss": 0.14970335364341736, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029650619253516197, + "rewards/margins": 0.23405447602272034, + "rewards/rejected": -0.2637051045894623, + "sft_loss": 0.2965061664581299, + "step": 1938 + }, + { + "epoch": 2.8040491684743314, + "grad_norm": 2.3197713473946773, + "learning_rate": 4.569538830016504e-06, + "logits/chosen": -0.19120481610298157, + "logits/rejected": -0.16116847097873688, + "logps/chosen": -0.3215819001197815, + "logps/rejected": -4.226866245269775, + "loss": 0.3769, + "odds_ratio_loss": 0.09225130826234818, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03215819224715233, + "rewards/margins": 0.39052847027778625, + "rewards/rejected": -0.4226866364479065, + "sft_loss": 0.3215819001197815, + "step": 1939 + }, + { + "epoch": 2.8054953000723066, + "grad_norm": 2.4727718660935407, + "learning_rate": 4.566464428085375e-06, + "logits/chosen": -0.1913873255252838, + "logits/rejected": -0.27844709157943726, + "logps/chosen": -0.20122195780277252, + "logps/rejected": -3.5175328254699707, + "loss": 0.3739, + "odds_ratio_loss": 0.08544261008501053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020122196525335312, + "rewards/margins": 0.331631064414978, + "rewards/rejected": -0.351753294467926, + "sft_loss": 0.20122195780277252, + "step": 1940 + }, + { + "epoch": 2.806941431670282, + "grad_norm": 2.946690687261266, + "learning_rate": 4.563389684630733e-06, + "logits/chosen": -0.130567729473114, + "logits/rejected": -0.1708630472421646, + "logps/chosen": -0.3934086561203003, + "logps/rejected": -4.302565097808838, + "loss": 0.3624, + "odds_ratio_loss": 0.14866124093532562, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03934086486697197, + "rewards/margins": 0.3909156620502472, + "rewards/rejected": -0.43025651574134827, + "sft_loss": 0.3934086561203003, + "step": 1941 + }, + { + "epoch": 2.8083875632682576, + "grad_norm": 2.189219374296923, + "learning_rate": 4.560314601506352e-06, + "logits/chosen": -0.09481626749038696, + "logits/rejected": -0.11865237355232239, + "logps/chosen": -0.19691374897956848, + "logps/rejected": -3.1772546768188477, + "loss": 0.2849, + "odds_ratio_loss": 0.09188981354236603, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019691377878189087, + "rewards/margins": 0.29803407192230225, + "rewards/rejected": -0.3177254796028137, + "sft_loss": 0.19691374897956848, + "step": 1942 + }, + { + "epoch": 2.809833694866233, + "grad_norm": 2.558939249575432, + "learning_rate": 4.557239180566211e-06, + "logits/chosen": -0.24929408729076385, + "logits/rejected": -0.20789062976837158, + "logps/chosen": -0.39418619871139526, + "logps/rejected": -3.524662494659424, + "loss": 0.3461, + "odds_ratio_loss": 0.20861908793449402, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.039418622851371765, + "rewards/margins": 0.3130476176738739, + "rewards/rejected": -0.3524662256240845, + "sft_loss": 0.39418619871139526, + "step": 1943 + }, + { + "epoch": 2.811279826464208, + "grad_norm": 3.5546240440051413, + "learning_rate": 4.554163423664492e-06, + "logits/chosen": -0.2653443217277527, + "logits/rejected": -0.19230744242668152, + "logps/chosen": -0.3097788393497467, + "logps/rejected": -2.285008192062378, + "loss": 0.3426, + "odds_ratio_loss": 0.15280936658382416, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03097788617014885, + "rewards/margins": 0.19752293825149536, + "rewards/rejected": -0.2285008281469345, + "sft_loss": 0.3097788393497467, + "step": 1944 + }, + { + "epoch": 2.8127259580621837, + "grad_norm": 2.464371112246266, + "learning_rate": 4.551087332655581e-06, + "logits/chosen": -0.19536343216896057, + "logits/rejected": -0.21240335702896118, + "logps/chosen": -0.3807547688484192, + "logps/rejected": -4.58089542388916, + "loss": 0.3613, + "odds_ratio_loss": 0.12009456753730774, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03807547688484192, + "rewards/margins": 0.42001405358314514, + "rewards/rejected": -0.4580894708633423, + "sft_loss": 0.3807547688484192, + "step": 1945 + }, + { + "epoch": 2.814172089660159, + "grad_norm": 2.8583032297639694, + "learning_rate": 4.548010909394065e-06, + "logits/chosen": -0.25077760219573975, + "logits/rejected": -0.22733455896377563, + "logps/chosen": -0.2656141519546509, + "logps/rejected": -2.585810422897339, + "loss": 0.3708, + "odds_ratio_loss": 0.07707735151052475, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026561414822936058, + "rewards/margins": 0.23201963305473328, + "rewards/rejected": -0.2585810422897339, + "sft_loss": 0.2656141519546509, + "step": 1946 + }, + { + "epoch": 2.815618221258134, + "grad_norm": 2.1958212700336777, + "learning_rate": 4.5449341557347314e-06, + "logits/chosen": -0.24419622123241425, + "logits/rejected": -0.32629725337028503, + "logps/chosen": -0.38885653018951416, + "logps/rejected": -2.0550642013549805, + "loss": 0.4423, + "odds_ratio_loss": 0.1665182113647461, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.038885653018951416, + "rewards/margins": 0.16662079095840454, + "rewards/rejected": -0.20550641417503357, + "sft_loss": 0.38885653018951416, + "step": 1947 + }, + { + "epoch": 2.81706435285611, + "grad_norm": 2.827337130609505, + "learning_rate": 4.541857073532565e-06, + "logits/chosen": -0.29342615604400635, + "logits/rejected": -0.37808701395988464, + "logps/chosen": -0.4292759597301483, + "logps/rejected": -3.3842148780822754, + "loss": 0.4184, + "odds_ratio_loss": 0.16178587079048157, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04292759299278259, + "rewards/margins": 0.2954939007759094, + "rewards/rejected": -0.338421493768692, + "sft_loss": 0.4292759597301483, + "step": 1948 + }, + { + "epoch": 2.8185104844540856, + "grad_norm": 2.9319747599147346, + "learning_rate": 4.538779664642751e-06, + "logits/chosen": -0.1982938051223755, + "logits/rejected": -0.2894214987754822, + "logps/chosen": -0.4327597916126251, + "logps/rejected": -4.025196552276611, + "loss": 0.38, + "odds_ratio_loss": 0.1208379864692688, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04327598214149475, + "rewards/margins": 0.3592436909675598, + "rewards/rejected": -0.40251967310905457, + "sft_loss": 0.4327597916126251, + "step": 1949 + }, + { + "epoch": 2.819956616052061, + "grad_norm": 2.781771227196679, + "learning_rate": 4.535701930920669e-06, + "logits/chosen": -0.250921368598938, + "logits/rejected": -0.21995939314365387, + "logps/chosen": -0.4275512099266052, + "logps/rejected": -2.362046003341675, + "loss": 0.3795, + "odds_ratio_loss": 0.11597280949354172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.042755126953125, + "rewards/margins": 0.1934494823217392, + "rewards/rejected": -0.2362046092748642, + "sft_loss": 0.4275512099266052, + "step": 1950 + }, + { + "epoch": 2.821402747650036, + "grad_norm": 2.3237978364107295, + "learning_rate": 4.532623874221901e-06, + "logits/chosen": -0.15754200518131256, + "logits/rejected": -0.06704466044902802, + "logps/chosen": -0.33436861634254456, + "logps/rejected": -4.438542366027832, + "loss": 0.3531, + "odds_ratio_loss": 0.09081018716096878, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.033436864614486694, + "rewards/margins": 0.4104173481464386, + "rewards/rejected": -0.4438541829586029, + "sft_loss": 0.33436861634254456, + "step": 1951 + }, + { + "epoch": 2.8228488792480118, + "grad_norm": 3.35312868677405, + "learning_rate": 4.529545496402214e-06, + "logits/chosen": -0.2972349226474762, + "logits/rejected": -0.29672643542289734, + "logps/chosen": -0.3766123652458191, + "logps/rejected": -4.205931186676025, + "loss": 0.3747, + "odds_ratio_loss": 0.12844733893871307, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03766123950481415, + "rewards/margins": 0.3829319477081299, + "rewards/rejected": -0.42059317231178284, + "sft_loss": 0.3766123652458191, + "step": 1952 + }, + { + "epoch": 2.824295010845987, + "grad_norm": 2.0202609151492563, + "learning_rate": 4.526466799317574e-06, + "logits/chosen": -0.1269502341747284, + "logits/rejected": -0.19187532365322113, + "logps/chosen": -0.5017333030700684, + "logps/rejected": -3.294124126434326, + "loss": 0.4123, + "odds_ratio_loss": 0.19588340818881989, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0501733273267746, + "rewards/margins": 0.27923905849456787, + "rewards/rejected": -0.32941240072250366, + "sft_loss": 0.5017333030700684, + "step": 1953 + }, + { + "epoch": 2.8257411424439622, + "grad_norm": 2.1424438067504608, + "learning_rate": 4.5233877848241405e-06, + "logits/chosen": -0.20550042390823364, + "logits/rejected": -0.217835932970047, + "logps/chosen": -0.4110490679740906, + "logps/rejected": -5.2300262451171875, + "loss": 0.3385, + "odds_ratio_loss": 0.1557879000902176, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04110490530729294, + "rewards/margins": 0.48189777135849, + "rewards/rejected": -0.5230026841163635, + "sft_loss": 0.4110490679740906, + "step": 1954 + }, + { + "epoch": 2.827187274041938, + "grad_norm": 2.2898308985512434, + "learning_rate": 4.5203084547782625e-06, + "logits/chosen": -0.1175132691860199, + "logits/rejected": -0.1555338203907013, + "logps/chosen": -0.3281111717224121, + "logps/rejected": -2.776350736618042, + "loss": 0.3175, + "odds_ratio_loss": 0.1153721809387207, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03281112015247345, + "rewards/margins": 0.24482394754886627, + "rewards/rejected": -0.2776350677013397, + "sft_loss": 0.3281111717224121, + "step": 1955 + }, + { + "epoch": 2.828633405639913, + "grad_norm": 2.556433357322931, + "learning_rate": 4.517228811036479e-06, + "logits/chosen": -0.12393073737621307, + "logits/rejected": -0.058607179671525955, + "logps/chosen": -0.3850470781326294, + "logps/rejected": -2.437091827392578, + "loss": 0.3945, + "odds_ratio_loss": 0.11526143550872803, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.038504708558321, + "rewards/margins": 0.20520448684692383, + "rewards/rejected": -0.24370920658111572, + "sft_loss": 0.3850470781326294, + "step": 1956 + }, + { + "epoch": 2.8300795372378884, + "grad_norm": 2.3441089513748934, + "learning_rate": 4.514148855455519e-06, + "logits/chosen": -0.2920979857444763, + "logits/rejected": -0.21535375714302063, + "logps/chosen": -0.42927250266075134, + "logps/rejected": -4.101245403289795, + "loss": 0.4083, + "odds_ratio_loss": 0.0865693911910057, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.042927250266075134, + "rewards/margins": 0.36719733476638794, + "rewards/rejected": -0.4101245403289795, + "sft_loss": 0.42927250266075134, + "step": 1957 + }, + { + "epoch": 2.831525668835864, + "grad_norm": 2.1830249246574, + "learning_rate": 4.511068589892299e-06, + "logits/chosen": -0.2623273730278015, + "logits/rejected": -0.28325706720352173, + "logps/chosen": -0.3608659505844116, + "logps/rejected": -4.1569623947143555, + "loss": 0.3382, + "odds_ratio_loss": 0.08394452929496765, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.036086589097976685, + "rewards/margins": 0.3796096444129944, + "rewards/rejected": -0.4156962037086487, + "sft_loss": 0.3608659505844116, + "step": 1958 + }, + { + "epoch": 2.8329718004338393, + "grad_norm": 2.4965701484780682, + "learning_rate": 4.507988016203924e-06, + "logits/chosen": -0.12955503165721893, + "logits/rejected": -0.13261455297470093, + "logps/chosen": -0.45575547218322754, + "logps/rejected": -5.616315841674805, + "loss": 0.4245, + "odds_ratio_loss": 0.15984290838241577, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.045575544238090515, + "rewards/margins": 0.5160560011863708, + "rewards/rejected": -0.5616315603256226, + "sft_loss": 0.45575547218322754, + "step": 1959 + }, + { + "epoch": 2.834417932031815, + "grad_norm": 2.609261696479621, + "learning_rate": 4.50490713624768e-06, + "logits/chosen": -0.278617262840271, + "logits/rejected": -0.3227754831314087, + "logps/chosen": -0.44750720262527466, + "logps/rejected": -3.244765043258667, + "loss": 0.354, + "odds_ratio_loss": 0.13986484706401825, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.044750723987817764, + "rewards/margins": 0.2797257900238037, + "rewards/rejected": -0.32447654008865356, + "sft_loss": 0.44750720262527466, + "step": 1960 + }, + { + "epoch": 2.8358640636297903, + "grad_norm": 2.4790920191828034, + "learning_rate": 4.501825951881044e-06, + "logits/chosen": -0.2545323967933655, + "logits/rejected": -0.23980608582496643, + "logps/chosen": -0.21073514223098755, + "logps/rejected": -3.4034600257873535, + "loss": 0.3826, + "odds_ratio_loss": 0.13137222826480865, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021073516458272934, + "rewards/margins": 0.319272518157959, + "rewards/rejected": -0.3403460383415222, + "sft_loss": 0.21073514223098755, + "step": 1961 + }, + { + "epoch": 2.837310195227766, + "grad_norm": 2.460263782780314, + "learning_rate": 4.498744464961673e-06, + "logits/chosen": -0.14830397069454193, + "logits/rejected": -0.17653128504753113, + "logps/chosen": -0.37625688314437866, + "logps/rejected": -2.8310248851776123, + "loss": 0.3778, + "odds_ratio_loss": 0.2329844832420349, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.037625689059495926, + "rewards/margins": 0.24547676742076874, + "rewards/rejected": -0.28310248255729675, + "sft_loss": 0.37625688314437866, + "step": 1962 + }, + { + "epoch": 2.838756326825741, + "grad_norm": 2.41693580770391, + "learning_rate": 4.495662677347406e-06, + "logits/chosen": -0.2042870670557022, + "logits/rejected": -0.2227935642004013, + "logps/chosen": -0.30183953046798706, + "logps/rejected": -3.360196590423584, + "loss": 0.3708, + "odds_ratio_loss": 0.11629791557788849, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.030183954164385796, + "rewards/margins": 0.30583569407463074, + "rewards/rejected": -0.3360196352005005, + "sft_loss": 0.30183953046798706, + "step": 1963 + }, + { + "epoch": 2.8402024584237164, + "grad_norm": 2.2424319365202483, + "learning_rate": 4.492580590896266e-06, + "logits/chosen": -0.29322850704193115, + "logits/rejected": -0.2505452334880829, + "logps/chosen": -0.3406594395637512, + "logps/rejected": -3.0602898597717285, + "loss": 0.3607, + "odds_ratio_loss": 0.14059670269489288, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03406594693660736, + "rewards/margins": 0.27196305990219116, + "rewards/rejected": -0.30602899193763733, + "sft_loss": 0.3406594395637512, + "step": 1964 + }, + { + "epoch": 2.841648590021692, + "grad_norm": 2.034324925578539, + "learning_rate": 4.489498207466452e-06, + "logits/chosen": -0.23154626786708832, + "logits/rejected": -0.22844228148460388, + "logps/chosen": -0.35256677865982056, + "logps/rejected": -3.0019397735595703, + "loss": 0.2994, + "odds_ratio_loss": 0.14165949821472168, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.035256676375865936, + "rewards/margins": 0.26493731141090393, + "rewards/rejected": -0.30019402503967285, + "sft_loss": 0.35256677865982056, + "step": 1965 + }, + { + "epoch": 2.8430947216196674, + "grad_norm": 2.297602410152931, + "learning_rate": 4.486415528916345e-06, + "logits/chosen": -0.26251840591430664, + "logits/rejected": -0.21951915323734283, + "logps/chosen": -0.4026089906692505, + "logps/rejected": -2.375399589538574, + "loss": 0.3456, + "odds_ratio_loss": 0.12557853758335114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04026089981198311, + "rewards/margins": 0.19727906584739685, + "rewards/rejected": -0.23753996193408966, + "sft_loss": 0.4026089906692505, + "step": 1966 + }, + { + "epoch": 2.8445408532176426, + "grad_norm": 2.3981821546026594, + "learning_rate": 4.483332557104506e-06, + "logits/chosen": -0.31481003761291504, + "logits/rejected": -0.23861244320869446, + "logps/chosen": -0.45590323209762573, + "logps/rejected": -4.1064300537109375, + "loss": 0.4072, + "odds_ratio_loss": 0.1462942659854889, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04559032618999481, + "rewards/margins": 0.3650527596473694, + "rewards/rejected": -0.4106430411338806, + "sft_loss": 0.45590323209762573, + "step": 1967 + }, + { + "epoch": 2.8459869848156183, + "grad_norm": 2.5671049781697786, + "learning_rate": 4.4802492938896665e-06, + "logits/chosen": -0.28605127334594727, + "logits/rejected": -0.3137693405151367, + "logps/chosen": -0.44223883748054504, + "logps/rejected": -3.326413631439209, + "loss": 0.3795, + "odds_ratio_loss": 0.149356871843338, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.044223885983228683, + "rewards/margins": 0.2884174585342407, + "rewards/rejected": -0.3326413333415985, + "sft_loss": 0.44223883748054504, + "step": 1968 + }, + { + "epoch": 2.8474331164135935, + "grad_norm": 2.2327104314001893, + "learning_rate": 4.477165741130739e-06, + "logits/chosen": -0.25932520627975464, + "logits/rejected": -0.2992691397666931, + "logps/chosen": -0.4744134843349457, + "logps/rejected": -2.5101125240325928, + "loss": 0.4011, + "odds_ratio_loss": 0.1707616001367569, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.047441352158784866, + "rewards/margins": 0.20356988906860352, + "rewards/rejected": -0.2510112524032593, + "sft_loss": 0.4744134843349457, + "step": 1969 + }, + { + "epoch": 2.8488792480115688, + "grad_norm": 2.293971064562503, + "learning_rate": 4.474081900686811e-06, + "logits/chosen": -0.11275100708007812, + "logits/rejected": -0.20851099491119385, + "logps/chosen": -0.3515588641166687, + "logps/rejected": -4.357938766479492, + "loss": 0.3836, + "odds_ratio_loss": 0.10340322554111481, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03515588864684105, + "rewards/margins": 0.40063801407814026, + "rewards/rejected": -0.4357939064502716, + "sft_loss": 0.3515588641166687, + "step": 1970 + }, + { + "epoch": 2.8503253796095445, + "grad_norm": 2.3802443419941293, + "learning_rate": 4.470997774417138e-06, + "logits/chosen": -0.23158510029315948, + "logits/rejected": -0.12711213529109955, + "logps/chosen": -0.18399205803871155, + "logps/rejected": -4.86518669128418, + "loss": 0.3133, + "odds_ratio_loss": 0.026787567883729935, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018399206921458244, + "rewards/margins": 0.46811944246292114, + "rewards/rejected": -0.4865187108516693, + "sft_loss": 0.18399205803871155, + "step": 1971 + }, + { + "epoch": 2.85177151120752, + "grad_norm": 4.32614696988975, + "learning_rate": 4.467913364181152e-06, + "logits/chosen": -0.1700606644153595, + "logits/rejected": -0.4262159466743469, + "logps/chosen": -0.3667178153991699, + "logps/rejected": -4.321340560913086, + "loss": 0.3362, + "odds_ratio_loss": 0.15199513733386993, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03667178750038147, + "rewards/margins": 0.395462304353714, + "rewards/rejected": -0.43213409185409546, + "sft_loss": 0.3667178153991699, + "step": 1972 + }, + { + "epoch": 2.8532176428054954, + "grad_norm": 2.4945795457234623, + "learning_rate": 4.464828671838456e-06, + "logits/chosen": -0.20741111040115356, + "logits/rejected": -0.28869232535362244, + "logps/chosen": -0.4533330798149109, + "logps/rejected": -3.0039780139923096, + "loss": 0.3851, + "odds_ratio_loss": 0.14457382261753082, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04533331096172333, + "rewards/margins": 0.2550644874572754, + "rewards/rejected": -0.3003978133201599, + "sft_loss": 0.4533330798149109, + "step": 1973 + }, + { + "epoch": 2.8546637744034706, + "grad_norm": 2.2392165620325555, + "learning_rate": 4.4617436992488255e-06, + "logits/chosen": -0.21774300932884216, + "logits/rejected": -0.22718608379364014, + "logps/chosen": -0.26567813754081726, + "logps/rejected": -3.7159323692321777, + "loss": 0.3558, + "odds_ratio_loss": 0.11599615216255188, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026567813009023666, + "rewards/margins": 0.3450254201889038, + "rewards/rejected": -0.3715932369232178, + "sft_loss": 0.26567813754081726, + "step": 1974 + }, + { + "epoch": 2.8561099060014463, + "grad_norm": 2.468051654089775, + "learning_rate": 4.458658448272196e-06, + "logits/chosen": -0.3251468241214752, + "logits/rejected": -0.34824731945991516, + "logps/chosen": -0.3763624429702759, + "logps/rejected": -2.987534523010254, + "loss": 0.3765, + "odds_ratio_loss": 0.14948882162570953, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03763624280691147, + "rewards/margins": 0.26111719012260437, + "rewards/rejected": -0.29875344038009644, + "sft_loss": 0.3763624429702759, + "step": 1975 + }, + { + "epoch": 2.8575560375994216, + "grad_norm": 2.1974442142697272, + "learning_rate": 4.455572920768681e-06, + "logits/chosen": -0.2270897775888443, + "logits/rejected": -0.25597840547561646, + "logps/chosen": -0.4614500105381012, + "logps/rejected": -3.404634714126587, + "loss": 0.37, + "odds_ratio_loss": 0.19110485911369324, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.046144999563694, + "rewards/margins": 0.29431846737861633, + "rewards/rejected": -0.34046345949172974, + "sft_loss": 0.4614500105381012, + "step": 1976 + }, + { + "epoch": 2.859002169197397, + "grad_norm": 2.0140127976514846, + "learning_rate": 4.452487118598554e-06, + "logits/chosen": -0.08335888385772705, + "logits/rejected": -0.18578308820724487, + "logps/chosen": -0.3250659108161926, + "logps/rejected": -4.700540542602539, + "loss": 0.3012, + "odds_ratio_loss": 0.12990859150886536, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03250659257173538, + "rewards/margins": 0.4375474452972412, + "rewards/rejected": -0.470054030418396, + "sft_loss": 0.3250659108161926, + "step": 1977 + }, + { + "epoch": 2.8604483007953725, + "grad_norm": 2.4445686295120987, + "learning_rate": 4.44940104362226e-06, + "logits/chosen": -0.14989988505840302, + "logits/rejected": -0.26366785168647766, + "logps/chosen": -0.352664053440094, + "logps/rejected": -2.437861680984497, + "loss": 0.3218, + "odds_ratio_loss": 0.1567659229040146, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03526640683412552, + "rewards/margins": 0.20851975679397583, + "rewards/rejected": -0.24378615617752075, + "sft_loss": 0.352664053440094, + "step": 1978 + }, + { + "epoch": 2.8618944323933477, + "grad_norm": 2.2321339276585483, + "learning_rate": 4.446314697700402e-06, + "logits/chosen": -0.2847731113433838, + "logits/rejected": -0.1818363070487976, + "logps/chosen": -0.2849189341068268, + "logps/rejected": -3.662630558013916, + "loss": 0.315, + "odds_ratio_loss": 0.10266374051570892, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02849189192056656, + "rewards/margins": 0.3377711772918701, + "rewards/rejected": -0.3662630617618561, + "sft_loss": 0.2849189341068268, + "step": 1979 + }, + { + "epoch": 2.863340563991323, + "grad_norm": 3.2130933735507603, + "learning_rate": 4.44322808269375e-06, + "logits/chosen": -0.43049612641334534, + "logits/rejected": -0.31409528851509094, + "logps/chosen": -0.3502507507801056, + "logps/rejected": -4.642218112945557, + "loss": 0.3838, + "odds_ratio_loss": 0.06136965751647949, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03502507507801056, + "rewards/margins": 0.4291967749595642, + "rewards/rejected": -0.46422186493873596, + "sft_loss": 0.3502507507801056, + "step": 1980 + }, + { + "epoch": 2.8647866955892987, + "grad_norm": 2.6696002700885826, + "learning_rate": 4.440141200463237e-06, + "logits/chosen": -0.20311379432678223, + "logits/rejected": -0.19404862821102142, + "logps/chosen": -0.3664669394493103, + "logps/rejected": -4.6123809814453125, + "loss": 0.342, + "odds_ratio_loss": 0.18475694954395294, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03664669767022133, + "rewards/margins": 0.42459142208099365, + "rewards/rejected": -0.4612380862236023, + "sft_loss": 0.3664669394493103, + "step": 1981 + }, + { + "epoch": 2.866232827187274, + "grad_norm": 2.313763366163634, + "learning_rate": 4.437054052869955e-06, + "logits/chosen": -0.14165878295898438, + "logits/rejected": -0.0675276443362236, + "logps/chosen": -0.4247075319290161, + "logps/rejected": -3.4205408096313477, + "loss": 0.4258, + "odds_ratio_loss": 0.1645819991827011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04247075691819191, + "rewards/margins": 0.2995833158493042, + "rewards/rejected": -0.3420540690422058, + "sft_loss": 0.4247075319290161, + "step": 1982 + }, + { + "epoch": 2.8676789587852496, + "grad_norm": 2.4102794434344044, + "learning_rate": 4.433966641775155e-06, + "logits/chosen": -0.037582166492938995, + "logits/rejected": -0.06151975318789482, + "logps/chosen": -0.2677247226238251, + "logps/rejected": -2.7316319942474365, + "loss": 0.3653, + "odds_ratio_loss": 0.1058277040719986, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026772471144795418, + "rewards/margins": 0.24639073014259338, + "rewards/rejected": -0.27316319942474365, + "sft_loss": 0.2677247226238251, + "step": 1983 + }, + { + "epoch": 2.869125090383225, + "grad_norm": 2.0562322116516647, + "learning_rate": 4.430878969040252e-06, + "logits/chosen": -0.16897395253181458, + "logits/rejected": -0.08548015356063843, + "logps/chosen": -0.531076192855835, + "logps/rejected": -2.6939971446990967, + "loss": 0.3929, + "odds_ratio_loss": 0.24118700623512268, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0531076155602932, + "rewards/margins": 0.21629208326339722, + "rewards/rejected": -0.2693997025489807, + "sft_loss": 0.531076192855835, + "step": 1984 + }, + { + "epoch": 2.8705712219812005, + "grad_norm": 2.4932561485656004, + "learning_rate": 4.427791036526813e-06, + "logits/chosen": -0.35910582542419434, + "logits/rejected": -0.25231948494911194, + "logps/chosen": -0.41696059703826904, + "logps/rejected": -3.8195204734802246, + "loss": 0.3503, + "odds_ratio_loss": 0.12001129239797592, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.041696060448884964, + "rewards/margins": 0.3402559757232666, + "rewards/rejected": -0.3819520175457001, + "sft_loss": 0.41696059703826904, + "step": 1985 + }, + { + "epoch": 2.8720173535791758, + "grad_norm": 2.328030440907696, + "learning_rate": 4.4247028460965665e-06, + "logits/chosen": -0.11354707181453705, + "logits/rejected": -0.058158066123723984, + "logps/chosen": -0.2665402293205261, + "logps/rejected": -4.639363765716553, + "loss": 0.3374, + "odds_ratio_loss": 0.049688465893268585, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026654021814465523, + "rewards/margins": 0.43728238344192505, + "rewards/rejected": -0.46393638849258423, + "sft_loss": 0.2665402293205261, + "step": 1986 + }, + { + "epoch": 2.873463485177151, + "grad_norm": 2.2398543080728786, + "learning_rate": 4.4216143996113905e-06, + "logits/chosen": -0.21972495317459106, + "logits/rejected": -0.18882104754447937, + "logps/chosen": -0.3515039384365082, + "logps/rejected": -4.940927982330322, + "loss": 0.3557, + "odds_ratio_loss": 0.15298905968666077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03515039384365082, + "rewards/margins": 0.4589424729347229, + "rewards/rejected": -0.4940928518772125, + "sft_loss": 0.3515039384365082, + "step": 1987 + }, + { + "epoch": 2.8749096167751267, + "grad_norm": 2.534407245251865, + "learning_rate": 4.418525698933324e-06, + "logits/chosen": -0.23043112456798553, + "logits/rejected": -0.29163533449172974, + "logps/chosen": -0.3577241897583008, + "logps/rejected": -3.6975207328796387, + "loss": 0.3254, + "odds_ratio_loss": 0.16912941634655, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0357724204659462, + "rewards/margins": 0.33397969603538513, + "rewards/rejected": -0.36975207924842834, + "sft_loss": 0.3577241897583008, + "step": 1988 + }, + { + "epoch": 2.876355748373102, + "grad_norm": 3.1712296916429317, + "learning_rate": 4.415436745924553e-06, + "logits/chosen": -0.1479547768831253, + "logits/rejected": -0.1481505185365677, + "logps/chosen": -0.40298303961753845, + "logps/rejected": -4.623858451843262, + "loss": 0.3811, + "odds_ratio_loss": 0.11026880890130997, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.040298305451869965, + "rewards/margins": 0.42208752036094666, + "rewards/rejected": -0.4623858332633972, + "sft_loss": 0.40298303961753845, + "step": 1989 + }, + { + "epoch": 2.877801879971077, + "grad_norm": 2.256856567396973, + "learning_rate": 4.412347542447423e-06, + "logits/chosen": -0.2117566615343094, + "logits/rejected": -0.15659171342849731, + "logps/chosen": -0.31408044695854187, + "logps/rejected": -4.856984615325928, + "loss": 0.2812, + "odds_ratio_loss": 0.05605805665254593, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03140804544091225, + "rewards/margins": 0.4542904496192932, + "rewards/rejected": -0.4856984615325928, + "sft_loss": 0.31408044695854187, + "step": 1990 + }, + { + "epoch": 2.879248011569053, + "grad_norm": 2.7358142066590374, + "learning_rate": 4.409258090364424e-06, + "logits/chosen": -0.33230820298194885, + "logits/rejected": -0.2448243498802185, + "logps/chosen": -0.5196665525436401, + "logps/rejected": -3.3180594444274902, + "loss": 0.4021, + "odds_ratio_loss": 0.1978374421596527, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05196665599942207, + "rewards/margins": 0.27983927726745605, + "rewards/rejected": -0.331805944442749, + "sft_loss": 0.5196665525436401, + "step": 1991 + }, + { + "epoch": 2.880694143167028, + "grad_norm": 2.2547489447553715, + "learning_rate": 4.406168391538197e-06, + "logits/chosen": -0.2996472120285034, + "logits/rejected": -0.25789931416511536, + "logps/chosen": -0.28679195046424866, + "logps/rejected": -4.5447258949279785, + "loss": 0.3514, + "odds_ratio_loss": 0.07733619213104248, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.028679195791482925, + "rewards/margins": 0.4257934093475342, + "rewards/rejected": -0.4544726014137268, + "sft_loss": 0.28679195046424866, + "step": 1992 + }, + { + "epoch": 2.8821402747650033, + "grad_norm": 2.49254805386154, + "learning_rate": 4.403078447831534e-06, + "logits/chosen": -0.2353857308626175, + "logits/rejected": -0.1543744057416916, + "logps/chosen": -0.15707671642303467, + "logps/rejected": -3.2030768394470215, + "loss": 0.2922, + "odds_ratio_loss": 0.05911577120423317, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015707671642303467, + "rewards/margins": 0.3046000301837921, + "rewards/rejected": -0.32030773162841797, + "sft_loss": 0.15707671642303467, + "step": 1993 + }, + { + "epoch": 2.883586406362979, + "grad_norm": 2.2256194334422967, + "learning_rate": 4.399988261107373e-06, + "logits/chosen": -0.13290566205978394, + "logits/rejected": -0.31374818086624146, + "logps/chosen": -0.5070281028747559, + "logps/rejected": -2.8628692626953125, + "loss": 0.3685, + "odds_ratio_loss": 0.29547181725502014, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05070280656218529, + "rewards/margins": 0.23558413982391357, + "rewards/rejected": -0.2862869203090668, + "sft_loss": 0.5070281028747559, + "step": 1994 + }, + { + "epoch": 2.8850325379609547, + "grad_norm": 3.5396995388199994, + "learning_rate": 4.396897833228801e-06, + "logits/chosen": -0.09624442458152771, + "logits/rejected": -0.20986582338809967, + "logps/chosen": -0.3855375349521637, + "logps/rejected": -4.744143486022949, + "loss": 0.3784, + "odds_ratio_loss": 0.0990835428237915, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03855375573039055, + "rewards/margins": 0.43586063385009766, + "rewards/rejected": -0.4744143784046173, + "sft_loss": 0.3855375349521637, + "step": 1995 + }, + { + "epoch": 2.88647866955893, + "grad_norm": 2.809101607987937, + "learning_rate": 4.393807166059044e-06, + "logits/chosen": -0.15723095834255219, + "logits/rejected": -0.20489570498466492, + "logps/chosen": -0.5089755058288574, + "logps/rejected": -3.931450843811035, + "loss": 0.407, + "odds_ratio_loss": 0.22489072382450104, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05089755356311798, + "rewards/margins": 0.34224751591682434, + "rewards/rejected": -0.39314502477645874, + "sft_loss": 0.5089755058288574, + "step": 1996 + }, + { + "epoch": 2.887924801156905, + "grad_norm": 2.5416958031833077, + "learning_rate": 4.390716261461484e-06, + "logits/chosen": -0.25893235206604004, + "logits/rejected": -0.2654384970664978, + "logps/chosen": -0.4957669675350189, + "logps/rejected": -3.1314098834991455, + "loss": 0.4241, + "odds_ratio_loss": 0.17412987351417542, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04957669600844383, + "rewards/margins": 0.26356425881385803, + "rewards/rejected": -0.31314098834991455, + "sft_loss": 0.4957669675350189, + "step": 1997 + }, + { + "epoch": 2.889370932754881, + "grad_norm": 2.293405340339323, + "learning_rate": 4.387625121299632e-06, + "logits/chosen": -0.23933672904968262, + "logits/rejected": -0.2176375389099121, + "logps/chosen": -0.3258305788040161, + "logps/rejected": -3.0861976146698, + "loss": 0.3925, + "odds_ratio_loss": 0.14498911798000336, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03258305788040161, + "rewards/margins": 0.27603673934936523, + "rewards/rejected": -0.30861979722976685, + "sft_loss": 0.3258305788040161, + "step": 1998 + }, + { + "epoch": 2.890817064352856, + "grad_norm": 2.1941997131433344, + "learning_rate": 4.384533747437151e-06, + "logits/chosen": -0.33271104097366333, + "logits/rejected": -0.28577131032943726, + "logps/chosen": -0.3192511796951294, + "logps/rejected": -3.704035758972168, + "loss": 0.3826, + "odds_ratio_loss": 0.11826330423355103, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03192511945962906, + "rewards/margins": 0.3384784460067749, + "rewards/rejected": -0.370403528213501, + "sft_loss": 0.3192511796951294, + "step": 1999 + }, + { + "epoch": 2.8922631959508314, + "grad_norm": 2.451640159463382, + "learning_rate": 4.381442141737842e-06, + "logits/chosen": -0.1317928582429886, + "logits/rejected": -0.19109252095222473, + "logps/chosen": -0.44239649176597595, + "logps/rejected": -2.6792032718658447, + "loss": 0.3352, + "odds_ratio_loss": 0.19351676106452942, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.044239647686481476, + "rewards/margins": 0.22368068993091583, + "rewards/rejected": -0.2679203450679779, + "sft_loss": 0.44239649176597595, + "step": 2000 + }, + { + "epoch": 2.893709327548807, + "grad_norm": 2.9987909450785004, + "learning_rate": 4.378350306065647e-06, + "logits/chosen": -0.11029291152954102, + "logits/rejected": -0.20811080932617188, + "logps/chosen": -0.5102984309196472, + "logps/rejected": -2.461052894592285, + "loss": 0.4728, + "odds_ratio_loss": 0.26014572381973267, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05102984607219696, + "rewards/margins": 0.19507546722888947, + "rewards/rejected": -0.24610531330108643, + "sft_loss": 0.5102984309196472, + "step": 2001 + }, + { + "epoch": 2.8951554591467823, + "grad_norm": 4.577215598616385, + "learning_rate": 4.375258242284641e-06, + "logits/chosen": -0.125568687915802, + "logits/rejected": -0.23617637157440186, + "logps/chosen": -0.1757880002260208, + "logps/rejected": -5.425671100616455, + "loss": 0.3219, + "odds_ratio_loss": 0.075681671500206, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01757879927754402, + "rewards/margins": 0.5249882936477661, + "rewards/rejected": -0.5425670742988586, + "sft_loss": 0.1757880002260208, + "step": 2002 + }, + { + "epoch": 2.8966015907447575, + "grad_norm": 2.5535034041144864, + "learning_rate": 4.372165952259043e-06, + "logits/chosen": -0.25127291679382324, + "logits/rejected": -0.20186248421669006, + "logps/chosen": -0.40638288855552673, + "logps/rejected": -1.6983206272125244, + "loss": 0.4103, + "odds_ratio_loss": 0.17356377840042114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04063829034566879, + "rewards/margins": 0.12919378280639648, + "rewards/rejected": -0.16983206570148468, + "sft_loss": 0.40638288855552673, + "step": 2003 + }, + { + "epoch": 2.8980477223427332, + "grad_norm": 2.415128934348208, + "learning_rate": 4.369073437853208e-06, + "logits/chosen": -0.20070333778858185, + "logits/rejected": -0.18373045325279236, + "logps/chosen": -0.3903118968009949, + "logps/rejected": -5.177107810974121, + "loss": 0.4006, + "odds_ratio_loss": 0.12080082297325134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03903118893504143, + "rewards/margins": 0.4786795973777771, + "rewards/rejected": -0.51771080493927, + "sft_loss": 0.3903118968009949, + "step": 2004 + }, + { + "epoch": 2.8994938539407085, + "grad_norm": 2.5665948351613945, + "learning_rate": 4.365980700931622e-06, + "logits/chosen": -0.21441921591758728, + "logits/rejected": -0.16341614723205566, + "logps/chosen": -0.3254525661468506, + "logps/rejected": -4.25070333480835, + "loss": 0.3888, + "odds_ratio_loss": 0.13244378566741943, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03254526108503342, + "rewards/margins": 0.3925250768661499, + "rewards/rejected": -0.4250703454017639, + "sft_loss": 0.3254525661468506, + "step": 2005 + }, + { + "epoch": 2.900939985538684, + "grad_norm": 2.41284437035138, + "learning_rate": 4.3628877433589085e-06, + "logits/chosen": -0.216677725315094, + "logits/rejected": -0.24935810267925262, + "logps/chosen": -0.30690398812294006, + "logps/rejected": -4.660876750946045, + "loss": 0.3046, + "odds_ratio_loss": 0.11162017285823822, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.030690398067235947, + "rewards/margins": 0.43539732694625854, + "rewards/rejected": -0.4660876989364624, + "sft_loss": 0.30690398812294006, + "step": 2006 + }, + { + "epoch": 2.9023861171366594, + "grad_norm": 2.335670194118243, + "learning_rate": 4.359794566999822e-06, + "logits/chosen": -0.10944445431232452, + "logits/rejected": -0.1079663336277008, + "logps/chosen": -0.4069558382034302, + "logps/rejected": -3.064016342163086, + "loss": 0.3672, + "odds_ratio_loss": 0.17633295059204102, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04069558531045914, + "rewards/margins": 0.26570606231689453, + "rewards/rejected": -0.30640166997909546, + "sft_loss": 0.4069558382034302, + "step": 2007 + }, + { + "epoch": 2.903832248734635, + "grad_norm": 2.4178856154509836, + "learning_rate": 4.356701173719252e-06, + "logits/chosen": -0.1341048777103424, + "logits/rejected": -0.20276595652103424, + "logps/chosen": -0.3767666220664978, + "logps/rejected": -2.81579852104187, + "loss": 0.3617, + "odds_ratio_loss": 0.18513742089271545, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03767666220664978, + "rewards/margins": 0.24390317499637604, + "rewards/rejected": -0.281579852104187, + "sft_loss": 0.3767666220664978, + "step": 2008 + }, + { + "epoch": 2.9052783803326103, + "grad_norm": 2.132182042456953, + "learning_rate": 4.3536075653822155e-06, + "logits/chosen": -0.1696842908859253, + "logits/rejected": -0.2770502269268036, + "logps/chosen": -0.38094091415405273, + "logps/rejected": -4.085338115692139, + "loss": 0.3671, + "odds_ratio_loss": 0.13694609701633453, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.038094088435173035, + "rewards/margins": 0.370439738035202, + "rewards/rejected": -0.40853381156921387, + "sft_loss": 0.38094091415405273, + "step": 2009 + }, + { + "epoch": 2.9067245119305856, + "grad_norm": 2.4441830024202638, + "learning_rate": 4.3505137438538605e-06, + "logits/chosen": -0.30766594409942627, + "logits/rejected": -0.17128098011016846, + "logps/chosen": -0.31321054697036743, + "logps/rejected": -3.57883620262146, + "loss": 0.3863, + "odds_ratio_loss": 0.09254439175128937, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03132105618715286, + "rewards/margins": 0.3265625834465027, + "rewards/rejected": -0.35788363218307495, + "sft_loss": 0.31321054697036743, + "step": 2010 + }, + { + "epoch": 2.9081706435285612, + "grad_norm": 2.474111533149485, + "learning_rate": 4.347419710999464e-06, + "logits/chosen": -0.198749840259552, + "logits/rejected": -0.24889856576919556, + "logps/chosen": -0.4312863349914551, + "logps/rejected": -3.4733762741088867, + "loss": 0.3641, + "odds_ratio_loss": 0.15984997153282166, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04312863573431969, + "rewards/margins": 0.3042089641094208, + "rewards/rejected": -0.34733760356903076, + "sft_loss": 0.4312863349914551, + "step": 2011 + }, + { + "epoch": 2.9096167751265365, + "grad_norm": 2.2140379184527874, + "learning_rate": 4.34432546868443e-06, + "logits/chosen": -0.0032480377703905106, + "logits/rejected": -0.05943544581532478, + "logps/chosen": -0.28691381216049194, + "logps/rejected": -2.6978836059570312, + "loss": 0.3977, + "odds_ratio_loss": 0.11956118047237396, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.028691379353404045, + "rewards/margins": 0.24109697341918945, + "rewards/rejected": -0.26978832483291626, + "sft_loss": 0.28691381216049194, + "step": 2012 + }, + { + "epoch": 2.9110629067245117, + "grad_norm": 2.1933346188671705, + "learning_rate": 4.3412310187742895e-06, + "logits/chosen": -0.21547159552574158, + "logits/rejected": -0.20167696475982666, + "logps/chosen": -0.3793547451496124, + "logps/rejected": -3.8487207889556885, + "loss": 0.3633, + "odds_ratio_loss": 0.11790119856595993, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03793547675013542, + "rewards/margins": 0.3469366133213043, + "rewards/rejected": -0.38487207889556885, + "sft_loss": 0.3793547451496124, + "step": 2013 + }, + { + "epoch": 2.9125090383224874, + "grad_norm": 2.2000607686897884, + "learning_rate": 4.338136363134696e-06, + "logits/chosen": -0.16653400659561157, + "logits/rejected": -0.22715330123901367, + "logps/chosen": -0.3354107737541199, + "logps/rejected": -4.489915370941162, + "loss": 0.3521, + "odds_ratio_loss": 0.09342381358146667, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03354107588529587, + "rewards/margins": 0.41545045375823975, + "rewards/rejected": -0.4489915370941162, + "sft_loss": 0.3354107737541199, + "step": 2014 + }, + { + "epoch": 2.9139551699204627, + "grad_norm": 2.345249585264906, + "learning_rate": 4.3350415036314295e-06, + "logits/chosen": -0.1863386034965515, + "logits/rejected": -0.17719031870365143, + "logps/chosen": -0.35153529047966003, + "logps/rejected": -3.003571033477783, + "loss": 0.2938, + "odds_ratio_loss": 0.20526739954948425, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03515353053808212, + "rewards/margins": 0.265203595161438, + "rewards/rejected": -0.3003571033477783, + "sft_loss": 0.35153529047966003, + "step": 2015 + }, + { + "epoch": 2.915401301518438, + "grad_norm": 2.664410560581058, + "learning_rate": 4.331946442130393e-06, + "logits/chosen": -0.32930123805999756, + "logits/rejected": -0.31893402338027954, + "logps/chosen": -0.4697510004043579, + "logps/rejected": -2.2911057472229004, + "loss": 0.4361, + "odds_ratio_loss": 0.14694637060165405, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04697509855031967, + "rewards/margins": 0.18213549256324768, + "rewards/rejected": -0.22911058366298676, + "sft_loss": 0.4697510004043579, + "step": 2016 + }, + { + "epoch": 2.9168474331164136, + "grad_norm": 2.0455275341242096, + "learning_rate": 4.32885118049761e-06, + "logits/chosen": -0.24276413023471832, + "logits/rejected": -0.3042674660682678, + "logps/chosen": -0.31963613629341125, + "logps/rejected": -4.9066643714904785, + "loss": 0.3045, + "odds_ratio_loss": 0.1002592146396637, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03196360915899277, + "rewards/margins": 0.45870283246040344, + "rewards/rejected": -0.4906664490699768, + "sft_loss": 0.31963613629341125, + "step": 2017 + }, + { + "epoch": 2.9182935647143893, + "grad_norm": 2.4816803752814014, + "learning_rate": 4.325755720599226e-06, + "logits/chosen": -0.17681774497032166, + "logits/rejected": -0.13152150809764862, + "logps/chosen": -0.39280036091804504, + "logps/rejected": -2.7280354499816895, + "loss": 0.3974, + "odds_ratio_loss": 0.11691177636384964, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03928003087639809, + "rewards/margins": 0.23352351784706116, + "rewards/rejected": -0.27280354499816895, + "sft_loss": 0.39280036091804504, + "step": 2018 + }, + { + "epoch": 2.9197396963123645, + "grad_norm": 2.4828814390035374, + "learning_rate": 4.322660064301504e-06, + "logits/chosen": -0.20829474925994873, + "logits/rejected": -0.2244415581226349, + "logps/chosen": -0.38568395376205444, + "logps/rejected": -3.2982382774353027, + "loss": 0.3738, + "odds_ratio_loss": 0.11840784549713135, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0385683998465538, + "rewards/margins": 0.2912554442882538, + "rewards/rejected": -0.3298238515853882, + "sft_loss": 0.38568395376205444, + "step": 2019 + }, + { + "epoch": 2.9211858279103398, + "grad_norm": 2.6918576722836085, + "learning_rate": 4.319564213470828e-06, + "logits/chosen": -0.20029829442501068, + "logits/rejected": -0.15000326931476593, + "logps/chosen": -0.5417320132255554, + "logps/rejected": -4.520184516906738, + "loss": 0.3793, + "odds_ratio_loss": 0.19173075258731842, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05417320504784584, + "rewards/margins": 0.3978452682495117, + "rewards/rejected": -0.45201849937438965, + "sft_loss": 0.5417320132255554, + "step": 2020 + }, + { + "epoch": 2.9226319595083154, + "grad_norm": 2.6470788518613233, + "learning_rate": 4.316468169973698e-06, + "logits/chosen": -0.2127470076084137, + "logits/rejected": -0.30946892499923706, + "logps/chosen": -0.3537566363811493, + "logps/rejected": -4.601029396057129, + "loss": 0.4096, + "odds_ratio_loss": 0.06098828837275505, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03537566214799881, + "rewards/margins": 0.42472726106643677, + "rewards/rejected": -0.460102915763855, + "sft_loss": 0.3537566363811493, + "step": 2021 + }, + { + "epoch": 2.9240780911062907, + "grad_norm": 2.31032069822132, + "learning_rate": 4.31337193567673e-06, + "logits/chosen": -0.27767282724380493, + "logits/rejected": -0.2333839237689972, + "logps/chosen": -0.4112637937068939, + "logps/rejected": -5.107657432556152, + "loss": 0.3246, + "odds_ratio_loss": 0.13032792508602142, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04112637788057327, + "rewards/margins": 0.4696393609046936, + "rewards/rejected": -0.5107657313346863, + "sft_loss": 0.4112637937068939, + "step": 2022 + }, + { + "epoch": 2.925524222704266, + "grad_norm": 2.155257212541574, + "learning_rate": 4.3102755124466525e-06, + "logits/chosen": -0.2570434510707855, + "logits/rejected": -0.20935188233852386, + "logps/chosen": -0.40464794635772705, + "logps/rejected": -2.422607183456421, + "loss": 0.3659, + "odds_ratio_loss": 0.14167605340480804, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.040464796125888824, + "rewards/margins": 0.20179593563079834, + "rewards/rejected": -0.24226072430610657, + "sft_loss": 0.40464794635772705, + "step": 2023 + }, + { + "epoch": 2.9269703543022416, + "grad_norm": 2.2816190632498343, + "learning_rate": 4.307178902150315e-06, + "logits/chosen": -0.18600818514823914, + "logits/rejected": -0.24983981251716614, + "logps/chosen": -0.3823423385620117, + "logps/rejected": -2.81748104095459, + "loss": 0.405, + "odds_ratio_loss": 0.1572282761335373, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03823423013091087, + "rewards/margins": 0.24351388216018677, + "rewards/rejected": -0.28174811601638794, + "sft_loss": 0.3823423385620117, + "step": 2024 + }, + { + "epoch": 2.928416485900217, + "grad_norm": 2.2882575706198125, + "learning_rate": 4.3040821066546736e-06, + "logits/chosen": -0.22499872744083405, + "logits/rejected": -0.17465078830718994, + "logps/chosen": -0.38680657744407654, + "logps/rejected": -3.9832472801208496, + "loss": 0.3367, + "odds_ratio_loss": 0.10981922596693039, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.038680657744407654, + "rewards/margins": 0.3596440851688385, + "rewards/rejected": -0.39832472801208496, + "sft_loss": 0.38680657744407654, + "step": 2025 + }, + { + "epoch": 2.929862617498192, + "grad_norm": 2.3630936993834335, + "learning_rate": 4.300985127826796e-06, + "logits/chosen": -0.3212575912475586, + "logits/rejected": -0.3458347022533417, + "logps/chosen": -0.5177043676376343, + "logps/rejected": -3.096360206604004, + "loss": 0.4595, + "odds_ratio_loss": 0.16100654006004333, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05177043378353119, + "rewards/margins": 0.2578656077384949, + "rewards/rejected": -0.30963602662086487, + "sft_loss": 0.5177043676376343, + "step": 2026 + }, + { + "epoch": 2.931308749096168, + "grad_norm": 2.3846314203711674, + "learning_rate": 4.297887967533865e-06, + "logits/chosen": -0.17958812415599823, + "logits/rejected": -0.10923945903778076, + "logps/chosen": -0.3733343780040741, + "logps/rejected": -4.5489301681518555, + "loss": 0.3881, + "odds_ratio_loss": 0.18937966227531433, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03733343631029129, + "rewards/margins": 0.4175596237182617, + "rewards/rejected": -0.4548930525779724, + "sft_loss": 0.3733343780040741, + "step": 2027 + }, + { + "epoch": 2.932754880694143, + "grad_norm": 2.586612143421851, + "learning_rate": 4.294790627643169e-06, + "logits/chosen": -0.3360060453414917, + "logits/rejected": -0.3124268651008606, + "logps/chosen": -0.2640987038612366, + "logps/rejected": -3.922750234603882, + "loss": 0.4057, + "odds_ratio_loss": 0.06727544218301773, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026409868150949478, + "rewards/margins": 0.36586514115333557, + "rewards/rejected": -0.39227503538131714, + "sft_loss": 0.2640987038612366, + "step": 2028 + }, + { + "epoch": 2.9342010122921187, + "grad_norm": 3.000278375564883, + "learning_rate": 4.2916931100221056e-06, + "logits/chosen": -0.16415224969387054, + "logits/rejected": -0.2963314652442932, + "logps/chosen": -0.4116109013557434, + "logps/rejected": -2.8866724967956543, + "loss": 0.4228, + "odds_ratio_loss": 0.20921087265014648, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04116109386086464, + "rewards/margins": 0.24750614166259766, + "rewards/rejected": -0.288667231798172, + "sft_loss": 0.4116109013557434, + "step": 2029 + }, + { + "epoch": 2.935647143890094, + "grad_norm": 2.5339856332023936, + "learning_rate": 4.288595416538179e-06, + "logits/chosen": -0.233162060379982, + "logits/rejected": -0.23949375748634338, + "logps/chosen": -0.3766535222530365, + "logps/rejected": -4.669894695281982, + "loss": 0.4059, + "odds_ratio_loss": 0.08383668214082718, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03766535222530365, + "rewards/margins": 0.4293241500854492, + "rewards/rejected": -0.46698951721191406, + "sft_loss": 0.3766535222530365, + "step": 2030 + }, + { + "epoch": 2.9370932754880696, + "grad_norm": 2.5983661776860796, + "learning_rate": 4.285497549059001e-06, + "logits/chosen": -0.16616466641426086, + "logits/rejected": -0.13526281714439392, + "logps/chosen": -0.3628811240196228, + "logps/rejected": -3.515805721282959, + "loss": 0.3736, + "odds_ratio_loss": 0.07886926084756851, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03628811240196228, + "rewards/margins": 0.31529247760772705, + "rewards/rejected": -0.35158056020736694, + "sft_loss": 0.3628811240196228, + "step": 2031 + }, + { + "epoch": 2.938539407086045, + "grad_norm": 1.9948751384440309, + "learning_rate": 4.282399509452288e-06, + "logits/chosen": -0.29387766122817993, + "logits/rejected": -0.3654487729072571, + "logps/chosen": -0.32119542360305786, + "logps/rejected": -3.8875999450683594, + "loss": 0.3627, + "odds_ratio_loss": 0.07366838306188583, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.032119542360305786, + "rewards/margins": 0.35664045810699463, + "rewards/rejected": -0.388759970664978, + "sft_loss": 0.32119542360305786, + "step": 2032 + }, + { + "epoch": 2.93998553868402, + "grad_norm": 2.0374482986686786, + "learning_rate": 4.279301299585859e-06, + "logits/chosen": -0.23302549123764038, + "logits/rejected": -0.16700975596904755, + "logps/chosen": -0.2750675678253174, + "logps/rejected": -2.0055830478668213, + "loss": 0.3007, + "odds_ratio_loss": 0.11454141139984131, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027506759390234947, + "rewards/margins": 0.17305153608322144, + "rewards/rejected": -0.20055830478668213, + "sft_loss": 0.2750675678253174, + "step": 2033 + }, + { + "epoch": 2.941431670281996, + "grad_norm": 2.848613244485053, + "learning_rate": 4.276202921327636e-06, + "logits/chosen": -0.11311466991901398, + "logits/rejected": -0.18168656527996063, + "logps/chosen": -0.3654848337173462, + "logps/rejected": -2.3824963569641113, + "loss": 0.4009, + "odds_ratio_loss": 0.19991439580917358, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03654848411679268, + "rewards/margins": 0.20170114934444427, + "rewards/rejected": -0.23824964463710785, + "sft_loss": 0.3654848337173462, + "step": 2034 + }, + { + "epoch": 2.942877801879971, + "grad_norm": 2.173231264643902, + "learning_rate": 4.273104376545643e-06, + "logits/chosen": -0.307564914226532, + "logits/rejected": -0.2892274558544159, + "logps/chosen": -0.3027850389480591, + "logps/rejected": -2.450563669204712, + "loss": 0.3599, + "odds_ratio_loss": 0.16147339344024658, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.030278503894805908, + "rewards/margins": 0.214777871966362, + "rewards/rejected": -0.2450563758611679, + "sft_loss": 0.3027850389480591, + "step": 2035 + }, + { + "epoch": 2.9443239334779463, + "grad_norm": 2.1288421307945478, + "learning_rate": 4.2700056671080044e-06, + "logits/chosen": -0.15985527634620667, + "logits/rejected": -0.30066192150115967, + "logps/chosen": -0.4228432774543762, + "logps/rejected": -3.4859423637390137, + "loss": 0.3645, + "odds_ratio_loss": 0.12999072670936584, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04228432849049568, + "rewards/margins": 0.30630987882614136, + "rewards/rejected": -0.34859421849250793, + "sft_loss": 0.4228432774543762, + "step": 2036 + }, + { + "epoch": 2.945770065075922, + "grad_norm": 2.4343642295209422, + "learning_rate": 4.2669067948829425e-06, + "logits/chosen": -0.25953909754753113, + "logits/rejected": -0.1952180564403534, + "logps/chosen": -0.41905689239501953, + "logps/rejected": -5.190460681915283, + "loss": 0.3077, + "odds_ratio_loss": 0.2337718904018402, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04190569370985031, + "rewards/margins": 0.4771403968334198, + "rewards/rejected": -0.5190460681915283, + "sft_loss": 0.41905689239501953, + "step": 2037 + }, + { + "epoch": 2.9472161966738972, + "grad_norm": 2.44183904840763, + "learning_rate": 4.263807761738781e-06, + "logits/chosen": -0.254044771194458, + "logits/rejected": -0.20028023421764374, + "logps/chosen": -0.24801340699195862, + "logps/rejected": -4.495428085327148, + "loss": 0.3435, + "odds_ratio_loss": 0.04579523205757141, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024801339954137802, + "rewards/margins": 0.4247414469718933, + "rewards/rejected": -0.4495427906513214, + "sft_loss": 0.24801340699195862, + "step": 2038 + }, + { + "epoch": 2.9486623282718725, + "grad_norm": 3.1947725674504537, + "learning_rate": 4.260708569543937e-06, + "logits/chosen": -0.19664132595062256, + "logits/rejected": -0.29793834686279297, + "logps/chosen": -0.4847376346588135, + "logps/rejected": -3.5855674743652344, + "loss": 0.4131, + "odds_ratio_loss": 0.1429980844259262, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04847376421093941, + "rewards/margins": 0.31008297204971313, + "rewards/rejected": -0.35855674743652344, + "sft_loss": 0.4847376346588135, + "step": 2039 + }, + { + "epoch": 2.950108459869848, + "grad_norm": 2.3990419415007818, + "learning_rate": 4.257609220166927e-06, + "logits/chosen": -0.18066376447677612, + "logits/rejected": -0.18795566260814667, + "logps/chosen": -0.3433865010738373, + "logps/rejected": -2.787734270095825, + "loss": 0.3489, + "odds_ratio_loss": 0.13024930655956268, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03433865308761597, + "rewards/margins": 0.24443475902080536, + "rewards/rejected": -0.27877339720726013, + "sft_loss": 0.3433865010738373, + "step": 2040 + }, + { + "epoch": 2.951554591467824, + "grad_norm": 2.7949894636338395, + "learning_rate": 4.254509715476356e-06, + "logits/chosen": -0.1268310844898224, + "logits/rejected": -0.2035345733165741, + "logps/chosen": -0.38025563955307007, + "logps/rejected": -3.1927952766418457, + "loss": 0.3854, + "odds_ratio_loss": 0.11911129951477051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.038025565445423126, + "rewards/margins": 0.28125396370887756, + "rewards/rejected": -0.3192794919013977, + "sft_loss": 0.38025563955307007, + "step": 2041 + }, + { + "epoch": 2.953000723065799, + "grad_norm": 2.822207275782586, + "learning_rate": 4.251410057340932e-06, + "logits/chosen": -0.2712603211402893, + "logits/rejected": -0.32916659116744995, + "logps/chosen": -0.39672163128852844, + "logps/rejected": -4.876379013061523, + "loss": 0.3427, + "odds_ratio_loss": 0.06600239872932434, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.039672162383794785, + "rewards/margins": 0.44796574115753174, + "rewards/rejected": -0.4876379072666168, + "sft_loss": 0.39672163128852844, + "step": 2042 + }, + { + "epoch": 2.9544468546637743, + "grad_norm": 2.205048962845581, + "learning_rate": 4.248310247629446e-06, + "logits/chosen": -0.2773968577384949, + "logits/rejected": -0.20153555274009705, + "logps/chosen": -0.18046045303344727, + "logps/rejected": -4.676190376281738, + "loss": 0.3392, + "odds_ratio_loss": 0.06551147252321243, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018046045675873756, + "rewards/margins": 0.4495730400085449, + "rewards/rejected": -0.46761903166770935, + "sft_loss": 0.18046045303344727, + "step": 2043 + }, + { + "epoch": 2.95589298626175, + "grad_norm": 2.3273549681064325, + "learning_rate": 4.24521028821079e-06, + "logits/chosen": -0.16675907373428345, + "logits/rejected": -0.09869304299354553, + "logps/chosen": -0.2557486593723297, + "logps/rejected": -4.221222877502441, + "loss": 0.2917, + "odds_ratio_loss": 0.05778995156288147, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02557486668229103, + "rewards/margins": 0.39654743671417236, + "rewards/rejected": -0.4221222698688507, + "sft_loss": 0.2557486593723297, + "step": 2044 + }, + { + "epoch": 2.9573391178597253, + "grad_norm": 9.342734141369453, + "learning_rate": 4.242110180953935e-06, + "logits/chosen": -0.16678950190544128, + "logits/rejected": -0.07353587448596954, + "logps/chosen": -0.35120508074760437, + "logps/rejected": -2.6658384799957275, + "loss": 0.384, + "odds_ratio_loss": 0.16317623853683472, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.035120509564876556, + "rewards/margins": 0.23146334290504456, + "rewards/rejected": -0.2665838599205017, + "sft_loss": 0.35120508074760437, + "step": 2045 + }, + { + "epoch": 2.9587852494577005, + "grad_norm": 1.98977459889491, + "learning_rate": 4.239009927727952e-06, + "logits/chosen": -0.18185609579086304, + "logits/rejected": -0.1600634753704071, + "logps/chosen": -0.22608599066734314, + "logps/rejected": -5.732004642486572, + "loss": 0.3355, + "odds_ratio_loss": 0.08565568923950195, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022608600556850433, + "rewards/margins": 0.5505918860435486, + "rewards/rejected": -0.5732004642486572, + "sft_loss": 0.22608599066734314, + "step": 2046 + }, + { + "epoch": 2.960231381055676, + "grad_norm": 2.6070638936508534, + "learning_rate": 4.235909530401992e-06, + "logits/chosen": -0.1590268611907959, + "logits/rejected": -0.1363297998905182, + "logps/chosen": -0.3237609267234802, + "logps/rejected": -2.4293899536132812, + "loss": 0.3868, + "odds_ratio_loss": 0.11542137712240219, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03237609565258026, + "rewards/margins": 0.21056291460990906, + "rewards/rejected": -0.24293901026248932, + "sft_loss": 0.3237609267234802, + "step": 2047 + }, + { + "epoch": 2.9616775126536514, + "grad_norm": 2.629598255334325, + "learning_rate": 4.232808990845298e-06, + "logits/chosen": -0.1289118379354477, + "logits/rejected": -0.19492867588996887, + "logps/chosen": -0.22533206641674042, + "logps/rejected": -3.8823606967926025, + "loss": 0.3449, + "odds_ratio_loss": 0.06637558341026306, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022533204406499863, + "rewards/margins": 0.36570286750793457, + "rewards/rejected": -0.38823604583740234, + "sft_loss": 0.22533206641674042, + "step": 2048 + }, + { + "epoch": 2.9631236442516267, + "grad_norm": 2.149957958947979, + "learning_rate": 4.229708310927196e-06, + "logits/chosen": -0.16187317669391632, + "logits/rejected": -0.31687965989112854, + "logps/chosen": -0.41082707047462463, + "logps/rejected": -4.500897407531738, + "loss": 0.4035, + "odds_ratio_loss": 0.17749832570552826, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.041082706302404404, + "rewards/margins": 0.4090070128440857, + "rewards/rejected": -0.4500897526741028, + "sft_loss": 0.41082707047462463, + "step": 2049 + }, + { + "epoch": 2.9645697758496024, + "grad_norm": 2.570665705617286, + "learning_rate": 4.2266074925170975e-06, + "logits/chosen": -0.26426705718040466, + "logits/rejected": -0.18332192301750183, + "logps/chosen": -0.33762437105178833, + "logps/rejected": -3.996952533721924, + "loss": 0.3992, + "odds_ratio_loss": 0.09728610515594482, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03376243636012077, + "rewards/margins": 0.365932822227478, + "rewards/rejected": -0.3996952474117279, + "sft_loss": 0.33762437105178833, + "step": 2050 + }, + { + "epoch": 2.9660159074475776, + "grad_norm": 2.17098598945688, + "learning_rate": 4.223506537484499e-06, + "logits/chosen": -0.1906561255455017, + "logits/rejected": -0.12105484306812286, + "logps/chosen": -0.3024832606315613, + "logps/rejected": -1.6752251386642456, + "loss": 0.3359, + "odds_ratio_loss": 0.13633744418621063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.030248325318098068, + "rewards/margins": 0.13727419078350067, + "rewards/rejected": -0.16752250492572784, + "sft_loss": 0.3024832606315613, + "step": 2051 + }, + { + "epoch": 2.9674620390455533, + "grad_norm": 2.2566433623049336, + "learning_rate": 4.220405447698976e-06, + "logits/chosen": -0.10293443500995636, + "logits/rejected": -0.14721611142158508, + "logps/chosen": -0.33052927255630493, + "logps/rejected": -3.822780132293701, + "loss": 0.3424, + "odds_ratio_loss": 0.09570840746164322, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03305293247103691, + "rewards/margins": 0.34922507405281067, + "rewards/rejected": -0.3822779953479767, + "sft_loss": 0.33052927255630493, + "step": 2052 + }, + { + "epoch": 2.9689081706435285, + "grad_norm": 2.3375168447838615, + "learning_rate": 4.217304225030187e-06, + "logits/chosen": -0.19826990365982056, + "logits/rejected": -0.19559445977210999, + "logps/chosen": -0.2711048722267151, + "logps/rejected": -5.2059326171875, + "loss": 0.3442, + "odds_ratio_loss": 0.09908849745988846, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027110489085316658, + "rewards/margins": 0.4934827387332916, + "rewards/rejected": -0.5205932259559631, + "sft_loss": 0.2711048722267151, + "step": 2053 + }, + { + "epoch": 2.970354302241504, + "grad_norm": 2.1135846620709655, + "learning_rate": 4.214202871347873e-06, + "logits/chosen": -0.11324124038219452, + "logits/rejected": -0.1152898520231247, + "logps/chosen": -0.3063899874687195, + "logps/rejected": -2.742185354232788, + "loss": 0.3318, + "odds_ratio_loss": 0.18975186347961426, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.030639000236988068, + "rewards/margins": 0.24357952177524567, + "rewards/rejected": -0.27421852946281433, + "sft_loss": 0.3063899874687195, + "step": 2054 + }, + { + "epoch": 2.9718004338394794, + "grad_norm": 2.649346066064892, + "learning_rate": 4.211101388521849e-06, + "logits/chosen": -0.20979903638362885, + "logits/rejected": -0.24693188071250916, + "logps/chosen": -0.2766479551792145, + "logps/rejected": -3.680441379547119, + "loss": 0.3301, + "odds_ratio_loss": 0.059294842183589935, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027664795517921448, + "rewards/margins": 0.34037935733795166, + "rewards/rejected": -0.3680441379547119, + "sft_loss": 0.2766479551792145, + "step": 2055 + }, + { + "epoch": 2.9732465654374547, + "grad_norm": 2.1537269933729584, + "learning_rate": 4.207999778422013e-06, + "logits/chosen": -0.19660750031471252, + "logits/rejected": -0.1521485149860382, + "logps/chosen": -0.3724362850189209, + "logps/rejected": -2.149691343307495, + "loss": 0.366, + "odds_ratio_loss": 0.10635429620742798, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03724363446235657, + "rewards/margins": 0.17772549390792847, + "rewards/rejected": -0.21496912837028503, + "sft_loss": 0.3724362850189209, + "step": 2056 + }, + { + "epoch": 2.9746926970354304, + "grad_norm": 3.7741465300804333, + "learning_rate": 4.204898042918334e-06, + "logits/chosen": -0.1858070194721222, + "logits/rejected": -0.15695783495903015, + "logps/chosen": -0.29552727937698364, + "logps/rejected": -3.873058319091797, + "loss": 0.3326, + "odds_ratio_loss": 0.1055610179901123, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029552727937698364, + "rewards/margins": 0.35775309801101685, + "rewards/rejected": -0.3873058557510376, + "sft_loss": 0.29552727937698364, + "step": 2057 + }, + { + "epoch": 2.9761388286334056, + "grad_norm": 2.0496999134672187, + "learning_rate": 4.201796183880863e-06, + "logits/chosen": -0.1244896799325943, + "logits/rejected": -0.21163567900657654, + "logps/chosen": -0.4614596962928772, + "logps/rejected": -3.733137845993042, + "loss": 0.3574, + "odds_ratio_loss": 0.17765578627586365, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0461459681391716, + "rewards/margins": 0.3271678388118744, + "rewards/rejected": -0.3733137845993042, + "sft_loss": 0.4614596962928772, + "step": 2058 + }, + { + "epoch": 2.977584960231381, + "grad_norm": 2.359398391670168, + "learning_rate": 4.1986942031797205e-06, + "logits/chosen": -0.11397155374288559, + "logits/rejected": -0.11231968551874161, + "logps/chosen": -0.28144025802612305, + "logps/rejected": -2.4738903045654297, + "loss": 0.3272, + "odds_ratio_loss": 0.12439928948879242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.028144024312496185, + "rewards/margins": 0.21924501657485962, + "rewards/rejected": -0.2473890334367752, + "sft_loss": 0.28144025802612305, + "step": 2059 + }, + { + "epoch": 2.9790310918293565, + "grad_norm": 2.387322205273284, + "learning_rate": 4.1955921026851044e-06, + "logits/chosen": -0.20542100071907043, + "logits/rejected": -0.14749495685100555, + "logps/chosen": -0.4900125563144684, + "logps/rejected": -2.852177619934082, + "loss": 0.3538, + "odds_ratio_loss": 0.15802116692066193, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04900125786662102, + "rewards/margins": 0.23621651530265808, + "rewards/rejected": -0.2852177619934082, + "sft_loss": 0.4900125563144684, + "step": 2060 + }, + { + "epoch": 2.980477223427332, + "grad_norm": 2.570217232894552, + "learning_rate": 4.19248988426728e-06, + "logits/chosen": -0.22486189007759094, + "logits/rejected": -0.16402465105056763, + "logps/chosen": -0.2641967535018921, + "logps/rejected": -3.638213872909546, + "loss": 0.3508, + "odds_ratio_loss": 0.06803110986948013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02641967684030533, + "rewards/margins": 0.33740171790122986, + "rewards/rejected": -0.3638213574886322, + "sft_loss": 0.2641967535018921, + "step": 2061 + }, + { + "epoch": 2.981923355025307, + "grad_norm": 2.2112546704563183, + "learning_rate": 4.189387549796587e-06, + "logits/chosen": -0.0817125216126442, + "logits/rejected": -0.12825915217399597, + "logps/chosen": -0.38398486375808716, + "logps/rejected": -3.2384722232818604, + "loss": 0.3811, + "odds_ratio_loss": 0.17186611890792847, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.038398489356040955, + "rewards/margins": 0.28544872999191284, + "rewards/rejected": -0.323847234249115, + "sft_loss": 0.38398486375808716, + "step": 2062 + }, + { + "epoch": 2.9833694866232827, + "grad_norm": 2.8149652071190667, + "learning_rate": 4.186285101143435e-06, + "logits/chosen": -0.25716516375541687, + "logits/rejected": -0.21721546351909637, + "logps/chosen": -0.42149853706359863, + "logps/rejected": -2.934656858444214, + "loss": 0.3683, + "odds_ratio_loss": 0.1831168532371521, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.042149852961301804, + "rewards/margins": 0.2513158321380615, + "rewards/rejected": -0.29346567392349243, + "sft_loss": 0.42149853706359863, + "step": 2063 + }, + { + "epoch": 2.9848156182212584, + "grad_norm": 2.4516629385806734, + "learning_rate": 4.183182540178301e-06, + "logits/chosen": -0.3914540112018585, + "logits/rejected": -0.30311813950538635, + "logps/chosen": -0.25005167722702026, + "logps/rejected": -3.276885747909546, + "loss": 0.3934, + "odds_ratio_loss": 0.07259271293878555, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025005165487527847, + "rewards/margins": 0.30268341302871704, + "rewards/rejected": -0.3276885747909546, + "sft_loss": 0.25005167722702026, + "step": 2064 + }, + { + "epoch": 2.9862617498192336, + "grad_norm": 2.5350627953826477, + "learning_rate": 4.180079868771733e-06, + "logits/chosen": -0.1459140181541443, + "logits/rejected": -0.21189197897911072, + "logps/chosen": -0.3703756332397461, + "logps/rejected": -2.440211296081543, + "loss": 0.3872, + "odds_ratio_loss": 0.15449750423431396, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03703756630420685, + "rewards/margins": 0.20698359608650208, + "rewards/rejected": -0.24402114748954773, + "sft_loss": 0.3703756332397461, + "step": 2065 + }, + { + "epoch": 2.987707881417209, + "grad_norm": 2.185668703924067, + "learning_rate": 4.176977088794341e-06, + "logits/chosen": -0.30598506331443787, + "logits/rejected": -0.2191198468208313, + "logps/chosen": -0.40353310108184814, + "logps/rejected": -2.7489523887634277, + "loss": 0.4252, + "odds_ratio_loss": 0.14380109310150146, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04035331308841705, + "rewards/margins": 0.2345419079065323, + "rewards/rejected": -0.27489522099494934, + "sft_loss": 0.40353310108184814, + "step": 2066 + }, + { + "epoch": 2.9891540130151846, + "grad_norm": 2.9826272049580513, + "learning_rate": 4.173874202116803e-06, + "logits/chosen": -0.37615519762039185, + "logits/rejected": -0.38028109073638916, + "logps/chosen": -0.4126752018928528, + "logps/rejected": -2.373816967010498, + "loss": 0.3951, + "odds_ratio_loss": 0.17959663271903992, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.041267525404691696, + "rewards/margins": 0.1961141675710678, + "rewards/rejected": -0.2373816967010498, + "sft_loss": 0.4126752018928528, + "step": 2067 + }, + { + "epoch": 2.99060014461316, + "grad_norm": 2.341677277000319, + "learning_rate": 4.170771210609861e-06, + "logits/chosen": -0.2782983183860779, + "logits/rejected": -0.193710595369339, + "logps/chosen": -0.44476866722106934, + "logps/rejected": -2.913386821746826, + "loss": 0.3934, + "odds_ratio_loss": 0.14662766456604004, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.044476866722106934, + "rewards/margins": 0.24686183035373688, + "rewards/rejected": -0.2913386821746826, + "sft_loss": 0.44476866722106934, + "step": 2068 + }, + { + "epoch": 2.992046276211135, + "grad_norm": 2.129466110457359, + "learning_rate": 4.167668116144319e-06, + "logits/chosen": -0.09359033405780792, + "logits/rejected": -0.26171213388442993, + "logps/chosen": -0.3228522539138794, + "logps/rejected": -2.8332600593566895, + "loss": 0.3746, + "odds_ratio_loss": 0.11024969816207886, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03228522837162018, + "rewards/margins": 0.25104081630706787, + "rewards/rejected": -0.28332602977752686, + "sft_loss": 0.3228522539138794, + "step": 2069 + }, + { + "epoch": 2.9934924078091107, + "grad_norm": 2.18740864051717, + "learning_rate": 4.164564920591047e-06, + "logits/chosen": -0.08463630080223083, + "logits/rejected": -0.15710577368736267, + "logps/chosen": -0.3528903126716614, + "logps/rejected": -4.306077003479004, + "loss": 0.3843, + "odds_ratio_loss": 0.15712261199951172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.035289034247398376, + "rewards/margins": 0.39531874656677246, + "rewards/rejected": -0.43060773611068726, + "sft_loss": 0.3528903126716614, + "step": 2070 + }, + { + "epoch": 2.994938539407086, + "grad_norm": 2.988838631607076, + "learning_rate": 4.16146162582097e-06, + "logits/chosen": -0.21836420893669128, + "logits/rejected": -0.1393698751926422, + "logps/chosen": -0.26207345724105835, + "logps/rejected": -4.11907434463501, + "loss": 0.3516, + "odds_ratio_loss": 0.125029519200325, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026207346469163895, + "rewards/margins": 0.3857001066207886, + "rewards/rejected": -0.411907434463501, + "sft_loss": 0.26207345724105835, + "step": 2071 + }, + { + "epoch": 2.9963846710050612, + "grad_norm": 2.5582808538128026, + "learning_rate": 4.158358233705078e-06, + "logits/chosen": -0.16028903424739838, + "logits/rejected": -0.13912662863731384, + "logps/chosen": -0.3448236286640167, + "logps/rejected": -2.6296613216400146, + "loss": 0.4081, + "odds_ratio_loss": 0.19457879662513733, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03448236733675003, + "rewards/margins": 0.22848376631736755, + "rewards/rejected": -0.2629661560058594, + "sft_loss": 0.3448236286640167, + "step": 2072 + }, + { + "epoch": 2.997830802603037, + "grad_norm": 3.5422999299640074, + "learning_rate": 4.155254746114417e-06, + "logits/chosen": -0.28058838844299316, + "logits/rejected": -0.27185720205307007, + "logps/chosen": -0.4271693825721741, + "logps/rejected": -2.3834800720214844, + "loss": 0.4268, + "odds_ratio_loss": 0.12995006144046783, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04271693527698517, + "rewards/margins": 0.19563104212284088, + "rewards/rejected": -0.23834799230098724, + "sft_loss": 0.4271693825721741, + "step": 2073 + }, + { + "epoch": 2.999276934201012, + "grad_norm": 2.339601181160581, + "learning_rate": 4.152151164920091e-06, + "logits/chosen": -0.2662460505962372, + "logits/rejected": -0.24672895669937134, + "logps/chosen": -0.25342679023742676, + "logps/rejected": -4.663143634796143, + "loss": 0.3761, + "odds_ratio_loss": 0.05503750592470169, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025342680513858795, + "rewards/margins": 0.4409716725349426, + "rewards/rejected": -0.4663143754005432, + "sft_loss": 0.25342679023742676, + "step": 2074 + }, + { + "epoch": 3.000723065798988, + "grad_norm": 2.4154432516967472, + "learning_rate": 4.149047491993262e-06, + "logits/chosen": -0.3026701509952545, + "logits/rejected": -0.3210662305355072, + "logps/chosen": -0.21438798308372498, + "logps/rejected": -4.023871898651123, + "loss": 0.2418, + "odds_ratio_loss": 0.06626778841018677, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021438799798488617, + "rewards/margins": 0.38094842433929443, + "rewards/rejected": -0.40238723158836365, + "sft_loss": 0.21438798308372498, + "step": 2075 + }, + { + "epoch": 3.002169197396963, + "grad_norm": 4.450487484527005, + "learning_rate": 4.145943729205145e-06, + "logits/chosen": -0.28832417726516724, + "logits/rejected": -0.24835100769996643, + "logps/chosen": -0.32508355379104614, + "logps/rejected": -2.2887487411499023, + "loss": 0.2607, + "odds_ratio_loss": 0.09311854094266891, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03250835835933685, + "rewards/margins": 0.19636651873588562, + "rewards/rejected": -0.22887487709522247, + "sft_loss": 0.32508355379104614, + "step": 2076 + }, + { + "epoch": 3.0036153289949383, + "grad_norm": 2.782558761277381, + "learning_rate": 4.142839878427008e-06, + "logits/chosen": -0.28335806727409363, + "logits/rejected": -0.2334311306476593, + "logps/chosen": -0.18162278831005096, + "logps/rejected": -2.974604606628418, + "loss": 0.2096, + "odds_ratio_loss": 0.06165589019656181, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018162280321121216, + "rewards/margins": 0.27929815649986267, + "rewards/rejected": -0.2974604368209839, + "sft_loss": 0.18162278831005096, + "step": 2077 + }, + { + "epoch": 3.005061460592914, + "grad_norm": 2.239035652889864, + "learning_rate": 4.1397359415301784e-06, + "logits/chosen": -0.3854452669620514, + "logits/rejected": -0.3029145896434784, + "logps/chosen": -0.2096811830997467, + "logps/rejected": -4.488315582275391, + "loss": 0.1834, + "odds_ratio_loss": 0.053315550088882446, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0209681186825037, + "rewards/margins": 0.4278634786605835, + "rewards/rejected": -0.44883155822753906, + "sft_loss": 0.2096811830997467, + "step": 2078 + }, + { + "epoch": 3.0065075921908893, + "grad_norm": 2.2164457668179978, + "learning_rate": 4.1366319203860286e-06, + "logits/chosen": -0.610029399394989, + "logits/rejected": -0.41548243165016174, + "logps/chosen": -0.1624118685722351, + "logps/rejected": -2.981743574142456, + "loss": 0.1688, + "odds_ratio_loss": 0.04206952825188637, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01624118722975254, + "rewards/margins": 0.28193315863609314, + "rewards/rejected": -0.29817435145378113, + "sft_loss": 0.1624118685722351, + "step": 2079 + }, + { + "epoch": 3.007953723788865, + "grad_norm": 2.4066543302384833, + "learning_rate": 4.133527816865985e-06, + "logits/chosen": -0.6042571663856506, + "logits/rejected": -0.37458497285842896, + "logps/chosen": -0.16903214156627655, + "logps/rejected": -3.6454310417175293, + "loss": 0.2058, + "odds_ratio_loss": 0.020001225173473358, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016903216019272804, + "rewards/margins": 0.34763991832733154, + "rewards/rejected": -0.3645431399345398, + "sft_loss": 0.16903214156627655, + "step": 2080 + }, + { + "epoch": 3.00939985538684, + "grad_norm": 2.312606834909949, + "learning_rate": 4.130423632841524e-06, + "logits/chosen": -0.628654956817627, + "logits/rejected": -0.531693160533905, + "logps/chosen": -0.14114609360694885, + "logps/rejected": -3.86907958984375, + "loss": 0.2121, + "odds_ratio_loss": 0.02958705648779869, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014114608988165855, + "rewards/margins": 0.37279337644577026, + "rewards/rejected": -0.3869079649448395, + "sft_loss": 0.14114609360694885, + "step": 2081 + }, + { + "epoch": 3.0108459869848154, + "grad_norm": 2.267565799953285, + "learning_rate": 4.127319370184169e-06, + "logits/chosen": -0.6523191928863525, + "logits/rejected": -0.3899100720882416, + "logps/chosen": -0.09384972602128983, + "logps/rejected": -3.3656177520751953, + "loss": 0.1418, + "odds_ratio_loss": 0.024596964940428734, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009384972974658012, + "rewards/margins": 0.3271768093109131, + "rewards/rejected": -0.33656179904937744, + "sft_loss": 0.09384972602128983, + "step": 2082 + }, + { + "epoch": 3.012292118582791, + "grad_norm": 3.145990187795483, + "learning_rate": 4.124215030765491e-06, + "logits/chosen": -1.0443816184997559, + "logits/rejected": -0.7100991010665894, + "logps/chosen": -0.16050492227077484, + "logps/rejected": -3.2707293033599854, + "loss": 0.1905, + "odds_ratio_loss": 0.03136509656906128, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016050491482019424, + "rewards/margins": 0.31102246046066284, + "rewards/rejected": -0.32707294821739197, + "sft_loss": 0.16050492227077484, + "step": 2083 + }, + { + "epoch": 3.0137382501807664, + "grad_norm": 4.579359916220718, + "learning_rate": 4.121110616457108e-06, + "logits/chosen": -0.5784963369369507, + "logits/rejected": -0.47556227445602417, + "logps/chosen": -0.2588617205619812, + "logps/rejected": -3.156449794769287, + "loss": 0.2127, + "odds_ratio_loss": 0.05530041828751564, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0258861742913723, + "rewards/margins": 0.2897588014602661, + "rewards/rejected": -0.3156449794769287, + "sft_loss": 0.2588617205619812, + "step": 2084 + }, + { + "epoch": 3.015184381778742, + "grad_norm": 2.7301283495678605, + "learning_rate": 4.118006129130684e-06, + "logits/chosen": -0.6547715663909912, + "logits/rejected": -0.40283170342445374, + "logps/chosen": -0.1553865224123001, + "logps/rejected": -4.589504718780518, + "loss": 0.1887, + "odds_ratio_loss": 0.028455141931772232, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015538652427494526, + "rewards/margins": 0.44341176748275757, + "rewards/rejected": -0.4589504599571228, + "sft_loss": 0.1553865224123001, + "step": 2085 + }, + { + "epoch": 3.0166305133767173, + "grad_norm": 2.181836161479129, + "learning_rate": 4.114901570657925e-06, + "logits/chosen": -0.3926331698894501, + "logits/rejected": -0.35337767004966736, + "logps/chosen": -0.1808508038520813, + "logps/rejected": -4.288332939147949, + "loss": 0.2029, + "odds_ratio_loss": 0.057720035314559937, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01808508113026619, + "rewards/margins": 0.4107481837272644, + "rewards/rejected": -0.4288333058357239, + "sft_loss": 0.1808508038520813, + "step": 2086 + }, + { + "epoch": 3.0180766449746925, + "grad_norm": 2.479142035501784, + "learning_rate": 4.111796942910581e-06, + "logits/chosen": -0.5397032499313354, + "logits/rejected": -0.5243762731552124, + "logps/chosen": -0.15296411514282227, + "logps/rejected": -3.3156747817993164, + "loss": 0.1995, + "odds_ratio_loss": 0.052010953426361084, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01529641356319189, + "rewards/margins": 0.316271036863327, + "rewards/rejected": -0.3315674662590027, + "sft_loss": 0.15296411514282227, + "step": 2087 + }, + { + "epoch": 3.019522776572668, + "grad_norm": 1.8418463482304441, + "learning_rate": 4.108692247760445e-06, + "logits/chosen": -0.42499226331710815, + "logits/rejected": -0.3503390848636627, + "logps/chosen": -0.1449154168367386, + "logps/rejected": -5.186098575592041, + "loss": 0.1672, + "odds_ratio_loss": 0.03054206445813179, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014491541311144829, + "rewards/margins": 0.5041183233261108, + "rewards/rejected": -0.5186098217964172, + "sft_loss": 0.1449154168367386, + "step": 2088 + }, + { + "epoch": 3.0209689081706435, + "grad_norm": 2.29448458956409, + "learning_rate": 4.105587487079345e-06, + "logits/chosen": -0.6999224424362183, + "logits/rejected": -0.48990195989608765, + "logps/chosen": -0.13518781960010529, + "logps/rejected": -2.764622211456299, + "loss": 0.1565, + "odds_ratio_loss": 0.042032863944768906, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013518782332539558, + "rewards/margins": 0.26294347643852234, + "rewards/rejected": -0.27646225690841675, + "sft_loss": 0.13518781960010529, + "step": 2089 + }, + { + "epoch": 3.022415039768619, + "grad_norm": 1.8885209324093233, + "learning_rate": 4.1024826627391575e-06, + "logits/chosen": -0.5850200653076172, + "logits/rejected": -0.41026294231414795, + "logps/chosen": -0.2129233032464981, + "logps/rejected": -3.2739462852478027, + "loss": 0.1821, + "odds_ratio_loss": 0.05676015093922615, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02129232883453369, + "rewards/margins": 0.30610227584838867, + "rewards/rejected": -0.32739460468292236, + "sft_loss": 0.2129233032464981, + "step": 2090 + }, + { + "epoch": 3.0238611713665944, + "grad_norm": 2.618426799746971, + "learning_rate": 4.0993777766117915e-06, + "logits/chosen": -0.6758778095245361, + "logits/rejected": -0.47175687551498413, + "logps/chosen": -0.27825915813446045, + "logps/rejected": -4.689516067504883, + "loss": 0.1887, + "odds_ratio_loss": 0.041840679943561554, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027825918048620224, + "rewards/margins": 0.4411257207393646, + "rewards/rejected": -0.46895164251327515, + "sft_loss": 0.27825915813446045, + "step": 2091 + }, + { + "epoch": 3.0253073029645696, + "grad_norm": 2.0671758104441094, + "learning_rate": 4.0962728305691926e-06, + "logits/chosen": -0.495349258184433, + "logits/rejected": -0.41494473814964294, + "logps/chosen": -0.1947062909603119, + "logps/rejected": -3.9549150466918945, + "loss": 0.1946, + "odds_ratio_loss": 0.05212587118148804, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01947062835097313, + "rewards/margins": 0.3760209083557129, + "rewards/rejected": -0.3954915404319763, + "sft_loss": 0.1947062909603119, + "step": 2092 + }, + { + "epoch": 3.0267534345625453, + "grad_norm": 2.043595727301972, + "learning_rate": 4.093167826483347e-06, + "logits/chosen": -0.4349784851074219, + "logits/rejected": -0.39165636897087097, + "logps/chosen": -0.16431699693202972, + "logps/rejected": -3.6643869876861572, + "loss": 0.225, + "odds_ratio_loss": 0.035537637770175934, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016431700438261032, + "rewards/margins": 0.350007027387619, + "rewards/rejected": -0.36643874645233154, + "sft_loss": 0.16431699693202972, + "step": 2093 + }, + { + "epoch": 3.0281995661605206, + "grad_norm": 2.232927910314228, + "learning_rate": 4.090062766226271e-06, + "logits/chosen": -0.7041189670562744, + "logits/rejected": -0.47187769412994385, + "logps/chosen": -0.1355535387992859, + "logps/rejected": -5.294040203094482, + "loss": 0.2344, + "odds_ratio_loss": 0.026054540649056435, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013555353507399559, + "rewards/margins": 0.515848696231842, + "rewards/rejected": -0.5294040441513062, + "sft_loss": 0.1355535387992859, + "step": 2094 + }, + { + "epoch": 3.0296456977584962, + "grad_norm": 2.5169435782219667, + "learning_rate": 4.086957651670018e-06, + "logits/chosen": -0.6608462333679199, + "logits/rejected": -0.48246246576309204, + "logps/chosen": -0.29138410091400146, + "logps/rejected": -2.3921921253204346, + "loss": 0.2552, + "odds_ratio_loss": 0.06351328641176224, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029138410463929176, + "rewards/margins": 0.2100808173418045, + "rewards/rejected": -0.23921923339366913, + "sft_loss": 0.29138410091400146, + "step": 2095 + }, + { + "epoch": 3.0310918293564715, + "grad_norm": 1.9132864556802796, + "learning_rate": 4.0838524846866735e-06, + "logits/chosen": -0.7670720219612122, + "logits/rejected": -0.7422049641609192, + "logps/chosen": -0.18818186223506927, + "logps/rejected": -3.1613006591796875, + "loss": 0.1341, + "odds_ratio_loss": 0.04930785298347473, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018818188458681107, + "rewards/margins": 0.2973118722438812, + "rewards/rejected": -0.31613004207611084, + "sft_loss": 0.18818186223506927, + "step": 2096 + }, + { + "epoch": 3.0325379609544467, + "grad_norm": 1.9483355773154174, + "learning_rate": 4.080747267148353e-06, + "logits/chosen": -0.4968271553516388, + "logits/rejected": -0.4267679452896118, + "logps/chosen": -0.15192314982414246, + "logps/rejected": -3.1880059242248535, + "loss": 0.1494, + "odds_ratio_loss": 0.05188725143671036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01519231591373682, + "rewards/margins": 0.3036082684993744, + "rewards/rejected": -0.31880059838294983, + "sft_loss": 0.15192314982414246, + "step": 2097 + }, + { + "epoch": 3.0339840925524224, + "grad_norm": 2.402577443846496, + "learning_rate": 4.077642000927205e-06, + "logits/chosen": -0.6681464314460754, + "logits/rejected": -0.41508573293685913, + "logps/chosen": -0.1843072772026062, + "logps/rejected": -5.525094985961914, + "loss": 0.1644, + "odds_ratio_loss": 0.02610270492732525, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01843072660267353, + "rewards/margins": 0.5340787768363953, + "rewards/rejected": -0.5525094270706177, + "sft_loss": 0.1843072772026062, + "step": 2098 + }, + { + "epoch": 3.0354302241503976, + "grad_norm": 2.4018786992472054, + "learning_rate": 4.074536687895405e-06, + "logits/chosen": -0.7136832475662231, + "logits/rejected": -0.47490206360816956, + "logps/chosen": -0.15297845005989075, + "logps/rejected": -4.706912994384766, + "loss": 0.2074, + "odds_ratio_loss": 0.02270958572626114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015297845005989075, + "rewards/margins": 0.4553934931755066, + "rewards/rejected": -0.47069135308265686, + "sft_loss": 0.15297845005989075, + "step": 2099 + }, + { + "epoch": 3.036876355748373, + "grad_norm": 2.043474240086887, + "learning_rate": 4.0714313299251575e-06, + "logits/chosen": -0.7795498967170715, + "logits/rejected": -0.5213183164596558, + "logps/chosen": -0.11158294975757599, + "logps/rejected": -4.195063591003418, + "loss": 0.1316, + "odds_ratio_loss": 0.019902389496564865, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011158295907080173, + "rewards/margins": 0.40834811329841614, + "rewards/rejected": -0.41950640082359314, + "sft_loss": 0.11158294975757599, + "step": 2100 + }, + { + "epoch": 3.0383224873463486, + "grad_norm": 2.1009794089099647, + "learning_rate": 4.0683259288886965e-06, + "logits/chosen": -0.7349028587341309, + "logits/rejected": -0.5432249307632446, + "logps/chosen": -0.10936813056468964, + "logps/rejected": -5.250774383544922, + "loss": 0.175, + "odds_ratio_loss": 0.019732775166630745, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010936813428997993, + "rewards/margins": 0.5141406059265137, + "rewards/rejected": -0.5250774025917053, + "sft_loss": 0.10936813056468964, + "step": 2101 + }, + { + "epoch": 3.039768618944324, + "grad_norm": 1.9958955503465796, + "learning_rate": 4.065220486658277e-06, + "logits/chosen": -0.5262503623962402, + "logits/rejected": -0.4228152632713318, + "logps/chosen": -0.16352926194667816, + "logps/rejected": -3.2567734718322754, + "loss": 0.1686, + "odds_ratio_loss": 0.037220489233732224, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016352925449609756, + "rewards/margins": 0.3093244433403015, + "rewards/rejected": -0.32567736506462097, + "sft_loss": 0.16352926194667816, + "step": 2102 + }, + { + "epoch": 3.0412147505422995, + "grad_norm": 1.8636021031520353, + "learning_rate": 4.062115005106184e-06, + "logits/chosen": -0.42543336749076843, + "logits/rejected": -0.4913654923439026, + "logps/chosen": -0.30701306462287903, + "logps/rejected": -2.3156869411468506, + "loss": 0.2009, + "odds_ratio_loss": 0.07792215794324875, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03070130944252014, + "rewards/margins": 0.20086738467216492, + "rewards/rejected": -0.23156869411468506, + "sft_loss": 0.30701306462287903, + "step": 2103 + }, + { + "epoch": 3.0426608821402747, + "grad_norm": 2.429431192098418, + "learning_rate": 4.059009486104723e-06, + "logits/chosen": -0.578881025314331, + "logits/rejected": -0.4205232262611389, + "logps/chosen": -0.13701388239860535, + "logps/rejected": -4.750458717346191, + "loss": 0.1819, + "odds_ratio_loss": 0.01851857826113701, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01370138768106699, + "rewards/margins": 0.46134448051452637, + "rewards/rejected": -0.4750458896160126, + "sft_loss": 0.13701388239860535, + "step": 2104 + }, + { + "epoch": 3.04410701373825, + "grad_norm": 2.293787093452759, + "learning_rate": 4.055903931526223e-06, + "logits/chosen": -0.7719517350196838, + "logits/rejected": -0.6306342482566833, + "logps/chosen": -0.18231219053268433, + "logps/rejected": -3.1785566806793213, + "loss": 0.201, + "odds_ratio_loss": 0.04030875861644745, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018231220543384552, + "rewards/margins": 0.2996244430541992, + "rewards/rejected": -0.3178556561470032, + "sft_loss": 0.18231219053268433, + "step": 2105 + }, + { + "epoch": 3.0455531453362257, + "grad_norm": 2.5028298804086235, + "learning_rate": 4.052798343243036e-06, + "logits/chosen": -0.7744817733764648, + "logits/rejected": -0.5614781379699707, + "logps/chosen": -0.22691814601421356, + "logps/rejected": -4.627237796783447, + "loss": 0.2317, + "odds_ratio_loss": 0.05098215490579605, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022691816091537476, + "rewards/margins": 0.4400319755077362, + "rewards/rejected": -0.4627237915992737, + "sft_loss": 0.22691814601421356, + "step": 2106 + }, + { + "epoch": 3.046999276934201, + "grad_norm": 2.295477572904047, + "learning_rate": 4.04969272312753e-06, + "logits/chosen": -0.8552494049072266, + "logits/rejected": -0.6359434723854065, + "logps/chosen": -0.20083674788475037, + "logps/rejected": -3.457988977432251, + "loss": 0.2102, + "odds_ratio_loss": 0.029120953753590584, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020083673298358917, + "rewards/margins": 0.32571524381637573, + "rewards/rejected": -0.34579890966415405, + "sft_loss": 0.20083674788475037, + "step": 2107 + }, + { + "epoch": 3.0484454085321766, + "grad_norm": 2.4749090053073233, + "learning_rate": 4.0465870730520954e-06, + "logits/chosen": -0.7466475963592529, + "logits/rejected": -0.5847068428993225, + "logps/chosen": -0.05967186018824577, + "logps/rejected": -3.782038688659668, + "loss": 0.1441, + "odds_ratio_loss": 0.009426586329936981, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005967185832560062, + "rewards/margins": 0.3722366690635681, + "rewards/rejected": -0.3782038688659668, + "sft_loss": 0.05967186018824577, + "step": 2108 + }, + { + "epoch": 3.049891540130152, + "grad_norm": 2.0123379540693334, + "learning_rate": 4.043481394889142e-06, + "logits/chosen": -0.678367555141449, + "logits/rejected": -0.6387525200843811, + "logps/chosen": -0.2518230974674225, + "logps/rejected": -3.079854726791382, + "loss": 0.2228, + "odds_ratio_loss": 0.05264444649219513, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025182312354445457, + "rewards/margins": 0.28280317783355713, + "rewards/rejected": -0.30798548460006714, + "sft_loss": 0.2518230974674225, + "step": 2109 + }, + { + "epoch": 3.051337671728127, + "grad_norm": 2.5649086388818767, + "learning_rate": 4.040375690511094e-06, + "logits/chosen": -0.733020544052124, + "logits/rejected": -0.4613838493824005, + "logps/chosen": -0.10593655705451965, + "logps/rejected": -5.208601951599121, + "loss": 0.1497, + "odds_ratio_loss": 0.018377486616373062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010593656450510025, + "rewards/margins": 0.5102665424346924, + "rewards/rejected": -0.5208601951599121, + "sft_loss": 0.10593655705451965, + "step": 2110 + }, + { + "epoch": 3.0527838033261028, + "grad_norm": 1.996597938866012, + "learning_rate": 4.03726996179039e-06, + "logits/chosen": -0.7812892198562622, + "logits/rejected": -0.6449815034866333, + "logps/chosen": -0.26513880491256714, + "logps/rejected": -3.381945848464966, + "loss": 0.1614, + "odds_ratio_loss": 0.049505963921546936, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026513883844017982, + "rewards/margins": 0.31168070435523987, + "rewards/rejected": -0.3381945788860321, + "sft_loss": 0.26513880491256714, + "step": 2111 + }, + { + "epoch": 3.054229934924078, + "grad_norm": 1.8174594922591507, + "learning_rate": 4.034164210599488e-06, + "logits/chosen": -0.5324065685272217, + "logits/rejected": -0.4299301505088806, + "logps/chosen": -0.11089219897985458, + "logps/rejected": -5.629248142242432, + "loss": 0.1388, + "odds_ratio_loss": 0.01814519427716732, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011089220643043518, + "rewards/margins": 0.5518355965614319, + "rewards/rejected": -0.5629248023033142, + "sft_loss": 0.11089219897985458, + "step": 2112 + }, + { + "epoch": 3.0556760665220537, + "grad_norm": 2.122834905775403, + "learning_rate": 4.031058438810857e-06, + "logits/chosen": -0.7841843366622925, + "logits/rejected": -0.5668798685073853, + "logps/chosen": -0.1282779723405838, + "logps/rejected": -3.247554302215576, + "loss": 0.1335, + "odds_ratio_loss": 0.03160501644015312, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0128277987241745, + "rewards/margins": 0.3119276165962219, + "rewards/rejected": -0.3247554302215576, + "sft_loss": 0.1282779723405838, + "step": 2113 + }, + { + "epoch": 3.057122198120029, + "grad_norm": 2.405269498818451, + "learning_rate": 4.027952648296978e-06, + "logits/chosen": -0.6955947875976562, + "logits/rejected": -0.5394856929779053, + "logps/chosen": -0.16242378950119019, + "logps/rejected": -5.063565731048584, + "loss": 0.2003, + "odds_ratio_loss": 0.027161482721567154, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01624237932264805, + "rewards/margins": 0.4901142120361328, + "rewards/rejected": -0.5063565373420715, + "sft_loss": 0.16242378950119019, + "step": 2114 + }, + { + "epoch": 3.058568329718004, + "grad_norm": 1.9522892318832643, + "learning_rate": 4.0248468409303425e-06, + "logits/chosen": -0.7737332582473755, + "logits/rejected": -0.4954363703727722, + "logps/chosen": -0.1599772721529007, + "logps/rejected": -3.514385223388672, + "loss": 0.1724, + "odds_ratio_loss": 0.022483911365270615, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01599772833287716, + "rewards/margins": 0.33544081449508667, + "rewards/rejected": -0.3514385223388672, + "sft_loss": 0.1599772721529007, + "step": 2115 + }, + { + "epoch": 3.06001446131598, + "grad_norm": 2.153304126645383, + "learning_rate": 4.0217410185834536e-06, + "logits/chosen": -0.7023472785949707, + "logits/rejected": -0.533017635345459, + "logps/chosen": -0.16424649953842163, + "logps/rejected": -5.224195957183838, + "loss": 0.1543, + "odds_ratio_loss": 0.043630871921777725, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016424652189016342, + "rewards/margins": 0.5059949159622192, + "rewards/rejected": -0.5224196314811707, + "sft_loss": 0.16424649953842163, + "step": 2116 + }, + { + "epoch": 3.061460592913955, + "grad_norm": 2.0513351448836104, + "learning_rate": 4.018635183128823e-06, + "logits/chosen": -0.6236501932144165, + "logits/rejected": -0.5361102819442749, + "logps/chosen": -0.20569291710853577, + "logps/rejected": -2.541423797607422, + "loss": 0.1819, + "odds_ratio_loss": 0.05294905602931976, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020569290965795517, + "rewards/margins": 0.23357310891151428, + "rewards/rejected": -0.2541424036026001, + "sft_loss": 0.20569291710853577, + "step": 2117 + }, + { + "epoch": 3.062906724511931, + "grad_norm": 2.1448700342273597, + "learning_rate": 4.015529336438973e-06, + "logits/chosen": -0.7315176129341125, + "logits/rejected": -0.540245532989502, + "logps/chosen": -0.14228178560733795, + "logps/rejected": -4.650500774383545, + "loss": 0.1486, + "odds_ratio_loss": 0.026424670591950417, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014228180050849915, + "rewards/margins": 0.4508219063282013, + "rewards/rejected": -0.4650501012802124, + "sft_loss": 0.14228178560733795, + "step": 2118 + }, + { + "epoch": 3.064352856109906, + "grad_norm": 2.341561331050371, + "learning_rate": 4.012423480386426e-06, + "logits/chosen": -0.5403401851654053, + "logits/rejected": -0.4582579731941223, + "logps/chosen": -0.14413967728614807, + "logps/rejected": -4.726695537567139, + "loss": 0.1921, + "odds_ratio_loss": 0.02182953804731369, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014413966797292233, + "rewards/margins": 0.4582555890083313, + "rewards/rejected": -0.4726695418357849, + "sft_loss": 0.14413967728614807, + "step": 2119 + }, + { + "epoch": 3.0657989877078813, + "grad_norm": 2.386821389201234, + "learning_rate": 4.009317616843718e-06, + "logits/chosen": -0.44133293628692627, + "logits/rejected": -0.4722680449485779, + "logps/chosen": -0.22620916366577148, + "logps/rejected": -2.8983755111694336, + "loss": 0.1771, + "odds_ratio_loss": 0.05633849278092384, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02262091636657715, + "rewards/margins": 0.26721662282943726, + "rewards/rejected": -0.2898375391960144, + "sft_loss": 0.22620916366577148, + "step": 2120 + }, + { + "epoch": 3.067245119305857, + "grad_norm": 1.7305002096138642, + "learning_rate": 4.006211747683384e-06, + "logits/chosen": -0.6039251685142517, + "logits/rejected": -0.5303505063056946, + "logps/chosen": -0.13674521446228027, + "logps/rejected": -4.530514240264893, + "loss": 0.1803, + "odds_ratio_loss": 0.035152267664670944, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013674520887434483, + "rewards/margins": 0.43937692046165466, + "rewards/rejected": -0.45305144786834717, + "sft_loss": 0.13674521446228027, + "step": 2121 + }, + { + "epoch": 3.068691250903832, + "grad_norm": 2.1291486853323405, + "learning_rate": 4.003105874777963e-06, + "logits/chosen": -0.7058529853820801, + "logits/rejected": -0.5664353966712952, + "logps/chosen": -0.09501180052757263, + "logps/rejected": -3.9581353664398193, + "loss": 0.1877, + "odds_ratio_loss": 0.022701166570186615, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009501179680228233, + "rewards/margins": 0.386312335729599, + "rewards/rejected": -0.395813524723053, + "sft_loss": 0.09501180052757263, + "step": 2122 + }, + { + "epoch": 3.0701373825018075, + "grad_norm": 2.7097953576820624, + "learning_rate": 4e-06, + "logits/chosen": -0.7147266864776611, + "logits/rejected": -0.4768485426902771, + "logps/chosen": -0.1762448400259018, + "logps/rejected": -4.018145561218262, + "loss": 0.2214, + "odds_ratio_loss": 0.027249373495578766, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01762448623776436, + "rewards/margins": 0.38419008255004883, + "rewards/rejected": -0.4018145799636841, + "sft_loss": 0.1762448400259018, + "step": 2123 + }, + { + "epoch": 3.071583514099783, + "grad_norm": 1.8998144556018384, + "learning_rate": 3.996894125222036e-06, + "logits/chosen": -0.5158587694168091, + "logits/rejected": -0.4066459536552429, + "logps/chosen": -0.16753427684307098, + "logps/rejected": -4.010112762451172, + "loss": 0.1828, + "odds_ratio_loss": 0.04549555480480194, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016753429546952248, + "rewards/margins": 0.38425785303115845, + "rewards/rejected": -0.40101125836372375, + "sft_loss": 0.16753427684307098, + "step": 2124 + }, + { + "epoch": 3.0730296456977584, + "grad_norm": 2.1176577092878133, + "learning_rate": 3.993788252316617e-06, + "logits/chosen": -0.6375981569290161, + "logits/rejected": -0.47454291582107544, + "logps/chosen": -0.1505729854106903, + "logps/rejected": -8.243040084838867, + "loss": 0.203, + "odds_ratio_loss": 0.012890107929706573, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01505729928612709, + "rewards/margins": 0.809246838092804, + "rewards/rejected": -0.8243041634559631, + "sft_loss": 0.1505729854106903, + "step": 2125 + }, + { + "epoch": 3.074475777295734, + "grad_norm": 2.222962148586192, + "learning_rate": 3.990682383156282e-06, + "logits/chosen": -0.620424211025238, + "logits/rejected": -0.4490058720111847, + "logps/chosen": -0.22270803153514862, + "logps/rejected": -4.201064586639404, + "loss": 0.2145, + "odds_ratio_loss": 0.05165166035294533, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022270802408456802, + "rewards/margins": 0.3978356719017029, + "rewards/rejected": -0.4201064705848694, + "sft_loss": 0.22270803153514862, + "step": 2126 + }, + { + "epoch": 3.0759219088937093, + "grad_norm": 2.4804554039420577, + "learning_rate": 3.987576519613574e-06, + "logits/chosen": -0.4530171751976013, + "logits/rejected": -0.412492036819458, + "logps/chosen": -0.2665051519870758, + "logps/rejected": -3.0253868103027344, + "loss": 0.2164, + "odds_ratio_loss": 0.061206281185150146, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02665051445364952, + "rewards/margins": 0.2758881747722626, + "rewards/rejected": -0.3025386929512024, + "sft_loss": 0.2665051519870758, + "step": 2127 + }, + { + "epoch": 3.0773680404916846, + "grad_norm": 1.7493140822868425, + "learning_rate": 3.984470663561027e-06, + "logits/chosen": -0.6655663847923279, + "logits/rejected": -0.7546771764755249, + "logps/chosen": -0.2823176681995392, + "logps/rejected": -3.803299903869629, + "loss": 0.1604, + "odds_ratio_loss": 0.06763014197349548, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02823176607489586, + "rewards/margins": 0.3520982563495636, + "rewards/rejected": -0.38033002614974976, + "sft_loss": 0.2823176681995392, + "step": 2128 + }, + { + "epoch": 3.0788141720896602, + "grad_norm": 2.5403797596003166, + "learning_rate": 3.981364816871177e-06, + "logits/chosen": -0.7357568740844727, + "logits/rejected": -0.583640456199646, + "logps/chosen": -0.17165610194206238, + "logps/rejected": -3.8525633811950684, + "loss": 0.1617, + "odds_ratio_loss": 0.024731557816267014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017165610566735268, + "rewards/margins": 0.36809074878692627, + "rewards/rejected": -0.3852563500404358, + "sft_loss": 0.17165610194206238, + "step": 2129 + }, + { + "epoch": 3.0802603036876355, + "grad_norm": 2.054847911596039, + "learning_rate": 3.978258981416547e-06, + "logits/chosen": -0.7385162711143494, + "logits/rejected": -0.6317088603973389, + "logps/chosen": -0.14730429649353027, + "logps/rejected": -2.464529037475586, + "loss": 0.1315, + "odds_ratio_loss": 0.03632424771785736, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014730430208146572, + "rewards/margins": 0.23172245919704437, + "rewards/rejected": -0.24645289778709412, + "sft_loss": 0.14730429649353027, + "step": 2130 + }, + { + "epoch": 3.081706435285611, + "grad_norm": 2.177460049188998, + "learning_rate": 3.975153159069659e-06, + "logits/chosen": -0.4692709147930145, + "logits/rejected": -0.4788125157356262, + "logps/chosen": -0.13799342513084412, + "logps/rejected": -3.8743791580200195, + "loss": 0.1705, + "odds_ratio_loss": 0.03219764679670334, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013799343258142471, + "rewards/margins": 0.3736385703086853, + "rewards/rejected": -0.3874379098415375, + "sft_loss": 0.13799342513084412, + "step": 2131 + }, + { + "epoch": 3.0831525668835864, + "grad_norm": 2.2308389604931995, + "learning_rate": 3.972047351703023e-06, + "logits/chosen": -0.5637999773025513, + "logits/rejected": -0.46241044998168945, + "logps/chosen": -0.28073132038116455, + "logps/rejected": -5.222455978393555, + "loss": 0.1847, + "odds_ratio_loss": 0.04090370982885361, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.028073132038116455, + "rewards/margins": 0.49417245388031006, + "rewards/rejected": -0.5222455859184265, + "sft_loss": 0.28073132038116455, + "step": 2132 + }, + { + "epoch": 3.0845986984815617, + "grad_norm": 2.065829840448526, + "learning_rate": 3.968941561189144e-06, + "logits/chosen": -0.6958283185958862, + "logits/rejected": -0.4899245500564575, + "logps/chosen": -0.1636473834514618, + "logps/rejected": -3.7653756141662598, + "loss": 0.2069, + "odds_ratio_loss": 0.033067747950553894, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01636473834514618, + "rewards/margins": 0.3601728081703186, + "rewards/rejected": -0.3765375316143036, + "sft_loss": 0.1636473834514618, + "step": 2133 + }, + { + "epoch": 3.0860448300795373, + "grad_norm": 1.876329075053545, + "learning_rate": 3.965835789400511e-06, + "logits/chosen": -0.563957691192627, + "logits/rejected": -0.5317133665084839, + "logps/chosen": -0.12031100690364838, + "logps/rejected": -3.236659526824951, + "loss": 0.1561, + "odds_ratio_loss": 0.027744002640247345, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012031100690364838, + "rewards/margins": 0.3116348683834076, + "rewards/rejected": -0.323665976524353, + "sft_loss": 0.12031100690364838, + "step": 2134 + }, + { + "epoch": 3.0874909616775126, + "grad_norm": 2.0304470488543616, + "learning_rate": 3.96273003820961e-06, + "logits/chosen": -0.7444326877593994, + "logits/rejected": -0.5694934129714966, + "logps/chosen": -0.22145669162273407, + "logps/rejected": -3.7066285610198975, + "loss": 0.1865, + "odds_ratio_loss": 0.028277050703763962, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022145669907331467, + "rewards/margins": 0.34851717948913574, + "rewards/rejected": -0.3706628680229187, + "sft_loss": 0.22145669162273407, + "step": 2135 + }, + { + "epoch": 3.0889370932754883, + "grad_norm": 1.8981778171798918, + "learning_rate": 3.959624309488907e-06, + "logits/chosen": -0.614467978477478, + "logits/rejected": -0.3776948153972626, + "logps/chosen": -0.11771661043167114, + "logps/rejected": -6.75969123840332, + "loss": 0.1404, + "odds_ratio_loss": 0.01951497234404087, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01177166122943163, + "rewards/margins": 0.6641974449157715, + "rewards/rejected": -0.675969123840332, + "sft_loss": 0.11771661043167114, + "step": 2136 + }, + { + "epoch": 3.0903832248734635, + "grad_norm": 2.1400342160750054, + "learning_rate": 3.956518605110858e-06, + "logits/chosen": -0.8642411828041077, + "logits/rejected": -0.6858229637145996, + "logps/chosen": -0.2012554109096527, + "logps/rejected": -2.901653528213501, + "loss": 0.1956, + "odds_ratio_loss": 0.03699169307947159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02012554183602333, + "rewards/margins": 0.27003979682922363, + "rewards/rejected": -0.29016533493995667, + "sft_loss": 0.2012554109096527, + "step": 2137 + }, + { + "epoch": 3.0918293564714388, + "grad_norm": 2.0394103728196638, + "learning_rate": 3.953412926947904e-06, + "logits/chosen": -0.5077266097068787, + "logits/rejected": -0.42272666096687317, + "logps/chosen": -0.10724125802516937, + "logps/rejected": -7.563508033752441, + "loss": 0.1693, + "odds_ratio_loss": 0.020378630608320236, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010724126361310482, + "rewards/margins": 0.74562668800354, + "rewards/rejected": -0.7563507556915283, + "sft_loss": 0.10724125802516937, + "step": 2138 + }, + { + "epoch": 3.0932754880694144, + "grad_norm": 2.233590697823899, + "learning_rate": 3.95030727687247e-06, + "logits/chosen": -0.6356875896453857, + "logits/rejected": -0.4583815932273865, + "logps/chosen": -0.13725396990776062, + "logps/rejected": -3.551173686981201, + "loss": 0.1757, + "odds_ratio_loss": 0.030577151104807854, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013725396245718002, + "rewards/margins": 0.34139198064804077, + "rewards/rejected": -0.3551173806190491, + "sft_loss": 0.13725396990776062, + "step": 2139 + }, + { + "epoch": 3.0947216196673897, + "grad_norm": 1.998594502803306, + "learning_rate": 3.947201656756965e-06, + "logits/chosen": -0.5484251976013184, + "logits/rejected": -0.43344441056251526, + "logps/chosen": -0.20057189464569092, + "logps/rejected": -3.6383776664733887, + "loss": 0.1883, + "odds_ratio_loss": 0.04176495224237442, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02005719020962715, + "rewards/margins": 0.3437805771827698, + "rewards/rejected": -0.3638377785682678, + "sft_loss": 0.20057189464569092, + "step": 2140 + }, + { + "epoch": 3.096167751265365, + "grad_norm": 2.0891032642185365, + "learning_rate": 3.944096068473776e-06, + "logits/chosen": -0.48677608370780945, + "logits/rejected": -0.405622273683548, + "logps/chosen": -0.3364259600639343, + "logps/rejected": -4.87321662902832, + "loss": 0.1989, + "odds_ratio_loss": 0.0456356480717659, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03364259749650955, + "rewards/margins": 0.4536791145801544, + "rewards/rejected": -0.487321674823761, + "sft_loss": 0.3364259600639343, + "step": 2141 + }, + { + "epoch": 3.0976138828633406, + "grad_norm": 2.3986916761530397, + "learning_rate": 3.940990513895277e-06, + "logits/chosen": -0.49167507886886597, + "logits/rejected": -0.2597613036632538, + "logps/chosen": -0.14950230717658997, + "logps/rejected": -4.943644046783447, + "loss": 0.1609, + "odds_ratio_loss": 0.02151937410235405, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014950230717658997, + "rewards/margins": 0.47941410541534424, + "rewards/rejected": -0.4943643808364868, + "sft_loss": 0.14950230717658997, + "step": 2142 + }, + { + "epoch": 3.099060014461316, + "grad_norm": 2.3771417631729204, + "learning_rate": 3.937884994893815e-06, + "logits/chosen": -0.6202237606048584, + "logits/rejected": -0.34670570492744446, + "logps/chosen": -0.16555556654930115, + "logps/rejected": -3.282248020172119, + "loss": 0.1799, + "odds_ratio_loss": 0.0847080647945404, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.016555557027459145, + "rewards/margins": 0.3116692304611206, + "rewards/rejected": -0.328224778175354, + "sft_loss": 0.16555556654930115, + "step": 2143 + }, + { + "epoch": 3.1005061460592915, + "grad_norm": 2.044758701059538, + "learning_rate": 3.934779513341723e-06, + "logits/chosen": -0.5131956338882446, + "logits/rejected": -0.3095894753932953, + "logps/chosen": -0.2654573917388916, + "logps/rejected": -4.961881160736084, + "loss": 0.2241, + "odds_ratio_loss": 0.04515118524432182, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02654574252665043, + "rewards/margins": 0.46964243054389954, + "rewards/rejected": -0.49618813395500183, + "sft_loss": 0.2654573917388916, + "step": 2144 + }, + { + "epoch": 3.1019522776572668, + "grad_norm": 1.7210001373799835, + "learning_rate": 3.931674071111304e-06, + "logits/chosen": -0.45441409945487976, + "logits/rejected": -0.40361326932907104, + "logps/chosen": -0.23514500260353088, + "logps/rejected": -3.1339406967163086, + "loss": 0.1595, + "odds_ratio_loss": 0.06016864255070686, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02351449988782406, + "rewards/margins": 0.28987962007522583, + "rewards/rejected": -0.31339409947395325, + "sft_loss": 0.23514500260353088, + "step": 2145 + }, + { + "epoch": 3.103398409255242, + "grad_norm": 2.024843444988982, + "learning_rate": 3.928568670074843e-06, + "logits/chosen": -0.4629301428794861, + "logits/rejected": -0.34551891684532166, + "logps/chosen": -0.13455036282539368, + "logps/rejected": -2.923142194747925, + "loss": 0.1627, + "odds_ratio_loss": 0.022695183753967285, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013455037027597427, + "rewards/margins": 0.2788591980934143, + "rewards/rejected": -0.29231423139572144, + "sft_loss": 0.13455036282539368, + "step": 2146 + }, + { + "epoch": 3.1048445408532177, + "grad_norm": 2.2333254041840034, + "learning_rate": 3.925463312104596e-06, + "logits/chosen": -0.7243767380714417, + "logits/rejected": -0.3842761516571045, + "logps/chosen": -0.21467825770378113, + "logps/rejected": -4.240947723388672, + "loss": 0.2452, + "odds_ratio_loss": 0.04264179617166519, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021467823535203934, + "rewards/margins": 0.4026269316673279, + "rewards/rejected": -0.4240947365760803, + "sft_loss": 0.21467825770378113, + "step": 2147 + }, + { + "epoch": 3.106290672451193, + "grad_norm": 2.188334438515691, + "learning_rate": 3.922357999072796e-06, + "logits/chosen": -0.8195107579231262, + "logits/rejected": -0.521625816822052, + "logps/chosen": -0.24980032444000244, + "logps/rejected": -3.1828725337982178, + "loss": 0.2113, + "odds_ratio_loss": 0.021983999758958817, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024980032816529274, + "rewards/margins": 0.29330721497535706, + "rewards/rejected": -0.3182872533798218, + "sft_loss": 0.24980032444000244, + "step": 2148 + }, + { + "epoch": 3.1077368040491686, + "grad_norm": 2.3063303796163637, + "learning_rate": 3.9192527328516475e-06, + "logits/chosen": -0.4675880968570709, + "logits/rejected": -0.32635053992271423, + "logps/chosen": -0.28106504678726196, + "logps/rejected": -3.9863810539245605, + "loss": 0.2463, + "odds_ratio_loss": 0.06380581855773926, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.028106503188610077, + "rewards/margins": 0.3705316185951233, + "rewards/rejected": -0.39863812923431396, + "sft_loss": 0.28106504678726196, + "step": 2149 + }, + { + "epoch": 3.109182935647144, + "grad_norm": 3.585902690061833, + "learning_rate": 3.916147515313326e-06, + "logits/chosen": -0.7061706781387329, + "logits/rejected": -0.5766094326972961, + "logps/chosen": -0.2508338689804077, + "logps/rejected": -3.741215229034424, + "loss": 0.2621, + "odds_ratio_loss": 0.05887717008590698, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025083385407924652, + "rewards/margins": 0.34903812408447266, + "rewards/rejected": -0.3741214871406555, + "sft_loss": 0.2508338689804077, + "step": 2150 + }, + { + "epoch": 3.110629067245119, + "grad_norm": 2.042487934584991, + "learning_rate": 3.9130423483299815e-06, + "logits/chosen": -0.5965969562530518, + "logits/rejected": -0.5227322578430176, + "logps/chosen": -0.1644129902124405, + "logps/rejected": -3.138468027114868, + "loss": 0.1707, + "odds_ratio_loss": 0.04252278432250023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01644129864871502, + "rewards/margins": 0.29740551114082336, + "rewards/rejected": -0.31384679675102234, + "sft_loss": 0.1644129902124405, + "step": 2151 + }, + { + "epoch": 3.112075198843095, + "grad_norm": 2.1188301952171513, + "learning_rate": 3.9099372337737285e-06, + "logits/chosen": -0.6012832522392273, + "logits/rejected": -0.49223455786705017, + "logps/chosen": -0.17560946941375732, + "logps/rejected": -4.076479434967041, + "loss": 0.1919, + "odds_ratio_loss": 0.03781703859567642, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017560947686433792, + "rewards/margins": 0.39008697867393494, + "rewards/rejected": -0.407647967338562, + "sft_loss": 0.17560946941375732, + "step": 2152 + }, + { + "epoch": 3.11352133044107, + "grad_norm": 2.6657369394288564, + "learning_rate": 3.906832173516653e-06, + "logits/chosen": -0.4916571378707886, + "logits/rejected": -0.41273894906044006, + "logps/chosen": -0.07565654069185257, + "logps/rejected": -5.079089641571045, + "loss": 0.1556, + "odds_ratio_loss": 0.011491803452372551, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007565653882920742, + "rewards/margins": 0.5003433227539062, + "rewards/rejected": -0.5079089403152466, + "sft_loss": 0.07565654069185257, + "step": 2153 + }, + { + "epoch": 3.1149674620390457, + "grad_norm": 2.238773018686905, + "learning_rate": 3.903727169430806e-06, + "logits/chosen": -0.5660860538482666, + "logits/rejected": -0.35988539457321167, + "logps/chosen": -0.11718136072158813, + "logps/rejected": -3.5155582427978516, + "loss": 0.1476, + "odds_ratio_loss": 0.03395921364426613, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011718135327100754, + "rewards/margins": 0.3398377001285553, + "rewards/rejected": -0.35155582427978516, + "sft_loss": 0.11718136072158813, + "step": 2154 + }, + { + "epoch": 3.116413593637021, + "grad_norm": 2.006913186733354, + "learning_rate": 3.900622223388209e-06, + "logits/chosen": -0.5774864554405212, + "logits/rejected": -0.4433661997318268, + "logps/chosen": -0.14877638220787048, + "logps/rejected": -5.834985256195068, + "loss": 0.2236, + "odds_ratio_loss": 0.0120998565107584, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014877637848258018, + "rewards/margins": 0.5686209201812744, + "rewards/rejected": -0.583498477935791, + "sft_loss": 0.14877638220787048, + "step": 2155 + }, + { + "epoch": 3.117859725234996, + "grad_norm": 4.473257009520163, + "learning_rate": 3.897517337260842e-06, + "logits/chosen": -0.580504834651947, + "logits/rejected": -0.3841407299041748, + "logps/chosen": -0.2404867708683014, + "logps/rejected": -3.8186872005462646, + "loss": 0.2396, + "odds_ratio_loss": 0.03407532721757889, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02404867857694626, + "rewards/margins": 0.3578200340270996, + "rewards/rejected": -0.38186874985694885, + "sft_loss": 0.2404867708683014, + "step": 2156 + }, + { + "epoch": 3.119305856832972, + "grad_norm": 2.1972664434284366, + "learning_rate": 3.894412512920655e-06, + "logits/chosen": -0.575932502746582, + "logits/rejected": -0.593704104423523, + "logps/chosen": -0.20279648900032043, + "logps/rejected": -4.450626373291016, + "loss": 0.1834, + "odds_ratio_loss": 0.06688258051872253, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.020279649645090103, + "rewards/margins": 0.42478299140930176, + "rewards/rejected": -0.44506266713142395, + "sft_loss": 0.20279648900032043, + "step": 2157 + }, + { + "epoch": 3.120751988430947, + "grad_norm": 2.7066013861559677, + "learning_rate": 3.891307752239556e-06, + "logits/chosen": -0.5452953577041626, + "logits/rejected": -0.4585101008415222, + "logps/chosen": -0.1771378219127655, + "logps/rejected": -2.6734554767608643, + "loss": 0.2138, + "odds_ratio_loss": 0.050962068140506744, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01771378144621849, + "rewards/margins": 0.24963179230690002, + "rewards/rejected": -0.2673455774784088, + "sft_loss": 0.1771378219127655, + "step": 2158 + }, + { + "epoch": 3.122198120028923, + "grad_norm": 2.211874916353963, + "learning_rate": 3.8882030570894194e-06, + "logits/chosen": -0.5663500428199768, + "logits/rejected": -0.4593850076198578, + "logps/chosen": -0.09784838557243347, + "logps/rejected": -2.9985287189483643, + "loss": 0.1453, + "odds_ratio_loss": 0.026956774294376373, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009784838184714317, + "rewards/margins": 0.29006803035736084, + "rewards/rejected": -0.2998528778553009, + "sft_loss": 0.09784838557243347, + "step": 2159 + }, + { + "epoch": 3.123644251626898, + "grad_norm": 1.8720211085043381, + "learning_rate": 3.8850984293420755e-06, + "logits/chosen": -0.8111594319343567, + "logits/rejected": -0.42723047733306885, + "logps/chosen": -0.1713923215866089, + "logps/rejected": -2.9202356338500977, + "loss": 0.1496, + "odds_ratio_loss": 0.0293788630515337, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017139233648777008, + "rewards/margins": 0.27488431334495544, + "rewards/rejected": -0.29202353954315186, + "sft_loss": 0.1713923215866089, + "step": 2160 + }, + { + "epoch": 3.1250903832248733, + "grad_norm": 2.1561396202024423, + "learning_rate": 3.881993870869317e-06, + "logits/chosen": -0.4672960638999939, + "logits/rejected": -0.23019523918628693, + "logps/chosen": -0.2502592206001282, + "logps/rejected": -3.436143398284912, + "loss": 0.2187, + "odds_ratio_loss": 0.04205578565597534, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025025920942425728, + "rewards/margins": 0.31858840584754944, + "rewards/rejected": -0.3436143398284912, + "sft_loss": 0.2502592206001282, + "step": 2161 + }, + { + "epoch": 3.126536514822849, + "grad_norm": 1.7497644379347141, + "learning_rate": 3.878889383542892e-06, + "logits/chosen": -0.5571482181549072, + "logits/rejected": -0.41588735580444336, + "logps/chosen": -0.18334656953811646, + "logps/rejected": -3.0952415466308594, + "loss": 0.1189, + "odds_ratio_loss": 0.03541785106062889, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018334658816456795, + "rewards/margins": 0.2911895215511322, + "rewards/rejected": -0.30952417850494385, + "sft_loss": 0.18334656953811646, + "step": 2162 + }, + { + "epoch": 3.1279826464208242, + "grad_norm": 2.2392104542666784, + "learning_rate": 3.87578496923451e-06, + "logits/chosen": -0.4766693413257599, + "logits/rejected": -0.40024662017822266, + "logps/chosen": -0.2564614415168762, + "logps/rejected": -3.4361166954040527, + "loss": 0.2093, + "odds_ratio_loss": 0.05638567730784416, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025646142661571503, + "rewards/margins": 0.3179655075073242, + "rewards/rejected": -0.3436116576194763, + "sft_loss": 0.2564614415168762, + "step": 2163 + }, + { + "epoch": 3.1294287780187995, + "grad_norm": 2.1439404626315546, + "learning_rate": 3.872680629815832e-06, + "logits/chosen": -0.5200873613357544, + "logits/rejected": -0.4677450358867645, + "logps/chosen": -0.2646474242210388, + "logps/rejected": -4.130258560180664, + "loss": 0.1668, + "odds_ratio_loss": 0.07118930667638779, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026464741677045822, + "rewards/margins": 0.3865610957145691, + "rewards/rejected": -0.4130258858203888, + "sft_loss": 0.2646474242210388, + "step": 2164 + }, + { + "epoch": 3.130874909616775, + "grad_norm": 2.3317828336888975, + "learning_rate": 3.869576367158475e-06, + "logits/chosen": -0.4354207515716553, + "logits/rejected": -0.33270853757858276, + "logps/chosen": -0.1295112818479538, + "logps/rejected": -5.896690368652344, + "loss": 0.1458, + "odds_ratio_loss": 0.016541190445423126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01295112818479538, + "rewards/margins": 0.5767179131507874, + "rewards/rejected": -0.5896689891815186, + "sft_loss": 0.1295112818479538, + "step": 2165 + }, + { + "epoch": 3.1323210412147504, + "grad_norm": 2.0183818962715514, + "learning_rate": 3.866472183134015e-06, + "logits/chosen": -0.314644455909729, + "logits/rejected": -0.21877217292785645, + "logps/chosen": -0.2322487086057663, + "logps/rejected": -4.360037326812744, + "loss": 0.1972, + "odds_ratio_loss": 0.0481003001332283, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02322487160563469, + "rewards/margins": 0.4127788841724396, + "rewards/rejected": -0.43600374460220337, + "sft_loss": 0.2322487086057663, + "step": 2166 + }, + { + "epoch": 3.133767172812726, + "grad_norm": 2.219897757568939, + "learning_rate": 3.863368079613971e-06, + "logits/chosen": -0.5187494158744812, + "logits/rejected": -0.37773463129997253, + "logps/chosen": -0.12917321920394897, + "logps/rejected": -3.245424270629883, + "loss": 0.201, + "odds_ratio_loss": 0.03227677941322327, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012917323037981987, + "rewards/margins": 0.3116251230239868, + "rewards/rejected": -0.32454243302345276, + "sft_loss": 0.12917321920394897, + "step": 2167 + }, + { + "epoch": 3.1352133044107013, + "grad_norm": 2.1556321676285393, + "learning_rate": 3.860264058469822e-06, + "logits/chosen": -0.6454533934593201, + "logits/rejected": -0.40207356214523315, + "logps/chosen": -0.19393956661224365, + "logps/rejected": -2.3810601234436035, + "loss": 0.1878, + "odds_ratio_loss": 0.057800304144620895, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019393956288695335, + "rewards/margins": 0.21871204674243927, + "rewards/rejected": -0.23810601234436035, + "sft_loss": 0.19393956661224365, + "step": 2168 + }, + { + "epoch": 3.1366594360086766, + "grad_norm": 2.465289843971513, + "learning_rate": 3.8571601215729904e-06, + "logits/chosen": -0.41532689332962036, + "logits/rejected": -0.3971560299396515, + "logps/chosen": -0.14556801319122314, + "logps/rejected": -3.225522994995117, + "loss": 0.1605, + "odds_ratio_loss": 0.03414017707109451, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01455680187791586, + "rewards/margins": 0.3079955279827118, + "rewards/rejected": -0.32255232334136963, + "sft_loss": 0.14556801319122314, + "step": 2169 + }, + { + "epoch": 3.1381055676066523, + "grad_norm": 2.213239859679258, + "learning_rate": 3.854056270794856e-06, + "logits/chosen": -0.7330466508865356, + "logits/rejected": -0.3163624405860901, + "logps/chosen": -0.13772007822990417, + "logps/rejected": -6.967245578765869, + "loss": 0.1757, + "odds_ratio_loss": 0.014283371157944202, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013772009871900082, + "rewards/margins": 0.6829525232315063, + "rewards/rejected": -0.6967245936393738, + "sft_loss": 0.13772007822990417, + "step": 2170 + }, + { + "epoch": 3.1395516992046275, + "grad_norm": 2.1787848582083895, + "learning_rate": 3.8509525080067375e-06, + "logits/chosen": -0.6489847898483276, + "logits/rejected": -0.36521393060684204, + "logps/chosen": -0.20706214010715485, + "logps/rejected": -3.5721209049224854, + "loss": 0.2048, + "odds_ratio_loss": 0.017135875299572945, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020706214010715485, + "rewards/margins": 0.3365058898925781, + "rewards/rejected": -0.3572120666503906, + "sft_loss": 0.20706214010715485, + "step": 2171 + }, + { + "epoch": 3.140997830802603, + "grad_norm": 1.98445645583722, + "learning_rate": 3.847848835079909e-06, + "logits/chosen": -0.770344614982605, + "logits/rejected": -0.5676401257514954, + "logps/chosen": -0.182399719953537, + "logps/rejected": -3.97198224067688, + "loss": 0.1593, + "odds_ratio_loss": 0.0314740389585495, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01823997311294079, + "rewards/margins": 0.3789582848548889, + "rewards/rejected": -0.39719823002815247, + "sft_loss": 0.182399719953537, + "step": 2172 + }, + { + "epoch": 3.1424439624005784, + "grad_norm": 1.985445789793339, + "learning_rate": 3.8447452538855835e-06, + "logits/chosen": -0.46158552169799805, + "logits/rejected": -0.4430912435054779, + "logps/chosen": -0.2738022804260254, + "logps/rejected": -4.37603235244751, + "loss": 0.2017, + "odds_ratio_loss": 0.10020264238119125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02738022990524769, + "rewards/margins": 0.41022300720214844, + "rewards/rejected": -0.437603235244751, + "sft_loss": 0.2738022804260254, + "step": 2173 + }, + { + "epoch": 3.1438900939985537, + "grad_norm": 2.371867630391579, + "learning_rate": 3.841641766294923e-06, + "logits/chosen": -0.42799311876296997, + "logits/rejected": -0.388339102268219, + "logps/chosen": -0.14372651278972626, + "logps/rejected": -3.0899109840393066, + "loss": 0.1701, + "odds_ratio_loss": 0.03828097879886627, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01437265146523714, + "rewards/margins": 0.2946184277534485, + "rewards/rejected": -0.30899107456207275, + "sft_loss": 0.14372651278972626, + "step": 2174 + }, + { + "epoch": 3.1453362255965294, + "grad_norm": 2.0472933581207897, + "learning_rate": 3.83853837417903e-06, + "logits/chosen": -0.6643333435058594, + "logits/rejected": -0.5412406921386719, + "logps/chosen": -0.2567393481731415, + "logps/rejected": -3.749028205871582, + "loss": 0.2037, + "odds_ratio_loss": 0.0705362930893898, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02567393332719803, + "rewards/margins": 0.3492288887500763, + "rewards/rejected": -0.3749028444290161, + "sft_loss": 0.2567393481731415, + "step": 2175 + }, + { + "epoch": 3.1467823571945046, + "grad_norm": 2.147926415455977, + "learning_rate": 3.835435079408954e-06, + "logits/chosen": -0.6184093952178955, + "logits/rejected": -0.4594188630580902, + "logps/chosen": -0.24748066067695618, + "logps/rejected": -2.9573397636413574, + "loss": 0.1725, + "odds_ratio_loss": 0.03922613710165024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024748066440224648, + "rewards/margins": 0.2709859311580658, + "rewards/rejected": -0.2957339882850647, + "sft_loss": 0.24748066067695618, + "step": 2176 + }, + { + "epoch": 3.1482284887924803, + "grad_norm": 2.176903500984134, + "learning_rate": 3.8323318838556814e-06, + "logits/chosen": -0.8568693995475769, + "logits/rejected": -0.6487101316452026, + "logps/chosen": -0.1801314353942871, + "logps/rejected": -4.282716274261475, + "loss": 0.1795, + "odds_ratio_loss": 0.044610489159822464, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01801314391195774, + "rewards/margins": 0.4102584719657898, + "rewards/rejected": -0.4282715916633606, + "sft_loss": 0.1801314353942871, + "step": 2177 + }, + { + "epoch": 3.1496746203904555, + "grad_norm": 2.036253753343565, + "learning_rate": 3.829228789390139e-06, + "logits/chosen": -0.558239758014679, + "logits/rejected": -0.45034515857696533, + "logps/chosen": -0.2566141188144684, + "logps/rejected": -3.356292724609375, + "loss": 0.2143, + "odds_ratio_loss": 0.047206830233335495, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025661412626504898, + "rewards/margins": 0.30996787548065186, + "rewards/rejected": -0.33562928438186646, + "sft_loss": 0.2566141188144684, + "step": 2178 + }, + { + "epoch": 3.151120751988431, + "grad_norm": 2.1664134746621246, + "learning_rate": 3.826125797883197e-06, + "logits/chosen": -0.43020087480545044, + "logits/rejected": -0.43662816286087036, + "logps/chosen": -0.2651152014732361, + "logps/rejected": -3.1269278526306152, + "loss": 0.2053, + "odds_ratio_loss": 0.06396672129631042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02651152014732361, + "rewards/margins": 0.28618124127388, + "rewards/rejected": -0.312692791223526, + "sft_loss": 0.2651152014732361, + "step": 2179 + }, + { + "epoch": 3.1525668835864065, + "grad_norm": 1.9887411533498236, + "learning_rate": 3.823022911205659e-06, + "logits/chosen": -0.6099244356155396, + "logits/rejected": -0.35586732625961304, + "logps/chosen": -0.11266939342021942, + "logps/rejected": -4.009099006652832, + "loss": 0.2009, + "odds_ratio_loss": 0.02088777907192707, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011266940273344517, + "rewards/margins": 0.38964295387268066, + "rewards/rejected": -0.4009098708629608, + "sft_loss": 0.11266939342021942, + "step": 2180 + }, + { + "epoch": 3.1540130151843817, + "grad_norm": 2.186223351643586, + "learning_rate": 3.819920131228268e-06, + "logits/chosen": -0.6756264567375183, + "logits/rejected": -0.5853908061981201, + "logps/chosen": -0.15891686081886292, + "logps/rejected": -4.220728874206543, + "loss": 0.1641, + "odds_ratio_loss": 0.027598343789577484, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01589168794453144, + "rewards/margins": 0.4061812162399292, + "rewards/rejected": -0.4220728874206543, + "sft_loss": 0.15891686081886292, + "step": 2181 + }, + { + "epoch": 3.1554591467823574, + "grad_norm": 2.2133671954217213, + "learning_rate": 3.816817459821698e-06, + "logits/chosen": -0.46562516689300537, + "logits/rejected": -0.4322385787963867, + "logps/chosen": -0.13615188002586365, + "logps/rejected": -3.9099514484405518, + "loss": 0.1749, + "odds_ratio_loss": 0.03285899758338928, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01361518818885088, + "rewards/margins": 0.3773799538612366, + "rewards/rejected": -0.3909951448440552, + "sft_loss": 0.13615188002586365, + "step": 2182 + }, + { + "epoch": 3.1569052783803326, + "grad_norm": 2.176409976005723, + "learning_rate": 3.8137148988565655e-06, + "logits/chosen": -0.7380187511444092, + "logits/rejected": -0.6416623592376709, + "logps/chosen": -0.27821779251098633, + "logps/rejected": -3.296513557434082, + "loss": 0.2334, + "odds_ratio_loss": 0.02916991338133812, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027821781113743782, + "rewards/margins": 0.30182960629463196, + "rewards/rejected": -0.329651415348053, + "sft_loss": 0.27821779251098633, + "step": 2183 + }, + { + "epoch": 3.158351409978308, + "grad_norm": 2.0522969283962413, + "learning_rate": 3.8106124502034133e-06, + "logits/chosen": -0.8222931027412415, + "logits/rejected": -0.7903980016708374, + "logps/chosen": -0.1292247623205185, + "logps/rejected": -3.884793758392334, + "loss": 0.1517, + "odds_ratio_loss": 0.024996642023324966, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012922476977109909, + "rewards/margins": 0.3755568861961365, + "rewards/rejected": -0.3884793519973755, + "sft_loss": 0.1292247623205185, + "step": 2184 + }, + { + "epoch": 3.1597975415762836, + "grad_norm": 2.261195748283917, + "learning_rate": 3.8075101157327215e-06, + "logits/chosen": -0.5183153748512268, + "logits/rejected": -0.4637899398803711, + "logps/chosen": -0.1162676066160202, + "logps/rejected": -3.6541671752929688, + "loss": 0.1853, + "odds_ratio_loss": 0.023313432931900024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01162676140666008, + "rewards/margins": 0.35378995537757874, + "rewards/rejected": -0.3654167056083679, + "sft_loss": 0.1162676066160202, + "step": 2185 + }, + { + "epoch": 3.161243673174259, + "grad_norm": 2.2397538577382146, + "learning_rate": 3.8044078973148965e-06, + "logits/chosen": -0.6720691919326782, + "logits/rejected": -0.5420901775360107, + "logps/chosen": -0.22410547733306885, + "logps/rejected": -2.626776695251465, + "loss": 0.2293, + "odds_ratio_loss": 0.04131586477160454, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022410545498132706, + "rewards/margins": 0.2402670979499817, + "rewards/rejected": -0.2626776695251465, + "sft_loss": 0.22410547733306885, + "step": 2186 + }, + { + "epoch": 3.162689804772234, + "grad_norm": 1.9970829869530207, + "learning_rate": 3.8013057968202796e-06, + "logits/chosen": -0.7500214576721191, + "logits/rejected": -0.617428183555603, + "logps/chosen": -0.17513474822044373, + "logps/rejected": -3.5836827754974365, + "loss": 0.2036, + "odds_ratio_loss": 0.04269365221261978, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017513476312160492, + "rewards/margins": 0.34085479378700256, + "rewards/rejected": -0.35836827754974365, + "sft_loss": 0.17513474822044373, + "step": 2187 + }, + { + "epoch": 3.1641359363702097, + "grad_norm": 1.9591010980917978, + "learning_rate": 3.7982038161191375e-06, + "logits/chosen": -0.6849007606506348, + "logits/rejected": -0.5874512195587158, + "logps/chosen": -0.17351099848747253, + "logps/rejected": -3.3261361122131348, + "loss": 0.1952, + "odds_ratio_loss": 0.05133747309446335, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017351102083921432, + "rewards/margins": 0.3152625262737274, + "rewards/rejected": -0.33261361718177795, + "sft_loss": 0.17351099848747253, + "step": 2188 + }, + { + "epoch": 3.165582067968185, + "grad_norm": 2.1623811095435883, + "learning_rate": 3.7951019570816664e-06, + "logits/chosen": -0.6917041540145874, + "logits/rejected": -0.6428658962249756, + "logps/chosen": -0.1369011104106903, + "logps/rejected": -4.486429691314697, + "loss": 0.1815, + "odds_ratio_loss": 0.02341514080762863, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013690110296010971, + "rewards/margins": 0.43495285511016846, + "rewards/rejected": -0.4486429989337921, + "sft_loss": 0.1369011104106903, + "step": 2189 + }, + { + "epoch": 3.1670281995661607, + "grad_norm": 2.203850860590357, + "learning_rate": 3.7920002215779875e-06, + "logits/chosen": -0.7099331617355347, + "logits/rejected": -0.63243567943573, + "logps/chosen": -0.1877567023038864, + "logps/rejected": -3.5332705974578857, + "loss": 0.1762, + "odds_ratio_loss": 0.04335636645555496, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01877567172050476, + "rewards/margins": 0.3345514237880707, + "rewards/rejected": -0.3533271253108978, + "sft_loss": 0.1877567023038864, + "step": 2190 + }, + { + "epoch": 3.168474331164136, + "grad_norm": 2.0629683680334083, + "learning_rate": 3.78889861147815e-06, + "logits/chosen": -0.5730685591697693, + "logits/rejected": -0.31942451000213623, + "logps/chosen": -0.11757153272628784, + "logps/rejected": -4.100497722625732, + "loss": 0.206, + "odds_ratio_loss": 0.009712357074022293, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011757154017686844, + "rewards/margins": 0.398292601108551, + "rewards/rejected": -0.41004979610443115, + "sft_loss": 0.11757153272628784, + "step": 2191 + }, + { + "epoch": 3.169920462762111, + "grad_norm": 6.148173254503388, + "learning_rate": 3.7857971286521273e-06, + "logits/chosen": -0.6861517429351807, + "logits/rejected": -0.5459161996841431, + "logps/chosen": -0.1798088550567627, + "logps/rejected": -4.008049964904785, + "loss": 0.2586, + "odds_ratio_loss": 0.03384054824709892, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01798088476061821, + "rewards/margins": 0.3828241229057312, + "rewards/rejected": -0.4008050262928009, + "sft_loss": 0.1798088550567627, + "step": 2192 + }, + { + "epoch": 3.171366594360087, + "grad_norm": 1.981916691678493, + "learning_rate": 3.782695774969811e-06, + "logits/chosen": -0.47907698154449463, + "logits/rejected": -0.45603060722351074, + "logps/chosen": -0.2008480578660965, + "logps/rejected": -2.513578414916992, + "loss": 0.191, + "odds_ratio_loss": 0.07965853065252304, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0200848076492548, + "rewards/margins": 0.23127305507659912, + "rewards/rejected": -0.2513578534126282, + "sft_loss": 0.2008480578660965, + "step": 2193 + }, + { + "epoch": 3.172812725958062, + "grad_norm": 2.209839340924135, + "learning_rate": 3.7795945523010236e-06, + "logits/chosen": -0.4900778532028198, + "logits/rejected": -0.45295250415802, + "logps/chosen": -0.09929634630680084, + "logps/rejected": -4.984482765197754, + "loss": 0.1897, + "odds_ratio_loss": 0.025094415992498398, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009929634630680084, + "rewards/margins": 0.48851868510246277, + "rewards/rejected": -0.49844831228256226, + "sft_loss": 0.09929634630680084, + "step": 2194 + }, + { + "epoch": 3.1742588575560378, + "grad_norm": 2.304743853675743, + "learning_rate": 3.776493462515501e-06, + "logits/chosen": -0.6225374341011047, + "logits/rejected": -0.44118446111679077, + "logps/chosen": -0.171739399433136, + "logps/rejected": -3.954901695251465, + "loss": 0.1846, + "odds_ratio_loss": 0.03998706489801407, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01717394031584263, + "rewards/margins": 0.3783162534236908, + "rewards/rejected": -0.3954901695251465, + "sft_loss": 0.171739399433136, + "step": 2195 + }, + { + "epoch": 3.175704989154013, + "grad_norm": 1.7701915623752982, + "learning_rate": 3.7733925074829026e-06, + "logits/chosen": -0.7270638942718506, + "logits/rejected": -0.5552908182144165, + "logps/chosen": -0.18401703238487244, + "logps/rejected": -3.406104564666748, + "loss": 0.2344, + "odds_ratio_loss": 0.031469061970710754, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018401702865958214, + "rewards/margins": 0.3222087323665619, + "rewards/rejected": -0.34061044454574585, + "sft_loss": 0.18401703238487244, + "step": 2196 + }, + { + "epoch": 3.1771511207519882, + "grad_norm": 2.059233864845376, + "learning_rate": 3.7702916890728037e-06, + "logits/chosen": -0.5447147488594055, + "logits/rejected": -0.45716631412506104, + "logps/chosen": -0.18555516004562378, + "logps/rejected": -4.597956657409668, + "loss": 0.1853, + "odds_ratio_loss": 0.026829298585653305, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01855551451444626, + "rewards/margins": 0.44124019145965576, + "rewards/rejected": -0.45979568362236023, + "sft_loss": 0.18555516004562378, + "step": 2197 + }, + { + "epoch": 3.178597252349964, + "grad_norm": 1.807317879130678, + "learning_rate": 3.767191009154703e-06, + "logits/chosen": -0.5492076873779297, + "logits/rejected": -0.32834699749946594, + "logps/chosen": -0.24237895011901855, + "logps/rejected": -4.1069793701171875, + "loss": 0.1525, + "odds_ratio_loss": 0.025217795744538307, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024237895384430885, + "rewards/margins": 0.3864600956439972, + "rewards/rejected": -0.41069793701171875, + "sft_loss": 0.24237895011901855, + "step": 2198 + }, + { + "epoch": 3.180043383947939, + "grad_norm": 1.9245969065946, + "learning_rate": 3.764090469598009e-06, + "logits/chosen": -0.6140974164009094, + "logits/rejected": -0.5183427333831787, + "logps/chosen": -0.12481319904327393, + "logps/rejected": -4.698176383972168, + "loss": 0.1857, + "odds_ratio_loss": 0.03451571986079216, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012481319718062878, + "rewards/margins": 0.45733633637428284, + "rewards/rejected": -0.4698176681995392, + "sft_loss": 0.12481319904327393, + "step": 2199 + }, + { + "epoch": 3.181489515545915, + "grad_norm": 2.191252165583729, + "learning_rate": 3.76099007227205e-06, + "logits/chosen": -0.791283130645752, + "logits/rejected": -0.5891169309616089, + "logps/chosen": -0.11535245925188065, + "logps/rejected": -3.5455451011657715, + "loss": 0.1683, + "odds_ratio_loss": 0.01490130927413702, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01153524499386549, + "rewards/margins": 0.3430192470550537, + "rewards/rejected": -0.35455450415611267, + "sft_loss": 0.11535245925188065, + "step": 2200 + }, + { + "epoch": 3.18293564714389, + "grad_norm": 2.268876201951002, + "learning_rate": 3.757889819046065e-06, + "logits/chosen": -0.6616383790969849, + "logits/rejected": -0.5038686990737915, + "logps/chosen": -0.21815867722034454, + "logps/rejected": -3.9103035926818848, + "loss": 0.1883, + "odds_ratio_loss": 0.03401227667927742, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021815868094563484, + "rewards/margins": 0.3692144751548767, + "rewards/rejected": -0.39103034138679504, + "sft_loss": 0.21815867722034454, + "step": 2201 + }, + { + "epoch": 3.1843817787418653, + "grad_norm": 2.4282555689676, + "learning_rate": 3.754789711789212e-06, + "logits/chosen": -0.5337816476821899, + "logits/rejected": -0.46093809604644775, + "logps/chosen": -0.1983119249343872, + "logps/rejected": -4.407778739929199, + "loss": 0.245, + "odds_ratio_loss": 0.02094094827771187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01983119361102581, + "rewards/margins": 0.4209466576576233, + "rewards/rejected": -0.44077789783477783, + "sft_loss": 0.1983119249343872, + "step": 2202 + }, + { + "epoch": 3.185827910339841, + "grad_norm": 2.326999903416427, + "learning_rate": 3.7516897523705537e-06, + "logits/chosen": -0.5480571985244751, + "logits/rejected": -0.4346959590911865, + "logps/chosen": -0.14086556434631348, + "logps/rejected": -2.17596435546875, + "loss": 0.1529, + "odds_ratio_loss": 0.029783004894852638, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014086555689573288, + "rewards/margins": 0.2035098671913147, + "rewards/rejected": -0.2175964117050171, + "sft_loss": 0.14086556434631348, + "step": 2203 + }, + { + "epoch": 3.1872740419378163, + "grad_norm": 2.949948093264566, + "learning_rate": 3.7485899426590676e-06, + "logits/chosen": -0.604040265083313, + "logits/rejected": -0.5441180467605591, + "logps/chosen": -0.2571631669998169, + "logps/rejected": -4.050754070281982, + "loss": 0.2018, + "odds_ratio_loss": 0.07504919171333313, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025716319680213928, + "rewards/margins": 0.37935909628868103, + "rewards/rejected": -0.4050753712654114, + "sft_loss": 0.2571631669998169, + "step": 2204 + }, + { + "epoch": 3.188720173535792, + "grad_norm": 2.1085036743628938, + "learning_rate": 3.7454902845236433e-06, + "logits/chosen": -0.8290140628814697, + "logits/rejected": -0.529233992099762, + "logps/chosen": -0.16382518410682678, + "logps/rejected": -4.446649551391602, + "loss": 0.1828, + "odds_ratio_loss": 0.025827059522271156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016382519155740738, + "rewards/margins": 0.4282824993133545, + "rewards/rejected": -0.44466501474380493, + "sft_loss": 0.16382518410682678, + "step": 2205 + }, + { + "epoch": 3.190166305133767, + "grad_norm": 5.203311282071658, + "learning_rate": 3.7423907798330735e-06, + "logits/chosen": -0.5849148631095886, + "logits/rejected": -0.44293212890625, + "logps/chosen": -0.2151346355676651, + "logps/rejected": -4.317756652832031, + "loss": 0.1996, + "odds_ratio_loss": 0.04586172476410866, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02151346392929554, + "rewards/margins": 0.41026225686073303, + "rewards/rejected": -0.43177568912506104, + "sft_loss": 0.2151346355676651, + "step": 2206 + }, + { + "epoch": 3.1916124367317424, + "grad_norm": 2.3996402747625014, + "learning_rate": 3.739291430456063e-06, + "logits/chosen": -0.5927102565765381, + "logits/rejected": -0.48668432235717773, + "logps/chosen": -0.16127264499664307, + "logps/rejected": -4.136725902557373, + "loss": 0.1826, + "odds_ratio_loss": 0.050145432353019714, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016127265989780426, + "rewards/margins": 0.39754533767700195, + "rewards/rejected": -0.4136725664138794, + "sft_loss": 0.16127264499664307, + "step": 2207 + }, + { + "epoch": 3.193058568329718, + "grad_norm": 2.331118390165532, + "learning_rate": 3.736192238261218e-06, + "logits/chosen": -0.4932352602481842, + "logits/rejected": -0.30938223004341125, + "logps/chosen": -0.11432990431785583, + "logps/rejected": -3.635563850402832, + "loss": 0.2564, + "odds_ratio_loss": 0.034632958471775055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011432991363108158, + "rewards/margins": 0.3521234095096588, + "rewards/rejected": -0.3635564148426056, + "sft_loss": 0.11432990431785583, + "step": 2208 + }, + { + "epoch": 3.1945046999276934, + "grad_norm": 2.0566819085700123, + "learning_rate": 3.733093205117057e-06, + "logits/chosen": -0.5601751208305359, + "logits/rejected": -0.3214784562587738, + "logps/chosen": -0.11301355063915253, + "logps/rejected": -5.065844535827637, + "loss": 0.1695, + "odds_ratio_loss": 0.02004491351544857, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011301355436444283, + "rewards/margins": 0.4952830970287323, + "rewards/rejected": -0.5065844655036926, + "sft_loss": 0.11301355063915253, + "step": 2209 + }, + { + "epoch": 3.1959508315256686, + "grad_norm": 1.9736197886022742, + "learning_rate": 3.7299943328919956e-06, + "logits/chosen": -0.5280442833900452, + "logits/rejected": -0.4607413411140442, + "logps/chosen": -0.18449607491493225, + "logps/rejected": -3.659106969833374, + "loss": 0.1599, + "odds_ratio_loss": 0.03631690517067909, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018449608236551285, + "rewards/margins": 0.34746110439300537, + "rewards/rejected": -0.36591070890426636, + "sft_loss": 0.18449607491493225, + "step": 2210 + }, + { + "epoch": 3.1973969631236443, + "grad_norm": 3.0301528491892826, + "learning_rate": 3.726895623454358e-06, + "logits/chosen": -0.5963754057884216, + "logits/rejected": -0.6541630625724792, + "logps/chosen": -0.40213772654533386, + "logps/rejected": -2.991434335708618, + "loss": 0.2434, + "odds_ratio_loss": 0.10465790331363678, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04021377116441727, + "rewards/margins": 0.25892966985702515, + "rewards/rejected": -0.2991434335708618, + "sft_loss": 0.40213772654533386, + "step": 2211 + }, + { + "epoch": 3.1988430947216195, + "grad_norm": 1.8336318495290496, + "learning_rate": 3.7237970786723638e-06, + "logits/chosen": -0.6186486482620239, + "logits/rejected": -0.44536644220352173, + "logps/chosen": -0.26954495906829834, + "logps/rejected": -4.847094535827637, + "loss": 0.1987, + "odds_ratio_loss": 0.06486377865076065, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026954498142004013, + "rewards/margins": 0.4577549695968628, + "rewards/rejected": -0.4847094416618347, + "sft_loss": 0.26954495906829834, + "step": 2212 + }, + { + "epoch": 3.2002892263195952, + "grad_norm": 1.9924435818068609, + "learning_rate": 3.7206987004141417e-06, + "logits/chosen": -0.6402283906936646, + "logits/rejected": -0.5902421474456787, + "logps/chosen": -0.22126686573028564, + "logps/rejected": -3.0365238189697266, + "loss": 0.1705, + "odds_ratio_loss": 0.04855208098888397, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022126685827970505, + "rewards/margins": 0.28152570128440857, + "rewards/rejected": -0.3036523759365082, + "sft_loss": 0.22126686573028564, + "step": 2213 + }, + { + "epoch": 3.2017353579175705, + "grad_norm": 2.3205251480643208, + "learning_rate": 3.717600490547712e-06, + "logits/chosen": -0.5460687875747681, + "logits/rejected": -0.42219042778015137, + "logps/chosen": -0.24014821648597717, + "logps/rejected": -2.918666124343872, + "loss": 0.1981, + "odds_ratio_loss": 0.0672679990530014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024014823138713837, + "rewards/margins": 0.2678517699241638, + "rewards/rejected": -0.29186660051345825, + "sft_loss": 0.24014821648597717, + "step": 2214 + }, + { + "epoch": 3.2031814895155457, + "grad_norm": 2.0440762308575042, + "learning_rate": 3.7145024509409994e-06, + "logits/chosen": -0.7297794818878174, + "logits/rejected": -0.5262342691421509, + "logps/chosen": -0.21348436176776886, + "logps/rejected": -2.7271389961242676, + "loss": 0.2102, + "odds_ratio_loss": 0.04441622644662857, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021348439157009125, + "rewards/margins": 0.2513654828071594, + "rewards/rejected": -0.27271392941474915, + "sft_loss": 0.21348436176776886, + "step": 2215 + }, + { + "epoch": 3.2046276211135214, + "grad_norm": 2.7250778588447107, + "learning_rate": 3.711404583461821e-06, + "logits/chosen": -0.5237282514572144, + "logits/rejected": -0.45042431354522705, + "logps/chosen": -0.14554253220558167, + "logps/rejected": -3.1719679832458496, + "loss": 0.2359, + "odds_ratio_loss": 0.054552722722291946, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014554252848029137, + "rewards/margins": 0.3026425540447235, + "rewards/rejected": -0.3171968460083008, + "sft_loss": 0.14554253220558167, + "step": 2216 + }, + { + "epoch": 3.2060737527114966, + "grad_norm": 2.002981078083241, + "learning_rate": 3.7083068899778936e-06, + "logits/chosen": -0.5125364661216736, + "logits/rejected": -0.5378755927085876, + "logps/chosen": -0.1441064178943634, + "logps/rejected": -3.8481955528259277, + "loss": 0.1459, + "odds_ratio_loss": 0.0454072542488575, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01441064290702343, + "rewards/margins": 0.37040889263153076, + "rewards/rejected": -0.38481953740119934, + "sft_loss": 0.1441064178943634, + "step": 2217 + }, + { + "epoch": 3.2075198843094723, + "grad_norm": 1.9684062919816647, + "learning_rate": 3.705209372356831e-06, + "logits/chosen": -0.462546706199646, + "logits/rejected": -0.371360719203949, + "logps/chosen": -0.14098519086837769, + "logps/rejected": -5.05507755279541, + "loss": 0.1494, + "odds_ratio_loss": 0.022105194628238678, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014098519459366798, + "rewards/margins": 0.49140921235084534, + "rewards/rejected": -0.50550776720047, + "sft_loss": 0.14098519086837769, + "step": 2218 + }, + { + "epoch": 3.2089660159074476, + "grad_norm": 1.8324522959136393, + "learning_rate": 3.702112032466134e-06, + "logits/chosen": -0.5853912234306335, + "logits/rejected": -0.5114383101463318, + "logps/chosen": -0.15347421169281006, + "logps/rejected": -3.8486804962158203, + "loss": 0.144, + "odds_ratio_loss": 0.03078891895711422, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01534742210060358, + "rewards/margins": 0.3695206642150879, + "rewards/rejected": -0.3848680555820465, + "sft_loss": 0.15347421169281006, + "step": 2219 + }, + { + "epoch": 3.210412147505423, + "grad_norm": 2.908343261824301, + "learning_rate": 3.6990148721732037e-06, + "logits/chosen": -0.6292286515235901, + "logits/rejected": -0.6121467351913452, + "logps/chosen": -0.12733693420886993, + "logps/rejected": -4.009809970855713, + "loss": 0.1963, + "odds_ratio_loss": 0.03087882697582245, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012733692303299904, + "rewards/margins": 0.3882473409175873, + "rewards/rejected": -0.40098103880882263, + "sft_loss": 0.12733693420886993, + "step": 2220 + }, + { + "epoch": 3.2118582791033985, + "grad_norm": 1.9318114387309266, + "learning_rate": 3.695917893345326e-06, + "logits/chosen": -0.4239646792411804, + "logits/rejected": -0.35339295864105225, + "logps/chosen": -0.10661177337169647, + "logps/rejected": -4.832416534423828, + "loss": 0.1664, + "odds_ratio_loss": 0.026308678090572357, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010661177337169647, + "rewards/margins": 0.4725804626941681, + "rewards/rejected": -0.48324161767959595, + "sft_loss": 0.10661177337169647, + "step": 2221 + }, + { + "epoch": 3.2133044107013737, + "grad_norm": 1.9552969597340941, + "learning_rate": 3.6928210978496844e-06, + "logits/chosen": -0.5122228264808655, + "logits/rejected": -0.4361344575881958, + "logps/chosen": -0.2315160632133484, + "logps/rejected": -2.3202056884765625, + "loss": 0.2312, + "odds_ratio_loss": 0.07692821323871613, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02315160632133484, + "rewards/margins": 0.20886898040771484, + "rewards/rejected": -0.2320205718278885, + "sft_loss": 0.2315160632133484, + "step": 2222 + }, + { + "epoch": 3.2147505422993494, + "grad_norm": 3.9027921143976867, + "learning_rate": 3.6897244875533463e-06, + "logits/chosen": -0.4394450783729553, + "logits/rejected": -0.4192020893096924, + "logps/chosen": -0.25058937072753906, + "logps/rejected": -5.037656784057617, + "loss": 0.1914, + "odds_ratio_loss": 0.04292678460478783, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025058934465050697, + "rewards/margins": 0.4787067770957947, + "rewards/rejected": -0.5037657022476196, + "sft_loss": 0.25058937072753906, + "step": 2223 + }, + { + "epoch": 3.2161966738973247, + "grad_norm": 1.9675271422773393, + "learning_rate": 3.686628064323271e-06, + "logits/chosen": -0.4829399585723877, + "logits/rejected": -0.46176087856292725, + "logps/chosen": -0.22450929880142212, + "logps/rejected": -4.800108432769775, + "loss": 0.2024, + "odds_ratio_loss": 0.05624048411846161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02245093137025833, + "rewards/margins": 0.4575599431991577, + "rewards/rejected": -0.48001086711883545, + "sft_loss": 0.22450929880142212, + "step": 2224 + }, + { + "epoch": 3.2176428054953, + "grad_norm": 2.1076011418204033, + "learning_rate": 3.6835318300263012e-06, + "logits/chosen": -0.4614330232143402, + "logits/rejected": -0.4945675730705261, + "logps/chosen": -0.2269093543291092, + "logps/rejected": -3.4398250579833984, + "loss": 0.2459, + "odds_ratio_loss": 0.0666380450129509, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02269093692302704, + "rewards/margins": 0.32129159569740295, + "rewards/rejected": -0.3439825475215912, + "sft_loss": 0.2269093543291092, + "step": 2225 + }, + { + "epoch": 3.2190889370932756, + "grad_norm": 1.9003759460409313, + "learning_rate": 3.6804357865291715e-06, + "logits/chosen": -0.38930198550224304, + "logits/rejected": -0.32057294249534607, + "logps/chosen": -0.15316364169120789, + "logps/rejected": -4.212235927581787, + "loss": 0.167, + "odds_ratio_loss": 0.017919572070240974, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015316363424062729, + "rewards/margins": 0.40590721368789673, + "rewards/rejected": -0.42122358083724976, + "sft_loss": 0.15316364169120789, + "step": 2226 + }, + { + "epoch": 3.220535068691251, + "grad_norm": 1.9152259661536026, + "learning_rate": 3.677339935698495e-06, + "logits/chosen": -0.45317479968070984, + "logits/rejected": -0.4471889138221741, + "logps/chosen": -0.19737450778484344, + "logps/rejected": -3.9483349323272705, + "loss": 0.1805, + "odds_ratio_loss": 0.0657714456319809, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019737452268600464, + "rewards/margins": 0.37509608268737793, + "rewards/rejected": -0.394833505153656, + "sft_loss": 0.19737450778484344, + "step": 2227 + }, + { + "epoch": 3.2219812002892265, + "grad_norm": 2.2018903514064276, + "learning_rate": 3.6742442794007746e-06, + "logits/chosen": -0.47369611263275146, + "logits/rejected": -0.38501498103141785, + "logps/chosen": -0.24870990216732025, + "logps/rejected": -2.4713776111602783, + "loss": 0.2353, + "odds_ratio_loss": 0.0954592227935791, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024870987981557846, + "rewards/margins": 0.2222667783498764, + "rewards/rejected": -0.24713777005672455, + "sft_loss": 0.24870990216732025, + "step": 2228 + }, + { + "epoch": 3.2234273318872018, + "grad_norm": 2.4456927538611613, + "learning_rate": 3.6711488195023893e-06, + "logits/chosen": -0.455442875623703, + "logits/rejected": -0.37904882431030273, + "logps/chosen": -0.231689915060997, + "logps/rejected": -3.092597246170044, + "loss": 0.1983, + "odds_ratio_loss": 0.06161237508058548, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02316899225115776, + "rewards/margins": 0.28609076142311096, + "rewards/rejected": -0.3092597424983978, + "sft_loss": 0.231689915060997, + "step": 2229 + }, + { + "epoch": 3.224873463485177, + "grad_norm": 2.2262037548504745, + "learning_rate": 3.6680535578696073e-06, + "logits/chosen": -0.5955873727798462, + "logits/rejected": -0.48187780380249023, + "logps/chosen": -0.2466651201248169, + "logps/rejected": -3.732814311981201, + "loss": 0.237, + "odds_ratio_loss": 0.0446498841047287, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02466651052236557, + "rewards/margins": 0.3486149311065674, + "rewards/rejected": -0.37328144907951355, + "sft_loss": 0.2466651201248169, + "step": 2230 + }, + { + "epoch": 3.2263195950831527, + "grad_norm": 2.1717346267474147, + "learning_rate": 3.6649584963685706e-06, + "logits/chosen": -0.6487745642662048, + "logits/rejected": -0.554900050163269, + "logps/chosen": -0.2109365165233612, + "logps/rejected": -2.185053586959839, + "loss": 0.2322, + "odds_ratio_loss": 0.048502735793590546, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02109365165233612, + "rewards/margins": 0.1974116861820221, + "rewards/rejected": -0.2185053676366806, + "sft_loss": 0.2109365165233612, + "step": 2231 + }, + { + "epoch": 3.227765726681128, + "grad_norm": 2.2488395772011742, + "learning_rate": 3.6618636368653033e-06, + "logits/chosen": -0.46767958998680115, + "logits/rejected": -0.32060617208480835, + "logps/chosen": -0.11053460091352463, + "logps/rejected": -6.905789375305176, + "loss": 0.1755, + "odds_ratio_loss": 0.02767387591302395, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011053459718823433, + "rewards/margins": 0.6795254945755005, + "rewards/rejected": -0.6905789375305176, + "sft_loss": 0.11053460091352463, + "step": 2232 + }, + { + "epoch": 3.229211858279103, + "grad_norm": 2.4396261057570015, + "learning_rate": 3.6587689812257106e-06, + "logits/chosen": -0.45963820815086365, + "logits/rejected": -0.4651893675327301, + "logps/chosen": -0.10936955362558365, + "logps/rejected": -5.02745246887207, + "loss": 0.1596, + "odds_ratio_loss": 0.010914292186498642, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010936954990029335, + "rewards/margins": 0.49180832505226135, + "rewards/rejected": -0.5027452707290649, + "sft_loss": 0.10936955362558365, + "step": 2233 + }, + { + "epoch": 3.230657989877079, + "grad_norm": 2.280698528298338, + "learning_rate": 3.655674531315569e-06, + "logits/chosen": -0.5203260183334351, + "logits/rejected": -0.35767683386802673, + "logps/chosen": -0.22178664803504944, + "logps/rejected": -2.980849027633667, + "loss": 0.2226, + "odds_ratio_loss": 0.045914776623249054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022178664803504944, + "rewards/margins": 0.2759062647819519, + "rewards/rejected": -0.29808491468429565, + "sft_loss": 0.22178664803504944, + "step": 2234 + }, + { + "epoch": 3.232104121475054, + "grad_norm": 1.7926615791207154, + "learning_rate": 3.6525802890005357e-06, + "logits/chosen": -0.36176133155822754, + "logits/rejected": -0.32122957706451416, + "logps/chosen": -0.0988263189792633, + "logps/rejected": -5.585865497589111, + "loss": 0.1614, + "odds_ratio_loss": 0.0176323801279068, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00988263264298439, + "rewards/margins": 0.5487039089202881, + "rewards/rejected": -0.5585865378379822, + "sft_loss": 0.0988263189792633, + "step": 2235 + }, + { + "epoch": 3.23355025307303, + "grad_norm": 1.93765750188442, + "learning_rate": 3.6494862561461387e-06, + "logits/chosen": -0.5547440648078918, + "logits/rejected": -0.4816645681858063, + "logps/chosen": -0.21579933166503906, + "logps/rejected": -5.2509565353393555, + "loss": 0.1766, + "odds_ratio_loss": 0.028800413012504578, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021579932421445847, + "rewards/margins": 0.5035157203674316, + "rewards/rejected": -0.5250956416130066, + "sft_loss": 0.21579933166503906, + "step": 2236 + }, + { + "epoch": 3.234996384671005, + "grad_norm": 2.1995050438159076, + "learning_rate": 3.646392434617785e-06, + "logits/chosen": -0.6402944326400757, + "logits/rejected": -0.421334445476532, + "logps/chosen": -0.19929829239845276, + "logps/rejected": -5.820633888244629, + "loss": 0.1767, + "odds_ratio_loss": 0.048856209963560104, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019929828122258186, + "rewards/margins": 0.5621335506439209, + "rewards/rejected": -0.5820633172988892, + "sft_loss": 0.19929829239845276, + "step": 2237 + }, + { + "epoch": 3.2364425162689803, + "grad_norm": 1.8119486632998885, + "learning_rate": 3.6432988262807483e-06, + "logits/chosen": -0.5280413627624512, + "logits/rejected": -0.42112961411476135, + "logps/chosen": -0.15436744689941406, + "logps/rejected": -5.035494327545166, + "loss": 0.1644, + "odds_ratio_loss": 0.030892375856637955, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015436742454767227, + "rewards/margins": 0.4881126880645752, + "rewards/rejected": -0.5035493969917297, + "sft_loss": 0.15436744689941406, + "step": 2238 + }, + { + "epoch": 3.237888647866956, + "grad_norm": 2.2278105118916995, + "learning_rate": 3.6402054330001787e-06, + "logits/chosen": -0.5598574876785278, + "logits/rejected": -0.29619458317756653, + "logps/chosen": -0.14671733975410461, + "logps/rejected": -6.336015224456787, + "loss": 0.1365, + "odds_ratio_loss": 0.02809251844882965, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014671734534204006, + "rewards/margins": 0.6189297437667847, + "rewards/rejected": -0.6336015462875366, + "sft_loss": 0.14671733975410461, + "step": 2239 + }, + { + "epoch": 3.239334779464931, + "grad_norm": 3.86220961291117, + "learning_rate": 3.637112256641092e-06, + "logits/chosen": -0.5911388993263245, + "logits/rejected": -0.3398038148880005, + "logps/chosen": -0.16455015540122986, + "logps/rejected": -5.836204528808594, + "loss": 0.246, + "odds_ratio_loss": 0.0325884185731411, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016455017030239105, + "rewards/margins": 0.5671654343605042, + "rewards/rejected": -0.5836204290390015, + "sft_loss": 0.16455015540122986, + "step": 2240 + }, + { + "epoch": 3.240780911062907, + "grad_norm": 1.9752946959873765, + "learning_rate": 3.6340192990683785e-06, + "logits/chosen": -0.5412940979003906, + "logits/rejected": -0.40820765495300293, + "logps/chosen": -0.14699213206768036, + "logps/rejected": -4.479633331298828, + "loss": 0.174, + "odds_ratio_loss": 0.023793520405888557, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01469921413809061, + "rewards/margins": 0.4332641363143921, + "rewards/rejected": -0.44796332716941833, + "sft_loss": 0.14699213206768036, + "step": 2241 + }, + { + "epoch": 3.242227042660882, + "grad_norm": 2.2022818942213864, + "learning_rate": 3.6309265621467923e-06, + "logits/chosen": -0.4983745217323303, + "logits/rejected": -0.5105041265487671, + "logps/chosen": -0.28136447072029114, + "logps/rejected": -3.8148107528686523, + "loss": 0.2074, + "odds_ratio_loss": 0.05450871214270592, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.028136447072029114, + "rewards/margins": 0.3533446788787842, + "rewards/rejected": -0.3814811110496521, + "sft_loss": 0.28136447072029114, + "step": 2242 + }, + { + "epoch": 3.2436731742588574, + "grad_norm": 2.3767532424730455, + "learning_rate": 3.627834047740957e-06, + "logits/chosen": -0.5548741221427917, + "logits/rejected": -0.44200655817985535, + "logps/chosen": -0.16056154668331146, + "logps/rejected": -3.1678218841552734, + "loss": 0.1749, + "odds_ratio_loss": 0.023494180291891098, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016056153923273087, + "rewards/margins": 0.300726056098938, + "rewards/rejected": -0.3167821764945984, + "sft_loss": 0.16056154668331146, + "step": 2243 + }, + { + "epoch": 3.245119305856833, + "grad_norm": 1.8650655116730133, + "learning_rate": 3.624741757715359e-06, + "logits/chosen": -0.6051979660987854, + "logits/rejected": -0.4340522587299347, + "logps/chosen": -0.19842886924743652, + "logps/rejected": -3.8932723999023438, + "loss": 0.1699, + "odds_ratio_loss": 0.035827018320560455, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019842887297272682, + "rewards/margins": 0.3694843649864197, + "rewards/rejected": -0.3893272876739502, + "sft_loss": 0.19842886924743652, + "step": 2244 + }, + { + "epoch": 3.2465654374548083, + "grad_norm": 1.8596281529466636, + "learning_rate": 3.621649693934353e-06, + "logits/chosen": -0.5226513147354126, + "logits/rejected": -0.49524396657943726, + "logps/chosen": -0.14361608028411865, + "logps/rejected": -4.346930980682373, + "loss": 0.177, + "odds_ratio_loss": 0.02752107009291649, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014361606910824776, + "rewards/margins": 0.4203314781188965, + "rewards/rejected": -0.4346930682659149, + "sft_loss": 0.14361608028411865, + "step": 2245 + }, + { + "epoch": 3.248011569052784, + "grad_norm": 2.174693078196969, + "learning_rate": 3.6185578582621573e-06, + "logits/chosen": -0.5399285554885864, + "logits/rejected": -0.4264034628868103, + "logps/chosen": -0.2987480163574219, + "logps/rejected": -4.435233116149902, + "loss": 0.1745, + "odds_ratio_loss": 0.05470084026455879, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02987479791045189, + "rewards/margins": 0.41364848613739014, + "rewards/rejected": -0.4435232877731323, + "sft_loss": 0.2987480163574219, + "step": 2246 + }, + { + "epoch": 3.2494577006507592, + "grad_norm": 1.9731539471015809, + "learning_rate": 3.6154662525628474e-06, + "logits/chosen": -0.6945093274116516, + "logits/rejected": -0.5601460337638855, + "logps/chosen": -0.15286661684513092, + "logps/rejected": -3.5629913806915283, + "loss": 0.1972, + "odds_ratio_loss": 0.05991650000214577, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015286662615835667, + "rewards/margins": 0.34101247787475586, + "rewards/rejected": -0.35629913210868835, + "sft_loss": 0.15286661684513092, + "step": 2247 + }, + { + "epoch": 3.2509038322487345, + "grad_norm": 1.9180427426151503, + "learning_rate": 3.612374878700368e-06, + "logits/chosen": -0.4100998640060425, + "logits/rejected": -0.5385414958000183, + "logps/chosen": -0.22914831340312958, + "logps/rejected": -4.232925891876221, + "loss": 0.1886, + "odds_ratio_loss": 0.07522237300872803, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022914830595254898, + "rewards/margins": 0.4003777801990509, + "rewards/rejected": -0.4232926070690155, + "sft_loss": 0.22914831340312958, + "step": 2248 + }, + { + "epoch": 3.25234996384671, + "grad_norm": 2.1307859276198164, + "learning_rate": 3.609283738538517e-06, + "logits/chosen": -0.673978328704834, + "logits/rejected": -0.42554956674575806, + "logps/chosen": -0.24228356778621674, + "logps/rejected": -5.043076992034912, + "loss": 0.2614, + "odds_ratio_loss": 0.04325840622186661, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024228356778621674, + "rewards/margins": 0.4800793528556824, + "rewards/rejected": -0.5043076872825623, + "sft_loss": 0.24228356778621674, + "step": 2249 + }, + { + "epoch": 3.2537960954446854, + "grad_norm": 2.377612293864264, + "learning_rate": 3.606192833940956e-06, + "logits/chosen": -0.5075133442878723, + "logits/rejected": -0.3776240646839142, + "logps/chosen": -0.139189213514328, + "logps/rejected": -1.9026098251342773, + "loss": 0.1694, + "odds_ratio_loss": 0.06290970742702484, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01391892321407795, + "rewards/margins": 0.17634207010269165, + "rewards/rejected": -0.19026097655296326, + "sft_loss": 0.139189213514328, + "step": 2250 + }, + { + "epoch": 3.255242227042661, + "grad_norm": 1.94278364933995, + "learning_rate": 3.6031021667712e-06, + "logits/chosen": -0.5642733573913574, + "logits/rejected": -0.38058435916900635, + "logps/chosen": -0.15122462809085846, + "logps/rejected": -5.234777450561523, + "loss": 0.183, + "odds_ratio_loss": 0.027925534173846245, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015122463926672935, + "rewards/margins": 0.5083552598953247, + "rewards/rejected": -0.5234777927398682, + "sft_loss": 0.15122462809085846, + "step": 2251 + }, + { + "epoch": 3.2566883586406363, + "grad_norm": 1.7092281341499098, + "learning_rate": 3.600011738892628e-06, + "logits/chosen": -0.4542877674102783, + "logits/rejected": -0.4579513370990753, + "logps/chosen": -0.08828302472829819, + "logps/rejected": -4.268548488616943, + "loss": 0.1504, + "odds_ratio_loss": 0.020018287003040314, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008828302845358849, + "rewards/margins": 0.41802653670310974, + "rewards/rejected": -0.4268548786640167, + "sft_loss": 0.08828302472829819, + "step": 2252 + }, + { + "epoch": 3.2581344902386116, + "grad_norm": 2.4301885734804634, + "learning_rate": 3.5969215521684673e-06, + "logits/chosen": -0.5045682191848755, + "logits/rejected": -0.3198281228542328, + "logps/chosen": -0.17714548110961914, + "logps/rejected": -2.6474297046661377, + "loss": 0.1836, + "odds_ratio_loss": 0.044076722115278244, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017714550718665123, + "rewards/margins": 0.2470283955335617, + "rewards/rejected": -0.26474297046661377, + "sft_loss": 0.17714548110961914, + "step": 2253 + }, + { + "epoch": 3.2595806218365873, + "grad_norm": 2.274847989564652, + "learning_rate": 3.593831608461805e-06, + "logits/chosen": -0.7203308939933777, + "logits/rejected": -0.5131949782371521, + "logps/chosen": -0.17507284879684448, + "logps/rejected": -5.116349220275879, + "loss": 0.201, + "odds_ratio_loss": 0.03700259327888489, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017507284879684448, + "rewards/margins": 0.49412769079208374, + "rewards/rejected": -0.5116349458694458, + "sft_loss": 0.17507284879684448, + "step": 2254 + }, + { + "epoch": 3.2610267534345625, + "grad_norm": 2.2182355679207766, + "learning_rate": 3.5907419096355768e-06, + "logits/chosen": -0.40628182888031006, + "logits/rejected": -0.2812096178531647, + "logps/chosen": -0.15741974115371704, + "logps/rejected": -4.353770732879639, + "loss": 0.1389, + "odds_ratio_loss": 0.03497250750660896, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015741974115371704, + "rewards/margins": 0.4196351170539856, + "rewards/rejected": -0.4353770911693573, + "sft_loss": 0.15741974115371704, + "step": 2255 + }, + { + "epoch": 3.2624728850325377, + "grad_norm": 2.6818134401064424, + "learning_rate": 3.5876524575525774e-06, + "logits/chosen": -0.6847485303878784, + "logits/rejected": -0.6329970955848694, + "logps/chosen": -0.1577267199754715, + "logps/rejected": -3.3478763103485107, + "loss": 0.222, + "odds_ratio_loss": 0.05068032443523407, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01577267237007618, + "rewards/margins": 0.3190149664878845, + "rewards/rejected": -0.33478766679763794, + "sft_loss": 0.1577267199754715, + "step": 2256 + }, + { + "epoch": 3.2639190166305134, + "grad_norm": 2.3388227044279497, + "learning_rate": 3.584563254075446e-06, + "logits/chosen": -0.476225346326828, + "logits/rejected": -0.3441811501979828, + "logps/chosen": -0.17915165424346924, + "logps/rejected": -3.2945075035095215, + "loss": 0.1806, + "odds_ratio_loss": 0.05011521279811859, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017915166914463043, + "rewards/margins": 0.3115355968475342, + "rewards/rejected": -0.329450786113739, + "sft_loss": 0.17915165424346924, + "step": 2257 + }, + { + "epoch": 3.2653651482284887, + "grad_norm": 1.9674045184480031, + "learning_rate": 3.5814743010666757e-06, + "logits/chosen": -0.5762167572975159, + "logits/rejected": -0.3875581622123718, + "logps/chosen": -0.1739146113395691, + "logps/rejected": -3.7504653930664062, + "loss": 0.159, + "odds_ratio_loss": 0.044472791254520416, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01739146001636982, + "rewards/margins": 0.3576550781726837, + "rewards/rejected": -0.3750465214252472, + "sft_loss": 0.1739146113395691, + "step": 2258 + }, + { + "epoch": 3.2668112798264644, + "grad_norm": 2.8547990030832735, + "learning_rate": 3.578385600388609e-06, + "logits/chosen": -0.4150635600090027, + "logits/rejected": -0.3737567067146301, + "logps/chosen": -0.19305887818336487, + "logps/rejected": -5.184538841247559, + "loss": 0.1916, + "odds_ratio_loss": 0.02490687370300293, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019305888563394547, + "rewards/margins": 0.49914807081222534, + "rewards/rejected": -0.5184538960456848, + "sft_loss": 0.19305887818336487, + "step": 2259 + }, + { + "epoch": 3.2682574114244396, + "grad_norm": 1.9041525202532068, + "learning_rate": 3.575297153903434e-06, + "logits/chosen": -0.6946017742156982, + "logits/rejected": -0.5355392694473267, + "logps/chosen": -0.13331067562103271, + "logps/rejected": -5.4609198570251465, + "loss": 0.1833, + "odds_ratio_loss": 0.014529845677316189, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013331068679690361, + "rewards/margins": 0.5327609181404114, + "rewards/rejected": -0.5460919737815857, + "sft_loss": 0.13331067562103271, + "step": 2260 + }, + { + "epoch": 3.2697035430224153, + "grad_norm": 1.914055513543421, + "learning_rate": 3.5722089634731868e-06, + "logits/chosen": -0.4900306463241577, + "logits/rejected": -0.3406396210193634, + "logps/chosen": -0.11849421262741089, + "logps/rejected": -5.536858081817627, + "loss": 0.195, + "odds_ratio_loss": 0.007503737695515156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011849422007799149, + "rewards/margins": 0.5418363809585571, + "rewards/rejected": -0.5536858439445496, + "sft_loss": 0.11849421262741089, + "step": 2261 + }, + { + "epoch": 3.2711496746203905, + "grad_norm": 2.1438021427198737, + "learning_rate": 3.5691210309597473e-06, + "logits/chosen": -0.6778137683868408, + "logits/rejected": -0.5302555561065674, + "logps/chosen": -0.10795378684997559, + "logps/rejected": -4.853133201599121, + "loss": 0.1451, + "odds_ratio_loss": 0.02070033550262451, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010795379988849163, + "rewards/margins": 0.47451794147491455, + "rewards/rejected": -0.485313355922699, + "sft_loss": 0.10795378684997559, + "step": 2262 + }, + { + "epoch": 3.2725958062183658, + "grad_norm": 1.9860009136951866, + "learning_rate": 3.566033358224845e-06, + "logits/chosen": -0.799131453037262, + "logits/rejected": -0.49861404299736023, + "logps/chosen": -0.12334741652011871, + "logps/rejected": -4.193481922149658, + "loss": 0.1469, + "odds_ratio_loss": 0.028686096891760826, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012334741652011871, + "rewards/margins": 0.4070134162902832, + "rewards/rejected": -0.41934818029403687, + "sft_loss": 0.12334741652011871, + "step": 2263 + }, + { + "epoch": 3.2740419378163415, + "grad_norm": 1.7714968208013866, + "learning_rate": 3.5629459471300455e-06, + "logits/chosen": -0.5801678895950317, + "logits/rejected": -0.38160601258277893, + "logps/chosen": -0.12812554836273193, + "logps/rejected": -5.513360977172852, + "loss": 0.1586, + "odds_ratio_loss": 0.022318800911307335, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012812554836273193, + "rewards/margins": 0.5385234951972961, + "rewards/rejected": -0.5513360500335693, + "sft_loss": 0.12812554836273193, + "step": 2264 + }, + { + "epoch": 3.2754880694143167, + "grad_norm": 1.781393491270812, + "learning_rate": 3.5598587995367645e-06, + "logits/chosen": -0.48313915729522705, + "logits/rejected": -0.34745123982429504, + "logps/chosen": -0.12040194869041443, + "logps/rejected": -2.534296989440918, + "loss": 0.1496, + "odds_ratio_loss": 0.041830502450466156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012040195055305958, + "rewards/margins": 0.24138952791690826, + "rewards/rejected": -0.25342971086502075, + "sft_loss": 0.12040194869041443, + "step": 2265 + }, + { + "epoch": 3.276934201012292, + "grad_norm": 2.249398935208769, + "learning_rate": 3.5567719173062503e-06, + "logits/chosen": -0.7603170871734619, + "logits/rejected": -0.4521884322166443, + "logps/chosen": -0.11175563186407089, + "logps/rejected": -4.254090785980225, + "loss": 0.1363, + "odds_ratio_loss": 0.010375426150858402, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011175563558936119, + "rewards/margins": 0.4142335057258606, + "rewards/rejected": -0.42540907859802246, + "sft_loss": 0.11175563186407089, + "step": 2266 + }, + { + "epoch": 3.2783803326102676, + "grad_norm": 1.9991795543356456, + "learning_rate": 3.553685302299599e-06, + "logits/chosen": -0.3749878406524658, + "logits/rejected": -0.27797868847846985, + "logps/chosen": -0.10831344127655029, + "logps/rejected": -4.4174909591674805, + "loss": 0.1216, + "odds_ratio_loss": 0.02435128577053547, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010831343941390514, + "rewards/margins": 0.43091776967048645, + "rewards/rejected": -0.4417491555213928, + "sft_loss": 0.10831344127655029, + "step": 2267 + }, + { + "epoch": 3.279826464208243, + "grad_norm": 2.1564064092160726, + "learning_rate": 3.5505989563777402e-06, + "logits/chosen": -0.5478772521018982, + "logits/rejected": -0.35927248001098633, + "logps/chosen": -0.12505735456943512, + "logps/rejected": -4.873144149780273, + "loss": 0.1495, + "odds_ratio_loss": 0.01490817777812481, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012505735270678997, + "rewards/margins": 0.4748087227344513, + "rewards/rejected": -0.48731446266174316, + "sft_loss": 0.12505735456943512, + "step": 2268 + }, + { + "epoch": 3.2812725958062186, + "grad_norm": 1.8682733125562931, + "learning_rate": 3.5475128814014457e-06, + "logits/chosen": -0.6678739190101624, + "logits/rejected": -0.5783143043518066, + "logps/chosen": -0.32308146357536316, + "logps/rejected": -3.722277879714966, + "loss": 0.1831, + "odds_ratio_loss": 0.08787070214748383, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.032308146357536316, + "rewards/margins": 0.3399196267127991, + "rewards/rejected": -0.3722277879714966, + "sft_loss": 0.32308146357536316, + "step": 2269 + }, + { + "epoch": 3.282718727404194, + "grad_norm": 2.2558187195382517, + "learning_rate": 3.5444270792313196e-06, + "logits/chosen": -0.6178706884384155, + "logits/rejected": -0.4912964105606079, + "logps/chosen": -0.18187105655670166, + "logps/rejected": -6.225649833679199, + "loss": 0.1925, + "odds_ratio_loss": 0.014290587976574898, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018187105655670166, + "rewards/margins": 0.6043779253959656, + "rewards/rejected": -0.6225650310516357, + "sft_loss": 0.18187105655670166, + "step": 2270 + }, + { + "epoch": 3.284164859002169, + "grad_norm": 2.0678723535377905, + "learning_rate": 3.5413415517278033e-06, + "logits/chosen": -0.5242627859115601, + "logits/rejected": -0.46460655331611633, + "logps/chosen": -0.26317670941352844, + "logps/rejected": -4.386211395263672, + "loss": 0.2566, + "odds_ratio_loss": 0.05664074793457985, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026317669078707695, + "rewards/margins": 0.41230350732803345, + "rewards/rejected": -0.4386211633682251, + "sft_loss": 0.26317670941352844, + "step": 2271 + }, + { + "epoch": 3.2856109906001447, + "grad_norm": 2.296493485099005, + "learning_rate": 3.5382563007511754e-06, + "logits/chosen": -0.601801872253418, + "logits/rejected": -0.3562720715999603, + "logps/chosen": -0.25675979256629944, + "logps/rejected": -3.735896110534668, + "loss": 0.2085, + "odds_ratio_loss": 0.04126711189746857, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025675982236862183, + "rewards/margins": 0.3479136526584625, + "rewards/rejected": -0.3735896348953247, + "sft_loss": 0.25675979256629944, + "step": 2272 + }, + { + "epoch": 3.28705712219812, + "grad_norm": 2.251694032374471, + "learning_rate": 3.535171328161542e-06, + "logits/chosen": -0.5955110788345337, + "logits/rejected": -0.4887436032295227, + "logps/chosen": -0.3352237939834595, + "logps/rejected": -4.323999404907227, + "loss": 0.2193, + "odds_ratio_loss": 0.017490914091467857, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03352237865328789, + "rewards/margins": 0.3988775610923767, + "rewards/rejected": -0.4323999583721161, + "sft_loss": 0.3352237939834595, + "step": 2273 + }, + { + "epoch": 3.2885032537960956, + "grad_norm": 2.075285424913544, + "learning_rate": 3.532086635818848e-06, + "logits/chosen": -0.42095768451690674, + "logits/rejected": -0.31674113869667053, + "logps/chosen": -0.2579844892024994, + "logps/rejected": -4.457534313201904, + "loss": 0.2311, + "odds_ratio_loss": 0.040766119956970215, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025798451155424118, + "rewards/margins": 0.4199550449848175, + "rewards/rejected": -0.4457534849643707, + "sft_loss": 0.2579844892024994, + "step": 2274 + }, + { + "epoch": 3.289949385394071, + "grad_norm": 2.193805076834671, + "learning_rate": 3.529002225582862e-06, + "logits/chosen": -0.6587631106376648, + "logits/rejected": -0.4716310501098633, + "logps/chosen": -0.1755092442035675, + "logps/rejected": -3.7191784381866455, + "loss": 0.1745, + "odds_ratio_loss": 0.04562509059906006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01755092665553093, + "rewards/margins": 0.3543669283390045, + "rewards/rejected": -0.37191784381866455, + "sft_loss": 0.1755092442035675, + "step": 2275 + }, + { + "epoch": 3.291395516992046, + "grad_norm": 2.0765831775562518, + "learning_rate": 3.5259180993131893e-06, + "logits/chosen": -0.5486062169075012, + "logits/rejected": -0.45100995898246765, + "logps/chosen": -0.17713306844234467, + "logps/rejected": -2.9269280433654785, + "loss": 0.2267, + "odds_ratio_loss": 0.04936151206493378, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017713308334350586, + "rewards/margins": 0.27497947216033936, + "rewards/rejected": -0.29269278049468994, + "sft_loss": 0.17713306844234467, + "step": 2276 + }, + { + "epoch": 3.292841648590022, + "grad_norm": 2.1171590132243105, + "learning_rate": 3.5228342588692603e-06, + "logits/chosen": -0.36625778675079346, + "logits/rejected": -0.26877880096435547, + "logps/chosen": -0.22854745388031006, + "logps/rejected": -4.398021221160889, + "loss": 0.2072, + "odds_ratio_loss": 0.021160855889320374, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022854745388031006, + "rewards/margins": 0.4169473350048065, + "rewards/rejected": -0.4398021101951599, + "sft_loss": 0.22854745388031006, + "step": 2277 + }, + { + "epoch": 3.294287780187997, + "grad_norm": 3.1829007081069176, + "learning_rate": 3.519750706110334e-06, + "logits/chosen": -0.5905520915985107, + "logits/rejected": -0.4183228611946106, + "logps/chosen": -0.12734369933605194, + "logps/rejected": -4.160774230957031, + "loss": 0.1687, + "odds_ratio_loss": 0.03372897580265999, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012734370306134224, + "rewards/margins": 0.4033430218696594, + "rewards/rejected": -0.4160774350166321, + "sft_loss": 0.12734369933605194, + "step": 2278 + }, + { + "epoch": 3.2957339117859723, + "grad_norm": 2.1113324149581594, + "learning_rate": 3.516667442895494e-06, + "logits/chosen": -0.6072694659233093, + "logits/rejected": -0.3589474558830261, + "logps/chosen": -0.279439240694046, + "logps/rejected": -3.625382423400879, + "loss": 0.2092, + "odds_ratio_loss": 0.06730879098176956, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027943922206759453, + "rewards/margins": 0.3345942795276642, + "rewards/rejected": -0.36253821849823, + "sft_loss": 0.279439240694046, + "step": 2279 + }, + { + "epoch": 3.297180043383948, + "grad_norm": 3.3151954739010767, + "learning_rate": 3.5135844710836545e-06, + "logits/chosen": -0.40881648659706116, + "logits/rejected": -0.3644575774669647, + "logps/chosen": -0.3337644338607788, + "logps/rejected": -5.119471549987793, + "loss": 0.2165, + "odds_ratio_loss": 0.09850729256868362, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03337644785642624, + "rewards/margins": 0.478570818901062, + "rewards/rejected": -0.5119472742080688, + "sft_loss": 0.3337644338607788, + "step": 2280 + }, + { + "epoch": 3.2986261749819232, + "grad_norm": 2.007323942505847, + "learning_rate": 3.510501792533548e-06, + "logits/chosen": -0.6884647607803345, + "logits/rejected": -0.6225319504737854, + "logps/chosen": -0.23053047060966492, + "logps/rejected": -3.772772789001465, + "loss": 0.1856, + "odds_ratio_loss": 0.058163873851299286, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02305304817855358, + "rewards/margins": 0.35422423481941223, + "rewards/rejected": -0.37727731466293335, + "sft_loss": 0.23053047060966492, + "step": 2281 + }, + { + "epoch": 3.300072306579899, + "grad_norm": 2.1743690077166784, + "learning_rate": 3.5074194091037354e-06, + "logits/chosen": -0.9183458089828491, + "logits/rejected": -0.6526041626930237, + "logps/chosen": -0.22743211686611176, + "logps/rejected": -2.442390203475952, + "loss": 0.2149, + "odds_ratio_loss": 0.05146101117134094, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022743212059140205, + "rewards/margins": 0.2214958220720291, + "rewards/rejected": -0.24423903226852417, + "sft_loss": 0.22743211686611176, + "step": 2282 + }, + { + "epoch": 3.301518438177874, + "grad_norm": 2.643722453059619, + "learning_rate": 3.5043373226525933e-06, + "logits/chosen": -0.6142512559890747, + "logits/rejected": -0.4808671474456787, + "logps/chosen": -0.1182275265455246, + "logps/rejected": -4.5574140548706055, + "loss": 0.1724, + "odds_ratio_loss": 0.021381031721830368, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011822751723229885, + "rewards/margins": 0.4439186751842499, + "rewards/rejected": -0.4557414650917053, + "sft_loss": 0.1182275265455246, + "step": 2283 + }, + { + "epoch": 3.30296456977585, + "grad_norm": 2.7578652269026644, + "learning_rate": 3.5012555350383265e-06, + "logits/chosen": -0.46577584743499756, + "logits/rejected": -0.3899204134941101, + "logps/chosen": -0.1679561287164688, + "logps/rejected": -3.679994583129883, + "loss": 0.1715, + "odds_ratio_loss": 0.012146007269620895, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016795611009001732, + "rewards/margins": 0.3512038588523865, + "rewards/rejected": -0.36799943447113037, + "sft_loss": 0.1679561287164688, + "step": 2284 + }, + { + "epoch": 3.304410701373825, + "grad_norm": 1.983775119024542, + "learning_rate": 3.4981740481189557e-06, + "logits/chosen": -0.6946292519569397, + "logits/rejected": -0.606383204460144, + "logps/chosen": -0.24009007215499878, + "logps/rejected": -2.6545491218566895, + "loss": 0.1572, + "odds_ratio_loss": 0.039150357246398926, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02400900609791279, + "rewards/margins": 0.2414458990097046, + "rewards/rejected": -0.26545488834381104, + "sft_loss": 0.24009007215499878, + "step": 2285 + }, + { + "epoch": 3.3058568329718003, + "grad_norm": 2.1786049400838388, + "learning_rate": 3.495092863752319e-06, + "logits/chosen": -0.6590663194656372, + "logits/rejected": -0.5998579263687134, + "logps/chosen": -0.1764366626739502, + "logps/rejected": -3.5466344356536865, + "loss": 0.2583, + "odds_ratio_loss": 0.03589484095573425, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01764366589486599, + "rewards/margins": 0.33701977133750916, + "rewards/rejected": -0.3546634614467621, + "sft_loss": 0.1764366626739502, + "step": 2286 + }, + { + "epoch": 3.307302964569776, + "grad_norm": 2.1572867888890936, + "learning_rate": 3.4920119837960764e-06, + "logits/chosen": -0.6842783093452454, + "logits/rejected": -0.6382707357406616, + "logps/chosen": -0.09783312678337097, + "logps/rejected": -5.192519187927246, + "loss": 0.1635, + "odds_ratio_loss": 0.010324829258024693, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009783312678337097, + "rewards/margins": 0.5094686150550842, + "rewards/rejected": -0.5192519426345825, + "sft_loss": 0.09783312678337097, + "step": 2287 + }, + { + "epoch": 3.3087490961677513, + "grad_norm": 2.5485270310682706, + "learning_rate": 3.4889314101077006e-06, + "logits/chosen": -0.7338652610778809, + "logits/rejected": -0.5132858157157898, + "logps/chosen": -0.18935944139957428, + "logps/rejected": -3.8639068603515625, + "loss": 0.2015, + "odds_ratio_loss": 0.028721699491143227, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018935944885015488, + "rewards/margins": 0.36745476722717285, + "rewards/rejected": -0.38639071583747864, + "sft_loss": 0.18935944139957428, + "step": 2288 + }, + { + "epoch": 3.3101952277657265, + "grad_norm": 2.185540071670579, + "learning_rate": 3.4858511445444814e-06, + "logits/chosen": -0.5446025729179382, + "logits/rejected": -0.48353853821754456, + "logps/chosen": -0.18259915709495544, + "logps/rejected": -2.570195198059082, + "loss": 0.1704, + "odds_ratio_loss": 0.05168168246746063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018259914591908455, + "rewards/margins": 0.2387596219778061, + "rewards/rejected": -0.2570195496082306, + "sft_loss": 0.18259915709495544, + "step": 2289 + }, + { + "epoch": 3.311641359363702, + "grad_norm": 2.1793350901491615, + "learning_rate": 3.4827711889635207e-06, + "logits/chosen": -0.6570934057235718, + "logits/rejected": -0.5295805335044861, + "logps/chosen": -0.12861356139183044, + "logps/rejected": -5.604830265045166, + "loss": 0.1796, + "odds_ratio_loss": 0.017030876129865646, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012861356139183044, + "rewards/margins": 0.5476217269897461, + "rewards/rejected": -0.5604830980300903, + "sft_loss": 0.12861356139183044, + "step": 2290 + }, + { + "epoch": 3.3130874909616774, + "grad_norm": 2.0587136754455586, + "learning_rate": 3.4796915452217376e-06, + "logits/chosen": -0.5374751687049866, + "logits/rejected": -0.5378148555755615, + "logps/chosen": -0.25962379574775696, + "logps/rejected": -2.395240068435669, + "loss": 0.2276, + "odds_ratio_loss": 0.06650760024785995, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025962380692362785, + "rewards/margins": 0.21356163918972015, + "rewards/rejected": -0.2395240217447281, + "sft_loss": 0.25962379574775696, + "step": 2291 + }, + { + "epoch": 3.314533622559653, + "grad_norm": 1.944851752443858, + "learning_rate": 3.4766122151758595e-06, + "logits/chosen": -0.5675870180130005, + "logits/rejected": -0.3958700895309448, + "logps/chosen": -0.25842157006263733, + "logps/rejected": -4.663134574890137, + "loss": 0.2255, + "odds_ratio_loss": 0.04698663204908371, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025842156261205673, + "rewards/margins": 0.4404713213443756, + "rewards/rejected": -0.46631351113319397, + "sft_loss": 0.25842157006263733, + "step": 2292 + }, + { + "epoch": 3.3159797541576284, + "grad_norm": 2.7882638415952177, + "learning_rate": 3.473533200682427e-06, + "logits/chosen": -0.4775904715061188, + "logits/rejected": -0.5372059345245361, + "logps/chosen": -0.15239141881465912, + "logps/rejected": -2.723569869995117, + "loss": 0.2259, + "odds_ratio_loss": 0.04135645925998688, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015239142812788486, + "rewards/margins": 0.2571178376674652, + "rewards/rejected": -0.2723569869995117, + "sft_loss": 0.15239141881465912, + "step": 2293 + }, + { + "epoch": 3.3174258857556036, + "grad_norm": 2.3481373712095963, + "learning_rate": 3.4704545035977866e-06, + "logits/chosen": -0.7102236151695251, + "logits/rejected": -0.5002355575561523, + "logps/chosen": -0.1865234225988388, + "logps/rejected": -2.455569267272949, + "loss": 0.2064, + "odds_ratio_loss": 0.05270793288946152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01865234225988388, + "rewards/margins": 0.22690460085868835, + "rewards/rejected": -0.24555695056915283, + "sft_loss": 0.1865234225988388, + "step": 2294 + }, + { + "epoch": 3.3188720173535793, + "grad_norm": 2.0824459255780283, + "learning_rate": 3.4673761257781e-06, + "logits/chosen": -0.6822003126144409, + "logits/rejected": -0.4748939871788025, + "logps/chosen": -0.13890133798122406, + "logps/rejected": -6.211273670196533, + "loss": 0.205, + "odds_ratio_loss": 0.009621636010706425, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013890134170651436, + "rewards/margins": 0.6072372794151306, + "rewards/rejected": -0.6211273670196533, + "sft_loss": 0.13890133798122406, + "step": 2295 + }, + { + "epoch": 3.3203181489515545, + "grad_norm": 2.061127162028327, + "learning_rate": 3.46429806907933e-06, + "logits/chosen": -0.7757879495620728, + "logits/rejected": -0.5989354848861694, + "logps/chosen": -0.1916319727897644, + "logps/rejected": -3.734245777130127, + "loss": 0.1461, + "odds_ratio_loss": 0.027212627232074738, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01916319690644741, + "rewards/margins": 0.3542613983154297, + "rewards/rejected": -0.37342458963394165, + "sft_loss": 0.1916319727897644, + "step": 2296 + }, + { + "epoch": 3.32176428054953, + "grad_norm": 2.114351797322761, + "learning_rate": 3.4612203353572503e-06, + "logits/chosen": -0.6353336572647095, + "logits/rejected": -0.5170816779136658, + "logps/chosen": -0.1459745466709137, + "logps/rejected": -4.982494831085205, + "loss": 0.2044, + "odds_ratio_loss": 0.020803630352020264, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014597454108297825, + "rewards/margins": 0.4836519956588745, + "rewards/rejected": -0.49824947118759155, + "sft_loss": 0.1459745466709137, + "step": 2297 + }, + { + "epoch": 3.3232104121475055, + "grad_norm": 1.946211322014238, + "learning_rate": 3.458142926467435e-06, + "logits/chosen": -0.3869672119617462, + "logits/rejected": -0.4443845748901367, + "logps/chosen": -0.12994037568569183, + "logps/rejected": -5.26711893081665, + "loss": 0.1448, + "odds_ratio_loss": 0.0468423031270504, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012994037009775639, + "rewards/margins": 0.5137178897857666, + "rewards/rejected": -0.5267119407653809, + "sft_loss": 0.12994037568569183, + "step": 2298 + }, + { + "epoch": 3.3246565437454807, + "grad_norm": 1.9993034368183027, + "learning_rate": 3.4550658442652686e-06, + "logits/chosen": -0.5554428100585938, + "logits/rejected": -0.40344059467315674, + "logps/chosen": -0.1770596206188202, + "logps/rejected": -5.824186325073242, + "loss": 0.2275, + "odds_ratio_loss": 0.009020314551889896, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01770596206188202, + "rewards/margins": 0.564712643623352, + "rewards/rejected": -0.5824186205863953, + "sft_loss": 0.1770596206188202, + "step": 2299 + }, + { + "epoch": 3.3261026753434564, + "grad_norm": 1.9294061959051394, + "learning_rate": 3.4519890906059354e-06, + "logits/chosen": -0.6760809421539307, + "logits/rejected": -0.5845804214477539, + "logps/chosen": -0.14634880423545837, + "logps/rejected": -3.8098344802856445, + "loss": 0.1445, + "odds_ratio_loss": 0.03921994939446449, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014634879305958748, + "rewards/margins": 0.3663485646247864, + "rewards/rejected": -0.3809834420681, + "sft_loss": 0.14634880423545837, + "step": 2300 + }, + { + "epoch": 3.3275488069414316, + "grad_norm": 7.102938265333184, + "learning_rate": 3.448912667344418e-06, + "logits/chosen": -0.7923925518989563, + "logits/rejected": -0.5611757040023804, + "logps/chosen": -0.190931037068367, + "logps/rejected": -3.764523506164551, + "loss": 0.1962, + "odds_ratio_loss": 0.035255275666713715, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0190931037068367, + "rewards/margins": 0.35735929012298584, + "rewards/rejected": -0.37645238637924194, + "sft_loss": 0.190931037068367, + "step": 2301 + }, + { + "epoch": 3.328994938539407, + "grad_norm": 1.9369947329468233, + "learning_rate": 3.445836576335508e-06, + "logits/chosen": -0.6175248622894287, + "logits/rejected": -0.5267677903175354, + "logps/chosen": -0.2175239473581314, + "logps/rejected": -3.4381308555603027, + "loss": 0.1882, + "odds_ratio_loss": 0.041964493691921234, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02175239287316799, + "rewards/margins": 0.3220607042312622, + "rewards/rejected": -0.34381306171417236, + "sft_loss": 0.2175239473581314, + "step": 2302 + }, + { + "epoch": 3.3304410701373826, + "grad_norm": 2.1746368999366212, + "learning_rate": 3.4427608194337895e-06, + "logits/chosen": -0.8088559508323669, + "logits/rejected": -0.731338381767273, + "logps/chosen": -0.23601864278316498, + "logps/rejected": -2.960899829864502, + "loss": 0.1881, + "odds_ratio_loss": 0.05574827641248703, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023601865395903587, + "rewards/margins": 0.2724881172180176, + "rewards/rejected": -0.2960899770259857, + "sft_loss": 0.23601864278316498, + "step": 2303 + }, + { + "epoch": 3.331887201735358, + "grad_norm": 2.3661530179952908, + "learning_rate": 3.4396853984936487e-06, + "logits/chosen": -0.4649229049682617, + "logits/rejected": -0.3578875660896301, + "logps/chosen": -0.09250445663928986, + "logps/rejected": -3.7127480506896973, + "loss": 0.1542, + "odds_ratio_loss": 0.017382729798555374, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00925044622272253, + "rewards/margins": 0.36202433705329895, + "rewards/rejected": -0.37127479910850525, + "sft_loss": 0.09250445663928986, + "step": 2304 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 3.0663201292058044, + "learning_rate": 3.4366103153692667e-06, + "logits/chosen": -0.5506351590156555, + "logits/rejected": -0.4261362850666046, + "logps/chosen": -0.35145315527915955, + "logps/rejected": -2.7159597873687744, + "loss": 0.244, + "odds_ratio_loss": 0.06705104559659958, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03514531999826431, + "rewards/margins": 0.2364506721496582, + "rewards/rejected": -0.27159595489501953, + "sft_loss": 0.35145315527915955, + "step": 2305 + }, + { + "epoch": 3.3347794649313087, + "grad_norm": 2.187765405868488, + "learning_rate": 3.433535571914625e-06, + "logits/chosen": -0.6699624061584473, + "logits/rejected": -0.516162633895874, + "logps/chosen": -0.11950968205928802, + "logps/rejected": -4.081395626068115, + "loss": 0.186, + "odds_ratio_loss": 0.01648012362420559, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011950968764722347, + "rewards/margins": 0.3961886167526245, + "rewards/rejected": -0.40813958644866943, + "sft_loss": 0.11950968205928802, + "step": 2306 + }, + { + "epoch": 3.3362255965292844, + "grad_norm": 3.9448298831896684, + "learning_rate": 3.430461169983497e-06, + "logits/chosen": -0.5674360394477844, + "logits/rejected": -0.48793601989746094, + "logps/chosen": -0.17865288257598877, + "logps/rejected": -4.214674949645996, + "loss": 0.1906, + "odds_ratio_loss": 0.07441666722297668, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.017865289002656937, + "rewards/margins": 0.4036021828651428, + "rewards/rejected": -0.42146748304367065, + "sft_loss": 0.17865288257598877, + "step": 2307 + }, + { + "epoch": 3.3376717281272597, + "grad_norm": 1.970348244123892, + "learning_rate": 3.4273871114294503e-06, + "logits/chosen": -0.7424345016479492, + "logits/rejected": -0.7071555852890015, + "logps/chosen": -0.13843649625778198, + "logps/rejected": -4.173454284667969, + "loss": 0.1585, + "odds_ratio_loss": 0.021304195746779442, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013843650929629803, + "rewards/margins": 0.40350183844566345, + "rewards/rejected": -0.41734546422958374, + "sft_loss": 0.13843649625778198, + "step": 2308 + }, + { + "epoch": 3.339117859725235, + "grad_norm": 1.9857192130170105, + "learning_rate": 3.4243133981058457e-06, + "logits/chosen": -0.5343315005302429, + "logits/rejected": -0.3975837826728821, + "logps/chosen": -0.11095140129327774, + "logps/rejected": -4.798469543457031, + "loss": 0.1805, + "odds_ratio_loss": 0.024698395282030106, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011095140129327774, + "rewards/margins": 0.46875184774398804, + "rewards/rejected": -0.4798470139503479, + "sft_loss": 0.11095140129327774, + "step": 2309 + }, + { + "epoch": 3.3405639913232106, + "grad_norm": 1.9290741417311212, + "learning_rate": 3.421240031865839e-06, + "logits/chosen": -0.5730533599853516, + "logits/rejected": -0.5497214794158936, + "logps/chosen": -0.11085940897464752, + "logps/rejected": -3.2291131019592285, + "loss": 0.1509, + "odds_ratio_loss": 0.031010687351226807, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011085940524935722, + "rewards/margins": 0.31182539463043213, + "rewards/rejected": -0.3229113221168518, + "sft_loss": 0.11085940897464752, + "step": 2310 + }, + { + "epoch": 3.342010122921186, + "grad_norm": 2.2141056969800883, + "learning_rate": 3.418167014562372e-06, + "logits/chosen": -0.6565252542495728, + "logits/rejected": -0.6016082763671875, + "logps/chosen": -0.12213000655174255, + "logps/rejected": -2.5358057022094727, + "loss": 0.2017, + "odds_ratio_loss": 0.03299042582511902, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012213001027703285, + "rewards/margins": 0.24136759340763092, + "rewards/rejected": -0.25358060002326965, + "sft_loss": 0.12213000655174255, + "step": 2311 + }, + { + "epoch": 3.343456254519161, + "grad_norm": 3.54538224416602, + "learning_rate": 3.415094348048178e-06, + "logits/chosen": -0.6507242918014526, + "logits/rejected": -0.5217114090919495, + "logps/chosen": -0.25277209281921387, + "logps/rejected": -3.5952529907226562, + "loss": 0.2842, + "odds_ratio_loss": 0.048409249633550644, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025277208536863327, + "rewards/margins": 0.3342480957508087, + "rewards/rejected": -0.35952532291412354, + "sft_loss": 0.25277209281921387, + "step": 2312 + }, + { + "epoch": 3.3449023861171367, + "grad_norm": 2.029733725748979, + "learning_rate": 3.4120220341757816e-06, + "logits/chosen": -0.6231029033660889, + "logits/rejected": -0.5002346634864807, + "logps/chosen": -0.13461542129516602, + "logps/rejected": -3.9896812438964844, + "loss": 0.1734, + "odds_ratio_loss": 0.025905992835760117, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013461543247103691, + "rewards/margins": 0.3855065703392029, + "rewards/rejected": -0.3989681005477905, + "sft_loss": 0.13461542129516602, + "step": 2313 + }, + { + "epoch": 3.346348517715112, + "grad_norm": 2.0223739152146694, + "learning_rate": 3.408950074797489e-06, + "logits/chosen": -0.47878187894821167, + "logits/rejected": -0.321734219789505, + "logps/chosen": -0.11608566343784332, + "logps/rejected": -4.263401031494141, + "loss": 0.1635, + "odds_ratio_loss": 0.028983620926737785, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011608565226197243, + "rewards/margins": 0.41473156213760376, + "rewards/rejected": -0.42634010314941406, + "sft_loss": 0.11608566343784332, + "step": 2314 + }, + { + "epoch": 3.3477946493130877, + "grad_norm": 1.9365576676145997, + "learning_rate": 3.4058784717653995e-06, + "logits/chosen": -0.4275784492492676, + "logits/rejected": -0.36998388171195984, + "logps/chosen": -0.1506321281194687, + "logps/rejected": -3.957401752471924, + "loss": 0.1715, + "odds_ratio_loss": 0.023368481546640396, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015063212253153324, + "rewards/margins": 0.3806769549846649, + "rewards/rejected": -0.3957401514053345, + "sft_loss": 0.1506321281194687, + "step": 2315 + }, + { + "epoch": 3.349240780911063, + "grad_norm": 2.305557489402209, + "learning_rate": 3.402807226931391e-06, + "logits/chosen": -0.3678243160247803, + "logits/rejected": -0.3632332682609558, + "logps/chosen": -0.1722518503665924, + "logps/rejected": -4.876021862030029, + "loss": 0.1641, + "odds_ratio_loss": 0.0586848147213459, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01722518540918827, + "rewards/margins": 0.4703770577907562, + "rewards/rejected": -0.48760223388671875, + "sft_loss": 0.1722518503665924, + "step": 2316 + }, + { + "epoch": 3.350686912509038, + "grad_norm": 1.809648668266014, + "learning_rate": 3.39973634214713e-06, + "logits/chosen": -0.4578765332698822, + "logits/rejected": -0.307986319065094, + "logps/chosen": -0.16912201046943665, + "logps/rejected": -2.635956048965454, + "loss": 0.2058, + "odds_ratio_loss": 0.04555736109614372, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016912199556827545, + "rewards/margins": 0.24668340384960175, + "rewards/rejected": -0.2635956108570099, + "sft_loss": 0.16912201046943665, + "step": 2317 + }, + { + "epoch": 3.352133044107014, + "grad_norm": 2.0091293833397312, + "learning_rate": 3.396665819264063e-06, + "logits/chosen": -0.564257800579071, + "logits/rejected": -0.5668538808822632, + "logps/chosen": -0.23128202557563782, + "logps/rejected": -3.673203229904175, + "loss": 0.1731, + "odds_ratio_loss": 0.07565176486968994, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023128200322389603, + "rewards/margins": 0.34419214725494385, + "rewards/rejected": -0.36732035875320435, + "sft_loss": 0.23128202557563782, + "step": 2318 + }, + { + "epoch": 3.353579175704989, + "grad_norm": 1.9041789941699363, + "learning_rate": 3.393595660133422e-06, + "logits/chosen": -0.6914946436882019, + "logits/rejected": -0.45303675532341003, + "logps/chosen": -0.15402580797672272, + "logps/rejected": -3.851901054382324, + "loss": 0.1957, + "odds_ratio_loss": 0.0196257084608078, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015402580611407757, + "rewards/margins": 0.3697875142097473, + "rewards/rejected": -0.38519006967544556, + "sft_loss": 0.15402580797672272, + "step": 2319 + }, + { + "epoch": 3.3550253073029648, + "grad_norm": 2.4464738329993594, + "learning_rate": 3.390525866606215e-06, + "logits/chosen": -0.5936788320541382, + "logits/rejected": -0.3911159634590149, + "logps/chosen": -0.16877484321594238, + "logps/rejected": -5.249103546142578, + "loss": 0.185, + "odds_ratio_loss": 0.02867019921541214, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01687748357653618, + "rewards/margins": 0.5080328583717346, + "rewards/rejected": -0.5249103307723999, + "sft_loss": 0.16877484321594238, + "step": 2320 + }, + { + "epoch": 3.35647143890094, + "grad_norm": 2.306906410554413, + "learning_rate": 3.3874564405332345e-06, + "logits/chosen": -0.42751482129096985, + "logits/rejected": -0.3295228183269501, + "logps/chosen": -0.108555868268013, + "logps/rejected": -3.7481698989868164, + "loss": 0.1913, + "odds_ratio_loss": 0.015943093225359917, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010855588130652905, + "rewards/margins": 0.363961398601532, + "rewards/rejected": -0.37481701374053955, + "sft_loss": 0.108555868268013, + "step": 2321 + }, + { + "epoch": 3.3579175704989153, + "grad_norm": 2.511012207408778, + "learning_rate": 3.3843873837650446e-06, + "logits/chosen": -0.5912790298461914, + "logits/rejected": -0.43329575657844543, + "logps/chosen": -0.15693248808383942, + "logps/rejected": -6.197193145751953, + "loss": 0.2058, + "odds_ratio_loss": 0.0240620244294405, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01569324918091297, + "rewards/margins": 0.6040260791778564, + "rewards/rejected": -0.6197193264961243, + "sft_loss": 0.15693248808383942, + "step": 2322 + }, + { + "epoch": 3.359363702096891, + "grad_norm": 2.3284028366780136, + "learning_rate": 3.3813186981519962e-06, + "logits/chosen": -0.5598545074462891, + "logits/rejected": -0.2795000672340393, + "logps/chosen": -0.2694113254547119, + "logps/rejected": -5.610904693603516, + "loss": 0.1817, + "odds_ratio_loss": 0.05200380086898804, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02694113366305828, + "rewards/margins": 0.5341493487358093, + "rewards/rejected": -0.5610904693603516, + "sft_loss": 0.2694113254547119, + "step": 2323 + }, + { + "epoch": 3.360809833694866, + "grad_norm": 2.197930880971812, + "learning_rate": 3.378250385544208e-06, + "logits/chosen": -0.5665892958641052, + "logits/rejected": -0.42673903703689575, + "logps/chosen": -0.1817660629749298, + "logps/rejected": -4.3229498863220215, + "loss": 0.1871, + "odds_ratio_loss": 0.03276637941598892, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0181766077876091, + "rewards/margins": 0.41411834955215454, + "rewards/rejected": -0.43229496479034424, + "sft_loss": 0.1817660629749298, + "step": 2324 + }, + { + "epoch": 3.3622559652928414, + "grad_norm": 2.067932219260601, + "learning_rate": 3.375182447791577e-06, + "logits/chosen": -0.6327534914016724, + "logits/rejected": -0.36072319746017456, + "logps/chosen": -0.16353721916675568, + "logps/rejected": -3.5113089084625244, + "loss": 0.1897, + "odds_ratio_loss": 0.027788694947957993, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016353722661733627, + "rewards/margins": 0.33477717638015747, + "rewards/rejected": -0.351130872964859, + "sft_loss": 0.16353721916675568, + "step": 2325 + }, + { + "epoch": 3.363702096890817, + "grad_norm": 2.0455089321641178, + "learning_rate": 3.3721148867437774e-06, + "logits/chosen": -0.5868119597434998, + "logits/rejected": -0.396449476480484, + "logps/chosen": -0.12406010925769806, + "logps/rejected": -5.176513195037842, + "loss": 0.1933, + "odds_ratio_loss": 0.026875488460063934, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012406010180711746, + "rewards/margins": 0.5052453875541687, + "rewards/rejected": -0.5176513195037842, + "sft_loss": 0.12406010925769806, + "step": 2326 + }, + { + "epoch": 3.3651482284887924, + "grad_norm": 1.8271184123517485, + "learning_rate": 3.3690477042502496e-06, + "logits/chosen": -0.5884243249893188, + "logits/rejected": -0.48027777671813965, + "logps/chosen": -0.27801549434661865, + "logps/rejected": -4.671180725097656, + "loss": 0.1827, + "odds_ratio_loss": 0.05004490539431572, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027801549062132835, + "rewards/margins": 0.4393165707588196, + "rewards/rejected": -0.46711811423301697, + "sft_loss": 0.27801549434661865, + "step": 2327 + }, + { + "epoch": 3.366594360086768, + "grad_norm": 1.8966435232463192, + "learning_rate": 3.365980902160212e-06, + "logits/chosen": -0.40547722578048706, + "logits/rejected": -0.36879974603652954, + "logps/chosen": -0.21847718954086304, + "logps/rejected": -4.9782304763793945, + "loss": 0.197, + "odds_ratio_loss": 0.0362633541226387, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021847719326615334, + "rewards/margins": 0.4759753346443176, + "rewards/rejected": -0.4978230595588684, + "sft_loss": 0.21847718954086304, + "step": 2328 + }, + { + "epoch": 3.3680404916847433, + "grad_norm": 2.0848613162126814, + "learning_rate": 3.3629144823226482e-06, + "logits/chosen": -0.7243247032165527, + "logits/rejected": -0.5537648797035217, + "logps/chosen": -0.2133502960205078, + "logps/rejected": -3.376030921936035, + "loss": 0.1792, + "odds_ratio_loss": 0.041394349187612534, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021335028111934662, + "rewards/margins": 0.31626808643341064, + "rewards/rejected": -0.3376030921936035, + "sft_loss": 0.2133502960205078, + "step": 2329 + }, + { + "epoch": 3.369486623282719, + "grad_norm": 1.9065026429617424, + "learning_rate": 3.3598484465863172e-06, + "logits/chosen": -0.4827233552932739, + "logits/rejected": -0.3291730284690857, + "logps/chosen": -0.13992281258106232, + "logps/rejected": -4.810104846954346, + "loss": 0.1576, + "odds_ratio_loss": 0.028171217069029808, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013992281630635262, + "rewards/margins": 0.4670182168483734, + "rewards/rejected": -0.4810104966163635, + "sft_loss": 0.13992281258106232, + "step": 2330 + }, + { + "epoch": 3.370932754880694, + "grad_norm": 2.093063883047962, + "learning_rate": 3.356782796799741e-06, + "logits/chosen": -0.7263155579566956, + "logits/rejected": -0.5995932817459106, + "logps/chosen": -0.2737896740436554, + "logps/rejected": -2.341442823410034, + "loss": 0.1972, + "odds_ratio_loss": 0.07859160006046295, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02737896703183651, + "rewards/margins": 0.2067653238773346, + "rewards/rejected": -0.23414428532123566, + "sft_loss": 0.2737896740436554, + "step": 2331 + }, + { + "epoch": 3.3723788864786695, + "grad_norm": 2.280755776822081, + "learning_rate": 3.3537175348112132e-06, + "logits/chosen": -0.5803489089012146, + "logits/rejected": -0.4628605246543884, + "logps/chosen": -0.2696508467197418, + "logps/rejected": -4.5527024269104, + "loss": 0.217, + "odds_ratio_loss": 0.03504379093647003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026965085417032242, + "rewards/margins": 0.42830514907836914, + "rewards/rejected": -0.4552702307701111, + "sft_loss": 0.2696508467197418, + "step": 2332 + }, + { + "epoch": 3.373825018076645, + "grad_norm": 2.4387381727232342, + "learning_rate": 3.350652662468789e-06, + "logits/chosen": -0.5568262338638306, + "logits/rejected": -0.5146419405937195, + "logps/chosen": -0.27246224880218506, + "logps/rejected": -4.7472405433654785, + "loss": 0.2329, + "odds_ratio_loss": 0.06966537982225418, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027246225625276566, + "rewards/margins": 0.4474778175354004, + "rewards/rejected": -0.47472405433654785, + "sft_loss": 0.27246224880218506, + "step": 2333 + }, + { + "epoch": 3.3752711496746204, + "grad_norm": 2.089432631192305, + "learning_rate": 3.347588181620295e-06, + "logits/chosen": -0.6027547121047974, + "logits/rejected": -0.45868149399757385, + "logps/chosen": -0.22351467609405518, + "logps/rejected": -3.78956937789917, + "loss": 0.2044, + "odds_ratio_loss": 0.028093697503209114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022351469844579697, + "rewards/margins": 0.3566054701805115, + "rewards/rejected": -0.37895697355270386, + "sft_loss": 0.22351467609405518, + "step": 2334 + }, + { + "epoch": 3.3767172812725956, + "grad_norm": 2.017534606069084, + "learning_rate": 3.344524094113315e-06, + "logits/chosen": -0.5132440328598022, + "logits/rejected": -0.4382811188697815, + "logps/chosen": -0.24050694704055786, + "logps/rejected": -4.633388042449951, + "loss": 0.2197, + "odds_ratio_loss": 0.06342487037181854, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024050697684288025, + "rewards/margins": 0.4392881393432617, + "rewards/rejected": -0.46333885192871094, + "sft_loss": 0.24050694704055786, + "step": 2335 + }, + { + "epoch": 3.3781634128705713, + "grad_norm": 2.5427888945503003, + "learning_rate": 3.3414604017952012e-06, + "logits/chosen": -0.43957871198654175, + "logits/rejected": -0.32247668504714966, + "logps/chosen": -0.16919280588626862, + "logps/rejected": -4.3719401359558105, + "loss": 0.2299, + "odds_ratio_loss": 0.015139477327466011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01691928133368492, + "rewards/margins": 0.4202747344970703, + "rewards/rejected": -0.43719398975372314, + "sft_loss": 0.16919280588626862, + "step": 2336 + }, + { + "epoch": 3.3796095444685466, + "grad_norm": 1.907668530921736, + "learning_rate": 3.338397106513062e-06, + "logits/chosen": -0.5221062898635864, + "logits/rejected": -0.44908884167671204, + "logps/chosen": -0.12111540138721466, + "logps/rejected": -5.3049821853637695, + "loss": 0.1675, + "odds_ratio_loss": 0.013860628008842468, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012111540883779526, + "rewards/margins": 0.5183866620063782, + "rewards/rejected": -0.530498206615448, + "sft_loss": 0.12111540138721466, + "step": 2337 + }, + { + "epoch": 3.3810556760665222, + "grad_norm": 2.7346827804749414, + "learning_rate": 3.3353342101137716e-06, + "logits/chosen": -0.7345597743988037, + "logits/rejected": -0.5234204530715942, + "logps/chosen": -0.24222394824028015, + "logps/rejected": -3.3377528190612793, + "loss": 0.2078, + "odds_ratio_loss": 0.034483764320611954, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024222396314144135, + "rewards/margins": 0.3095529079437256, + "rewards/rejected": -0.3337753117084503, + "sft_loss": 0.24222394824028015, + "step": 2338 + }, + { + "epoch": 3.3825018076644975, + "grad_norm": 1.917048235526739, + "learning_rate": 3.3322717144439625e-06, + "logits/chosen": -0.46222031116485596, + "logits/rejected": -0.27375099062919617, + "logps/chosen": -0.1882801055908203, + "logps/rejected": -2.4854445457458496, + "loss": 0.2039, + "odds_ratio_loss": 0.03459760174155235, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018828010186553, + "rewards/margins": 0.2297164499759674, + "rewards/rejected": -0.24854443967342377, + "sft_loss": 0.1882801055908203, + "step": 2339 + }, + { + "epoch": 3.3839479392624727, + "grad_norm": 2.0568650037186837, + "learning_rate": 3.329209621350022e-06, + "logits/chosen": -0.40930983424186707, + "logits/rejected": -0.33890777826309204, + "logps/chosen": -0.12745355069637299, + "logps/rejected": -3.447296619415283, + "loss": 0.1956, + "odds_ratio_loss": 0.023260656744241714, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012745355255901814, + "rewards/margins": 0.33198434114456177, + "rewards/rejected": -0.3447296917438507, + "sft_loss": 0.12745355069637299, + "step": 2340 + }, + { + "epoch": 3.3853940708604484, + "grad_norm": 1.8812355165375803, + "learning_rate": 3.326147932678101e-06, + "logits/chosen": -0.5835216045379639, + "logits/rejected": -0.3774033486843109, + "logps/chosen": -0.1862981766462326, + "logps/rejected": -2.6372690200805664, + "loss": 0.1454, + "odds_ratio_loss": 0.04647889733314514, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01862981729209423, + "rewards/margins": 0.2450971007347107, + "rewards/rejected": -0.2637269198894501, + "sft_loss": 0.1862981766462326, + "step": 2341 + }, + { + "epoch": 3.3868402024584237, + "grad_norm": 2.0368914180952524, + "learning_rate": 3.3230866502741003e-06, + "logits/chosen": -0.5929782390594482, + "logits/rejected": -0.4657699167728424, + "logps/chosen": -0.15571850538253784, + "logps/rejected": -4.687864303588867, + "loss": 0.1233, + "odds_ratio_loss": 0.03532061725854874, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01557185035198927, + "rewards/margins": 0.453214555978775, + "rewards/rejected": -0.4687863886356354, + "sft_loss": 0.15571850538253784, + "step": 2342 + }, + { + "epoch": 3.3882863340563993, + "grad_norm": 2.5322463156632584, + "learning_rate": 3.3200257759836797e-06, + "logits/chosen": -0.5486747026443481, + "logits/rejected": -0.4624249041080475, + "logps/chosen": -0.15167993307113647, + "logps/rejected": -4.17061185836792, + "loss": 0.2099, + "odds_ratio_loss": 0.028253663331270218, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015167992562055588, + "rewards/margins": 0.4018931984901428, + "rewards/rejected": -0.4170612096786499, + "sft_loss": 0.15167993307113647, + "step": 2343 + }, + { + "epoch": 3.3897324656543746, + "grad_norm": 3.862720927006293, + "learning_rate": 3.3169653116522495e-06, + "logits/chosen": -0.585822343826294, + "logits/rejected": -0.43602919578552246, + "logps/chosen": -0.18509134650230408, + "logps/rejected": -2.327944755554199, + "loss": 0.2604, + "odds_ratio_loss": 0.04921817034482956, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018509134650230408, + "rewards/margins": 0.21428535878658295, + "rewards/rejected": -0.23279447853565216, + "sft_loss": 0.18509134650230408, + "step": 2344 + }, + { + "epoch": 3.39117859725235, + "grad_norm": 1.950000962651423, + "learning_rate": 3.3139052591249787e-06, + "logits/chosen": -0.6751699447631836, + "logits/rejected": -0.5654871463775635, + "logps/chosen": -0.15269403159618378, + "logps/rejected": -3.7506747245788574, + "loss": 0.1931, + "odds_ratio_loss": 0.016780618578195572, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015269402414560318, + "rewards/margins": 0.35979804396629333, + "rewards/rejected": -0.37506741285324097, + "sft_loss": 0.15269403159618378, + "step": 2345 + }, + { + "epoch": 3.3926247288503255, + "grad_norm": 2.4756529994631578, + "learning_rate": 3.310845620246782e-06, + "logits/chosen": -0.5545775294303894, + "logits/rejected": -0.5234758257865906, + "logps/chosen": -0.3893795907497406, + "logps/rejected": -3.45979642868042, + "loss": 0.2417, + "odds_ratio_loss": 0.06643575429916382, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03893795982003212, + "rewards/margins": 0.3070417046546936, + "rewards/rejected": -0.34597963094711304, + "sft_loss": 0.3893795907497406, + "step": 2346 + }, + { + "epoch": 3.3940708604483008, + "grad_norm": 1.9452862350833446, + "learning_rate": 3.307786396862328e-06, + "logits/chosen": -0.514398992061615, + "logits/rejected": -0.418234258890152, + "logps/chosen": -0.1784936636686325, + "logps/rejected": -3.257981777191162, + "loss": 0.169, + "odds_ratio_loss": 0.029191169887781143, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01784936711192131, + "rewards/margins": 0.3079487979412079, + "rewards/rejected": -0.3257981538772583, + "sft_loss": 0.1784936636686325, + "step": 2347 + }, + { + "epoch": 3.395516992046276, + "grad_norm": 1.8849207427683208, + "learning_rate": 3.3047275908160313e-06, + "logits/chosen": -0.6669695377349854, + "logits/rejected": -0.5456492304801941, + "logps/chosen": -0.14028765261173248, + "logps/rejected": -3.805546522140503, + "loss": 0.1403, + "odds_ratio_loss": 0.0347730778157711, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014028767123818398, + "rewards/margins": 0.36652588844299316, + "rewards/rejected": -0.3805546462535858, + "sft_loss": 0.14028765261173248, + "step": 2348 + }, + { + "epoch": 3.3969631236442517, + "grad_norm": 3.2445046573108662, + "learning_rate": 3.301669203952062e-06, + "logits/chosen": -0.6601239442825317, + "logits/rejected": -0.5460609793663025, + "logps/chosen": -0.33426791429519653, + "logps/rejected": -3.9542243480682373, + "loss": 0.2175, + "odds_ratio_loss": 0.059068068861961365, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03342679515480995, + "rewards/margins": 0.361995667219162, + "rewards/rejected": -0.39542245864868164, + "sft_loss": 0.33426791429519653, + "step": 2349 + }, + { + "epoch": 3.398409255242227, + "grad_norm": 2.158850405411536, + "learning_rate": 3.298611238114329e-06, + "logits/chosen": -0.6491446495056152, + "logits/rejected": -0.5393118858337402, + "logps/chosen": -0.1076105535030365, + "logps/rejected": -4.072206497192383, + "loss": 0.1556, + "odds_ratio_loss": 0.056976694613695145, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01076105609536171, + "rewards/margins": 0.3964596390724182, + "rewards/rejected": -0.4072206914424896, + "sft_loss": 0.1076105535030365, + "step": 2350 + }, + { + "epoch": 3.3998553868402026, + "grad_norm": 1.9828791161868011, + "learning_rate": 3.2955536951464928e-06, + "logits/chosen": -0.5805346965789795, + "logits/rejected": -0.43692758679389954, + "logps/chosen": -0.24223265051841736, + "logps/rejected": -3.075357437133789, + "loss": 0.1726, + "odds_ratio_loss": 0.08130443841218948, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024223264306783676, + "rewards/margins": 0.28331252932548523, + "rewards/rejected": -0.3075357675552368, + "sft_loss": 0.24223265051841736, + "step": 2351 + }, + { + "epoch": 3.401301518438178, + "grad_norm": 2.6132585882729185, + "learning_rate": 3.2924965768919584e-06, + "logits/chosen": -0.6605554819107056, + "logits/rejected": -0.5415310263633728, + "logps/chosen": -0.20664909482002258, + "logps/rejected": -4.503979682922363, + "loss": 0.1837, + "odds_ratio_loss": 0.026226144284009933, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020664909854531288, + "rewards/margins": 0.4297330677509308, + "rewards/rejected": -0.4503979980945587, + "sft_loss": 0.20664909482002258, + "step": 2352 + }, + { + "epoch": 3.4027476500361535, + "grad_norm": 2.0914371446634217, + "learning_rate": 3.2894398851938722e-06, + "logits/chosen": -0.5870836973190308, + "logits/rejected": -0.5486207008361816, + "logps/chosen": -0.3236868679523468, + "logps/rejected": -3.2036993503570557, + "loss": 0.2116, + "odds_ratio_loss": 0.10235699266195297, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03236868977546692, + "rewards/margins": 0.28800126910209656, + "rewards/rejected": -0.3203699588775635, + "sft_loss": 0.3236868679523468, + "step": 2353 + }, + { + "epoch": 3.404193781634129, + "grad_norm": 1.9042342118993496, + "learning_rate": 3.2863836218951264e-06, + "logits/chosen": -0.5434327721595764, + "logits/rejected": -0.6129238605499268, + "logps/chosen": -0.15127164125442505, + "logps/rejected": -3.548457145690918, + "loss": 0.1374, + "odds_ratio_loss": 0.0960196778178215, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.01512716431170702, + "rewards/margins": 0.3397185206413269, + "rewards/rejected": -0.35484570264816284, + "sft_loss": 0.15127164125442505, + "step": 2354 + }, + { + "epoch": 3.405639913232104, + "grad_norm": 2.0168858297493384, + "learning_rate": 3.283327788838351e-06, + "logits/chosen": -0.46761107444763184, + "logits/rejected": -0.40209662914276123, + "logps/chosen": -0.16615912318229675, + "logps/rejected": -4.629472732543945, + "loss": 0.1965, + "odds_ratio_loss": 0.07964546978473663, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.016615912318229675, + "rewards/margins": 0.44633132219314575, + "rewards/rejected": -0.462947279214859, + "sft_loss": 0.16615912318229675, + "step": 2355 + }, + { + "epoch": 3.4070860448300797, + "grad_norm": 1.9900945386518782, + "learning_rate": 3.2802723878659227e-06, + "logits/chosen": -0.7723032236099243, + "logits/rejected": -0.5101234912872314, + "logps/chosen": -0.18992879986763, + "logps/rejected": -3.797363758087158, + "loss": 0.162, + "odds_ratio_loss": 0.04275386407971382, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01899288222193718, + "rewards/margins": 0.3607434928417206, + "rewards/rejected": -0.3797363340854645, + "sft_loss": 0.18992879986763, + "step": 2356 + }, + { + "epoch": 3.408532176428055, + "grad_norm": 2.0183495710777297, + "learning_rate": 3.2772174208199506e-06, + "logits/chosen": -0.6566188335418701, + "logits/rejected": -0.3303099572658539, + "logps/chosen": -0.11480744928121567, + "logps/rejected": -6.169617652893066, + "loss": 0.1865, + "odds_ratio_loss": 0.017061393707990646, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011480744928121567, + "rewards/margins": 0.6054810285568237, + "rewards/rejected": -0.6169617772102356, + "sft_loss": 0.11480744928121567, + "step": 2357 + }, + { + "epoch": 3.40997830802603, + "grad_norm": 1.8891380224756906, + "learning_rate": 3.27416288954229e-06, + "logits/chosen": -0.44351673126220703, + "logits/rejected": -0.43019381165504456, + "logps/chosen": -0.10614179819822311, + "logps/rejected": -6.608994007110596, + "loss": 0.1517, + "odds_ratio_loss": 0.028982926160097122, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010614179074764252, + "rewards/margins": 0.6502852439880371, + "rewards/rejected": -0.6608994603157043, + "sft_loss": 0.10614179819822311, + "step": 2358 + }, + { + "epoch": 3.411424439624006, + "grad_norm": 2.5923012440968547, + "learning_rate": 3.2711087958745244e-06, + "logits/chosen": -0.5696052312850952, + "logits/rejected": -0.41010528802871704, + "logps/chosen": -0.26490670442581177, + "logps/rejected": -4.231900215148926, + "loss": 0.2317, + "odds_ratio_loss": 0.07147705554962158, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026490669697523117, + "rewards/margins": 0.39669930934906006, + "rewards/rejected": -0.42318999767303467, + "sft_loss": 0.26490670442581177, + "step": 2359 + }, + { + "epoch": 3.412870571221981, + "grad_norm": 2.0508780602004104, + "learning_rate": 3.2680551416579814e-06, + "logits/chosen": -0.680136501789093, + "logits/rejected": -0.4244930148124695, + "logps/chosen": -0.16198989748954773, + "logps/rejected": -3.4155640602111816, + "loss": 0.1938, + "odds_ratio_loss": 0.025843270123004913, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016198990866541862, + "rewards/margins": 0.32535743713378906, + "rewards/rejected": -0.3415564000606537, + "sft_loss": 0.16198989748954773, + "step": 2360 + }, + { + "epoch": 3.414316702819957, + "grad_norm": 1.887673138028965, + "learning_rate": 3.265001928733718e-06, + "logits/chosen": -0.6401176452636719, + "logits/rejected": -0.5871616005897522, + "logps/chosen": -0.22450858354568481, + "logps/rejected": -3.574300765991211, + "loss": 0.2106, + "odds_ratio_loss": 0.04913847893476486, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02245086058974266, + "rewards/margins": 0.33497920632362366, + "rewards/rejected": -0.357430100440979, + "sft_loss": 0.22450858354568481, + "step": 2361 + }, + { + "epoch": 3.415762834417932, + "grad_norm": 2.082031818490767, + "learning_rate": 3.2619491589425315e-06, + "logits/chosen": -0.6551008820533752, + "logits/rejected": -0.37433722615242004, + "logps/chosen": -0.19321411848068237, + "logps/rejected": -3.2051291465759277, + "loss": 0.2201, + "odds_ratio_loss": 0.06106024235486984, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019321411848068237, + "rewards/margins": 0.301191508769989, + "rewards/rejected": -0.32051289081573486, + "sft_loss": 0.19321411848068237, + "step": 2362 + }, + { + "epoch": 3.4172089660159073, + "grad_norm": 2.178896734407967, + "learning_rate": 3.2588968341249446e-06, + "logits/chosen": -0.49421876668930054, + "logits/rejected": -0.4174592196941376, + "logps/chosen": -0.15940704941749573, + "logps/rejected": -4.831748008728027, + "loss": 0.2088, + "odds_ratio_loss": 0.02657831273972988, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015940703451633453, + "rewards/margins": 0.4672340750694275, + "rewards/rejected": -0.48317480087280273, + "sft_loss": 0.15940704941749573, + "step": 2363 + }, + { + "epoch": 3.418655097613883, + "grad_norm": 2.3614175718808053, + "learning_rate": 3.2558449561212175e-06, + "logits/chosen": -0.7048352360725403, + "logits/rejected": -0.3603106439113617, + "logps/chosen": -0.20679709315299988, + "logps/rejected": -4.15094518661499, + "loss": 0.2183, + "odds_ratio_loss": 0.0959421694278717, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.020679708570241928, + "rewards/margins": 0.3944148123264313, + "rewards/rejected": -0.4150944948196411, + "sft_loss": 0.20679709315299988, + "step": 2364 + }, + { + "epoch": 3.420101229211858, + "grad_norm": 1.8595835773087739, + "learning_rate": 3.2527935267713358e-06, + "logits/chosen": -0.45306140184402466, + "logits/rejected": -0.45435231924057007, + "logps/chosen": -0.1741219162940979, + "logps/rejected": -2.359769821166992, + "loss": 0.1421, + "odds_ratio_loss": 0.04474630579352379, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01741219311952591, + "rewards/margins": 0.21856479346752167, + "rewards/rejected": -0.23597699403762817, + "sft_loss": 0.1741219162940979, + "step": 2365 + }, + { + "epoch": 3.421547360809834, + "grad_norm": 2.0437400502249163, + "learning_rate": 3.249742547915021e-06, + "logits/chosen": -0.6979148983955383, + "logits/rejected": -0.5617083311080933, + "logps/chosen": -0.15010838210582733, + "logps/rejected": -4.167182445526123, + "loss": 0.2088, + "odds_ratio_loss": 0.030087631195783615, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015010838396847248, + "rewards/margins": 0.40170741081237793, + "rewards/rejected": -0.4167182743549347, + "sft_loss": 0.15010838210582733, + "step": 2366 + }, + { + "epoch": 3.422993492407809, + "grad_norm": 2.2100265623341286, + "learning_rate": 3.246692021391719e-06, + "logits/chosen": -0.5132609605789185, + "logits/rejected": -0.3893182575702667, + "logps/chosen": -0.2116222381591797, + "logps/rejected": -7.529613494873047, + "loss": 0.2881, + "odds_ratio_loss": 0.03504924103617668, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02116222307085991, + "rewards/margins": 0.7317991256713867, + "rewards/rejected": -0.7529613375663757, + "sft_loss": 0.2116222381591797, + "step": 2367 + }, + { + "epoch": 3.4244396240057844, + "grad_norm": 1.9627649713389184, + "learning_rate": 3.2436419490406014e-06, + "logits/chosen": -0.5135637521743774, + "logits/rejected": -0.4365648925304413, + "logps/chosen": -0.18647979199886322, + "logps/rejected": -4.965529441833496, + "loss": 0.165, + "odds_ratio_loss": 0.05243527889251709, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01864797994494438, + "rewards/margins": 0.4779050052165985, + "rewards/rejected": -0.496552973985672, + "sft_loss": 0.18647979199886322, + "step": 2368 + }, + { + "epoch": 3.42588575560376, + "grad_norm": 1.9653134748886898, + "learning_rate": 3.2405923327005713e-06, + "logits/chosen": -0.6600465774536133, + "logits/rejected": -0.44938400387763977, + "logps/chosen": -0.14724966883659363, + "logps/rejected": -3.737379550933838, + "loss": 0.1758, + "odds_ratio_loss": 0.029483633115887642, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014724968001246452, + "rewards/margins": 0.35901302099227905, + "rewards/rejected": -0.37373796105384827, + "sft_loss": 0.14724966883659363, + "step": 2369 + }, + { + "epoch": 3.4273318872017353, + "grad_norm": 2.2160538606815017, + "learning_rate": 3.237543174210251e-06, + "logits/chosen": -0.8269349336624146, + "logits/rejected": -0.6046656370162964, + "logps/chosen": -0.09066790342330933, + "logps/rejected": -5.1507792472839355, + "loss": 0.1569, + "odds_ratio_loss": 0.013543096370995045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009066790342330933, + "rewards/margins": 0.5060111880302429, + "rewards/rejected": -0.5150779485702515, + "sft_loss": 0.09066790342330933, + "step": 2370 + }, + { + "epoch": 3.4287780187997106, + "grad_norm": 1.899550802406212, + "learning_rate": 3.234494475407992e-06, + "logits/chosen": -0.6040807962417603, + "logits/rejected": -0.6489887237548828, + "logps/chosen": -0.12181614339351654, + "logps/rejected": -3.8885912895202637, + "loss": 0.1457, + "odds_ratio_loss": 0.024596858769655228, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012181615456938744, + "rewards/margins": 0.3766775131225586, + "rewards/rejected": -0.3888591527938843, + "sft_loss": 0.12181614339351654, + "step": 2371 + }, + { + "epoch": 3.4302241503976862, + "grad_norm": 2.099727938709925, + "learning_rate": 3.231446238131863e-06, + "logits/chosen": -0.6808019876480103, + "logits/rejected": -0.5160682797431946, + "logps/chosen": -0.1048995703458786, + "logps/rejected": -5.250744819641113, + "loss": 0.1281, + "odds_ratio_loss": 0.008181117475032806, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01048995740711689, + "rewards/margins": 0.514584481716156, + "rewards/rejected": -0.5250744223594666, + "sft_loss": 0.1048995703458786, + "step": 2372 + }, + { + "epoch": 3.4316702819956615, + "grad_norm": 2.706733983494289, + "learning_rate": 3.2283984642196613e-06, + "logits/chosen": -0.6812127828598022, + "logits/rejected": -0.5712729692459106, + "logps/chosen": -0.11651255190372467, + "logps/rejected": -4.5421271324157715, + "loss": 0.1814, + "odds_ratio_loss": 0.023081321269273758, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011651256121695042, + "rewards/margins": 0.44256147742271423, + "rewards/rejected": -0.4542126953601837, + "sft_loss": 0.11651255190372467, + "step": 2373 + }, + { + "epoch": 3.433116413593637, + "grad_norm": 5.394973473846446, + "learning_rate": 3.225351155508898e-06, + "logits/chosen": -0.6133010387420654, + "logits/rejected": -0.45091527700424194, + "logps/chosen": -0.22377179563045502, + "logps/rejected": -3.0302748680114746, + "loss": 0.1914, + "odds_ratio_loss": 0.0691990777850151, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02237718179821968, + "rewards/margins": 0.2806503176689148, + "rewards/rejected": -0.303027480840683, + "sft_loss": 0.22377179563045502, + "step": 2374 + }, + { + "epoch": 3.4345625451916124, + "grad_norm": 2.1457624970133327, + "learning_rate": 3.222304313836809e-06, + "logits/chosen": -0.556597888469696, + "logits/rejected": -0.49058791995048523, + "logps/chosen": -0.2874477505683899, + "logps/rejected": -4.049435615539551, + "loss": 0.2413, + "odds_ratio_loss": 0.06138356029987335, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02874477580189705, + "rewards/margins": 0.37619873881340027, + "rewards/rejected": -0.4049435555934906, + "sft_loss": 0.2874477505683899, + "step": 2375 + }, + { + "epoch": 3.436008676789588, + "grad_norm": 1.9218950138125275, + "learning_rate": 3.219257941040344e-06, + "logits/chosen": -0.5005204677581787, + "logits/rejected": -0.28589102625846863, + "logps/chosen": -0.17385639250278473, + "logps/rejected": -4.714556694030762, + "loss": 0.1819, + "odds_ratio_loss": 0.03751099109649658, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017385639250278473, + "rewards/margins": 0.4540700316429138, + "rewards/rejected": -0.4714556634426117, + "sft_loss": 0.17385639250278473, + "step": 2376 + }, + { + "epoch": 3.4374548083875633, + "grad_norm": 1.8161697265415382, + "learning_rate": 3.216212038956176e-06, + "logits/chosen": -0.5983232855796814, + "logits/rejected": -0.5303330421447754, + "logps/chosen": -0.16004885733127594, + "logps/rejected": -3.1815803050994873, + "loss": 0.154, + "odds_ratio_loss": 0.04018660634756088, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016004884615540504, + "rewards/margins": 0.30215317010879517, + "rewards/rejected": -0.31815803050994873, + "sft_loss": 0.16004885733127594, + "step": 2377 + }, + { + "epoch": 3.4389009399855386, + "grad_norm": 2.316256513557429, + "learning_rate": 3.2131666094206877e-06, + "logits/chosen": -0.6681080460548401, + "logits/rejected": -0.4476345479488373, + "logps/chosen": -0.1448899507522583, + "logps/rejected": -3.2004542350769043, + "loss": 0.2104, + "odds_ratio_loss": 0.038483526557683945, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01448899321258068, + "rewards/margins": 0.30555644631385803, + "rewards/rejected": -0.32004544138908386, + "sft_loss": 0.1448899507522583, + "step": 2378 + }, + { + "epoch": 3.4403470715835143, + "grad_norm": 2.075563631432093, + "learning_rate": 3.2101216542699807e-06, + "logits/chosen": -0.5723260641098022, + "logits/rejected": -0.46136629581451416, + "logps/chosen": -0.21146410703659058, + "logps/rejected": -2.679683208465576, + "loss": 0.2148, + "odds_ratio_loss": 0.048095718026161194, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021146409213542938, + "rewards/margins": 0.24682191014289856, + "rewards/rejected": -0.2679683268070221, + "sft_loss": 0.21146410703659058, + "step": 2379 + }, + { + "epoch": 3.4417932031814895, + "grad_norm": 2.2145156618535804, + "learning_rate": 3.207077175339871e-06, + "logits/chosen": -0.327717661857605, + "logits/rejected": -0.3595542311668396, + "logps/chosen": -0.1934828907251358, + "logps/rejected": -4.13493537902832, + "loss": 0.1963, + "odds_ratio_loss": 0.04170215129852295, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01934828981757164, + "rewards/margins": 0.39414525032043457, + "rewards/rejected": -0.4134935438632965, + "sft_loss": 0.1934828907251358, + "step": 2380 + }, + { + "epoch": 3.4432393347794648, + "grad_norm": 2.227105578312563, + "learning_rate": 3.204033174465886e-06, + "logits/chosen": -0.5712621808052063, + "logits/rejected": -0.5687679648399353, + "logps/chosen": -0.20861420035362244, + "logps/rejected": -3.32293701171875, + "loss": 0.2707, + "odds_ratio_loss": 0.044927455484867096, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020861420780420303, + "rewards/margins": 0.3114323019981384, + "rewards/rejected": -0.33229371905326843, + "sft_loss": 0.20861420035362244, + "step": 2381 + }, + { + "epoch": 3.4446854663774404, + "grad_norm": 2.090363545406971, + "learning_rate": 3.2009896534832645e-06, + "logits/chosen": -0.6760439872741699, + "logits/rejected": -0.4026757478713989, + "logps/chosen": -0.1740947961807251, + "logps/rejected": -5.411316871643066, + "loss": 0.1854, + "odds_ratio_loss": 0.029533682391047478, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01740947738289833, + "rewards/margins": 0.523722231388092, + "rewards/rejected": -0.5411317348480225, + "sft_loss": 0.1740947961807251, + "step": 2382 + }, + { + "epoch": 3.4461315979754157, + "grad_norm": 2.029695406988353, + "learning_rate": 3.1979466142269555e-06, + "logits/chosen": -0.6552931666374207, + "logits/rejected": -0.5883285403251648, + "logps/chosen": -0.17759792506694794, + "logps/rejected": -2.6016619205474854, + "loss": 0.1752, + "odds_ratio_loss": 0.04593576118350029, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017759792506694794, + "rewards/margins": 0.24240639805793762, + "rewards/rejected": -0.260166198015213, + "sft_loss": 0.17759792506694794, + "step": 2383 + }, + { + "epoch": 3.4475777295733914, + "grad_norm": 1.9069898883594973, + "learning_rate": 3.194904058531621e-06, + "logits/chosen": -0.6044560670852661, + "logits/rejected": -0.3972908854484558, + "logps/chosen": -0.12178179621696472, + "logps/rejected": -4.539610862731934, + "loss": 0.1617, + "odds_ratio_loss": 0.01849059760570526, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012178179807960987, + "rewards/margins": 0.4417828321456909, + "rewards/rejected": -0.453961044549942, + "sft_loss": 0.12178179621696472, + "step": 2384 + }, + { + "epoch": 3.4490238611713666, + "grad_norm": 2.249715426855833, + "learning_rate": 3.191861988231627e-06, + "logits/chosen": -0.6379084587097168, + "logits/rejected": -0.44113999605178833, + "logps/chosen": -0.10592366009950638, + "logps/rejected": -2.7317471504211426, + "loss": 0.1578, + "odds_ratio_loss": 0.05817069858312607, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.010592365637421608, + "rewards/margins": 0.26258236169815063, + "rewards/rejected": -0.2731747329235077, + "sft_loss": 0.10592366009950638, + "step": 2385 + }, + { + "epoch": 3.450469992769342, + "grad_norm": 2.263875089312316, + "learning_rate": 3.1888204051610524e-06, + "logits/chosen": -0.6763030886650085, + "logits/rejected": -0.482852578163147, + "logps/chosen": -0.11899633705615997, + "logps/rejected": -2.220947265625, + "loss": 0.1291, + "odds_ratio_loss": 0.032654620707035065, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011899634264409542, + "rewards/margins": 0.21019507944583893, + "rewards/rejected": -0.22209471464157104, + "sft_loss": 0.11899633705615997, + "step": 2386 + }, + { + "epoch": 3.4519161243673175, + "grad_norm": 2.2015403837200953, + "learning_rate": 3.185779311153674e-06, + "logits/chosen": -0.6529958248138428, + "logits/rejected": -0.5249685645103455, + "logps/chosen": -0.10539045929908752, + "logps/rejected": -3.266904354095459, + "loss": 0.167, + "odds_ratio_loss": 0.023367218673229218, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010539045557379723, + "rewards/margins": 0.3161514401435852, + "rewards/rejected": -0.3266904950141907, + "sft_loss": 0.10539045929908752, + "step": 2387 + }, + { + "epoch": 3.453362255965293, + "grad_norm": 2.1645835829709745, + "learning_rate": 3.1827387080429834e-06, + "logits/chosen": -0.5144082307815552, + "logits/rejected": -0.5050871968269348, + "logps/chosen": -0.1488667130470276, + "logps/rejected": -3.8072664737701416, + "loss": 0.171, + "odds_ratio_loss": 0.03980445861816406, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014886670745909214, + "rewards/margins": 0.36583998799324036, + "rewards/rejected": -0.3807266354560852, + "sft_loss": 0.1488667130470276, + "step": 2388 + }, + { + "epoch": 3.4548083875632685, + "grad_norm": 2.092793971447257, + "learning_rate": 3.179698597662168e-06, + "logits/chosen": -0.6013069748878479, + "logits/rejected": -0.5046336054801941, + "logps/chosen": -0.19006365537643433, + "logps/rejected": -3.2266502380371094, + "loss": 0.1904, + "odds_ratio_loss": 0.03395771235227585, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019006364047527313, + "rewards/margins": 0.303658664226532, + "rewards/rejected": -0.3226650655269623, + "sft_loss": 0.19006365537643433, + "step": 2389 + }, + { + "epoch": 3.4562545191612437, + "grad_norm": 2.248101499587932, + "learning_rate": 3.176658981844125e-06, + "logits/chosen": -0.6274727582931519, + "logits/rejected": -0.6035176515579224, + "logps/chosen": -0.16629016399383545, + "logps/rejected": -3.1788456439971924, + "loss": 0.1437, + "odds_ratio_loss": 0.036419257521629333, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016629017889499664, + "rewards/margins": 0.30125558376312256, + "rewards/rejected": -0.3178845942020416, + "sft_loss": 0.16629016399383545, + "step": 2390 + }, + { + "epoch": 3.457700650759219, + "grad_norm": 2.5398307774402333, + "learning_rate": 3.173619862421446e-06, + "logits/chosen": -0.6537259817123413, + "logits/rejected": -0.4555646777153015, + "logps/chosen": -0.13223238289356232, + "logps/rejected": -5.075956344604492, + "loss": 0.1914, + "odds_ratio_loss": 0.02143080160021782, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013223239220678806, + "rewards/margins": 0.4943723678588867, + "rewards/rejected": -0.5075955986976624, + "sft_loss": 0.13223238289356232, + "step": 2391 + }, + { + "epoch": 3.4591467823571946, + "grad_norm": 1.7598998790393996, + "learning_rate": 3.170581241226431e-06, + "logits/chosen": -0.5485961437225342, + "logits/rejected": -0.46813613176345825, + "logps/chosen": -0.08493223786354065, + "logps/rejected": -3.2580509185791016, + "loss": 0.156, + "odds_ratio_loss": 0.01832718588411808, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008493224158883095, + "rewards/margins": 0.3173118829727173, + "rewards/rejected": -0.32580509781837463, + "sft_loss": 0.08493223786354065, + "step": 2392 + }, + { + "epoch": 3.46059291395517, + "grad_norm": 2.1489156210409828, + "learning_rate": 3.167543120091075e-06, + "logits/chosen": -0.6511685848236084, + "logits/rejected": -0.5171000957489014, + "logps/chosen": -0.2797868847846985, + "logps/rejected": -3.9867255687713623, + "loss": 0.2727, + "odds_ratio_loss": 0.0753171294927597, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0279786866158247, + "rewards/margins": 0.3706938624382019, + "rewards/rejected": -0.39867255091667175, + "sft_loss": 0.2797868847846985, + "step": 2393 + }, + { + "epoch": 3.462039045553145, + "grad_norm": 3.179471585649838, + "learning_rate": 3.1645055008470715e-06, + "logits/chosen": -0.6657485365867615, + "logits/rejected": -0.5495181083679199, + "logps/chosen": -0.16760435700416565, + "logps/rejected": -3.56003475189209, + "loss": 0.1801, + "odds_ratio_loss": 0.024452198296785355, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016760436818003654, + "rewards/margins": 0.3392430543899536, + "rewards/rejected": -0.3560035228729248, + "sft_loss": 0.16760435700416565, + "step": 2394 + }, + { + "epoch": 3.463485177151121, + "grad_norm": 1.8815520518710367, + "learning_rate": 3.161468385325814e-06, + "logits/chosen": -0.5054148435592651, + "logits/rejected": -0.47322767972946167, + "logps/chosen": -0.11986206471920013, + "logps/rejected": -3.1187729835510254, + "loss": 0.1612, + "odds_ratio_loss": 0.02132941596210003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011986206285655499, + "rewards/margins": 0.2998911142349243, + "rewards/rejected": -0.3118773102760315, + "sft_loss": 0.11986206471920013, + "step": 2395 + }, + { + "epoch": 3.464931308749096, + "grad_norm": 2.122953789567727, + "learning_rate": 3.1584317753583897e-06, + "logits/chosen": -0.45948153734207153, + "logits/rejected": -0.4089983105659485, + "logps/chosen": -0.15057674050331116, + "logps/rejected": -3.30064058303833, + "loss": 0.1828, + "odds_ratio_loss": 0.038241539150476456, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015057675540447235, + "rewards/margins": 0.3150063753128052, + "rewards/rejected": -0.330064058303833, + "sft_loss": 0.15057674050331116, + "step": 2396 + }, + { + "epoch": 3.4663774403470717, + "grad_norm": 2.609325702886058, + "learning_rate": 3.155395672775583e-06, + "logits/chosen": -0.6183381080627441, + "logits/rejected": -0.5282341241836548, + "logps/chosen": -0.2384868562221527, + "logps/rejected": -4.012352466583252, + "loss": 0.1958, + "odds_ratio_loss": 0.06628884375095367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02384868450462818, + "rewards/margins": 0.37738654017448425, + "rewards/rejected": -0.4012352526187897, + "sft_loss": 0.2384868562221527, + "step": 2397 + }, + { + "epoch": 3.467823571945047, + "grad_norm": 1.963380190689956, + "learning_rate": 3.1523600794078695e-06, + "logits/chosen": -0.5454755425453186, + "logits/rejected": -0.46037548780441284, + "logps/chosen": -0.102660171687603, + "logps/rejected": -2.684774160385132, + "loss": 0.1444, + "odds_ratio_loss": 0.021067647263407707, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0102660171687603, + "rewards/margins": 0.2582114040851593, + "rewards/rejected": -0.2684774398803711, + "sft_loss": 0.102660171687603, + "step": 2398 + }, + { + "epoch": 3.469269703543022, + "grad_norm": 2.2003218735262213, + "learning_rate": 3.149324997085422e-06, + "logits/chosen": -0.5759695768356323, + "logits/rejected": -0.5235334634780884, + "logps/chosen": -0.1454126387834549, + "logps/rejected": -3.1648459434509277, + "loss": 0.1878, + "odds_ratio_loss": 0.03307610750198364, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0145412627607584, + "rewards/margins": 0.3019433915615082, + "rewards/rejected": -0.31648463010787964, + "sft_loss": 0.1454126387834549, + "step": 2399 + }, + { + "epoch": 3.470715835140998, + "grad_norm": 2.221086291687407, + "learning_rate": 3.1462904276381016e-06, + "logits/chosen": -0.6801586151123047, + "logits/rejected": -0.5571650266647339, + "logps/chosen": -0.2424691617488861, + "logps/rejected": -3.4627394676208496, + "loss": 0.1726, + "odds_ratio_loss": 0.06682531535625458, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.02424691990017891, + "rewards/margins": 0.3220270276069641, + "rewards/rejected": -0.3462739586830139, + "sft_loss": 0.2424691617488861, + "step": 2400 + }, + { + "epoch": 3.472161966738973, + "grad_norm": 2.2423475367785213, + "learning_rate": 3.1432563728954627e-06, + "logits/chosen": -0.7703630924224854, + "logits/rejected": -0.5540469288825989, + "logps/chosen": -0.18336373567581177, + "logps/rejected": -3.5720396041870117, + "loss": 0.1913, + "odds_ratio_loss": 0.015588531270623207, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018336374312639236, + "rewards/margins": 0.33886754512786865, + "rewards/rejected": -0.35720396041870117, + "sft_loss": 0.18336373567581177, + "step": 2401 + }, + { + "epoch": 3.473608098336949, + "grad_norm": 2.075655681568371, + "learning_rate": 3.1402228346867464e-06, + "logits/chosen": -0.6194726824760437, + "logits/rejected": -0.4660893380641937, + "logps/chosen": -0.16618576645851135, + "logps/rejected": -3.4085822105407715, + "loss": 0.1808, + "odds_ratio_loss": 0.03004610911011696, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016618575900793076, + "rewards/margins": 0.3242396116256714, + "rewards/rejected": -0.34085819125175476, + "sft_loss": 0.16618576645851135, + "step": 2402 + }, + { + "epoch": 3.475054229934924, + "grad_norm": 2.176656937387457, + "learning_rate": 3.1371898148408864e-06, + "logits/chosen": -0.6529266238212585, + "logits/rejected": -0.46994972229003906, + "logps/chosen": -0.23654180765151978, + "logps/rejected": -4.497025012969971, + "loss": 0.1887, + "odds_ratio_loss": 0.031046872958540916, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023654181510210037, + "rewards/margins": 0.4260483384132385, + "rewards/rejected": -0.44970253109931946, + "sft_loss": 0.23654180765151978, + "step": 2403 + }, + { + "epoch": 3.4765003615328993, + "grad_norm": 1.9864857166981567, + "learning_rate": 3.1341573151864996e-06, + "logits/chosen": -0.6524367332458496, + "logits/rejected": -0.390330970287323, + "logps/chosen": -0.18505634367465973, + "logps/rejected": -4.1696391105651855, + "loss": 0.1636, + "odds_ratio_loss": 0.02654869109392166, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018505636602640152, + "rewards/margins": 0.3984583020210266, + "rewards/rejected": -0.4169639050960541, + "sft_loss": 0.18505634367465973, + "step": 2404 + }, + { + "epoch": 3.477946493130875, + "grad_norm": 2.024665467230913, + "learning_rate": 3.131125337551891e-06, + "logits/chosen": -0.7065335512161255, + "logits/rejected": -0.5359545946121216, + "logps/chosen": -0.14678552746772766, + "logps/rejected": -3.0860984325408936, + "loss": 0.2154, + "odds_ratio_loss": 0.026049984619021416, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014678554609417915, + "rewards/margins": 0.2939313054084778, + "rewards/rejected": -0.30860984325408936, + "sft_loss": 0.14678552746772766, + "step": 2405 + }, + { + "epoch": 3.4793926247288502, + "grad_norm": 2.0090458837282785, + "learning_rate": 3.1280938837650547e-06, + "logits/chosen": -0.7596422433853149, + "logits/rejected": -0.526679515838623, + "logps/chosen": -0.13010278344154358, + "logps/rejected": -4.164193153381348, + "loss": 0.2014, + "odds_ratio_loss": 0.026350578293204308, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013010278344154358, + "rewards/margins": 0.40340906381607056, + "rewards/rejected": -0.4164193272590637, + "sft_loss": 0.13010278344154358, + "step": 2406 + }, + { + "epoch": 3.480838756326826, + "grad_norm": 2.372630511041672, + "learning_rate": 3.125062955653661e-06, + "logits/chosen": -0.5935577750205994, + "logits/rejected": -0.5829852223396301, + "logps/chosen": -0.18501299619674683, + "logps/rejected": -4.798090934753418, + "loss": 0.2498, + "odds_ratio_loss": 0.04796503111720085, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018501300364732742, + "rewards/margins": 0.4613078236579895, + "rewards/rejected": -0.47980910539627075, + "sft_loss": 0.18501299619674683, + "step": 2407 + }, + { + "epoch": 3.482284887924801, + "grad_norm": 1.973003502635966, + "learning_rate": 3.122032555045072e-06, + "logits/chosen": -0.5334011316299438, + "logits/rejected": -0.46345335245132446, + "logps/chosen": -0.14981776475906372, + "logps/rejected": -3.4753215312957764, + "loss": 0.1573, + "odds_ratio_loss": 0.04779369384050369, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014981777407228947, + "rewards/margins": 0.33255037665367126, + "rewards/rejected": -0.34753215312957764, + "sft_loss": 0.14981776475906372, + "step": 2408 + }, + { + "epoch": 3.4837310195227764, + "grad_norm": 2.084221610637667, + "learning_rate": 3.119002683766325e-06, + "logits/chosen": -0.5933706760406494, + "logits/rejected": -0.4296287000179291, + "logps/chosen": -0.12299705296754837, + "logps/rejected": -6.163819313049316, + "loss": 0.1744, + "odds_ratio_loss": 0.02097168192267418, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012299705296754837, + "rewards/margins": 0.6040822267532349, + "rewards/rejected": -0.6163819432258606, + "sft_loss": 0.12299705296754837, + "step": 2409 + }, + { + "epoch": 3.485177151120752, + "grad_norm": 2.2991481985822664, + "learning_rate": 3.1159733436441413e-06, + "logits/chosen": -0.6943192481994629, + "logits/rejected": -0.3896532654762268, + "logps/chosen": -0.2796631157398224, + "logps/rejected": -3.7519736289978027, + "loss": 0.1965, + "odds_ratio_loss": 0.04865211993455887, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02796631120145321, + "rewards/margins": 0.34723103046417236, + "rewards/rejected": -0.3751973509788513, + "sft_loss": 0.2796631157398224, + "step": 2410 + }, + { + "epoch": 3.4866232827187273, + "grad_norm": 2.363386916775901, + "learning_rate": 3.11294453650492e-06, + "logits/chosen": -0.5733893513679504, + "logits/rejected": -0.5263524055480957, + "logps/chosen": -0.2019123136997223, + "logps/rejected": -3.117097854614258, + "loss": 0.2121, + "odds_ratio_loss": 0.029736708849668503, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02019122987985611, + "rewards/margins": 0.29151856899261475, + "rewards/rejected": -0.31170982122421265, + "sft_loss": 0.2019123136997223, + "step": 2411 + }, + { + "epoch": 3.488069414316703, + "grad_norm": 2.1846083513594934, + "learning_rate": 3.1099162641747427e-06, + "logits/chosen": -0.5293310284614563, + "logits/rejected": -0.3837476372718811, + "logps/chosen": -0.10314946621656418, + "logps/rejected": -5.216273307800293, + "loss": 0.1856, + "odds_ratio_loss": 0.052306585013866425, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.010314945131540298, + "rewards/margins": 0.5113124251365662, + "rewards/rejected": -0.5216273069381714, + "sft_loss": 0.10314946621656418, + "step": 2412 + }, + { + "epoch": 3.4895155459146783, + "grad_norm": 15.463632104482794, + "learning_rate": 3.1068885284793636e-06, + "logits/chosen": -0.620168149471283, + "logits/rejected": -0.503106951713562, + "logps/chosen": -0.3016592264175415, + "logps/rejected": -3.0068225860595703, + "loss": 0.2447, + "odds_ratio_loss": 0.04610760882496834, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03016592189669609, + "rewards/margins": 0.27051636576652527, + "rewards/rejected": -0.30068230628967285, + "sft_loss": 0.3016592264175415, + "step": 2413 + }, + { + "epoch": 3.4909616775126535, + "grad_norm": 2.085084360099831, + "learning_rate": 3.1038613312442187e-06, + "logits/chosen": -0.6817932724952698, + "logits/rejected": -0.5196681618690491, + "logps/chosen": -0.18397408723831177, + "logps/rejected": -4.176403999328613, + "loss": 0.1861, + "odds_ratio_loss": 0.032095346599817276, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018397409468889236, + "rewards/margins": 0.39924299716949463, + "rewards/rejected": -0.4176403880119324, + "sft_loss": 0.18397408723831177, + "step": 2414 + }, + { + "epoch": 3.492407809110629, + "grad_norm": 2.1247525539099206, + "learning_rate": 3.1008346742944124e-06, + "logits/chosen": -0.6427405476570129, + "logits/rejected": -0.4603559076786041, + "logps/chosen": -0.19035601615905762, + "logps/rejected": -3.170347213745117, + "loss": 0.2023, + "odds_ratio_loss": 0.03771872818470001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01903560385107994, + "rewards/margins": 0.2979991137981415, + "rewards/rejected": -0.3170347213745117, + "sft_loss": 0.19035601615905762, + "step": 2415 + }, + { + "epoch": 3.4938539407086044, + "grad_norm": 2.1004484725383876, + "learning_rate": 3.097808559454732e-06, + "logits/chosen": -0.5600918531417847, + "logits/rejected": -0.45386743545532227, + "logps/chosen": -0.16461656987667084, + "logps/rejected": -4.282623767852783, + "loss": 0.2029, + "odds_ratio_loss": 0.035014864057302475, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016461655497550964, + "rewards/margins": 0.41180071234703064, + "rewards/rejected": -0.4282623827457428, + "sft_loss": 0.16461656987667084, + "step": 2416 + }, + { + "epoch": 3.4953000723065797, + "grad_norm": 1.9634213959408762, + "learning_rate": 3.09478298854963e-06, + "logits/chosen": -0.4763942062854767, + "logits/rejected": -0.27630501985549927, + "logps/chosen": -0.14406144618988037, + "logps/rejected": -4.638453006744385, + "loss": 0.1381, + "odds_ratio_loss": 0.03147125244140625, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014406142756342888, + "rewards/margins": 0.4494391083717346, + "rewards/rejected": -0.46384525299072266, + "sft_loss": 0.14406144618988037, + "step": 2417 + }, + { + "epoch": 3.4967462039045554, + "grad_norm": 2.0293001482316146, + "learning_rate": 3.0917579634032345e-06, + "logits/chosen": -0.6355884075164795, + "logits/rejected": -0.4838637709617615, + "logps/chosen": -0.31256037950515747, + "logps/rejected": -5.803687572479248, + "loss": 0.2496, + "odds_ratio_loss": 0.10423914343118668, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03125603869557381, + "rewards/margins": 0.5491127371788025, + "rewards/rejected": -0.5803688168525696, + "sft_loss": 0.31256037950515747, + "step": 2418 + }, + { + "epoch": 3.4981923355025306, + "grad_norm": 1.9185361869559496, + "learning_rate": 3.088733485839348e-06, + "logits/chosen": -0.6340678334236145, + "logits/rejected": -0.540869951248169, + "logps/chosen": -0.21716678142547607, + "logps/rejected": -4.758569240570068, + "loss": 0.1763, + "odds_ratio_loss": 0.050362855195999146, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021716676652431488, + "rewards/margins": 0.45414021611213684, + "rewards/rejected": -0.4758569002151489, + "sft_loss": 0.21716678142547607, + "step": 2419 + }, + { + "epoch": 3.4996384671005063, + "grad_norm": 2.019327134335387, + "learning_rate": 3.0857095576814357e-06, + "logits/chosen": -0.38161930441856384, + "logits/rejected": -0.34976017475128174, + "logps/chosen": -0.18557745218276978, + "logps/rejected": -6.113600730895996, + "loss": 0.2017, + "odds_ratio_loss": 0.04104998707771301, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018557745963335037, + "rewards/margins": 0.5928024053573608, + "rewards/rejected": -0.6113600730895996, + "sft_loss": 0.18557745218276978, + "step": 2420 + }, + { + "epoch": 3.5010845986984815, + "grad_norm": 2.0202611999068663, + "learning_rate": 3.0826861807526366e-06, + "logits/chosen": -0.63789963722229, + "logits/rejected": -0.7829943895339966, + "logps/chosen": -0.2620221674442291, + "logps/rejected": -5.192530632019043, + "loss": 0.2215, + "odds_ratio_loss": 0.06331058591604233, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026202216744422913, + "rewards/margins": 0.4930509030818939, + "rewards/rejected": -0.519253134727478, + "sft_loss": 0.2620221674442291, + "step": 2421 + }, + { + "epoch": 3.5025307302964572, + "grad_norm": 2.127595750553593, + "learning_rate": 3.079663356875754e-06, + "logits/chosen": -0.5893542766571045, + "logits/rejected": -0.4019384980201721, + "logps/chosen": -0.16648858785629272, + "logps/rejected": -4.031942844390869, + "loss": 0.2227, + "odds_ratio_loss": 0.04801696911454201, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016648858785629272, + "rewards/margins": 0.38654541969299316, + "rewards/rejected": -0.40319427847862244, + "sft_loss": 0.16648858785629272, + "step": 2422 + }, + { + "epoch": 3.5039768618944325, + "grad_norm": 1.996498648451224, + "learning_rate": 3.0766410878732634e-06, + "logits/chosen": -0.47269999980926514, + "logits/rejected": -0.32516586780548096, + "logps/chosen": -0.20525558292865753, + "logps/rejected": -3.3645095825195312, + "loss": 0.1638, + "odds_ratio_loss": 0.021875783801078796, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020525559782981873, + "rewards/margins": 0.31592538952827454, + "rewards/rejected": -0.3364509344100952, + "sft_loss": 0.20525558292865753, + "step": 2423 + }, + { + "epoch": 3.5054229934924077, + "grad_norm": 2.2579256443891875, + "learning_rate": 3.073619375567299e-06, + "logits/chosen": -0.4492790699005127, + "logits/rejected": -0.4535216689109802, + "logps/chosen": -0.16690582036972046, + "logps/rejected": -4.513905048370361, + "loss": 0.1916, + "odds_ratio_loss": 0.04860381782054901, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016690582036972046, + "rewards/margins": 0.4346998929977417, + "rewards/rejected": -0.45139047503471375, + "sft_loss": 0.16690582036972046, + "step": 2424 + }, + { + "epoch": 3.5068691250903834, + "grad_norm": 2.1783585530456357, + "learning_rate": 3.070598221779664e-06, + "logits/chosen": -0.6967248916625977, + "logits/rejected": -0.4836665093898773, + "logps/chosen": -0.20443633198738098, + "logps/rejected": -3.487060785293579, + "loss": 0.2119, + "odds_ratio_loss": 0.029861953109502792, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020443633198738098, + "rewards/margins": 0.32826241850852966, + "rewards/rejected": -0.34870606660842896, + "sft_loss": 0.20443633198738098, + "step": 2425 + }, + { + "epoch": 3.5083152566883586, + "grad_norm": 2.463389036122334, + "learning_rate": 3.0675776283318203e-06, + "logits/chosen": -0.5934221744537354, + "logits/rejected": -0.5299607515335083, + "logps/chosen": -0.2866644263267517, + "logps/rejected": -4.078819274902344, + "loss": 0.1839, + "odds_ratio_loss": 0.025208022445440292, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02866644226014614, + "rewards/margins": 0.3792154788970947, + "rewards/rejected": -0.4078819155693054, + "sft_loss": 0.2866644263267517, + "step": 2426 + }, + { + "epoch": 3.509761388286334, + "grad_norm": 2.3513293545188634, + "learning_rate": 3.064557597044899e-06, + "logits/chosen": -0.5402525663375854, + "logits/rejected": -0.4047253727912903, + "logps/chosen": -0.27317333221435547, + "logps/rejected": -4.133315086364746, + "loss": 0.2127, + "odds_ratio_loss": 0.05065507814288139, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027317333966493607, + "rewards/margins": 0.3860141634941101, + "rewards/rejected": -0.4133315086364746, + "sft_loss": 0.27317333221435547, + "step": 2427 + }, + { + "epoch": 3.5112075198843096, + "grad_norm": 2.629368897025621, + "learning_rate": 3.0615381297396863e-06, + "logits/chosen": -0.6587547063827515, + "logits/rejected": -0.5991069674491882, + "logps/chosen": -0.18863442540168762, + "logps/rejected": -3.0743231773376465, + "loss": 0.1699, + "odds_ratio_loss": 0.033928271383047104, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018863443285226822, + "rewards/margins": 0.2885688841342926, + "rewards/rejected": -0.3074323236942291, + "sft_loss": 0.18863442540168762, + "step": 2428 + }, + { + "epoch": 3.512653651482285, + "grad_norm": 1.9268262456854648, + "learning_rate": 3.058519228236631e-06, + "logits/chosen": -0.6309177279472351, + "logits/rejected": -0.5707622766494751, + "logps/chosen": -0.15104332566261292, + "logps/rejected": -2.467463254928589, + "loss": 0.2081, + "odds_ratio_loss": 0.030597880482673645, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015104331076145172, + "rewards/margins": 0.2316419780254364, + "rewards/rejected": -0.24674633145332336, + "sft_loss": 0.15104332566261292, + "step": 2429 + }, + { + "epoch": 3.51409978308026, + "grad_norm": 2.187578199208661, + "learning_rate": 3.0555008943558376e-06, + "logits/chosen": -0.43284595012664795, + "logits/rejected": -0.3950757086277008, + "logps/chosen": -0.3491520285606384, + "logps/rejected": -5.344951629638672, + "loss": 0.2358, + "odds_ratio_loss": 0.07832454144954681, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03491520509123802, + "rewards/margins": 0.4995799958705902, + "rewards/rejected": -0.5344952344894409, + "sft_loss": 0.3491520285606384, + "step": 2430 + }, + { + "epoch": 3.5155459146782357, + "grad_norm": 2.0356756659249515, + "learning_rate": 3.052483129917074e-06, + "logits/chosen": -0.527158260345459, + "logits/rejected": -0.459917277097702, + "logps/chosen": -0.1386464685201645, + "logps/rejected": -6.810248374938965, + "loss": 0.1815, + "odds_ratio_loss": 0.02819202095270157, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013864649459719658, + "rewards/margins": 0.6671602129936218, + "rewards/rejected": -0.6810248494148254, + "sft_loss": 0.1386464685201645, + "step": 2431 + }, + { + "epoch": 3.516992046276211, + "grad_norm": 1.9468625134569972, + "learning_rate": 3.04946593673976e-06, + "logits/chosen": -0.37922120094299316, + "logits/rejected": -0.36168068647384644, + "logps/chosen": -0.20144081115722656, + "logps/rejected": -4.170196533203125, + "loss": 0.1787, + "odds_ratio_loss": 0.031825270503759384, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020144082605838776, + "rewards/margins": 0.3968755900859833, + "rewards/rejected": -0.41701966524124146, + "sft_loss": 0.20144081115722656, + "step": 2432 + }, + { + "epoch": 3.5184381778741867, + "grad_norm": 1.9008651504383323, + "learning_rate": 3.046449316642972e-06, + "logits/chosen": -0.53741455078125, + "logits/rejected": -0.49799400568008423, + "logps/chosen": -0.26500892639160156, + "logps/rejected": -3.6741976737976074, + "loss": 0.2029, + "odds_ratio_loss": 0.09227637946605682, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026500891894102097, + "rewards/margins": 0.3409188985824585, + "rewards/rejected": -0.3674197793006897, + "sft_loss": 0.26500892639160156, + "step": 2433 + }, + { + "epoch": 3.519884309472162, + "grad_norm": 1.795668496423871, + "learning_rate": 3.043433271445444e-06, + "logits/chosen": -0.5739824175834656, + "logits/rejected": -0.5970664024353027, + "logps/chosen": -0.06262612342834473, + "logps/rejected": -4.009900093078613, + "loss": 0.1221, + "odds_ratio_loss": 0.02301051840186119, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0062626120634377, + "rewards/margins": 0.3947274088859558, + "rewards/rejected": -0.4009900391101837, + "sft_loss": 0.06262612342834473, + "step": 2434 + }, + { + "epoch": 3.5213304410701376, + "grad_norm": 2.3034738221241087, + "learning_rate": 3.0404178029655584e-06, + "logits/chosen": -0.631015419960022, + "logits/rejected": -0.41128766536712646, + "logps/chosen": -0.20884862542152405, + "logps/rejected": -3.4437193870544434, + "loss": 0.1965, + "odds_ratio_loss": 0.034937452524900436, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020884864032268524, + "rewards/margins": 0.3234870433807373, + "rewards/rejected": -0.3443719148635864, + "sft_loss": 0.20884862542152405, + "step": 2435 + }, + { + "epoch": 3.522776572668113, + "grad_norm": 1.945593311128753, + "learning_rate": 3.037402913021354e-06, + "logits/chosen": -0.4494878649711609, + "logits/rejected": -0.449461966753006, + "logps/chosen": -0.1753808557987213, + "logps/rejected": -4.066275596618652, + "loss": 0.1666, + "odds_ratio_loss": 0.035513922572135925, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01753808557987213, + "rewards/margins": 0.3890894651412964, + "rewards/rejected": -0.4066275656223297, + "sft_loss": 0.1753808557987213, + "step": 2436 + }, + { + "epoch": 3.524222704266088, + "grad_norm": 2.079948813762282, + "learning_rate": 3.0343886034305167e-06, + "logits/chosen": -0.6514714956283569, + "logits/rejected": -0.5961179733276367, + "logps/chosen": -0.2650774121284485, + "logps/rejected": -5.5663652420043945, + "loss": 0.2188, + "odds_ratio_loss": 0.07149592787027359, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02650773897767067, + "rewards/margins": 0.5301287770271301, + "rewards/rejected": -0.5566365122795105, + "sft_loss": 0.2650774121284485, + "step": 2437 + }, + { + "epoch": 3.5256688358640638, + "grad_norm": 2.1260641283263273, + "learning_rate": 3.0313748760103887e-06, + "logits/chosen": -0.49076318740844727, + "logits/rejected": -0.34688353538513184, + "logps/chosen": -0.12424877285957336, + "logps/rejected": -5.382345676422119, + "loss": 0.1499, + "odds_ratio_loss": 0.030808860436081886, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012424877844750881, + "rewards/margins": 0.5258097052574158, + "rewards/rejected": -0.5382345914840698, + "sft_loss": 0.12424877285957336, + "step": 2438 + }, + { + "epoch": 3.527114967462039, + "grad_norm": 2.371176330762152, + "learning_rate": 3.0283617325779545e-06, + "logits/chosen": -0.4812876880168915, + "logits/rejected": -0.34987127780914307, + "logps/chosen": -0.2775692641735077, + "logps/rejected": -4.12560510635376, + "loss": 0.2054, + "odds_ratio_loss": 0.04205989092588425, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02775692753493786, + "rewards/margins": 0.3848035931587219, + "rewards/rejected": -0.41256052255630493, + "sft_loss": 0.2775692641735077, + "step": 2439 + }, + { + "epoch": 3.5285610990600143, + "grad_norm": 2.0813750370837423, + "learning_rate": 3.0253491749498512e-06, + "logits/chosen": -0.4459601938724518, + "logits/rejected": -0.328517884016037, + "logps/chosen": -0.3284389078617096, + "logps/rejected": -3.3992013931274414, + "loss": 0.2288, + "odds_ratio_loss": 0.058917153626680374, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03284389153122902, + "rewards/margins": 0.30707627534866333, + "rewards/rejected": -0.33992013335227966, + "sft_loss": 0.3284389078617096, + "step": 2440 + }, + { + "epoch": 3.53000723065799, + "grad_norm": 2.003917998097915, + "learning_rate": 3.0223372049423586e-06, + "logits/chosen": -0.7053133249282837, + "logits/rejected": -0.4800989329814911, + "logps/chosen": -0.14180231094360352, + "logps/rejected": -3.8691177368164062, + "loss": 0.1728, + "odds_ratio_loss": 0.039757855236530304, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014180229976773262, + "rewards/margins": 0.3727315664291382, + "rewards/rejected": -0.3869118094444275, + "sft_loss": 0.14180231094360352, + "step": 2441 + }, + { + "epoch": 3.531453362255965, + "grad_norm": 2.0461480696169603, + "learning_rate": 3.0193258243714084e-06, + "logits/chosen": -0.5701842308044434, + "logits/rejected": -0.3964661657810211, + "logps/chosen": -0.25734081864356995, + "logps/rejected": -2.8863861560821533, + "loss": 0.1949, + "odds_ratio_loss": 0.08616319298744202, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.025734081864356995, + "rewards/margins": 0.26290449500083923, + "rewards/rejected": -0.2886385917663574, + "sft_loss": 0.25734081864356995, + "step": 2442 + }, + { + "epoch": 3.532899493853941, + "grad_norm": 2.255625866443697, + "learning_rate": 3.01631503505257e-06, + "logits/chosen": -0.7258257865905762, + "logits/rejected": -0.4492414593696594, + "logps/chosen": -0.21355527639389038, + "logps/rejected": -3.853524684906006, + "loss": 0.2263, + "odds_ratio_loss": 0.030937321484088898, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021355528384447098, + "rewards/margins": 0.3639969527721405, + "rewards/rejected": -0.3853524923324585, + "sft_loss": 0.21355527639389038, + "step": 2443 + }, + { + "epoch": 3.534345625451916, + "grad_norm": 2.1382791727428034, + "learning_rate": 3.0133048388010615e-06, + "logits/chosen": -0.48679161071777344, + "logits/rejected": -0.4358132779598236, + "logps/chosen": -0.17743311822414398, + "logps/rejected": -3.7108383178710938, + "loss": 0.1898, + "odds_ratio_loss": 0.029122265055775642, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017743311822414398, + "rewards/margins": 0.3533405065536499, + "rewards/rejected": -0.3710837960243225, + "sft_loss": 0.17743311822414398, + "step": 2444 + }, + { + "epoch": 3.535791757049892, + "grad_norm": 1.9527830299379936, + "learning_rate": 3.0102952374317392e-06, + "logits/chosen": -0.6457036137580872, + "logits/rejected": -0.5125142335891724, + "logps/chosen": -0.1727660447359085, + "logps/rejected": -4.623327255249023, + "loss": 0.2268, + "odds_ratio_loss": 0.026115503162145615, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01727660559117794, + "rewards/margins": 0.44505611062049866, + "rewards/rejected": -0.46233272552490234, + "sft_loss": 0.1727660447359085, + "step": 2445 + }, + { + "epoch": 3.537237888647867, + "grad_norm": 2.080234692907973, + "learning_rate": 3.007286232759105e-06, + "logits/chosen": -0.6250134110450745, + "logits/rejected": -0.49293088912963867, + "logps/chosen": -0.2399110198020935, + "logps/rejected": -3.053586721420288, + "loss": 0.1964, + "odds_ratio_loss": 0.04962414503097534, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02399110235273838, + "rewards/margins": 0.28136757016181946, + "rewards/rejected": -0.3053586781024933, + "sft_loss": 0.2399110198020935, + "step": 2446 + }, + { + "epoch": 3.5386840202458423, + "grad_norm": 2.317314461800139, + "learning_rate": 3.0042778265972984e-06, + "logits/chosen": -0.47505897283554077, + "logits/rejected": -0.39482036232948303, + "logps/chosen": -0.35046637058258057, + "logps/rejected": -3.926889419555664, + "loss": 0.2291, + "odds_ratio_loss": 0.0780850425362587, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03504663333296776, + "rewards/margins": 0.3576422929763794, + "rewards/rejected": -0.39268895983695984, + "sft_loss": 0.35046637058258057, + "step": 2447 + }, + { + "epoch": 3.540130151843818, + "grad_norm": 2.058424958820097, + "learning_rate": 3.0012700207600974e-06, + "logits/chosen": -0.6533050537109375, + "logits/rejected": -0.45995908975601196, + "logps/chosen": -0.1486455202102661, + "logps/rejected": -4.053260803222656, + "loss": 0.2256, + "odds_ratio_loss": 0.022504646331071854, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014864552766084671, + "rewards/margins": 0.3904615640640259, + "rewards/rejected": -0.40532612800598145, + "sft_loss": 0.1486455202102661, + "step": 2448 + }, + { + "epoch": 3.541576283441793, + "grad_norm": 2.267037713125268, + "learning_rate": 2.9982628170609223e-06, + "logits/chosen": -0.6241118907928467, + "logits/rejected": -0.5395658612251282, + "logps/chosen": -0.16177615523338318, + "logps/rejected": -3.594677448272705, + "loss": 0.1644, + "odds_ratio_loss": 0.03948385640978813, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016177615150809288, + "rewards/margins": 0.34329015016555786, + "rewards/rejected": -0.3594677448272705, + "sft_loss": 0.16177615523338318, + "step": 2449 + }, + { + "epoch": 3.5430224150397684, + "grad_norm": 2.168659789575291, + "learning_rate": 2.9952562173128248e-06, + "logits/chosen": -0.47931620478630066, + "logits/rejected": -0.3874140977859497, + "logps/chosen": -0.21014590561389923, + "logps/rejected": -5.508059501647949, + "loss": 0.1959, + "odds_ratio_loss": 0.0441533587872982, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021014589816331863, + "rewards/margins": 0.5297913551330566, + "rewards/rejected": -0.550805926322937, + "sft_loss": 0.21014590561389923, + "step": 2450 + }, + { + "epoch": 3.544468546637744, + "grad_norm": 2.0176231969282497, + "learning_rate": 2.9922502233284973e-06, + "logits/chosen": -0.5037535429000854, + "logits/rejected": -0.46424558758735657, + "logps/chosen": -0.30051517486572266, + "logps/rejected": -5.129392147064209, + "loss": 0.2652, + "odds_ratio_loss": 0.08848903328180313, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.030051520094275475, + "rewards/margins": 0.4828876852989197, + "rewards/rejected": -0.5129392147064209, + "sft_loss": 0.30051517486572266, + "step": 2451 + }, + { + "epoch": 3.5459146782357194, + "grad_norm": 2.3169499110429617, + "learning_rate": 2.989244836920261e-06, + "logits/chosen": -0.33989861607551575, + "logits/rejected": -0.3616371154785156, + "logps/chosen": -0.19239774346351624, + "logps/rejected": -3.9730749130249023, + "loss": 0.2066, + "odds_ratio_loss": 0.06537087261676788, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019239773973822594, + "rewards/margins": 0.3780677318572998, + "rewards/rejected": -0.39730751514434814, + "sft_loss": 0.19239774346351624, + "step": 2452 + }, + { + "epoch": 3.5473608098336946, + "grad_norm": 2.357450845950666, + "learning_rate": 2.986240059900079e-06, + "logits/chosen": -0.5030441284179688, + "logits/rejected": -0.43161094188690186, + "logps/chosen": -0.34148740768432617, + "logps/rejected": -4.043811798095703, + "loss": 0.2018, + "odds_ratio_loss": 0.04510558396577835, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03414874151349068, + "rewards/margins": 0.3702324628829956, + "rewards/rejected": -0.4043811559677124, + "sft_loss": 0.34148740768432617, + "step": 2453 + }, + { + "epoch": 3.5488069414316703, + "grad_norm": 2.1646070564772795, + "learning_rate": 2.983235894079539e-06, + "logits/chosen": -0.7625619769096375, + "logits/rejected": -0.6689075231552124, + "logps/chosen": -0.19497406482696533, + "logps/rejected": -4.010917663574219, + "loss": 0.1559, + "odds_ratio_loss": 0.09022661298513412, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.019497405737638474, + "rewards/margins": 0.3815944194793701, + "rewards/rejected": -0.4010918140411377, + "sft_loss": 0.19497406482696533, + "step": 2454 + }, + { + "epoch": 3.5502530730296455, + "grad_norm": 2.153672375639417, + "learning_rate": 2.9802323412698666e-06, + "logits/chosen": -0.5097517371177673, + "logits/rejected": -0.5301701426506042, + "logps/chosen": -0.18443165719509125, + "logps/rejected": -2.901488780975342, + "loss": 0.1948, + "odds_ratio_loss": 0.056402601301670074, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018443167209625244, + "rewards/margins": 0.2717057466506958, + "rewards/rejected": -0.29014888405799866, + "sft_loss": 0.18443165719509125, + "step": 2455 + }, + { + "epoch": 3.5516992046276212, + "grad_norm": 2.4930393761949268, + "learning_rate": 2.977229403281913e-06, + "logits/chosen": -0.5628433227539062, + "logits/rejected": -0.46921300888061523, + "logps/chosen": -0.2880474030971527, + "logps/rejected": -3.0459322929382324, + "loss": 0.2311, + "odds_ratio_loss": 0.05179532617330551, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02880474366247654, + "rewards/margins": 0.2757885158061981, + "rewards/rejected": -0.3045932650566101, + "sft_loss": 0.2880474030971527, + "step": 2456 + }, + { + "epoch": 3.5531453362255965, + "grad_norm": 1.8099839799147603, + "learning_rate": 2.974227081926162e-06, + "logits/chosen": -0.6780756711959839, + "logits/rejected": -0.5614203810691833, + "logps/chosen": -0.22212156653404236, + "logps/rejected": -4.356184959411621, + "loss": 0.1565, + "odds_ratio_loss": 0.056687142699956894, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022212158888578415, + "rewards/margins": 0.4134063422679901, + "rewards/rejected": -0.43561851978302, + "sft_loss": 0.22212156653404236, + "step": 2457 + }, + { + "epoch": 3.554591467823572, + "grad_norm": 2.092408680060082, + "learning_rate": 2.9712253790127223e-06, + "logits/chosen": -0.5250091552734375, + "logits/rejected": -0.37066176533699036, + "logps/chosen": -0.15992625057697296, + "logps/rejected": -2.9919040203094482, + "loss": 0.1848, + "odds_ratio_loss": 0.022480811923742294, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015992626547813416, + "rewards/margins": 0.2831977903842926, + "rewards/rejected": -0.2991904020309448, + "sft_loss": 0.15992625057697296, + "step": 2458 + }, + { + "epoch": 3.5560375994215474, + "grad_norm": 2.399018956229861, + "learning_rate": 2.968224296351334e-06, + "logits/chosen": -0.617534875869751, + "logits/rejected": -0.5581540465354919, + "logps/chosen": -0.177803635597229, + "logps/rejected": -3.2628211975097656, + "loss": 0.205, + "odds_ratio_loss": 0.039328742772340775, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01778036542236805, + "rewards/margins": 0.3085017800331116, + "rewards/rejected": -0.3262821435928345, + "sft_loss": 0.177803635597229, + "step": 2459 + }, + { + "epoch": 3.5574837310195226, + "grad_norm": 2.1254520024891432, + "learning_rate": 2.965223835751361e-06, + "logits/chosen": -0.7163950204849243, + "logits/rejected": -0.5951937437057495, + "logps/chosen": -0.2112230509519577, + "logps/rejected": -3.38348388671875, + "loss": 0.2085, + "odds_ratio_loss": 0.07596226781606674, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.02112230658531189, + "rewards/margins": 0.3172260820865631, + "rewards/rejected": -0.338348388671875, + "sft_loss": 0.2112230509519577, + "step": 2460 + }, + { + "epoch": 3.5589298626174983, + "grad_norm": 2.0093893471895967, + "learning_rate": 2.9622239990217896e-06, + "logits/chosen": -0.6013926267623901, + "logits/rejected": -0.5151457190513611, + "logps/chosen": -0.15072676539421082, + "logps/rejected": -2.643908739089966, + "loss": 0.1547, + "odds_ratio_loss": 0.022211087867617607, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015072677284479141, + "rewards/margins": 0.2493181973695755, + "rewards/rejected": -0.26439088582992554, + "sft_loss": 0.15072676539421082, + "step": 2461 + }, + { + "epoch": 3.5603759942154736, + "grad_norm": 2.2035437107137046, + "learning_rate": 2.9592247879712357e-06, + "logits/chosen": -0.3561607599258423, + "logits/rejected": -0.36101406812667847, + "logps/chosen": -0.2586825489997864, + "logps/rejected": -3.189708709716797, + "loss": 0.2256, + "odds_ratio_loss": 0.0867890790104866, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025868259370326996, + "rewards/margins": 0.2931026220321655, + "rewards/rejected": -0.31897085905075073, + "sft_loss": 0.2586825489997864, + "step": 2462 + }, + { + "epoch": 3.561822125813449, + "grad_norm": 2.054047426793152, + "learning_rate": 2.956226204407933e-06, + "logits/chosen": -0.5213802456855774, + "logits/rejected": -0.47224000096321106, + "logps/chosen": -0.14102500677108765, + "logps/rejected": -4.169181823730469, + "loss": 0.1955, + "odds_ratio_loss": 0.0326719805598259, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014102501794695854, + "rewards/margins": 0.40281563997268677, + "rewards/rejected": -0.41691815853118896, + "sft_loss": 0.14102500677108765, + "step": 2463 + }, + { + "epoch": 3.5632682574114245, + "grad_norm": 2.0041196821551353, + "learning_rate": 2.95322825013974e-06, + "logits/chosen": -0.5516745448112488, + "logits/rejected": -0.41874760389328003, + "logps/chosen": -0.1619918942451477, + "logps/rejected": -3.6532042026519775, + "loss": 0.1731, + "odds_ratio_loss": 0.028985023498535156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01619919016957283, + "rewards/margins": 0.34912124276161194, + "rewards/rejected": -0.3653204143047333, + "sft_loss": 0.1619918942451477, + "step": 2464 + }, + { + "epoch": 3.5647143890093997, + "grad_norm": 1.9715716167615243, + "learning_rate": 2.9502309269741314e-06, + "logits/chosen": -0.4302927255630493, + "logits/rejected": -0.3220667541027069, + "logps/chosen": -0.12735146284103394, + "logps/rejected": -3.6478960514068604, + "loss": 0.199, + "odds_ratio_loss": 0.020095938816666603, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012735145166516304, + "rewards/margins": 0.3520544767379761, + "rewards/rejected": -0.3647896349430084, + "sft_loss": 0.12735146284103394, + "step": 2465 + }, + { + "epoch": 3.5661605206073754, + "grad_norm": 2.1032834327668506, + "learning_rate": 2.9472342367182086e-06, + "logits/chosen": -0.5896605253219604, + "logits/rejected": -0.5629295706748962, + "logps/chosen": -0.331041157245636, + "logps/rejected": -3.1009109020233154, + "loss": 0.2487, + "odds_ratio_loss": 0.05582534521818161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03310411795973778, + "rewards/margins": 0.2769869863986969, + "rewards/rejected": -0.3100910782814026, + "sft_loss": 0.331041157245636, + "step": 2466 + }, + { + "epoch": 3.5676066522053507, + "grad_norm": 2.2201609739812924, + "learning_rate": 2.9442381811786846e-06, + "logits/chosen": -0.5560648441314697, + "logits/rejected": -0.46408846974372864, + "logps/chosen": -0.1172012984752655, + "logps/rejected": -3.779165267944336, + "loss": 0.167, + "odds_ratio_loss": 0.026231221854686737, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011720129288733006, + "rewards/margins": 0.3661963939666748, + "rewards/rejected": -0.37791651487350464, + "sft_loss": 0.1172012984752655, + "step": 2467 + }, + { + "epoch": 3.5690527838033264, + "grad_norm": 2.125829017437066, + "learning_rate": 2.9412427621618936e-06, + "logits/chosen": -0.569636344909668, + "logits/rejected": -0.482186883687973, + "logps/chosen": -0.18977811932563782, + "logps/rejected": -6.578741550445557, + "loss": 0.1958, + "odds_ratio_loss": 0.03151582553982735, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0189778134226799, + "rewards/margins": 0.6388963460922241, + "rewards/rejected": -0.6578741669654846, + "sft_loss": 0.18977811932563782, + "step": 2468 + }, + { + "epoch": 3.5704989154013016, + "grad_norm": 2.2044030860237367, + "learning_rate": 2.9382479814737836e-06, + "logits/chosen": -0.41000592708587646, + "logits/rejected": -0.410857617855072, + "logps/chosen": -0.3284101188182831, + "logps/rejected": -3.962172031402588, + "loss": 0.2546, + "odds_ratio_loss": 0.05446305871009827, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03284101188182831, + "rewards/margins": 0.3633762001991272, + "rewards/rejected": -0.3962172269821167, + "sft_loss": 0.3284101188182831, + "step": 2469 + }, + { + "epoch": 3.571945046999277, + "grad_norm": 1.9361635852730106, + "learning_rate": 2.9352538409199213e-06, + "logits/chosen": -0.4168761670589447, + "logits/rejected": -0.44787657260894775, + "logps/chosen": -0.22248664498329163, + "logps/rejected": -3.7818639278411865, + "loss": 0.1994, + "odds_ratio_loss": 0.03490440174937248, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02224866673350334, + "rewards/margins": 0.3559377193450928, + "rewards/rejected": -0.3781863749027252, + "sft_loss": 0.22248664498329163, + "step": 2470 + }, + { + "epoch": 3.5733911785972525, + "grad_norm": 2.078403301829724, + "learning_rate": 2.9322603423054826e-06, + "logits/chosen": -0.7230794429779053, + "logits/rejected": -0.5729874968528748, + "logps/chosen": -0.22554948925971985, + "logps/rejected": -3.4284701347351074, + "loss": 0.1535, + "odds_ratio_loss": 0.04831676930189133, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022554948925971985, + "rewards/margins": 0.32029205560684204, + "rewards/rejected": -0.34284698963165283, + "sft_loss": 0.22554948925971985, + "step": 2471 + }, + { + "epoch": 3.5748373101952278, + "grad_norm": 1.9481832175668146, + "learning_rate": 2.92926748743526e-06, + "logits/chosen": -0.5367501378059387, + "logits/rejected": -0.5176258683204651, + "logps/chosen": -0.2013862282037735, + "logps/rejected": -3.327510118484497, + "loss": 0.1885, + "odds_ratio_loss": 0.028387073427438736, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02013862319290638, + "rewards/margins": 0.3126124143600464, + "rewards/rejected": -0.3327510356903076, + "sft_loss": 0.2013862282037735, + "step": 2472 + }, + { + "epoch": 3.576283441793203, + "grad_norm": 2.298558661220886, + "learning_rate": 2.9262752781136584e-06, + "logits/chosen": -0.5828816890716553, + "logits/rejected": -0.4730074405670166, + "logps/chosen": -0.14373230934143066, + "logps/rejected": -2.983417510986328, + "loss": 0.1655, + "odds_ratio_loss": 0.020958803594112396, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014373233541846275, + "rewards/margins": 0.2839685082435608, + "rewards/rejected": -0.2983417510986328, + "sft_loss": 0.14373230934143066, + "step": 2473 + }, + { + "epoch": 3.5777295733911787, + "grad_norm": 2.2309893965679914, + "learning_rate": 2.92328371614469e-06, + "logits/chosen": -0.4653478264808655, + "logits/rejected": -0.4630056619644165, + "logps/chosen": -0.1631046086549759, + "logps/rejected": -4.455272674560547, + "loss": 0.165, + "odds_ratio_loss": 0.04333974048495293, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01631046086549759, + "rewards/margins": 0.42921683192253113, + "rewards/rejected": -0.44552725553512573, + "sft_loss": 0.1631046086549759, + "step": 2474 + }, + { + "epoch": 3.579175704989154, + "grad_norm": 1.9881278080353881, + "learning_rate": 2.9202928033319802e-06, + "logits/chosen": -0.486683189868927, + "logits/rejected": -0.4148131012916565, + "logps/chosen": -0.23954859375953674, + "logps/rejected": -4.019528388977051, + "loss": 0.2333, + "odds_ratio_loss": 0.03935012221336365, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023954860866069794, + "rewards/margins": 0.3779979944229126, + "rewards/rejected": -0.4019528329372406, + "sft_loss": 0.23954859375953674, + "step": 2475 + }, + { + "epoch": 3.580621836587129, + "grad_norm": 1.8702974706037494, + "learning_rate": 2.917302541478759e-06, + "logits/chosen": -0.6687430143356323, + "logits/rejected": -0.4045840799808502, + "logps/chosen": -0.10321912914514542, + "logps/rejected": -4.325726509094238, + "loss": 0.1571, + "odds_ratio_loss": 0.013897450640797615, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010321913287043571, + "rewards/margins": 0.42225074768066406, + "rewards/rejected": -0.4325726628303528, + "sft_loss": 0.10321912914514542, + "step": 2476 + }, + { + "epoch": 3.582067968185105, + "grad_norm": 2.2813330190561665, + "learning_rate": 2.9143129323878688e-06, + "logits/chosen": -0.36619487404823303, + "logits/rejected": -0.3331270217895508, + "logps/chosen": -0.19207827746868134, + "logps/rejected": -4.304910659790039, + "loss": 0.1878, + "odds_ratio_loss": 0.022445132955908775, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019207827746868134, + "rewards/margins": 0.4112831950187683, + "rewards/rejected": -0.4304910898208618, + "sft_loss": 0.19207827746868134, + "step": 2477 + }, + { + "epoch": 3.58351409978308, + "grad_norm": 2.1589894888217187, + "learning_rate": 2.911323977861755e-06, + "logits/chosen": -0.5841609239578247, + "logits/rejected": -0.5109463334083557, + "logps/chosen": -0.11602534353733063, + "logps/rejected": -3.624415874481201, + "loss": 0.1854, + "odds_ratio_loss": 0.032501354813575745, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011602534912526608, + "rewards/margins": 0.3508390784263611, + "rewards/rejected": -0.3624415993690491, + "sft_loss": 0.11602534353733063, + "step": 2478 + }, + { + "epoch": 3.584960231381056, + "grad_norm": 1.8654817230355145, + "learning_rate": 2.9083356797024704e-06, + "logits/chosen": -0.652426540851593, + "logits/rejected": -0.609657347202301, + "logps/chosen": -0.3062911033630371, + "logps/rejected": -3.556048631668091, + "loss": 0.1912, + "odds_ratio_loss": 0.04985497146844864, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03062911331653595, + "rewards/margins": 0.32497575879096985, + "rewards/rejected": -0.3556048572063446, + "sft_loss": 0.3062911033630371, + "step": 2479 + }, + { + "epoch": 3.586406362979031, + "grad_norm": 2.3939535042376523, + "learning_rate": 2.9053480397116684e-06, + "logits/chosen": -0.4469086229801178, + "logits/rejected": -0.3033689856529236, + "logps/chosen": -0.14823874831199646, + "logps/rejected": -3.7784619331359863, + "loss": 0.1764, + "odds_ratio_loss": 0.04784733057022095, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01482387538999319, + "rewards/margins": 0.3630222976207733, + "rewards/rejected": -0.3778461813926697, + "sft_loss": 0.14823874831199646, + "step": 2480 + }, + { + "epoch": 3.5878524945770067, + "grad_norm": 3.091688427595171, + "learning_rate": 2.902361059690612e-06, + "logits/chosen": -0.5378003716468811, + "logits/rejected": -0.3790894150733948, + "logps/chosen": -0.21176090836524963, + "logps/rejected": -4.0171709060668945, + "loss": 0.2286, + "odds_ratio_loss": 0.05296362191438675, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021176090463995934, + "rewards/margins": 0.3805410861968994, + "rewards/rejected": -0.4017171561717987, + "sft_loss": 0.21176090836524963, + "step": 2481 + }, + { + "epoch": 3.589298626174982, + "grad_norm": 1.936186621813689, + "learning_rate": 2.8993747414401597e-06, + "logits/chosen": -0.487335741519928, + "logits/rejected": -0.40547001361846924, + "logps/chosen": -0.30852556228637695, + "logps/rejected": -3.569243907928467, + "loss": 0.1835, + "odds_ratio_loss": 0.05021437630057335, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.030852556228637695, + "rewards/margins": 0.3260718584060669, + "rewards/rejected": -0.3569244146347046, + "sft_loss": 0.30852556228637695, + "step": 2482 + }, + { + "epoch": 3.590744757772957, + "grad_norm": 1.8409295208559, + "learning_rate": 2.8963890867607757e-06, + "logits/chosen": -0.6406811475753784, + "logits/rejected": -0.5087481737136841, + "logps/chosen": -0.23279646039009094, + "logps/rejected": -3.4700019359588623, + "loss": 0.1946, + "odds_ratio_loss": 0.03444098308682442, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023279648274183273, + "rewards/margins": 0.3237205743789673, + "rewards/rejected": -0.3470001816749573, + "sft_loss": 0.23279646039009094, + "step": 2483 + }, + { + "epoch": 3.592190889370933, + "grad_norm": 1.8898837799392554, + "learning_rate": 2.8934040974525206e-06, + "logits/chosen": -0.5216646790504456, + "logits/rejected": -0.5978440046310425, + "logps/chosen": -0.17544230818748474, + "logps/rejected": -4.052137851715088, + "loss": 0.1655, + "odds_ratio_loss": 0.03247702866792679, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017544232308864594, + "rewards/margins": 0.38766956329345703, + "rewards/rejected": -0.4052138030529022, + "sft_loss": 0.17544230818748474, + "step": 2484 + }, + { + "epoch": 3.593637020968908, + "grad_norm": 2.187248762834126, + "learning_rate": 2.890419775315057e-06, + "logits/chosen": -0.6240389347076416, + "logits/rejected": -0.5031851530075073, + "logps/chosen": -0.13512706756591797, + "logps/rejected": -3.0444791316986084, + "loss": 0.2024, + "odds_ratio_loss": 0.0328708216547966, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013512706384062767, + "rewards/margins": 0.2909351885318756, + "rewards/rejected": -0.30444788932800293, + "sft_loss": 0.13512706756591797, + "step": 2485 + }, + { + "epoch": 3.5950831525668834, + "grad_norm": 2.204519299865527, + "learning_rate": 2.887436122147644e-06, + "logits/chosen": -0.4692801535129547, + "logits/rejected": -0.44944295287132263, + "logps/chosen": -0.33308011293411255, + "logps/rejected": -3.2959773540496826, + "loss": 0.2273, + "odds_ratio_loss": 0.08923923969268799, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.033308010548353195, + "rewards/margins": 0.29628971219062805, + "rewards/rejected": -0.32959771156311035, + "sft_loss": 0.33308011293411255, + "step": 2486 + }, + { + "epoch": 3.596529284164859, + "grad_norm": 2.228837722865733, + "learning_rate": 2.884453139749135e-06, + "logits/chosen": -0.5567612051963806, + "logits/rejected": -0.4138484597206116, + "logps/chosen": -0.15191403031349182, + "logps/rejected": -5.067747116088867, + "loss": 0.1797, + "odds_ratio_loss": 0.020546402782201767, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015191404148936272, + "rewards/margins": 0.49158334732055664, + "rewards/rejected": -0.5067747831344604, + "sft_loss": 0.15191403031349182, + "step": 2487 + }, + { + "epoch": 3.5979754157628343, + "grad_norm": 2.245458739312704, + "learning_rate": 2.881470829917984e-06, + "logits/chosen": -0.5933226346969604, + "logits/rejected": -0.567808985710144, + "logps/chosen": -0.255230188369751, + "logps/rejected": -3.186595916748047, + "loss": 0.2138, + "odds_ratio_loss": 0.07271232455968857, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02552301622927189, + "rewards/margins": 0.2931365668773651, + "rewards/rejected": -0.31865960359573364, + "sft_loss": 0.255230188369751, + "step": 2488 + }, + { + "epoch": 3.59942154736081, + "grad_norm": 1.9277082887715968, + "learning_rate": 2.8784891944522356e-06, + "logits/chosen": -0.5661904215812683, + "logits/rejected": -0.4054012596607208, + "logps/chosen": -0.2036726176738739, + "logps/rejected": -4.150453567504883, + "loss": 0.1813, + "odds_ratio_loss": 0.03672172129154205, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02036726474761963, + "rewards/margins": 0.3946780562400818, + "rewards/rejected": -0.4150453805923462, + "sft_loss": 0.2036726176738739, + "step": 2489 + }, + { + "epoch": 3.6008676789587852, + "grad_norm": 2.086005099661083, + "learning_rate": 2.875508235149529e-06, + "logits/chosen": -0.3694448471069336, + "logits/rejected": -0.33086222410202026, + "logps/chosen": -0.23983649909496307, + "logps/rejected": -4.911922931671143, + "loss": 0.1822, + "odds_ratio_loss": 0.07498527318239212, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.023983649909496307, + "rewards/margins": 0.4672086834907532, + "rewards/rejected": -0.4911923110485077, + "sft_loss": 0.23983649909496307, + "step": 2490 + }, + { + "epoch": 3.602313810556761, + "grad_norm": 2.3194389065513, + "learning_rate": 2.872527953807094e-06, + "logits/chosen": -0.6120354533195496, + "logits/rejected": -0.3980521261692047, + "logps/chosen": -0.21247637271881104, + "logps/rejected": -3.412698268890381, + "loss": 0.1507, + "odds_ratio_loss": 0.04428691789507866, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021247640252113342, + "rewards/margins": 0.3200221061706543, + "rewards/rejected": -0.34126976132392883, + "sft_loss": 0.21247637271881104, + "step": 2491 + }, + { + "epoch": 3.603759942154736, + "grad_norm": 2.2912130950801246, + "learning_rate": 2.869548352221757e-06, + "logits/chosen": -0.5996018648147583, + "logits/rejected": -0.38249221444129944, + "logps/chosen": -0.14581993222236633, + "logps/rejected": -4.9018731117248535, + "loss": 0.1737, + "odds_ratio_loss": 0.023255767300724983, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014581995084881783, + "rewards/margins": 0.4756053388118744, + "rewards/rejected": -0.4901873469352722, + "sft_loss": 0.14581993222236633, + "step": 2492 + }, + { + "epoch": 3.6052060737527114, + "grad_norm": 2.3023457380266303, + "learning_rate": 2.8665694321899275e-06, + "logits/chosen": -0.5616737604141235, + "logits/rejected": -0.36620962619781494, + "logps/chosen": -0.09285837411880493, + "logps/rejected": -4.530645847320557, + "loss": 0.2018, + "odds_ratio_loss": 0.016189442947506905, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009285837411880493, + "rewards/margins": 0.44377875328063965, + "rewards/rejected": -0.45306459069252014, + "sft_loss": 0.09285837411880493, + "step": 2493 + }, + { + "epoch": 3.606652205350687, + "grad_norm": 1.8296179592059094, + "learning_rate": 2.863591195507609e-06, + "logits/chosen": -0.43233737349510193, + "logits/rejected": -0.42928576469421387, + "logps/chosen": -0.1452055722475052, + "logps/rejected": -4.242135524749756, + "loss": 0.1593, + "odds_ratio_loss": 0.04257887601852417, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014520557597279549, + "rewards/margins": 0.40969300270080566, + "rewards/rejected": -0.4242135286331177, + "sft_loss": 0.1452055722475052, + "step": 2494 + }, + { + "epoch": 3.6080983369486623, + "grad_norm": 1.8562477051616662, + "learning_rate": 2.86061364397039e-06, + "logits/chosen": -0.468068391084671, + "logits/rejected": -0.40226829051971436, + "logps/chosen": -0.14415565133094788, + "logps/rejected": -3.4453213214874268, + "loss": 0.1654, + "odds_ratio_loss": 0.024850863963365555, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014415565878152847, + "rewards/margins": 0.3301165699958801, + "rewards/rejected": -0.34453216195106506, + "sft_loss": 0.14415565133094788, + "step": 2495 + }, + { + "epoch": 3.6095444685466376, + "grad_norm": 2.0000474708923717, + "learning_rate": 2.8576367793734506e-06, + "logits/chosen": -0.297725111246109, + "logits/rejected": -0.36155325174331665, + "logps/chosen": -0.20408833026885986, + "logps/rejected": -3.36604905128479, + "loss": 0.2305, + "odds_ratio_loss": 0.037189140915870667, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020408835262060165, + "rewards/margins": 0.316196084022522, + "rewards/rejected": -0.33660489320755005, + "sft_loss": 0.20408833026885986, + "step": 2496 + }, + { + "epoch": 3.6109906001446133, + "grad_norm": 1.9603399607152372, + "learning_rate": 2.8546606035115498e-06, + "logits/chosen": -0.6297708749771118, + "logits/rejected": -0.5421985387802124, + "logps/chosen": -0.3003847301006317, + "logps/rejected": -2.903427839279175, + "loss": 0.2134, + "odds_ratio_loss": 0.09296062588691711, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03003847599029541, + "rewards/margins": 0.26030433177948, + "rewards/rejected": -0.2903428077697754, + "sft_loss": 0.3003847301006317, + "step": 2497 + }, + { + "epoch": 3.6124367317425885, + "grad_norm": 1.8908173660949101, + "learning_rate": 2.8516851181790384e-06, + "logits/chosen": -0.4586806297302246, + "logits/rejected": -0.3453059494495392, + "logps/chosen": -0.07760489732027054, + "logps/rejected": -4.848832607269287, + "loss": 0.1691, + "odds_ratio_loss": 0.03154202550649643, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007760489825159311, + "rewards/margins": 0.4771227538585663, + "rewards/rejected": -0.48488324880599976, + "sft_loss": 0.07760489732027054, + "step": 2498 + }, + { + "epoch": 3.6138828633405637, + "grad_norm": 1.9129309499246139, + "learning_rate": 2.848710325169844e-06, + "logits/chosen": -0.5357916951179504, + "logits/rejected": -0.47928518056869507, + "logps/chosen": -0.09794984012842178, + "logps/rejected": -5.796481609344482, + "loss": 0.1743, + "odds_ratio_loss": 0.015042275190353394, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009794985875487328, + "rewards/margins": 0.5698531866073608, + "rewards/rejected": -0.5796481966972351, + "sft_loss": 0.09794984012842178, + "step": 2499 + }, + { + "epoch": 3.6153289949385394, + "grad_norm": 3.275069979062189, + "learning_rate": 2.8457362262774825e-06, + "logits/chosen": -0.3791963458061218, + "logits/rejected": -0.27261611819267273, + "logps/chosen": -0.1412392556667328, + "logps/rejected": -6.682195663452148, + "loss": 0.1799, + "odds_ratio_loss": 0.036368854343891144, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014123925939202309, + "rewards/margins": 0.6540957093238831, + "rewards/rejected": -0.6682195663452148, + "sft_loss": 0.1412392556667328, + "step": 2500 + }, + { + "epoch": 3.6167751265365147, + "grad_norm": 1.9873717801084936, + "learning_rate": 2.8427628232950504e-06, + "logits/chosen": -0.5378691554069519, + "logits/rejected": -0.48928403854370117, + "logps/chosen": -0.1797875463962555, + "logps/rejected": -3.5523314476013184, + "loss": 0.1742, + "odds_ratio_loss": 0.04132210463285446, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01797875389456749, + "rewards/margins": 0.3372544050216675, + "rewards/rejected": -0.35523316264152527, + "sft_loss": 0.1797875463962555, + "step": 2501 + }, + { + "epoch": 3.6182212581344904, + "grad_norm": 1.8980601051441697, + "learning_rate": 2.8397901180152223e-06, + "logits/chosen": -0.5706363916397095, + "logits/rejected": -0.407425194978714, + "logps/chosen": -0.15050289034843445, + "logps/rejected": -3.6332499980926514, + "loss": 0.1993, + "odds_ratio_loss": 0.042995937168598175, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01505028922110796, + "rewards/margins": 0.34827476739883423, + "rewards/rejected": -0.36332499980926514, + "sft_loss": 0.15050289034843445, + "step": 2502 + }, + { + "epoch": 3.6196673897324656, + "grad_norm": 2.118084164458217, + "learning_rate": 2.8368181122302525e-06, + "logits/chosen": -0.3179455101490021, + "logits/rejected": -0.25522419810295105, + "logps/chosen": -0.1424923986196518, + "logps/rejected": -5.086899280548096, + "loss": 0.1693, + "odds_ratio_loss": 0.03211604803800583, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014249240048229694, + "rewards/margins": 0.4944407045841217, + "rewards/rejected": -0.5086899399757385, + "sft_loss": 0.1424923986196518, + "step": 2503 + }, + { + "epoch": 3.6211135213304413, + "grad_norm": 2.6548176127386793, + "learning_rate": 2.833846807731975e-06, + "logits/chosen": -0.4786378741264343, + "logits/rejected": -0.43536630272865295, + "logps/chosen": -0.0729706808924675, + "logps/rejected": -3.080990791320801, + "loss": 0.1645, + "odds_ratio_loss": 0.021972548216581345, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00729706883430481, + "rewards/margins": 0.30080199241638184, + "rewards/rejected": -0.30809906125068665, + "sft_loss": 0.0729706808924675, + "step": 2504 + }, + { + "epoch": 3.6225596529284165, + "grad_norm": 2.5135474712614045, + "learning_rate": 2.8308762063118006e-06, + "logits/chosen": -0.4966127574443817, + "logits/rejected": -0.5303236842155457, + "logps/chosen": -0.17419995367527008, + "logps/rejected": -3.1192662715911865, + "loss": 0.1724, + "odds_ratio_loss": 0.04412810876965523, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01741999387741089, + "rewards/margins": 0.29450663924217224, + "rewards/rejected": -0.31192663311958313, + "sft_loss": 0.17419995367527008, + "step": 2505 + }, + { + "epoch": 3.6240057845263918, + "grad_norm": 2.1262943897743747, + "learning_rate": 2.8279063097607156e-06, + "logits/chosen": -0.6062009334564209, + "logits/rejected": -0.5596913695335388, + "logps/chosen": -0.1444343477487564, + "logps/rejected": -2.005614757537842, + "loss": 0.1655, + "odds_ratio_loss": 0.03216197341680527, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014443434774875641, + "rewards/margins": 0.18611805140972137, + "rewards/rejected": -0.20056146383285522, + "sft_loss": 0.1444343477487564, + "step": 2506 + }, + { + "epoch": 3.6254519161243675, + "grad_norm": 2.086089991067271, + "learning_rate": 2.8249371198692827e-06, + "logits/chosen": -0.6401376724243164, + "logits/rejected": -0.5771905183792114, + "logps/chosen": -0.13659507036209106, + "logps/rejected": -4.549966812133789, + "loss": 0.1906, + "odds_ratio_loss": 0.015896422788500786, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013659507036209106, + "rewards/margins": 0.4413371682167053, + "rewards/rejected": -0.45499664545059204, + "sft_loss": 0.13659507036209106, + "step": 2507 + }, + { + "epoch": 3.6268980477223427, + "grad_norm": 2.169001436836437, + "learning_rate": 2.821968638427634e-06, + "logits/chosen": -0.8902711868286133, + "logits/rejected": -0.500067412853241, + "logps/chosen": -0.11250185966491699, + "logps/rejected": -3.8250503540039062, + "loss": 0.162, + "odds_ratio_loss": 0.036406371742486954, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011250186711549759, + "rewards/margins": 0.3712548613548279, + "rewards/rejected": -0.38250505924224854, + "sft_loss": 0.11250185966491699, + "step": 2508 + }, + { + "epoch": 3.628344179320318, + "grad_norm": 13.832662348150325, + "learning_rate": 2.8190008672254835e-06, + "logits/chosen": -0.7249534726142883, + "logits/rejected": -0.6150829792022705, + "logps/chosen": -0.13645420968532562, + "logps/rejected": -2.8476240634918213, + "loss": 0.1885, + "odds_ratio_loss": 0.03549912944436073, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013645419850945473, + "rewards/margins": 0.2711169719696045, + "rewards/rejected": -0.2847624123096466, + "sft_loss": 0.13645420968532562, + "step": 2509 + }, + { + "epoch": 3.6297903109182936, + "grad_norm": 2.2782044948197258, + "learning_rate": 2.8160338080521074e-06, + "logits/chosen": -0.5967862606048584, + "logits/rejected": -0.6156792640686035, + "logps/chosen": -0.15163278579711914, + "logps/rejected": -3.7114038467407227, + "loss": 0.177, + "odds_ratio_loss": 0.03112233430147171, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015163278207182884, + "rewards/margins": 0.3559771180152893, + "rewards/rejected": -0.37114039063453674, + "sft_loss": 0.15163278579711914, + "step": 2510 + }, + { + "epoch": 3.631236442516269, + "grad_norm": 2.1207273904138946, + "learning_rate": 2.8130674626963586e-06, + "logits/chosen": -0.5536920428276062, + "logits/rejected": -0.49420982599258423, + "logps/chosen": -0.18125204741954803, + "logps/rejected": -2.170628070831299, + "loss": 0.1629, + "odds_ratio_loss": 0.03468827158212662, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018125206232070923, + "rewards/margins": 0.19893759489059448, + "rewards/rejected": -0.2170628309249878, + "sft_loss": 0.18125204741954803, + "step": 2511 + }, + { + "epoch": 3.6326825741142446, + "grad_norm": 2.087165380567483, + "learning_rate": 2.8101018329466557e-06, + "logits/chosen": -0.529472827911377, + "logits/rejected": -0.3276049494743347, + "logps/chosen": -0.2205849587917328, + "logps/rejected": -5.405893802642822, + "loss": 0.1696, + "odds_ratio_loss": 0.04581043869256973, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02205849625170231, + "rewards/margins": 0.5185309052467346, + "rewards/rejected": -0.5405893921852112, + "sft_loss": 0.2205849587917328, + "step": 2512 + }, + { + "epoch": 3.63412870571222, + "grad_norm": 4.15655797636779, + "learning_rate": 2.8071369205909904e-06, + "logits/chosen": -0.5813817381858826, + "logits/rejected": -0.5162385702133179, + "logps/chosen": -0.16202031075954437, + "logps/rejected": -4.27128791809082, + "loss": 0.1493, + "odds_ratio_loss": 0.04542159289121628, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016202032566070557, + "rewards/margins": 0.4109267294406891, + "rewards/rejected": -0.42712879180908203, + "sft_loss": 0.16202031075954437, + "step": 2513 + }, + { + "epoch": 3.6355748373101955, + "grad_norm": 2.1316330685140072, + "learning_rate": 2.804172727416919e-06, + "logits/chosen": -0.566758394241333, + "logits/rejected": -0.4026070237159729, + "logps/chosen": -0.2781253159046173, + "logps/rejected": -4.5679779052734375, + "loss": 0.2684, + "odds_ratio_loss": 0.0528629794716835, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02781253308057785, + "rewards/margins": 0.42898523807525635, + "rewards/rejected": -0.4567977786064148, + "sft_loss": 0.2781253159046173, + "step": 2514 + }, + { + "epoch": 3.6370209689081707, + "grad_norm": 2.2518564789198323, + "learning_rate": 2.801209255211562e-06, + "logits/chosen": -0.5290085077285767, + "logits/rejected": -0.4111337661743164, + "logps/chosen": -0.15597251057624817, + "logps/rejected": -3.711620807647705, + "loss": 0.2034, + "odds_ratio_loss": 0.033812057226896286, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015597251243889332, + "rewards/margins": 0.35556483268737793, + "rewards/rejected": -0.3711620569229126, + "sft_loss": 0.15597251057624817, + "step": 2515 + }, + { + "epoch": 3.638467100506146, + "grad_norm": 1.9427894630685738, + "learning_rate": 2.798246505761612e-06, + "logits/chosen": -0.580420732498169, + "logits/rejected": -0.45197558403015137, + "logps/chosen": -0.1896083801984787, + "logps/rejected": -5.463057518005371, + "loss": 0.1589, + "odds_ratio_loss": 0.029428288340568542, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01896083913743496, + "rewards/margins": 0.5273449420928955, + "rewards/rejected": -0.546305775642395, + "sft_loss": 0.1896083801984787, + "step": 2516 + }, + { + "epoch": 3.6399132321041217, + "grad_norm": 1.704300930390487, + "learning_rate": 2.7952844808533185e-06, + "logits/chosen": -0.5066642165184021, + "logits/rejected": -0.4148600101470947, + "logps/chosen": -0.13526996970176697, + "logps/rejected": -5.089852333068848, + "loss": 0.1713, + "odds_ratio_loss": 0.030814705416560173, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013526996597647667, + "rewards/margins": 0.4954582452774048, + "rewards/rejected": -0.5089852213859558, + "sft_loss": 0.13526996970176697, + "step": 2517 + }, + { + "epoch": 3.641359363702097, + "grad_norm": 1.8499325363659644, + "learning_rate": 2.792323182272499e-06, + "logits/chosen": -0.46863168478012085, + "logits/rejected": -0.4521971642971039, + "logps/chosen": -0.13161855936050415, + "logps/rejected": -3.217064380645752, + "loss": 0.1664, + "odds_ratio_loss": 0.03563810884952545, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0131618557497859, + "rewards/margins": 0.3085446059703827, + "rewards/rejected": -0.32170647382736206, + "sft_loss": 0.13161855936050415, + "step": 2518 + }, + { + "epoch": 3.642805495300072, + "grad_norm": 2.025316530793158, + "learning_rate": 2.789362611804529e-06, + "logits/chosen": -0.49984610080718994, + "logits/rejected": -0.515692949295044, + "logps/chosen": -0.22088411450386047, + "logps/rejected": -4.2113823890686035, + "loss": 0.2018, + "odds_ratio_loss": 0.0324944369494915, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022088412195444107, + "rewards/margins": 0.3990498185157776, + "rewards/rejected": -0.4211382269859314, + "sft_loss": 0.22088411450386047, + "step": 2519 + }, + { + "epoch": 3.644251626898048, + "grad_norm": 1.976339663757813, + "learning_rate": 2.7864027712343513e-06, + "logits/chosen": -0.45987632870674133, + "logits/rejected": -0.3473128378391266, + "logps/chosen": -0.10946309566497803, + "logps/rejected": -4.030787944793701, + "loss": 0.1518, + "odds_ratio_loss": 0.04007424786686897, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010946309193968773, + "rewards/margins": 0.3921324908733368, + "rewards/rejected": -0.4030787944793701, + "sft_loss": 0.10946309566497803, + "step": 2520 + }, + { + "epoch": 3.645697758496023, + "grad_norm": 2.0676256585095576, + "learning_rate": 2.7834436623464616e-06, + "logits/chosen": -0.6962200403213501, + "logits/rejected": -0.5652716755867004, + "logps/chosen": -0.20560969412326813, + "logps/rejected": -4.473116397857666, + "loss": 0.2235, + "odds_ratio_loss": 0.03834813833236694, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020560970529913902, + "rewards/margins": 0.42675071954727173, + "rewards/rejected": -0.447311669588089, + "sft_loss": 0.20560969412326813, + "step": 2521 + }, + { + "epoch": 3.6471438900939983, + "grad_norm": 1.9786619388491158, + "learning_rate": 2.78048528692492e-06, + "logits/chosen": -0.4101904332637787, + "logits/rejected": -0.3749925494194031, + "logps/chosen": -0.2814168632030487, + "logps/rejected": -2.196606159210205, + "loss": 0.1929, + "odds_ratio_loss": 0.06532847881317139, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0281416866928339, + "rewards/margins": 0.19151893258094788, + "rewards/rejected": -0.21966060996055603, + "sft_loss": 0.2814168632030487, + "step": 2522 + }, + { + "epoch": 3.648590021691974, + "grad_norm": 1.8923559183457217, + "learning_rate": 2.777527646753339e-06, + "logits/chosen": -0.533107340335846, + "logits/rejected": -0.37154343724250793, + "logps/chosen": -0.18305464088916779, + "logps/rejected": -3.3793654441833496, + "loss": 0.1757, + "odds_ratio_loss": 0.03871382027864456, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018305467441678047, + "rewards/margins": 0.31963109970092773, + "rewards/rejected": -0.3379365801811218, + "sft_loss": 0.18305464088916779, + "step": 2523 + }, + { + "epoch": 3.6500361532899492, + "grad_norm": 2.4175380591242486, + "learning_rate": 2.774570743614894e-06, + "logits/chosen": -0.6140806078910828, + "logits/rejected": -0.43565231561660767, + "logps/chosen": -0.2680893838405609, + "logps/rejected": -3.991342306137085, + "loss": 0.2151, + "odds_ratio_loss": 0.14778588712215424, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.02680893987417221, + "rewards/margins": 0.3723253309726715, + "rewards/rejected": -0.39913424849510193, + "sft_loss": 0.2680893838405609, + "step": 2524 + }, + { + "epoch": 3.651482284887925, + "grad_norm": 2.0920122323863817, + "learning_rate": 2.7716145792923114e-06, + "logits/chosen": -0.3725340962409973, + "logits/rejected": -0.392286479473114, + "logps/chosen": -0.19242461025714874, + "logps/rejected": -2.6325159072875977, + "loss": 0.2413, + "odds_ratio_loss": 0.041729189455509186, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019242461770772934, + "rewards/margins": 0.2440091371536255, + "rewards/rejected": -0.2632516026496887, + "sft_loss": 0.19242461025714874, + "step": 2525 + }, + { + "epoch": 3.6529284164859, + "grad_norm": 1.7977951192621633, + "learning_rate": 2.7686591555678725e-06, + "logits/chosen": -0.3152431845664978, + "logits/rejected": -0.36898574233055115, + "logps/chosen": -0.09872251749038696, + "logps/rejected": -5.19644021987915, + "loss": 0.1324, + "odds_ratio_loss": 0.01986844837665558, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009872252121567726, + "rewards/margins": 0.5097717642784119, + "rewards/rejected": -0.519644021987915, + "sft_loss": 0.09872251749038696, + "step": 2526 + }, + { + "epoch": 3.654374548083876, + "grad_norm": 2.1765409277692904, + "learning_rate": 2.765704474223417e-06, + "logits/chosen": -0.5735154151916504, + "logits/rejected": -0.43699342012405396, + "logps/chosen": -0.194193035364151, + "logps/rejected": -4.180932521820068, + "loss": 0.2277, + "odds_ratio_loss": 0.043235015124082565, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01941930502653122, + "rewards/margins": 0.3986739218235016, + "rewards/rejected": -0.4180932641029358, + "sft_loss": 0.194193035364151, + "step": 2527 + }, + { + "epoch": 3.655820679681851, + "grad_norm": 1.986485149439459, + "learning_rate": 2.76275053704033e-06, + "logits/chosen": -0.6245859265327454, + "logits/rejected": -0.5641262531280518, + "logps/chosen": -0.1828930377960205, + "logps/rejected": -3.177208185195923, + "loss": 0.1233, + "odds_ratio_loss": 0.0734827071428299, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01828930340707302, + "rewards/margins": 0.2994315028190613, + "rewards/rejected": -0.31772083044052124, + "sft_loss": 0.1828930377960205, + "step": 2528 + }, + { + "epoch": 3.6572668112798263, + "grad_norm": 2.2808936138834874, + "learning_rate": 2.759797345799553e-06, + "logits/chosen": -0.5670177936553955, + "logits/rejected": -0.3730791509151459, + "logps/chosen": -0.17481276392936707, + "logps/rejected": -4.819494247436523, + "loss": 0.2586, + "odds_ratio_loss": 0.023719413205981255, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017481276765465736, + "rewards/margins": 0.4644681513309479, + "rewards/rejected": -0.48194941878318787, + "sft_loss": 0.17481276392936707, + "step": 2529 + }, + { + "epoch": 3.658712942877802, + "grad_norm": 2.5614184083505975, + "learning_rate": 2.7568449022815737e-06, + "logits/chosen": -0.6097587943077087, + "logits/rejected": -0.42381176352500916, + "logps/chosen": -0.11319475620985031, + "logps/rejected": -5.370057106018066, + "loss": 0.1716, + "odds_ratio_loss": 0.013680643402040005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011319476179778576, + "rewards/margins": 0.5256862640380859, + "rewards/rejected": -0.5370057821273804, + "sft_loss": 0.11319475620985031, + "step": 2530 + }, + { + "epoch": 3.6601590744757773, + "grad_norm": 2.1000222835332836, + "learning_rate": 2.7538932082664337e-06, + "logits/chosen": -0.6866092085838318, + "logits/rejected": -0.6213411092758179, + "logps/chosen": -0.1459658145904541, + "logps/rejected": -2.8110921382904053, + "loss": 0.1863, + "odds_ratio_loss": 0.04226803034543991, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01459658145904541, + "rewards/margins": 0.2665126323699951, + "rewards/rejected": -0.2811092138290405, + "sft_loss": 0.1459658145904541, + "step": 2531 + }, + { + "epoch": 3.6616052060737525, + "grad_norm": 2.349668505867712, + "learning_rate": 2.7509422655337194e-06, + "logits/chosen": -0.748806893825531, + "logits/rejected": -0.5211739540100098, + "logps/chosen": -0.11192212998867035, + "logps/rejected": -5.20810604095459, + "loss": 0.1486, + "odds_ratio_loss": 0.027064472436904907, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011192213743925095, + "rewards/margins": 0.5096184015274048, + "rewards/rejected": -0.520810604095459, + "sft_loss": 0.11192212998867035, + "step": 2532 + }, + { + "epoch": 3.663051337671728, + "grad_norm": 1.9229196234297072, + "learning_rate": 2.747992075862566e-06, + "logits/chosen": -0.5800803899765015, + "logits/rejected": -0.5336440801620483, + "logps/chosen": -0.21885840594768524, + "logps/rejected": -3.05794620513916, + "loss": 0.1674, + "odds_ratio_loss": 0.033811405301094055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021885842084884644, + "rewards/margins": 0.28390878438949585, + "rewards/rejected": -0.3057945966720581, + "sft_loss": 0.21885840594768524, + "step": 2533 + }, + { + "epoch": 3.6644974692697034, + "grad_norm": 1.9210763154156887, + "learning_rate": 2.7450426410316515e-06, + "logits/chosen": -0.6399502754211426, + "logits/rejected": -0.5182642340660095, + "logps/chosen": -0.2076047956943512, + "logps/rejected": -4.193682670593262, + "loss": 0.1769, + "odds_ratio_loss": 0.045763999223709106, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02076047845184803, + "rewards/margins": 0.3986077904701233, + "rewards/rejected": -0.41936826705932617, + "sft_loss": 0.2076047956943512, + "step": 2534 + }, + { + "epoch": 3.665943600867679, + "grad_norm": 2.011546723686748, + "learning_rate": 2.7420939628192044e-06, + "logits/chosen": -0.8048521280288696, + "logits/rejected": -0.5235615968704224, + "logps/chosen": -0.17272844910621643, + "logps/rejected": -4.574443817138672, + "loss": 0.1633, + "odds_ratio_loss": 0.0383070707321167, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017272844910621643, + "rewards/margins": 0.44017156958580017, + "rewards/rejected": -0.457444429397583, + "sft_loss": 0.17272844910621643, + "step": 2535 + }, + { + "epoch": 3.6673897324656544, + "grad_norm": 1.9680305398978295, + "learning_rate": 2.739146043002991e-06, + "logits/chosen": -0.6656097173690796, + "logits/rejected": -0.5541112422943115, + "logps/chosen": -0.1563243865966797, + "logps/rejected": -3.3513598442077637, + "loss": 0.171, + "odds_ratio_loss": 0.022492770105600357, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01563243940472603, + "rewards/margins": 0.3195035457611084, + "rewards/rejected": -0.3351359963417053, + "sft_loss": 0.1563243865966797, + "step": 2536 + }, + { + "epoch": 3.66883586406363, + "grad_norm": 1.9549831394889021, + "learning_rate": 2.736198883360324e-06, + "logits/chosen": -0.5706369876861572, + "logits/rejected": -0.43308964371681213, + "logps/chosen": -0.24978458881378174, + "logps/rejected": -3.1274073123931885, + "loss": 0.2132, + "odds_ratio_loss": 0.05592186748981476, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024978458881378174, + "rewards/margins": 0.28776225447654724, + "rewards/rejected": -0.3127407133579254, + "sft_loss": 0.24978458881378174, + "step": 2537 + }, + { + "epoch": 3.6702819956616053, + "grad_norm": 2.8400570515450774, + "learning_rate": 2.733252485668057e-06, + "logits/chosen": -0.6471999883651733, + "logits/rejected": -0.5394845008850098, + "logps/chosen": -0.189593106508255, + "logps/rejected": -2.7674801349639893, + "loss": 0.2171, + "odds_ratio_loss": 0.03962300345301628, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01895931176841259, + "rewards/margins": 0.2577887177467346, + "rewards/rejected": -0.27674800157546997, + "sft_loss": 0.189593106508255, + "step": 2538 + }, + { + "epoch": 3.6717281272595805, + "grad_norm": 2.4733180825080403, + "learning_rate": 2.7303068517025845e-06, + "logits/chosen": -0.5572927594184875, + "logits/rejected": -0.4921252727508545, + "logps/chosen": -0.1227627843618393, + "logps/rejected": -4.144178867340088, + "loss": 0.1404, + "odds_ratio_loss": 0.012044022791087627, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01227627694606781, + "rewards/margins": 0.40214163064956665, + "rewards/rejected": -0.41441792249679565, + "sft_loss": 0.1227627843618393, + "step": 2539 + }, + { + "epoch": 3.673174258857556, + "grad_norm": 1.9355292256824457, + "learning_rate": 2.7273619832398405e-06, + "logits/chosen": -0.5252768993377686, + "logits/rejected": -0.4472333788871765, + "logps/chosen": -0.15913286805152893, + "logps/rejected": -4.911133766174316, + "loss": 0.1602, + "odds_ratio_loss": 0.02607126533985138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015913287177681923, + "rewards/margins": 0.4752000570297241, + "rewards/rejected": -0.4911133646965027, + "sft_loss": 0.15913286805152893, + "step": 2540 + }, + { + "epoch": 3.6746203904555315, + "grad_norm": 2.381886666609673, + "learning_rate": 2.724417882055295e-06, + "logits/chosen": -0.5004345774650574, + "logits/rejected": -0.457826167345047, + "logps/chosen": -0.16766560077667236, + "logps/rejected": -2.8187742233276367, + "loss": 0.2033, + "odds_ratio_loss": 0.049446333199739456, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016766561195254326, + "rewards/margins": 0.26511088013648987, + "rewards/rejected": -0.28187742829322815, + "sft_loss": 0.16766560077667236, + "step": 2541 + }, + { + "epoch": 3.6760665220535067, + "grad_norm": 2.292433055047217, + "learning_rate": 2.7214745499239613e-06, + "logits/chosen": -0.7124675512313843, + "logits/rejected": -0.5647624135017395, + "logps/chosen": -0.13279548287391663, + "logps/rejected": -2.9666402339935303, + "loss": 0.1804, + "odds_ratio_loss": 0.022926434874534607, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013279548846185207, + "rewards/margins": 0.2833844721317291, + "rewards/rejected": -0.2966639995574951, + "sft_loss": 0.13279548287391663, + "step": 2542 + }, + { + "epoch": 3.6775126536514824, + "grad_norm": 2.181098941599247, + "learning_rate": 2.7185319886203825e-06, + "logits/chosen": -0.5184475183486938, + "logits/rejected": -0.35520482063293457, + "logps/chosen": -0.11584815382957458, + "logps/rejected": -3.8595778942108154, + "loss": 0.2275, + "odds_ratio_loss": 0.012421883642673492, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011584816500544548, + "rewards/margins": 0.3743729889392853, + "rewards/rejected": -0.3859577775001526, + "sft_loss": 0.11584815382957458, + "step": 2543 + }, + { + "epoch": 3.6789587852494576, + "grad_norm": 1.947842308476255, + "learning_rate": 2.7155901999186407e-06, + "logits/chosen": -0.6304459571838379, + "logits/rejected": -0.3842719793319702, + "logps/chosen": -0.1558356136083603, + "logps/rejected": -4.710226535797119, + "loss": 0.175, + "odds_ratio_loss": 0.029195290058851242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015583561733365059, + "rewards/margins": 0.4554390609264374, + "rewards/rejected": -0.4710226356983185, + "sft_loss": 0.1558356136083603, + "step": 2544 + }, + { + "epoch": 3.680404916847433, + "grad_norm": 2.107354009715289, + "learning_rate": 2.7126491855923497e-06, + "logits/chosen": -0.4662625193595886, + "logits/rejected": -0.3562268316745758, + "logps/chosen": -0.18191561102867126, + "logps/rejected": -5.232119083404541, + "loss": 0.2264, + "odds_ratio_loss": 0.027083907276391983, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018191561102867126, + "rewards/margins": 0.5050203800201416, + "rewards/rejected": -0.5232118964195251, + "sft_loss": 0.18191561102867126, + "step": 2545 + }, + { + "epoch": 3.6818510484454086, + "grad_norm": 2.1162458781646594, + "learning_rate": 2.7097089474146608e-06, + "logits/chosen": -0.6267588138580322, + "logits/rejected": -0.47810542583465576, + "logps/chosen": -0.17624418437480927, + "logps/rejected": -2.7139992713928223, + "loss": 0.1855, + "odds_ratio_loss": 0.01949489489197731, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017624419182538986, + "rewards/margins": 0.25377553701400757, + "rewards/rejected": -0.27139994502067566, + "sft_loss": 0.17624418437480927, + "step": 2546 + }, + { + "epoch": 3.683297180043384, + "grad_norm": 1.8981881245598302, + "learning_rate": 2.706769487158251e-06, + "logits/chosen": -0.4763748347759247, + "logits/rejected": -0.4929497241973877, + "logps/chosen": -0.10615742951631546, + "logps/rejected": -2.481844425201416, + "loss": 0.1587, + "odds_ratio_loss": 0.022733446210622787, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010615743696689606, + "rewards/margins": 0.2375687062740326, + "rewards/rejected": -0.2481844425201416, + "sft_loss": 0.10615742951631546, + "step": 2547 + }, + { + "epoch": 3.6847433116413595, + "grad_norm": 1.9813324367877208, + "learning_rate": 2.703830806595337e-06, + "logits/chosen": -0.5492920279502869, + "logits/rejected": -0.4389027953147888, + "logps/chosen": -0.20252877473831177, + "logps/rejected": -5.312198638916016, + "loss": 0.1906, + "odds_ratio_loss": 0.031157786026597023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020252875983715057, + "rewards/margins": 0.5109670162200928, + "rewards/rejected": -0.5312198400497437, + "sft_loss": 0.20252877473831177, + "step": 2548 + }, + { + "epoch": 3.6861894432393347, + "grad_norm": 2.584069937077018, + "learning_rate": 2.7008929074976548e-06, + "logits/chosen": -0.43435296416282654, + "logits/rejected": -0.5349177122116089, + "logps/chosen": -0.19991475343704224, + "logps/rejected": -2.423389434814453, + "loss": 0.1821, + "odds_ratio_loss": 0.1047024056315422, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.019991476088762283, + "rewards/margins": 0.2223474681377411, + "rewards/rejected": -0.24233895540237427, + "sft_loss": 0.19991475343704224, + "step": 2549 + }, + { + "epoch": 3.6876355748373104, + "grad_norm": 2.117888291413296, + "learning_rate": 2.6979557916364784e-06, + "logits/chosen": -0.48327285051345825, + "logits/rejected": -0.43461471796035767, + "logps/chosen": -0.2285316288471222, + "logps/rejected": -4.459638595581055, + "loss": 0.2034, + "odds_ratio_loss": 0.04889984801411629, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02285316213965416, + "rewards/margins": 0.4231106638908386, + "rewards/rejected": -0.4459638297557831, + "sft_loss": 0.2285316288471222, + "step": 2550 + }, + { + "epoch": 3.6890817064352857, + "grad_norm": 2.312483892866644, + "learning_rate": 2.695019460782603e-06, + "logits/chosen": -0.465861052274704, + "logits/rejected": -0.4098866581916809, + "logps/chosen": -0.2447035163640976, + "logps/rejected": -2.7150635719299316, + "loss": 0.1766, + "odds_ratio_loss": 0.11905589699745178, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.02447035349905491, + "rewards/margins": 0.24703601002693176, + "rewards/rejected": -0.27150633931159973, + "sft_loss": 0.2447035163640976, + "step": 2551 + }, + { + "epoch": 3.690527838033261, + "grad_norm": 1.9446876065784444, + "learning_rate": 2.6920839167063553e-06, + "logits/chosen": -0.6311659812927246, + "logits/rejected": -0.5097159147262573, + "logps/chosen": -0.16333486139774323, + "logps/rejected": -5.158690452575684, + "loss": 0.1929, + "odds_ratio_loss": 0.02804923988878727, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016333486884832382, + "rewards/margins": 0.49953556060791016, + "rewards/rejected": -0.5158690214157104, + "sft_loss": 0.16333486139774323, + "step": 2552 + }, + { + "epoch": 3.6919739696312366, + "grad_norm": 1.9728394412534829, + "learning_rate": 2.689149161177587e-06, + "logits/chosen": -0.5078314542770386, + "logits/rejected": -0.5038515329360962, + "logps/chosen": -0.27855753898620605, + "logps/rejected": -3.2389707565307617, + "loss": 0.2245, + "odds_ratio_loss": 0.06192722171545029, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027855753898620605, + "rewards/margins": 0.296041339635849, + "rewards/rejected": -0.3238970935344696, + "sft_loss": 0.27855753898620605, + "step": 2553 + }, + { + "epoch": 3.693420101229212, + "grad_norm": 2.189323575896016, + "learning_rate": 2.6862151959656696e-06, + "logits/chosen": -0.735476016998291, + "logits/rejected": -0.4959479868412018, + "logps/chosen": -0.16790777444839478, + "logps/rejected": -4.156628608703613, + "loss": 0.2023, + "odds_ratio_loss": 0.011840347200632095, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016790777444839478, + "rewards/margins": 0.3988720774650574, + "rewards/rejected": -0.41566285490989685, + "sft_loss": 0.16790777444839478, + "step": 2554 + }, + { + "epoch": 3.694866232827187, + "grad_norm": 1.8570832866807923, + "learning_rate": 2.6832820228395054e-06, + "logits/chosen": -0.603441596031189, + "logits/rejected": -0.4443323612213135, + "logps/chosen": -0.15063583850860596, + "logps/rejected": -4.947831630706787, + "loss": 0.1428, + "odds_ratio_loss": 0.022614125162363052, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015063582919538021, + "rewards/margins": 0.4797195792198181, + "rewards/rejected": -0.4947831928730011, + "sft_loss": 0.15063583850860596, + "step": 2555 + }, + { + "epoch": 3.6963123644251628, + "grad_norm": 1.9088578967408238, + "learning_rate": 2.6803496435675127e-06, + "logits/chosen": -0.6813327074050903, + "logits/rejected": -0.5897226333618164, + "logps/chosen": -0.23708531260490417, + "logps/rejected": -3.692704200744629, + "loss": 0.1918, + "odds_ratio_loss": 0.04931206628680229, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023708531633019447, + "rewards/margins": 0.3455618917942047, + "rewards/rejected": -0.3692704439163208, + "sft_loss": 0.23708531260490417, + "step": 2556 + }, + { + "epoch": 3.697758496023138, + "grad_norm": 2.3605727774829557, + "learning_rate": 2.6774180599176356e-06, + "logits/chosen": -0.7046637535095215, + "logits/rejected": -0.6025267839431763, + "logps/chosen": -0.19786414504051208, + "logps/rejected": -3.777595281600952, + "loss": 0.161, + "odds_ratio_loss": 0.03681756556034088, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019786417484283447, + "rewards/margins": 0.3579730987548828, + "rewards/rejected": -0.37775954604148865, + "sft_loss": 0.19786414504051208, + "step": 2557 + }, + { + "epoch": 3.6992046276211137, + "grad_norm": 2.125397589144874, + "learning_rate": 2.674487273657334e-06, + "logits/chosen": -0.3669889569282532, + "logits/rejected": -0.3705841302871704, + "logps/chosen": -0.1971624493598938, + "logps/rejected": -5.799873352050781, + "loss": 0.151, + "odds_ratio_loss": 0.07610459625720978, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.01971624605357647, + "rewards/margins": 0.5602710843086243, + "rewards/rejected": -0.5799873471260071, + "sft_loss": 0.1971624493598938, + "step": 2558 + }, + { + "epoch": 3.700650759219089, + "grad_norm": 1.9753024574913587, + "learning_rate": 2.671557286553594e-06, + "logits/chosen": -0.5993480682373047, + "logits/rejected": -0.564248263835907, + "logps/chosen": -0.29278743267059326, + "logps/rejected": -3.0654852390289307, + "loss": 0.1871, + "odds_ratio_loss": 0.06417261809110641, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029278744012117386, + "rewards/margins": 0.27726981043815613, + "rewards/rejected": -0.306548535823822, + "sft_loss": 0.29278743267059326, + "step": 2559 + }, + { + "epoch": 3.7020968908170646, + "grad_norm": 2.347577217983018, + "learning_rate": 2.6686281003729126e-06, + "logits/chosen": -0.8407468795776367, + "logits/rejected": -0.6318446397781372, + "logps/chosen": -0.1679374873638153, + "logps/rejected": -4.08281135559082, + "loss": 0.1522, + "odds_ratio_loss": 0.029092181473970413, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0167937483638525, + "rewards/margins": 0.39148736000061035, + "rewards/rejected": -0.408281147480011, + "sft_loss": 0.1679374873638153, + "step": 2560 + }, + { + "epoch": 3.70354302241504, + "grad_norm": 1.9624703787237274, + "learning_rate": 2.6656997168813085e-06, + "logits/chosen": -0.46395984292030334, + "logits/rejected": -0.49898120760917664, + "logps/chosen": -0.29700911045074463, + "logps/rejected": -5.381232261657715, + "loss": 0.198, + "odds_ratio_loss": 0.07508499175310135, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.029700906947255135, + "rewards/margins": 0.508422315120697, + "rewards/rejected": -0.5381232500076294, + "sft_loss": 0.29700911045074463, + "step": 2561 + }, + { + "epoch": 3.704989154013015, + "grad_norm": 2.0815175213453068, + "learning_rate": 2.662772137844313e-06, + "logits/chosen": -0.5803831815719604, + "logits/rejected": -0.5036250948905945, + "logps/chosen": -0.23945432901382446, + "logps/rejected": -4.763128280639648, + "loss": 0.1932, + "odds_ratio_loss": 0.025707893073558807, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023945434018969536, + "rewards/margins": 0.45236736536026, + "rewards/rejected": -0.4763127863407135, + "sft_loss": 0.23945432901382446, + "step": 2562 + }, + { + "epoch": 3.706435285610991, + "grad_norm": 2.1229476330536188, + "learning_rate": 2.659845365026978e-06, + "logits/chosen": -0.615597665309906, + "logits/rejected": -0.3729110658168793, + "logps/chosen": -0.1586671620607376, + "logps/rejected": -6.872448921203613, + "loss": 0.225, + "odds_ratio_loss": 0.0378350093960762, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01586671732366085, + "rewards/margins": 0.6713781952857971, + "rewards/rejected": -0.6872448921203613, + "sft_loss": 0.1586671620607376, + "step": 2563 + }, + { + "epoch": 3.707881417208966, + "grad_norm": 2.06415325654969, + "learning_rate": 2.6569194001938625e-06, + "logits/chosen": -0.548956036567688, + "logits/rejected": -0.44726645946502686, + "logps/chosen": -0.1950879991054535, + "logps/rejected": -3.3179187774658203, + "loss": 0.2203, + "odds_ratio_loss": 0.032068926841020584, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01950879767537117, + "rewards/margins": 0.31228309869766235, + "rewards/rejected": -0.33179187774658203, + "sft_loss": 0.1950879991054535, + "step": 2564 + }, + { + "epoch": 3.7093275488069413, + "grad_norm": 2.0330127883250677, + "learning_rate": 2.653994245109044e-06, + "logits/chosen": -0.7457367777824402, + "logits/rejected": -0.4856140613555908, + "logps/chosen": -0.1529059112071991, + "logps/rejected": -3.516286611557007, + "loss": 0.1702, + "odds_ratio_loss": 0.017367858439683914, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015290590934455395, + "rewards/margins": 0.3363381028175354, + "rewards/rejected": -0.3516286611557007, + "sft_loss": 0.1529059112071991, + "step": 2565 + }, + { + "epoch": 3.710773680404917, + "grad_norm": 2.4457696068112478, + "learning_rate": 2.651069901536106e-06, + "logits/chosen": -0.7771862745285034, + "logits/rejected": -0.3238375186920166, + "logps/chosen": -0.135412335395813, + "logps/rejected": -5.311068058013916, + "loss": 0.1521, + "odds_ratio_loss": 0.013591814786195755, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01354123279452324, + "rewards/margins": 0.5175656080245972, + "rewards/rejected": -0.5311068296432495, + "sft_loss": 0.135412335395813, + "step": 2566 + }, + { + "epoch": 3.712219812002892, + "grad_norm": 1.7528436649250814, + "learning_rate": 2.64814637123815e-06, + "logits/chosen": -0.696196436882019, + "logits/rejected": -0.45283639430999756, + "logps/chosen": -0.11723405867815018, + "logps/rejected": -5.629380226135254, + "loss": 0.14, + "odds_ratio_loss": 0.019943315535783768, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011723405681550503, + "rewards/margins": 0.5512145757675171, + "rewards/rejected": -0.5629379749298096, + "sft_loss": 0.11723405867815018, + "step": 2567 + }, + { + "epoch": 3.7136659436008674, + "grad_norm": 1.9528661831415446, + "learning_rate": 2.645223655977782e-06, + "logits/chosen": -0.6582965850830078, + "logits/rejected": -0.3814026117324829, + "logps/chosen": -0.07659213244915009, + "logps/rejected": -7.244695663452148, + "loss": 0.1345, + "odds_ratio_loss": 0.008341525681316853, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007659214083105326, + "rewards/margins": 0.716810405254364, + "rewards/rejected": -0.7244695425033569, + "sft_loss": 0.07659213244915009, + "step": 2568 + }, + { + "epoch": 3.715112075198843, + "grad_norm": 1.994869520742531, + "learning_rate": 2.6423017575171153e-06, + "logits/chosen": -0.6350436806678772, + "logits/rejected": -0.6293706893920898, + "logps/chosen": -0.13775679469108582, + "logps/rejected": -5.226606845855713, + "loss": 0.1814, + "odds_ratio_loss": 0.032059669494628906, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013775679282844067, + "rewards/margins": 0.5088850259780884, + "rewards/rejected": -0.5226607322692871, + "sft_loss": 0.13775679469108582, + "step": 2569 + }, + { + "epoch": 3.7165582067968184, + "grad_norm": 2.0319895917615303, + "learning_rate": 2.6393806776177777e-06, + "logits/chosen": -0.7844041585922241, + "logits/rejected": -0.5700384378433228, + "logps/chosen": -0.1305319368839264, + "logps/rejected": -3.323162078857422, + "loss": 0.2278, + "odds_ratio_loss": 0.02434907667338848, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01305319368839264, + "rewards/margins": 0.3192630112171173, + "rewards/rejected": -0.33231621980667114, + "sft_loss": 0.1305319368839264, + "step": 2570 + }, + { + "epoch": 3.718004338394794, + "grad_norm": 2.674733943727509, + "learning_rate": 2.6364604180408963e-06, + "logits/chosen": -0.7198705673217773, + "logits/rejected": -0.4059660732746124, + "logps/chosen": -0.13282117247581482, + "logps/rejected": -4.611578464508057, + "loss": 0.1716, + "odds_ratio_loss": 0.016370346769690514, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013282117433845997, + "rewards/margins": 0.4478756785392761, + "rewards/rejected": -0.46115779876708984, + "sft_loss": 0.13282117247581482, + "step": 2571 + }, + { + "epoch": 3.7194504699927693, + "grad_norm": 2.274559735184135, + "learning_rate": 2.633540980547108e-06, + "logits/chosen": -0.8815616369247437, + "logits/rejected": -0.6955693960189819, + "logps/chosen": -0.25010132789611816, + "logps/rejected": -3.4041104316711426, + "loss": 0.2208, + "odds_ratio_loss": 0.056482359766960144, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025010133162140846, + "rewards/margins": 0.31540095806121826, + "rewards/rejected": -0.34041109681129456, + "sft_loss": 0.25010132789611816, + "step": 2572 + }, + { + "epoch": 3.720896601590745, + "grad_norm": 3.822609490949455, + "learning_rate": 2.63062236689655e-06, + "logits/chosen": -0.5537604093551636, + "logits/rejected": -0.3567535877227783, + "logps/chosen": -0.27658021450042725, + "logps/rejected": -3.7039966583251953, + "loss": 0.2414, + "odds_ratio_loss": 0.04956976696848869, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027658019214868546, + "rewards/margins": 0.34274163842201233, + "rewards/rejected": -0.3703996539115906, + "sft_loss": 0.27658021450042725, + "step": 2573 + }, + { + "epoch": 3.72234273318872, + "grad_norm": 2.0402329713850835, + "learning_rate": 2.6277045788488695e-06, + "logits/chosen": -0.5506142973899841, + "logits/rejected": -0.40666961669921875, + "logps/chosen": -0.17523783445358276, + "logps/rejected": -4.3214945793151855, + "loss": 0.1537, + "odds_ratio_loss": 0.025635970756411552, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017523784190416336, + "rewards/margins": 0.4146256744861603, + "rewards/rejected": -0.4321494698524475, + "sft_loss": 0.17523783445358276, + "step": 2574 + }, + { + "epoch": 3.7237888647866955, + "grad_norm": 2.370602746264014, + "learning_rate": 2.624787618163208e-06, + "logits/chosen": -0.5524752140045166, + "logits/rejected": -0.48569953441619873, + "logps/chosen": -0.09435204416513443, + "logps/rejected": -3.0093679428100586, + "loss": 0.201, + "odds_ratio_loss": 0.02923463098704815, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009435204789042473, + "rewards/margins": 0.29150158166885376, + "rewards/rejected": -0.3009367883205414, + "sft_loss": 0.09435204416513443, + "step": 2575 + }, + { + "epoch": 3.725234996384671, + "grad_norm": 1.9717764414884915, + "learning_rate": 2.621871486598214e-06, + "logits/chosen": -0.510238766670227, + "logits/rejected": -0.4656646251678467, + "logps/chosen": -0.10176987946033478, + "logps/rejected": -3.364330768585205, + "loss": 0.1647, + "odds_ratio_loss": 0.06915295869112015, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.010176989249885082, + "rewards/margins": 0.3262560963630676, + "rewards/rejected": -0.3364330530166626, + "sft_loss": 0.10176987946033478, + "step": 2576 + }, + { + "epoch": 3.7266811279826464, + "grad_norm": 3.64987792030796, + "learning_rate": 2.618956185912032e-06, + "logits/chosen": -0.47754430770874023, + "logits/rejected": -0.38157540559768677, + "logps/chosen": -0.23310229182243347, + "logps/rejected": -3.7075998783111572, + "loss": 0.2027, + "odds_ratio_loss": 0.04826747253537178, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023310229182243347, + "rewards/margins": 0.34744977951049805, + "rewards/rejected": -0.3707600235939026, + "sft_loss": 0.23310229182243347, + "step": 2577 + }, + { + "epoch": 3.7281272595806216, + "grad_norm": 2.439119648965075, + "learning_rate": 2.616041717862311e-06, + "logits/chosen": -0.6305740475654602, + "logits/rejected": -0.48160648345947266, + "logps/chosen": -0.2966257929801941, + "logps/rejected": -5.1441545486450195, + "loss": 0.2318, + "odds_ratio_loss": 0.06576695293188095, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.02966257929801941, + "rewards/margins": 0.484752893447876, + "rewards/rejected": -0.5144155025482178, + "sft_loss": 0.2966257929801941, + "step": 2578 + }, + { + "epoch": 3.7295733911785973, + "grad_norm": 2.259488348219623, + "learning_rate": 2.613128084206191e-06, + "logits/chosen": -0.4879527688026428, + "logits/rejected": -0.5105048418045044, + "logps/chosen": -0.22474190592765808, + "logps/rejected": -3.7948379516601562, + "loss": 0.1725, + "odds_ratio_loss": 0.07505577802658081, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022474190220236778, + "rewards/margins": 0.3570095896720886, + "rewards/rejected": -0.37948381900787354, + "sft_loss": 0.22474190592765808, + "step": 2579 + }, + { + "epoch": 3.7310195227765726, + "grad_norm": 2.0511491756501825, + "learning_rate": 2.6102152867003143e-06, + "logits/chosen": -0.6580703854560852, + "logits/rejected": -0.4834393262863159, + "logps/chosen": -0.1688551902770996, + "logps/rejected": -3.4576525688171387, + "loss": 0.1761, + "odds_ratio_loss": 0.028964854776859283, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01688551902770996, + "rewards/margins": 0.32887977361679077, + "rewards/rejected": -0.34576526284217834, + "sft_loss": 0.1688551902770996, + "step": 2580 + }, + { + "epoch": 3.7324656543745482, + "grad_norm": 2.1539385559635664, + "learning_rate": 2.6073033271008184e-06, + "logits/chosen": -0.5000095963478088, + "logits/rejected": -0.5185361504554749, + "logps/chosen": -0.16783685982227325, + "logps/rejected": -3.7964539527893066, + "loss": 0.2066, + "odds_ratio_loss": 0.02780839428305626, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016783684492111206, + "rewards/margins": 0.36286166310310364, + "rewards/rejected": -0.37964534759521484, + "sft_loss": 0.16783685982227325, + "step": 2581 + }, + { + "epoch": 3.7339117859725235, + "grad_norm": 2.452678357903579, + "learning_rate": 2.604392207163333e-06, + "logits/chosen": -0.7065168023109436, + "logits/rejected": -0.6411018967628479, + "logps/chosen": -0.2850452661514282, + "logps/rejected": -2.233325481414795, + "loss": 0.1963, + "odds_ratio_loss": 0.09458545595407486, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02850452810525894, + "rewards/margins": 0.19482800364494324, + "rewards/rejected": -0.22333255410194397, + "sft_loss": 0.2850452661514282, + "step": 2582 + }, + { + "epoch": 3.735357917570499, + "grad_norm": 3.5577667700650695, + "learning_rate": 2.601481928642985e-06, + "logits/chosen": -0.5432594418525696, + "logits/rejected": -0.4507932662963867, + "logps/chosen": -0.14750340580940247, + "logps/rejected": -5.576773166656494, + "loss": 0.2091, + "odds_ratio_loss": 0.012168655171990395, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014750340022146702, + "rewards/margins": 0.5429270267486572, + "rewards/rejected": -0.5576773285865784, + "sft_loss": 0.14750340580940247, + "step": 2583 + }, + { + "epoch": 3.7368040491684744, + "grad_norm": 3.1603618668755367, + "learning_rate": 2.598572493294388e-06, + "logits/chosen": -0.8299934267997742, + "logits/rejected": -0.4090917706489563, + "logps/chosen": -0.15736770629882812, + "logps/rejected": -4.021340370178223, + "loss": 0.18, + "odds_ratio_loss": 0.014862221665680408, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015736771747469902, + "rewards/margins": 0.38639724254608154, + "rewards/rejected": -0.4021340012550354, + "sft_loss": 0.15736770629882812, + "step": 2584 + }, + { + "epoch": 3.7382501807664497, + "grad_norm": 2.2400557928515883, + "learning_rate": 2.5956639028716576e-06, + "logits/chosen": -0.4494069814682007, + "logits/rejected": -0.47250938415527344, + "logps/chosen": -0.19860422611236572, + "logps/rejected": -2.777355194091797, + "loss": 0.2257, + "odds_ratio_loss": 0.06139344349503517, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01986042410135269, + "rewards/margins": 0.25787508487701416, + "rewards/rejected": -0.27773553133010864, + "sft_loss": 0.19860422611236572, + "step": 2585 + }, + { + "epoch": 3.7396963123644253, + "grad_norm": 1.983706842271957, + "learning_rate": 2.592756159128388e-06, + "logits/chosen": -0.5673388838768005, + "logits/rejected": -0.5991949439048767, + "logps/chosen": -0.11181987822055817, + "logps/rejected": -4.574343204498291, + "loss": 0.1371, + "odds_ratio_loss": 0.022676371037960052, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011181988753378391, + "rewards/margins": 0.44625231623649597, + "rewards/rejected": -0.4574342966079712, + "sft_loss": 0.11181987822055817, + "step": 2586 + }, + { + "epoch": 3.7411424439624006, + "grad_norm": 2.137993317413108, + "learning_rate": 2.589849263817673e-06, + "logits/chosen": -0.4015089273452759, + "logits/rejected": -0.35641565918922424, + "logps/chosen": -0.23663023114204407, + "logps/rejected": -5.35009765625, + "loss": 0.2093, + "odds_ratio_loss": 0.022359168156981468, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023663025349378586, + "rewards/margins": 0.5113467574119568, + "rewards/rejected": -0.5350098013877869, + "sft_loss": 0.23663023114204407, + "step": 2587 + }, + { + "epoch": 3.742588575560376, + "grad_norm": 1.846983258310368, + "learning_rate": 2.586943218692087e-06, + "logits/chosen": -0.4792490601539612, + "logits/rejected": -0.38760459423065186, + "logps/chosen": -0.1646152138710022, + "logps/rejected": -3.257059097290039, + "loss": 0.1606, + "odds_ratio_loss": 0.023885680362582207, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01646151952445507, + "rewards/margins": 0.30924439430236816, + "rewards/rejected": -0.3257059156894684, + "sft_loss": 0.1646152138710022, + "step": 2588 + }, + { + "epoch": 3.7440347071583515, + "grad_norm": 2.2043603682689885, + "learning_rate": 2.5840380255036987e-06, + "logits/chosen": -0.5829678773880005, + "logits/rejected": -0.47153595089912415, + "logps/chosen": -0.23809784650802612, + "logps/rejected": -4.241508483886719, + "loss": 0.2287, + "odds_ratio_loss": 0.05533334240317345, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023809785023331642, + "rewards/margins": 0.40034106373786926, + "rewards/rejected": -0.42415088415145874, + "sft_loss": 0.23809784650802612, + "step": 2589 + }, + { + "epoch": 3.7454808387563268, + "grad_norm": 2.374432356246243, + "learning_rate": 2.5811336860040575e-06, + "logits/chosen": -0.6532708406448364, + "logits/rejected": -0.6016970872879028, + "logps/chosen": -0.16082583367824554, + "logps/rejected": -4.622746467590332, + "loss": 0.1728, + "odds_ratio_loss": 0.07337768375873566, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.016082582995295525, + "rewards/margins": 0.44619205594062805, + "rewards/rejected": -0.4622746706008911, + "sft_loss": 0.16082583367824554, + "step": 2590 + }, + { + "epoch": 3.746926970354302, + "grad_norm": 2.4503692546178084, + "learning_rate": 2.5782302019442028e-06, + "logits/chosen": -0.7345556020736694, + "logits/rejected": -0.5952568054199219, + "logps/chosen": -0.2881607413291931, + "logps/rejected": -3.4280734062194824, + "loss": 0.2145, + "odds_ratio_loss": 0.07098816335201263, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02881607599556446, + "rewards/margins": 0.3139912486076355, + "rewards/rejected": -0.3428073227405548, + "sft_loss": 0.2881607413291931, + "step": 2591 + }, + { + "epoch": 3.7483731019522777, + "grad_norm": 2.0016155900455703, + "learning_rate": 2.5753275750746523e-06, + "logits/chosen": -0.5780439376831055, + "logits/rejected": -0.4470243453979492, + "logps/chosen": -0.21503284573554993, + "logps/rejected": -3.0827932357788086, + "loss": 0.1664, + "odds_ratio_loss": 0.060324862599372864, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02150328829884529, + "rewards/margins": 0.286776065826416, + "rewards/rejected": -0.3082793653011322, + "sft_loss": 0.21503284573554993, + "step": 2592 + }, + { + "epoch": 3.749819233550253, + "grad_norm": 1.9390808109588353, + "learning_rate": 2.5724258071454134e-06, + "logits/chosen": -0.49952131509780884, + "logits/rejected": -0.4054247736930847, + "logps/chosen": -0.23884204030036926, + "logps/rejected": -4.414041519165039, + "loss": 0.1973, + "odds_ratio_loss": 0.04530846327543259, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023884203284978867, + "rewards/margins": 0.4175199866294861, + "rewards/rejected": -0.44140419363975525, + "sft_loss": 0.23884204030036926, + "step": 2593 + }, + { + "epoch": 3.7512653651482286, + "grad_norm": 1.9932521409433872, + "learning_rate": 2.5695248999059732e-06, + "logits/chosen": -0.5046989321708679, + "logits/rejected": -0.54093998670578, + "logps/chosen": -0.19573456048965454, + "logps/rejected": -3.6174211502075195, + "loss": 0.1782, + "odds_ratio_loss": 0.051149386912584305, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019573457539081573, + "rewards/margins": 0.3421686589717865, + "rewards/rejected": -0.3617421090602875, + "sft_loss": 0.19573456048965454, + "step": 2594 + }, + { + "epoch": 3.752711496746204, + "grad_norm": 3.2670889758539667, + "learning_rate": 2.5666248551052987e-06, + "logits/chosen": -0.5050589442253113, + "logits/rejected": -0.4645492434501648, + "logps/chosen": -0.24979722499847412, + "logps/rejected": -3.0747222900390625, + "loss": 0.1936, + "odds_ratio_loss": 0.05264318734407425, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024979721754789352, + "rewards/margins": 0.2824925184249878, + "rewards/rejected": -0.30747222900390625, + "sft_loss": 0.24979722499847412, + "step": 2595 + }, + { + "epoch": 3.7541576283441795, + "grad_norm": 2.0890743012945867, + "learning_rate": 2.563725674491837e-06, + "logits/chosen": -0.5555975437164307, + "logits/rejected": -0.40666472911834717, + "logps/chosen": -0.1283828616142273, + "logps/rejected": -3.9973580837249756, + "loss": 0.1481, + "odds_ratio_loss": 0.027073953300714493, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012838287279009819, + "rewards/margins": 0.3868975341320038, + "rewards/rejected": -0.39973583817481995, + "sft_loss": 0.1283828616142273, + "step": 2596 + }, + { + "epoch": 3.755603759942155, + "grad_norm": 2.0155989414546616, + "learning_rate": 2.5608273598135145e-06, + "logits/chosen": -0.6884058117866516, + "logits/rejected": -0.6368868350982666, + "logps/chosen": -0.25396209955215454, + "logps/rejected": -2.0925629138946533, + "loss": 0.2361, + "odds_ratio_loss": 0.07850989699363708, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025396209210157394, + "rewards/margins": 0.18386009335517883, + "rewards/rejected": -0.20925629138946533, + "sft_loss": 0.25396209955215454, + "step": 2597 + }, + { + "epoch": 3.75704989154013, + "grad_norm": 1.9793552722188195, + "learning_rate": 2.557929912817738e-06, + "logits/chosen": -0.49515867233276367, + "logits/rejected": -0.24635782837867737, + "logps/chosen": -0.20986327528953552, + "logps/rejected": -4.535175323486328, + "loss": 0.1979, + "odds_ratio_loss": 0.04149693623185158, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020986327901482582, + "rewards/margins": 0.43253129720687866, + "rewards/rejected": -0.4535176157951355, + "sft_loss": 0.20986327528953552, + "step": 2598 + }, + { + "epoch": 3.7584960231381057, + "grad_norm": 1.8678317296039257, + "learning_rate": 2.5550333352513884e-06, + "logits/chosen": -0.7255061268806458, + "logits/rejected": -0.794285237789154, + "logps/chosen": -0.1850017011165619, + "logps/rejected": -3.257514238357544, + "loss": 0.1506, + "odds_ratio_loss": 0.0496574267745018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01850016973912716, + "rewards/margins": 0.3072512745857239, + "rewards/rejected": -0.3257514238357544, + "sft_loss": 0.1850017011165619, + "step": 2599 + }, + { + "epoch": 3.759942154736081, + "grad_norm": 2.025715101180212, + "learning_rate": 2.552137628860822e-06, + "logits/chosen": -0.5476210117340088, + "logits/rejected": -0.4572288990020752, + "logps/chosen": -0.19937428832054138, + "logps/rejected": -4.361888885498047, + "loss": 0.2122, + "odds_ratio_loss": 0.03671419993042946, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019937429577112198, + "rewards/margins": 0.41625142097473145, + "rewards/rejected": -0.43618887662887573, + "sft_loss": 0.19937428832054138, + "step": 2600 + }, + { + "epoch": 3.761388286334056, + "grad_norm": 1.8713940948947654, + "learning_rate": 2.549242795391871e-06, + "logits/chosen": -0.5071431398391724, + "logits/rejected": -0.30187955498695374, + "logps/chosen": -0.05415284261107445, + "logps/rejected": -6.518517971038818, + "loss": 0.1636, + "odds_ratio_loss": 0.005559473764151335, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00541528407484293, + "rewards/margins": 0.6464364528656006, + "rewards/rejected": -0.6518517732620239, + "sft_loss": 0.05415284261107445, + "step": 2601 + }, + { + "epoch": 3.762834417932032, + "grad_norm": 3.2637222304935616, + "learning_rate": 2.5463488365898426e-06, + "logits/chosen": -0.5941678285598755, + "logits/rejected": -0.43715280294418335, + "logps/chosen": -0.20915353298187256, + "logps/rejected": -3.662285804748535, + "loss": 0.2696, + "odds_ratio_loss": 0.03305754438042641, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020915353670716286, + "rewards/margins": 0.3453132212162018, + "rewards/rejected": -0.36622855067253113, + "sft_loss": 0.20915353298187256, + "step": 2602 + }, + { + "epoch": 3.764280549530007, + "grad_norm": 2.007573145262109, + "learning_rate": 2.543455754199514e-06, + "logits/chosen": -0.5387523770332336, + "logits/rejected": -0.4196021854877472, + "logps/chosen": -0.1982327699661255, + "logps/rejected": -4.7550201416015625, + "loss": 0.1634, + "odds_ratio_loss": 0.03372848033905029, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01982327736914158, + "rewards/margins": 0.4556787312030792, + "rewards/rejected": -0.47550201416015625, + "sft_loss": 0.1982327699661255, + "step": 2603 + }, + { + "epoch": 3.765726681127983, + "grad_norm": 1.9541545437185883, + "learning_rate": 2.540563549965137e-06, + "logits/chosen": -0.5714979767799377, + "logits/rejected": -0.43261682987213135, + "logps/chosen": -0.19826523959636688, + "logps/rejected": -4.462432861328125, + "loss": 0.2072, + "odds_ratio_loss": 0.06122538074851036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019826525822281837, + "rewards/margins": 0.4264167845249176, + "rewards/rejected": -0.4462433159351349, + "sft_loss": 0.19826523959636688, + "step": 2604 + }, + { + "epoch": 3.767172812725958, + "grad_norm": 1.9328215931836676, + "learning_rate": 2.5376722256304295e-06, + "logits/chosen": -0.5500741004943848, + "logits/rejected": -0.4248887002468109, + "logps/chosen": -0.23179960250854492, + "logps/rejected": -3.2190053462982178, + "loss": 0.1968, + "odds_ratio_loss": 0.07697392255067825, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02317996136844158, + "rewards/margins": 0.2987205982208252, + "rewards/rejected": -0.32190054655075073, + "sft_loss": 0.23179960250854492, + "step": 2605 + }, + { + "epoch": 3.7686189443239337, + "grad_norm": 2.203707685539388, + "learning_rate": 2.5347817829385846e-06, + "logits/chosen": -0.3630380630493164, + "logits/rejected": -0.3876681327819824, + "logps/chosen": -0.16202141344547272, + "logps/rejected": -3.187636613845825, + "loss": 0.1938, + "odds_ratio_loss": 0.0400000736117363, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01620214246213436, + "rewards/margins": 0.30256152153015137, + "rewards/rejected": -0.3187636733055115, + "sft_loss": 0.16202141344547272, + "step": 2606 + }, + { + "epoch": 3.770065075921909, + "grad_norm": 2.04049312263035, + "learning_rate": 2.5318922236322602e-06, + "logits/chosen": -0.5945833921432495, + "logits/rejected": -0.4727177917957306, + "logps/chosen": -0.26018601655960083, + "logps/rejected": -5.333613395690918, + "loss": 0.1814, + "odds_ratio_loss": 0.0550345852971077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026018597185611725, + "rewards/margins": 0.5073426961898804, + "rewards/rejected": -0.5333613157272339, + "sft_loss": 0.26018601655960083, + "step": 2607 + }, + { + "epoch": 3.7715112075198842, + "grad_norm": 2.270452537470267, + "learning_rate": 2.5290035494535805e-06, + "logits/chosen": -0.4364027678966522, + "logits/rejected": -0.444566547870636, + "logps/chosen": -0.15006223320960999, + "logps/rejected": -3.9100732803344727, + "loss": 0.1582, + "odds_ratio_loss": 0.06002519652247429, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015006224624812603, + "rewards/margins": 0.3760010898113251, + "rewards/rejected": -0.39100736379623413, + "sft_loss": 0.15006223320960999, + "step": 2608 + }, + { + "epoch": 3.77295733911786, + "grad_norm": 2.292197095198817, + "learning_rate": 2.5261157621441413e-06, + "logits/chosen": -0.6184677481651306, + "logits/rejected": -0.6291539072990417, + "logps/chosen": -0.33772191405296326, + "logps/rejected": -3.0420265197753906, + "loss": 0.24, + "odds_ratio_loss": 0.052166953682899475, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.033772192895412445, + "rewards/margins": 0.27043047547340393, + "rewards/rejected": -0.3042026460170746, + "sft_loss": 0.33772191405296326, + "step": 2609 + }, + { + "epoch": 3.774403470715835, + "grad_norm": 2.099941808987336, + "learning_rate": 2.523228863444997e-06, + "logits/chosen": -0.6739153861999512, + "logits/rejected": -0.4860963225364685, + "logps/chosen": -0.16617199778556824, + "logps/rejected": -6.441198348999023, + "loss": 0.1615, + "odds_ratio_loss": 0.03473407030105591, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016617199406027794, + "rewards/margins": 0.6275026798248291, + "rewards/rejected": -0.6441198587417603, + "sft_loss": 0.16617199778556824, + "step": 2610 + }, + { + "epoch": 3.7758496023138104, + "grad_norm": 1.982794854155467, + "learning_rate": 2.5203428550966722e-06, + "logits/chosen": -0.5175485014915466, + "logits/rejected": -0.48581239581108093, + "logps/chosen": -0.22742906212806702, + "logps/rejected": -3.3520054817199707, + "loss": 0.2163, + "odds_ratio_loss": 0.05771571770310402, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02274290844798088, + "rewards/margins": 0.3124576807022095, + "rewards/rejected": -0.33520054817199707, + "sft_loss": 0.22742906212806702, + "step": 2611 + }, + { + "epoch": 3.777295733911786, + "grad_norm": 2.47202283857541, + "learning_rate": 2.517457738839149e-06, + "logits/chosen": -0.7268478870391846, + "logits/rejected": -0.6017354726791382, + "logps/chosen": -0.32217803597450256, + "logps/rejected": -2.822502613067627, + "loss": 0.2546, + "odds_ratio_loss": 0.078008271753788, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.032217804342508316, + "rewards/margins": 0.2500324845314026, + "rewards/rejected": -0.2822502851486206, + "sft_loss": 0.32217803597450256, + "step": 2612 + }, + { + "epoch": 3.7787418655097613, + "grad_norm": 1.9410029740314656, + "learning_rate": 2.5145735164118788e-06, + "logits/chosen": -0.6474170088768005, + "logits/rejected": -0.48739415407180786, + "logps/chosen": -0.08966898918151855, + "logps/rejected": -6.39056396484375, + "loss": 0.1556, + "odds_ratio_loss": 0.01307761948555708, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008966898545622826, + "rewards/margins": 0.6300894618034363, + "rewards/rejected": -0.639056384563446, + "sft_loss": 0.08966898918151855, + "step": 2613 + }, + { + "epoch": 3.7801879971077366, + "grad_norm": 2.0303896879223085, + "learning_rate": 2.511690189553767e-06, + "logits/chosen": -0.5155009031295776, + "logits/rejected": -0.47514843940734863, + "logps/chosen": -0.2928372025489807, + "logps/rejected": -5.307400703430176, + "loss": 0.2232, + "odds_ratio_loss": 0.056762486696243286, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029283719137310982, + "rewards/margins": 0.5014563202857971, + "rewards/rejected": -0.5307400822639465, + "sft_loss": 0.2928372025489807, + "step": 2614 + }, + { + "epoch": 3.7816341287057122, + "grad_norm": 2.168404520852216, + "learning_rate": 2.5088077600031834e-06, + "logits/chosen": -0.7363658547401428, + "logits/rejected": -0.6525477170944214, + "logps/chosen": -0.12692435085773468, + "logps/rejected": -4.636532306671143, + "loss": 0.1842, + "odds_ratio_loss": 0.01741865649819374, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012692435644567013, + "rewards/margins": 0.45096075534820557, + "rewards/rejected": -0.46365320682525635, + "sft_loss": 0.12692435085773468, + "step": 2615 + }, + { + "epoch": 3.7830802603036875, + "grad_norm": 2.0652992396748875, + "learning_rate": 2.5059262294979535e-06, + "logits/chosen": -0.5252244472503662, + "logits/rejected": -0.4214191436767578, + "logps/chosen": -0.24405062198638916, + "logps/rejected": -4.097309112548828, + "loss": 0.2465, + "odds_ratio_loss": 0.04359886795282364, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024405062198638916, + "rewards/margins": 0.3853258788585663, + "rewards/rejected": -0.4097309112548828, + "sft_loss": 0.24405062198638916, + "step": 2616 + }, + { + "epoch": 3.784526391901663, + "grad_norm": 1.9517111344022926, + "learning_rate": 2.5030455997753663e-06, + "logits/chosen": -0.6113194227218628, + "logits/rejected": -0.6022163033485413, + "logps/chosen": -0.31509339809417725, + "logps/rejected": -2.91943359375, + "loss": 0.265, + "odds_ratio_loss": 0.059509724378585815, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.031509339809417725, + "rewards/margins": 0.26043403148651123, + "rewards/rejected": -0.29194337129592896, + "sft_loss": 0.31509339809417725, + "step": 2617 + }, + { + "epoch": 3.7859725234996384, + "grad_norm": 2.092277943733906, + "learning_rate": 2.500165872572161e-06, + "logits/chosen": -0.6757873296737671, + "logits/rejected": -0.5339130759239197, + "logps/chosen": -0.189954936504364, + "logps/rejected": -4.904602527618408, + "loss": 0.2004, + "odds_ratio_loss": 0.05145931616425514, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01899549551308155, + "rewards/margins": 0.47146478295326233, + "rewards/rejected": -0.49046024680137634, + "sft_loss": 0.189954936504364, + "step": 2618 + }, + { + "epoch": 3.787418655097614, + "grad_norm": 2.152313378101085, + "learning_rate": 2.4972870496245366e-06, + "logits/chosen": -0.6356850266456604, + "logits/rejected": -0.20262449979782104, + "logps/chosen": -0.25436562299728394, + "logps/rejected": -4.084608554840088, + "loss": 0.2114, + "odds_ratio_loss": 0.158283069729805, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.025436561554670334, + "rewards/margins": 0.38302430510520935, + "rewards/rejected": -0.4084608554840088, + "sft_loss": 0.25436562299728394, + "step": 2619 + }, + { + "epoch": 3.7888647866955893, + "grad_norm": 1.8461811123197192, + "learning_rate": 2.4944091326681484e-06, + "logits/chosen": -0.5899071097373962, + "logits/rejected": -0.6064687967300415, + "logps/chosen": -0.13420352339744568, + "logps/rejected": -5.148534297943115, + "loss": 0.1548, + "odds_ratio_loss": 0.033398158848285675, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013420352712273598, + "rewards/margins": 0.5014330148696899, + "rewards/rejected": -0.5148534178733826, + "sft_loss": 0.13420352339744568, + "step": 2620 + }, + { + "epoch": 3.7903109182935646, + "grad_norm": 2.5677808116805085, + "learning_rate": 2.4915321234381e-06, + "logits/chosen": -0.6439390182495117, + "logits/rejected": -0.6536998748779297, + "logps/chosen": -0.21709826588630676, + "logps/rejected": -2.651989698410034, + "loss": 0.2035, + "odds_ratio_loss": 0.05655750632286072, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021709825843572617, + "rewards/margins": 0.24348914623260498, + "rewards/rejected": -0.2651989758014679, + "sft_loss": 0.21709826588630676, + "step": 2621 + }, + { + "epoch": 3.7917570498915403, + "grad_norm": 1.7240970042044195, + "learning_rate": 2.4886560236689542e-06, + "logits/chosen": -0.4303063154220581, + "logits/rejected": -0.2958157956600189, + "logps/chosen": -0.10966433584690094, + "logps/rejected": -5.254568099975586, + "loss": 0.1329, + "odds_ratio_loss": 0.017621489241719246, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010966433212161064, + "rewards/margins": 0.5144904255867004, + "rewards/rejected": -0.5254568457603455, + "sft_loss": 0.10966433584690094, + "step": 2622 + }, + { + "epoch": 3.7932031814895155, + "grad_norm": 2.2505913691166635, + "learning_rate": 2.4857808350947186e-06, + "logits/chosen": -0.5765487551689148, + "logits/rejected": -0.26596999168395996, + "logps/chosen": -0.19715330004692078, + "logps/rejected": -3.5105814933776855, + "loss": 0.1702, + "odds_ratio_loss": 0.039164457470178604, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019715333357453346, + "rewards/margins": 0.33134281635284424, + "rewards/rejected": -0.35105812549591064, + "sft_loss": 0.19715330004692078, + "step": 2623 + }, + { + "epoch": 3.7946493130874908, + "grad_norm": 1.9007264625255884, + "learning_rate": 2.4829065594488586e-06, + "logits/chosen": -0.6615445613861084, + "logits/rejected": -0.485071063041687, + "logps/chosen": -0.16579408943653107, + "logps/rejected": -5.319438457489014, + "loss": 0.1492, + "odds_ratio_loss": 0.04737501218914986, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016579408198595047, + "rewards/margins": 0.515364408493042, + "rewards/rejected": -0.5319438576698303, + "sft_loss": 0.16579408943653107, + "step": 2624 + }, + { + "epoch": 3.7960954446854664, + "grad_norm": 1.9969119227602157, + "learning_rate": 2.4800331984642837e-06, + "logits/chosen": -0.5275101661682129, + "logits/rejected": -0.48966342210769653, + "logps/chosen": -0.21786557137966156, + "logps/rejected": -2.759644031524658, + "loss": 0.1551, + "odds_ratio_loss": 0.054346635937690735, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021786555647850037, + "rewards/margins": 0.25417786836624146, + "rewards/rejected": -0.2759644091129303, + "sft_loss": 0.21786557137966156, + "step": 2625 + }, + { + "epoch": 3.7975415762834417, + "grad_norm": 3.3918947625961184, + "learning_rate": 2.4771607538733554e-06, + "logits/chosen": -0.4635317325592041, + "logits/rejected": -0.377239465713501, + "logps/chosen": -0.12900260090827942, + "logps/rejected": -5.37921142578125, + "loss": 0.1137, + "odds_ratio_loss": 0.009468241594731808, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012900261208415031, + "rewards/margins": 0.5250208377838135, + "rewards/rejected": -0.537921130657196, + "sft_loss": 0.12900260090827942, + "step": 2626 + }, + { + "epoch": 3.7989877078814174, + "grad_norm": 2.052320299573357, + "learning_rate": 2.474289227407878e-06, + "logits/chosen": -0.6370794773101807, + "logits/rejected": -0.4276534914970398, + "logps/chosen": -0.18756850063800812, + "logps/rejected": -3.975283622741699, + "loss": 0.1866, + "odds_ratio_loss": 0.0332857221364975, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01875685155391693, + "rewards/margins": 0.37877151370048523, + "rewards/rejected": -0.39752835035324097, + "sft_loss": 0.18756850063800812, + "step": 2627 + }, + { + "epoch": 3.8004338394793926, + "grad_norm": 2.194141426402103, + "learning_rate": 2.4714186207991095e-06, + "logits/chosen": -0.5361537337303162, + "logits/rejected": -0.5779704451560974, + "logps/chosen": -0.19404172897338867, + "logps/rejected": -2.87283992767334, + "loss": 0.2097, + "odds_ratio_loss": 0.04218186438083649, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019404174759984016, + "rewards/margins": 0.26787981390953064, + "rewards/rejected": -0.2872839570045471, + "sft_loss": 0.19404172897338867, + "step": 2628 + }, + { + "epoch": 3.8018799710773683, + "grad_norm": 2.0262944409100654, + "learning_rate": 2.468548935777747e-06, + "logits/chosen": -0.660569429397583, + "logits/rejected": -0.5347737073898315, + "logps/chosen": -0.16592204570770264, + "logps/rejected": -4.6749138832092285, + "loss": 0.2168, + "odds_ratio_loss": 0.030461864545941353, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016592204570770264, + "rewards/margins": 0.4508991837501526, + "rewards/rejected": -0.46749138832092285, + "sft_loss": 0.16592204570770264, + "step": 2629 + }, + { + "epoch": 3.8033261026753435, + "grad_norm": 2.0284521617244002, + "learning_rate": 2.4656801740739356e-06, + "logits/chosen": -0.5391656756401062, + "logits/rejected": -0.42175936698913574, + "logps/chosen": -0.1921410858631134, + "logps/rejected": -3.636936902999878, + "loss": 0.194, + "odds_ratio_loss": 0.053551651537418365, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01921410858631134, + "rewards/margins": 0.34447959065437317, + "rewards/rejected": -0.3636936843395233, + "sft_loss": 0.1921410858631134, + "step": 2630 + }, + { + "epoch": 3.804772234273319, + "grad_norm": 2.0716752248142467, + "learning_rate": 2.46281233741726e-06, + "logits/chosen": -0.6886047124862671, + "logits/rejected": -0.5288622379302979, + "logps/chosen": -0.23665514588356018, + "logps/rejected": -2.459178924560547, + "loss": 0.2018, + "odds_ratio_loss": 0.06963891535997391, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023665515705943108, + "rewards/margins": 0.22225239872932434, + "rewards/rejected": -0.2459179013967514, + "sft_loss": 0.23665514588356018, + "step": 2631 + }, + { + "epoch": 3.8062183658712945, + "grad_norm": 2.0485758856786034, + "learning_rate": 2.4599454275367526e-06, + "logits/chosen": -0.6599227786064148, + "logits/rejected": -0.459836483001709, + "logps/chosen": -0.18189409375190735, + "logps/rejected": -2.9734206199645996, + "loss": 0.2247, + "odds_ratio_loss": 0.02089795470237732, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018189409747719765, + "rewards/margins": 0.27915263175964355, + "rewards/rejected": -0.2973420321941376, + "sft_loss": 0.18189409375190735, + "step": 2632 + }, + { + "epoch": 3.8076644974692697, + "grad_norm": 2.057869407827372, + "learning_rate": 2.4570794461608816e-06, + "logits/chosen": -0.6338626146316528, + "logits/rejected": -0.7052443027496338, + "logps/chosen": -0.16895508766174316, + "logps/rejected": -1.9207273721694946, + "loss": 0.2421, + "odds_ratio_loss": 0.05102720484137535, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016895508393645287, + "rewards/margins": 0.17517723143100739, + "rewards/rejected": -0.19207274913787842, + "sft_loss": 0.16895508766174316, + "step": 2633 + }, + { + "epoch": 3.809110629067245, + "grad_norm": 1.9324861662836847, + "learning_rate": 2.4542143950175594e-06, + "logits/chosen": -0.5443887710571289, + "logits/rejected": -0.3198566436767578, + "logps/chosen": -0.16024640202522278, + "logps/rejected": -5.304378509521484, + "loss": 0.2111, + "odds_ratio_loss": 0.018941111862659454, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016024641692638397, + "rewards/margins": 0.5144132375717163, + "rewards/rejected": -0.5304378867149353, + "sft_loss": 0.16024640202522278, + "step": 2634 + }, + { + "epoch": 3.8105567606652206, + "grad_norm": 2.0345855814772307, + "learning_rate": 2.4513502758341365e-06, + "logits/chosen": -0.6028193235397339, + "logits/rejected": -0.6171325445175171, + "logps/chosen": -0.3160252571105957, + "logps/rejected": -3.1160173416137695, + "loss": 0.1974, + "odds_ratio_loss": 0.12013718485832214, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03160252422094345, + "rewards/margins": 0.2799992263317108, + "rewards/rejected": -0.31160175800323486, + "sft_loss": 0.3160252571105957, + "step": 2635 + }, + { + "epoch": 3.812002892263196, + "grad_norm": 1.8867295971525198, + "learning_rate": 2.448487090337399e-06, + "logits/chosen": -0.3207634687423706, + "logits/rejected": -0.26631879806518555, + "logps/chosen": -0.11553123593330383, + "logps/rejected": -4.209815979003906, + "loss": 0.1515, + "odds_ratio_loss": 0.0758066475391388, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.011553123593330383, + "rewards/margins": 0.4094284772872925, + "rewards/rejected": -0.42098158597946167, + "sft_loss": 0.11553123593330383, + "step": 2636 + }, + { + "epoch": 3.813449023861171, + "grad_norm": 2.002878147880314, + "learning_rate": 2.4456248402535744e-06, + "logits/chosen": -0.4317575991153717, + "logits/rejected": -0.36689284443855286, + "logps/chosen": -0.1188906878232956, + "logps/rejected": -5.702237129211426, + "loss": 0.1365, + "odds_ratio_loss": 0.030404847115278244, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011889069341123104, + "rewards/margins": 0.5583345890045166, + "rewards/rejected": -0.5702236890792847, + "sft_loss": 0.1188906878232956, + "step": 2637 + }, + { + "epoch": 3.814895155459147, + "grad_norm": 2.0028531170479185, + "learning_rate": 2.4427635273083205e-06, + "logits/chosen": -0.5605095624923706, + "logits/rejected": -0.6367613077163696, + "logps/chosen": -0.16176557540893555, + "logps/rejected": -2.33298397064209, + "loss": 0.1762, + "odds_ratio_loss": 0.03323551267385483, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016176559031009674, + "rewards/margins": 0.21712183952331543, + "rewards/rejected": -0.2332983911037445, + "sft_loss": 0.16176557540893555, + "step": 2638 + }, + { + "epoch": 3.816341287057122, + "grad_norm": 1.9136356302803936, + "learning_rate": 2.439903153226738e-06, + "logits/chosen": -0.4376313388347626, + "logits/rejected": -0.34835922718048096, + "logps/chosen": -0.11063480377197266, + "logps/rejected": -6.628140926361084, + "loss": 0.1722, + "odds_ratio_loss": 0.02789875864982605, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011063480749726295, + "rewards/margins": 0.6517506837844849, + "rewards/rejected": -0.6628141403198242, + "sft_loss": 0.11063480377197266, + "step": 2639 + }, + { + "epoch": 3.8177874186550977, + "grad_norm": 1.9359572286512576, + "learning_rate": 2.4370437197333535e-06, + "logits/chosen": -0.7120791077613831, + "logits/rejected": -0.4205125868320465, + "logps/chosen": -0.23005536198616028, + "logps/rejected": -4.266036033630371, + "loss": 0.1677, + "odds_ratio_loss": 0.02433464676141739, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023005535826086998, + "rewards/margins": 0.4035980999469757, + "rewards/rejected": -0.42660361528396606, + "sft_loss": 0.23005536198616028, + "step": 2640 + }, + { + "epoch": 3.819233550253073, + "grad_norm": 2.2466722500017187, + "learning_rate": 2.434185228552133e-06, + "logits/chosen": -0.5734530091285706, + "logits/rejected": -0.5702399015426636, + "logps/chosen": -0.2448917031288147, + "logps/rejected": -3.1988368034362793, + "loss": 0.1682, + "odds_ratio_loss": 0.03184647485613823, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02448917180299759, + "rewards/margins": 0.29539453983306885, + "rewards/rejected": -0.31988370418548584, + "sft_loss": 0.2448917031288147, + "step": 2641 + }, + { + "epoch": 3.8206796818510487, + "grad_norm": 2.5007274744169905, + "learning_rate": 2.431327681406468e-06, + "logits/chosen": -0.4124388098716736, + "logits/rejected": -0.30019381642341614, + "logps/chosen": -0.220967099070549, + "logps/rejected": -4.135493278503418, + "loss": 0.2378, + "odds_ratio_loss": 0.04712362587451935, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022096708416938782, + "rewards/margins": 0.3914526104927063, + "rewards/rejected": -0.4135493040084839, + "sft_loss": 0.220967099070549, + "step": 2642 + }, + { + "epoch": 3.822125813449024, + "grad_norm": 2.2516952824095715, + "learning_rate": 2.4284710800191877e-06, + "logits/chosen": -0.6341081857681274, + "logits/rejected": -0.4541645646095276, + "logps/chosen": -0.218563973903656, + "logps/rejected": -3.7575972080230713, + "loss": 0.1739, + "odds_ratio_loss": 0.023864325135946274, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0218563973903656, + "rewards/margins": 0.3539033532142639, + "rewards/rejected": -0.37575972080230713, + "sft_loss": 0.218563973903656, + "step": 2643 + }, + { + "epoch": 3.823571945046999, + "grad_norm": 1.8444139630057388, + "learning_rate": 2.425615426112545e-06, + "logits/chosen": -0.6047503352165222, + "logits/rejected": -0.41343581676483154, + "logps/chosen": -0.1336621344089508, + "logps/rejected": -4.135256290435791, + "loss": 0.1668, + "odds_ratio_loss": 0.011073876172304153, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01336621306836605, + "rewards/margins": 0.40015938878059387, + "rewards/rejected": -0.41352561116218567, + "sft_loss": 0.1336621344089508, + "step": 2644 + }, + { + "epoch": 3.825018076644975, + "grad_norm": 3.148403921694642, + "learning_rate": 2.4227607214082267e-06, + "logits/chosen": -0.3584001958370209, + "logits/rejected": -0.3328199088573456, + "logps/chosen": -0.12119731307029724, + "logps/rejected": -3.8874454498291016, + "loss": 0.1786, + "odds_ratio_loss": 0.0209796205163002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012119731865823269, + "rewards/margins": 0.37662479281425476, + "rewards/rejected": -0.3887445628643036, + "sft_loss": 0.12119731307029724, + "step": 2645 + }, + { + "epoch": 3.82646420824295, + "grad_norm": 2.0977720178713906, + "learning_rate": 2.419906967627343e-06, + "logits/chosen": -0.6201465129852295, + "logits/rejected": -0.49884653091430664, + "logps/chosen": -0.18073919415473938, + "logps/rejected": -5.1349968910217285, + "loss": 0.2091, + "odds_ratio_loss": 0.04040105640888214, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018073920160531998, + "rewards/margins": 0.4954257607460022, + "rewards/rejected": -0.5134996771812439, + "sft_loss": 0.18073919415473938, + "step": 2646 + }, + { + "epoch": 3.8279103398409253, + "grad_norm": 2.253181881937432, + "learning_rate": 2.417054166490433e-06, + "logits/chosen": -0.6608943939208984, + "logits/rejected": -0.5251725316047668, + "logps/chosen": -0.10730911791324615, + "logps/rejected": -3.7329394817352295, + "loss": 0.1553, + "odds_ratio_loss": 0.030384372919797897, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01073091197758913, + "rewards/margins": 0.36256301403045654, + "rewards/rejected": -0.3732939660549164, + "sft_loss": 0.10730911791324615, + "step": 2647 + }, + { + "epoch": 3.829356471438901, + "grad_norm": 3.032198838691612, + "learning_rate": 2.4142023197174625e-06, + "logits/chosen": -0.6617286205291748, + "logits/rejected": -0.5526999235153198, + "logps/chosen": -0.14543002843856812, + "logps/rejected": -4.451727390289307, + "loss": 0.1426, + "odds_ratio_loss": 0.017422260716557503, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014543002471327782, + "rewards/margins": 0.4306297302246094, + "rewards/rejected": -0.4451727569103241, + "sft_loss": 0.14543002843856812, + "step": 2648 + }, + { + "epoch": 3.8308026030368763, + "grad_norm": 2.104212176731941, + "learning_rate": 2.4113514290278193e-06, + "logits/chosen": -0.49113255739212036, + "logits/rejected": -0.36438047885894775, + "logps/chosen": -0.12778490781784058, + "logps/rejected": -3.6236534118652344, + "loss": 0.1499, + "odds_ratio_loss": 0.01595112681388855, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012778490781784058, + "rewards/margins": 0.3495868742465973, + "rewards/rejected": -0.36236536502838135, + "sft_loss": 0.12778490781784058, + "step": 2649 + }, + { + "epoch": 3.8322487346348515, + "grad_norm": 1.9491076169596921, + "learning_rate": 2.4085014961403168e-06, + "logits/chosen": -0.6785717010498047, + "logits/rejected": -0.5729687809944153, + "logps/chosen": -0.20131143927574158, + "logps/rejected": -4.839371681213379, + "loss": 0.171, + "odds_ratio_loss": 0.04480065777897835, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020131144672632217, + "rewards/margins": 0.46380603313446045, + "rewards/rejected": -0.48393720388412476, + "sft_loss": 0.20131143927574158, + "step": 2650 + }, + { + "epoch": 3.833694866232827, + "grad_norm": 2.714118679981264, + "learning_rate": 2.4056525227731882e-06, + "logits/chosen": -0.4560430943965912, + "logits/rejected": -0.3898116648197174, + "logps/chosen": -0.2648117244243622, + "logps/rejected": -3.906280517578125, + "loss": 0.1648, + "odds_ratio_loss": 0.062475770711898804, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02648117206990719, + "rewards/margins": 0.3641469180583954, + "rewards/rejected": -0.39062806963920593, + "sft_loss": 0.2648117244243622, + "step": 2651 + }, + { + "epoch": 3.835140997830803, + "grad_norm": 2.2461660391212943, + "learning_rate": 2.4028045106440933e-06, + "logits/chosen": -0.590645432472229, + "logits/rejected": -0.4207696318626404, + "logps/chosen": -0.21039243042469025, + "logps/rejected": -4.4112677574157715, + "loss": 0.1987, + "odds_ratio_loss": 0.035821445286273956, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021039243787527084, + "rewards/margins": 0.4200875461101532, + "rewards/rejected": -0.4411267936229706, + "sft_loss": 0.21039243042469025, + "step": 2652 + }, + { + "epoch": 3.836587129428778, + "grad_norm": 2.166266828237972, + "learning_rate": 2.3999574614701067e-06, + "logits/chosen": -0.669924259185791, + "logits/rejected": -0.48964861035346985, + "logps/chosen": -0.15689903497695923, + "logps/rejected": -3.4739127159118652, + "loss": 0.1679, + "odds_ratio_loss": 0.04638451337814331, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015689902007579803, + "rewards/margins": 0.3317013680934906, + "rewards/rejected": -0.347391277551651, + "sft_loss": 0.15689903497695923, + "step": 2653 + }, + { + "epoch": 3.8380332610267534, + "grad_norm": 2.0754765994658, + "learning_rate": 2.3971113769677263e-06, + "logits/chosen": -0.4671492278575897, + "logits/rejected": -0.4130011796951294, + "logps/chosen": -0.1233329176902771, + "logps/rejected": -4.802967071533203, + "loss": 0.1486, + "odds_ratio_loss": 0.016072288155555725, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012333293445408344, + "rewards/margins": 0.46796342730522156, + "rewards/rejected": -0.48029670119285583, + "sft_loss": 0.1233329176902771, + "step": 2654 + }, + { + "epoch": 3.839479392624729, + "grad_norm": 2.198700366164306, + "learning_rate": 2.394266258852865e-06, + "logits/chosen": -0.6466750502586365, + "logits/rejected": -0.579832136631012, + "logps/chosen": -0.2253737449645996, + "logps/rejected": -3.258246660232544, + "loss": 0.197, + "odds_ratio_loss": 0.03207365423440933, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02253737300634384, + "rewards/margins": 0.3032872974872589, + "rewards/rejected": -0.32582464814186096, + "sft_loss": 0.2253737449645996, + "step": 2655 + }, + { + "epoch": 3.8409255242227043, + "grad_norm": 2.806483633942708, + "learning_rate": 2.3914221088408583e-06, + "logits/chosen": -0.4588702619075775, + "logits/rejected": -0.34531545639038086, + "logps/chosen": -0.2463608831167221, + "logps/rejected": -4.684918403625488, + "loss": 0.1539, + "odds_ratio_loss": 0.06339805573225021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02463608607649803, + "rewards/margins": 0.4438557028770447, + "rewards/rejected": -0.4684918522834778, + "sft_loss": 0.2463608831167221, + "step": 2656 + }, + { + "epoch": 3.8423716558206795, + "grad_norm": 2.1236348351774277, + "learning_rate": 2.3885789286464527e-06, + "logits/chosen": -0.4032669961452484, + "logits/rejected": -0.27119266986846924, + "logps/chosen": -0.12063460052013397, + "logps/rejected": -4.50440788269043, + "loss": 0.166, + "odds_ratio_loss": 0.03388247638940811, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012063460424542427, + "rewards/margins": 0.43837735056877136, + "rewards/rejected": -0.45044082403182983, + "sft_loss": 0.12063460052013397, + "step": 2657 + }, + { + "epoch": 3.843817787418655, + "grad_norm": 1.9236357962541986, + "learning_rate": 2.385736719983813e-06, + "logits/chosen": -0.5088585615158081, + "logits/rejected": -0.4260765314102173, + "logps/chosen": -0.21049919724464417, + "logps/rejected": -4.297446250915527, + "loss": 0.1824, + "odds_ratio_loss": 0.03804740309715271, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021049920469522476, + "rewards/margins": 0.4086947441101074, + "rewards/rejected": -0.4297446608543396, + "sft_loss": 0.21049919724464417, + "step": 2658 + }, + { + "epoch": 3.8452639190166304, + "grad_norm": 2.1599104065431614, + "learning_rate": 2.3828954845665153e-06, + "logits/chosen": -0.5122015476226807, + "logits/rejected": -0.5679474472999573, + "logps/chosen": -0.2392263263463974, + "logps/rejected": -5.5167236328125, + "loss": 0.2194, + "odds_ratio_loss": 0.03686853125691414, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02392263151705265, + "rewards/margins": 0.52774977684021, + "rewards/rejected": -0.5516723990440369, + "sft_loss": 0.2392263263463974, + "step": 2659 + }, + { + "epoch": 3.8467100506146057, + "grad_norm": 2.251439001850937, + "learning_rate": 2.3800552241075538e-06, + "logits/chosen": -0.5718013048171997, + "logits/rejected": -0.6475985050201416, + "logps/chosen": -0.15388913452625275, + "logps/rejected": -3.5337791442871094, + "loss": 0.2118, + "odds_ratio_loss": 0.04061558097600937, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01538891438394785, + "rewards/margins": 0.33798903226852417, + "rewards/rejected": -0.35337793827056885, + "sft_loss": 0.15388913452625275, + "step": 2660 + }, + { + "epoch": 3.8481561822125814, + "grad_norm": 2.3274237399359605, + "learning_rate": 2.3772159403193315e-06, + "logits/chosen": -0.5476040840148926, + "logits/rejected": -0.3083384931087494, + "logps/chosen": -0.14341911673545837, + "logps/rejected": -5.004617691040039, + "loss": 0.1523, + "odds_ratio_loss": 0.00789717212319374, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014341913163661957, + "rewards/margins": 0.4861198663711548, + "rewards/rejected": -0.500461757183075, + "sft_loss": 0.14341911673545837, + "step": 2661 + }, + { + "epoch": 3.8496023138105566, + "grad_norm": 1.853220411413604, + "learning_rate": 2.3743776349136615e-06, + "logits/chosen": -0.6236512660980225, + "logits/rejected": -0.5720344185829163, + "logps/chosen": -0.27546441555023193, + "logps/rejected": -3.9568581581115723, + "loss": 0.1901, + "odds_ratio_loss": 0.06295789033174515, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027546444907784462, + "rewards/margins": 0.368139386177063, + "rewards/rejected": -0.3956858515739441, + "sft_loss": 0.27546441555023193, + "step": 2662 + }, + { + "epoch": 3.8510484454085323, + "grad_norm": 2.382636427805891, + "learning_rate": 2.3715403096017713e-06, + "logits/chosen": -0.5481364727020264, + "logits/rejected": -0.35319069027900696, + "logps/chosen": -0.13607822358608246, + "logps/rejected": -6.205620288848877, + "loss": 0.1311, + "odds_ratio_loss": 0.012123924680054188, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013607822358608246, + "rewards/margins": 0.6069542169570923, + "rewards/rejected": -0.6205620765686035, + "sft_loss": 0.13607822358608246, + "step": 2663 + }, + { + "epoch": 3.8524945770065075, + "grad_norm": 2.1180282826968915, + "learning_rate": 2.3687039660942926e-06, + "logits/chosen": -0.611968994140625, + "logits/rejected": -0.44643181562423706, + "logps/chosen": -0.14699847996234894, + "logps/rejected": -5.006779670715332, + "loss": 0.1661, + "odds_ratio_loss": 0.04196387901902199, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014699846506118774, + "rewards/margins": 0.4859781861305237, + "rewards/rejected": -0.5006780624389648, + "sft_loss": 0.14699847996234894, + "step": 2664 + }, + { + "epoch": 3.8539407086044832, + "grad_norm": 2.104062036320424, + "learning_rate": 2.365868606101269e-06, + "logits/chosen": -0.7090559601783752, + "logits/rejected": -0.5487937927246094, + "logps/chosen": -0.11773136258125305, + "logps/rejected": -4.815327167510986, + "loss": 0.1767, + "odds_ratio_loss": 0.018653199076652527, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011773135513067245, + "rewards/margins": 0.46975958347320557, + "rewards/rejected": -0.4815327525138855, + "sft_loss": 0.11773136258125305, + "step": 2665 + }, + { + "epoch": 3.8553868402024585, + "grad_norm": 2.0999659236456902, + "learning_rate": 2.3630342313321473e-06, + "logits/chosen": -0.4976791441440582, + "logits/rejected": -0.646413266658783, + "logps/chosen": -0.21249869465827942, + "logps/rejected": -4.945645809173584, + "loss": 0.1906, + "odds_ratio_loss": 0.06591396033763885, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021249869838356972, + "rewards/margins": 0.4733147621154785, + "rewards/rejected": -0.49456462264060974, + "sft_loss": 0.21249869465827942, + "step": 2666 + }, + { + "epoch": 3.8568329718004337, + "grad_norm": 2.1533503427856537, + "learning_rate": 2.360200843495786e-06, + "logits/chosen": -0.4372127056121826, + "logits/rejected": -0.4424366354942322, + "logps/chosen": -0.24950441718101501, + "logps/rejected": -2.736983299255371, + "loss": 0.1789, + "odds_ratio_loss": 0.06281781941652298, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02495044469833374, + "rewards/margins": 0.24874788522720337, + "rewards/rejected": -0.2736983299255371, + "sft_loss": 0.24950441718101501, + "step": 2667 + }, + { + "epoch": 3.8582791033984094, + "grad_norm": 3.0188891770587762, + "learning_rate": 2.3573684443004425e-06, + "logits/chosen": -0.46939700841903687, + "logits/rejected": -0.5516136884689331, + "logps/chosen": -0.27107489109039307, + "logps/rejected": -3.729104518890381, + "loss": 0.1659, + "odds_ratio_loss": 0.055554620921611786, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027107488363981247, + "rewards/margins": 0.3458029627799988, + "rewards/rejected": -0.3729104697704315, + "sft_loss": 0.27107489109039307, + "step": 2668 + }, + { + "epoch": 3.8597252349963846, + "grad_norm": 2.31441815819728, + "learning_rate": 2.354537035453783e-06, + "logits/chosen": -0.8334481120109558, + "logits/rejected": -0.6476742625236511, + "logps/chosen": -0.08477871119976044, + "logps/rejected": -4.919661521911621, + "loss": 0.1985, + "odds_ratio_loss": 0.008285166695713997, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008477871306240559, + "rewards/margins": 0.4834883213043213, + "rewards/rejected": -0.491966187953949, + "sft_loss": 0.08477871119976044, + "step": 2669 + }, + { + "epoch": 3.86117136659436, + "grad_norm": 1.8740259808678605, + "learning_rate": 2.351706618662871e-06, + "logits/chosen": -0.6517231464385986, + "logits/rejected": -0.40706366300582886, + "logps/chosen": -0.22961880266666412, + "logps/rejected": -5.185815811157227, + "loss": 0.1517, + "odds_ratio_loss": 0.056328706443309784, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022961881011724472, + "rewards/margins": 0.4956197142601013, + "rewards/rejected": -0.5185816287994385, + "sft_loss": 0.22961880266666412, + "step": 2670 + }, + { + "epoch": 3.8626174981923356, + "grad_norm": 2.2545041214525696, + "learning_rate": 2.3488771956341795e-06, + "logits/chosen": -0.5102351903915405, + "logits/rejected": -0.4168573319911957, + "logps/chosen": -0.19025494158267975, + "logps/rejected": -5.506153106689453, + "loss": 0.1912, + "odds_ratio_loss": 0.05001336708664894, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019025495275855064, + "rewards/margins": 0.5315897464752197, + "rewards/rejected": -0.5506153106689453, + "sft_loss": 0.19025494158267975, + "step": 2671 + }, + { + "epoch": 3.864063629790311, + "grad_norm": 2.267911264770996, + "learning_rate": 2.346048768073575e-06, + "logits/chosen": -0.5635769963264465, + "logits/rejected": -0.511735737323761, + "logps/chosen": -0.12450949102640152, + "logps/rejected": -5.101263523101807, + "loss": 0.127, + "odds_ratio_loss": 0.04796475172042847, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012450949288904667, + "rewards/margins": 0.49767541885375977, + "rewards/rejected": -0.5101263523101807, + "sft_loss": 0.12450949102640152, + "step": 2672 + }, + { + "epoch": 3.865509761388286, + "grad_norm": 2.1719020920397387, + "learning_rate": 2.343221337686328e-06, + "logits/chosen": -0.6359919309616089, + "logits/rejected": -0.5677777528762817, + "logps/chosen": -0.18134115636348724, + "logps/rejected": -3.594672679901123, + "loss": 0.1779, + "odds_ratio_loss": 0.038205526769161224, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018134113401174545, + "rewards/margins": 0.34133318066596985, + "rewards/rejected": -0.3594672977924347, + "sft_loss": 0.18134115636348724, + "step": 2673 + }, + { + "epoch": 3.8669558929862617, + "grad_norm": 2.1634017159786483, + "learning_rate": 2.3403949061771083e-06, + "logits/chosen": -0.7519722580909729, + "logits/rejected": -0.4830787181854248, + "logps/chosen": -0.19885629415512085, + "logps/rejected": -6.522568225860596, + "loss": 0.1848, + "odds_ratio_loss": 0.01121095847338438, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019885629415512085, + "rewards/margins": 0.6323711276054382, + "rewards/rejected": -0.6522568464279175, + "sft_loss": 0.19885629415512085, + "step": 2674 + }, + { + "epoch": 3.8684020245842374, + "grad_norm": 1.9462707784219915, + "learning_rate": 2.33756947524998e-06, + "logits/chosen": -0.5072588920593262, + "logits/rejected": -0.3704627752304077, + "logps/chosen": -0.09452974051237106, + "logps/rejected": -4.214332103729248, + "loss": 0.1956, + "odds_ratio_loss": 0.014662055298686028, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009452973492443562, + "rewards/margins": 0.4119802713394165, + "rewards/rejected": -0.4214332699775696, + "sft_loss": 0.09452974051237106, + "step": 2675 + }, + { + "epoch": 3.8698481561822127, + "grad_norm": 1.883670642460947, + "learning_rate": 2.3347450466084064e-06, + "logits/chosen": -0.4906489849090576, + "logits/rejected": -0.5329009890556335, + "logps/chosen": -0.13472123444080353, + "logps/rejected": -3.3913538455963135, + "loss": 0.1755, + "odds_ratio_loss": 0.026549160480499268, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013472123071551323, + "rewards/margins": 0.3256632685661316, + "rewards/rejected": -0.33913540840148926, + "sft_loss": 0.13472123444080353, + "step": 2676 + }, + { + "epoch": 3.871294287780188, + "grad_norm": 1.7645055199572455, + "learning_rate": 2.3319216219552465e-06, + "logits/chosen": -0.7514166831970215, + "logits/rejected": -0.4781545102596283, + "logps/chosen": -0.19920554757118225, + "logps/rejected": -4.202726364135742, + "loss": 0.2171, + "odds_ratio_loss": 0.050058625638484955, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019920554012060165, + "rewards/margins": 0.4003520607948303, + "rewards/rejected": -0.4202726483345032, + "sft_loss": 0.19920554757118225, + "step": 2677 + }, + { + "epoch": 3.8727404193781636, + "grad_norm": 2.272822367567261, + "learning_rate": 2.3290992029927545e-06, + "logits/chosen": -0.5296982526779175, + "logits/rejected": -0.42527490854263306, + "logps/chosen": -0.1885211169719696, + "logps/rejected": -3.3843655586242676, + "loss": 0.1662, + "odds_ratio_loss": 0.03290311619639397, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0188521146774292, + "rewards/margins": 0.319584459066391, + "rewards/rejected": -0.3384365737438202, + "sft_loss": 0.1885211169719696, + "step": 2678 + }, + { + "epoch": 3.874186550976139, + "grad_norm": 2.1408885936856636, + "learning_rate": 2.326277791422574e-06, + "logits/chosen": -0.6290498971939087, + "logits/rejected": -0.4600859582424164, + "logps/chosen": -0.1297677755355835, + "logps/rejected": -4.369756698608398, + "loss": 0.1729, + "odds_ratio_loss": 0.016120584681630135, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01297677680850029, + "rewards/margins": 0.4239988625049591, + "rewards/rejected": -0.4369756579399109, + "sft_loss": 0.1297677755355835, + "step": 2679 + }, + { + "epoch": 3.875632682574114, + "grad_norm": 2.392102570924333, + "learning_rate": 2.3234573889457477e-06, + "logits/chosen": -0.6287966370582581, + "logits/rejected": -0.5920383930206299, + "logps/chosen": -0.1912301480770111, + "logps/rejected": -3.680026054382324, + "loss": 0.2083, + "odds_ratio_loss": 0.06031038612127304, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01912301406264305, + "rewards/margins": 0.3488796353340149, + "rewards/rejected": -0.36800259351730347, + "sft_loss": 0.1912301480770111, + "step": 2680 + }, + { + "epoch": 3.8770788141720898, + "grad_norm": 2.307929371288277, + "learning_rate": 2.3206379972627047e-06, + "logits/chosen": -0.5087276697158813, + "logits/rejected": -0.34346815943717957, + "logps/chosen": -0.22030162811279297, + "logps/rejected": -2.734839916229248, + "loss": 0.2434, + "odds_ratio_loss": 0.03812616690993309, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022030163556337357, + "rewards/margins": 0.25145381689071655, + "rewards/rejected": -0.2734839916229248, + "sft_loss": 0.22030162811279297, + "step": 2681 + }, + { + "epoch": 3.878524945770065, + "grad_norm": 2.261436653549, + "learning_rate": 2.31781961807327e-06, + "logits/chosen": -0.6415372490882874, + "logits/rejected": -0.5570952892303467, + "logps/chosen": -0.1425553858280182, + "logps/rejected": -4.366990566253662, + "loss": 0.1465, + "odds_ratio_loss": 0.023208629339933395, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014255540445446968, + "rewards/margins": 0.4224435091018677, + "rewards/rejected": -0.4366990923881531, + "sft_loss": 0.1425553858280182, + "step": 2682 + }, + { + "epoch": 3.8799710773680403, + "grad_norm": 1.8952349868348815, + "learning_rate": 2.31500225307665e-06, + "logits/chosen": -0.508310079574585, + "logits/rejected": -0.42341750860214233, + "logps/chosen": -0.09663991630077362, + "logps/rejected": -4.333970069885254, + "loss": 0.1367, + "odds_ratio_loss": 0.005665269680321217, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009663991630077362, + "rewards/margins": 0.4237329959869385, + "rewards/rejected": -0.43339696526527405, + "sft_loss": 0.09663991630077362, + "step": 2683 + }, + { + "epoch": 3.881417208966016, + "grad_norm": 2.4567404402425863, + "learning_rate": 2.3121859039714492e-06, + "logits/chosen": -0.5046891570091248, + "logits/rejected": -0.38773348927497864, + "logps/chosen": -0.169320210814476, + "logps/rejected": -3.1759390830993652, + "loss": 0.1662, + "odds_ratio_loss": 0.05408705398440361, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01693202182650566, + "rewards/margins": 0.3006618618965149, + "rewards/rejected": -0.31759390234947205, + "sft_loss": 0.169320210814476, + "step": 2684 + }, + { + "epoch": 3.882863340563991, + "grad_norm": 3.069883800674717, + "learning_rate": 2.3093705724556527e-06, + "logits/chosen": -0.47510790824890137, + "logits/rejected": -0.37609606981277466, + "logps/chosen": -0.21013259887695312, + "logps/rejected": -2.8292295932769775, + "loss": 0.2096, + "odds_ratio_loss": 0.046919770538806915, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02101326175034046, + "rewards/margins": 0.26190969347953796, + "rewards/rejected": -0.2829229533672333, + "sft_loss": 0.21013259887695312, + "step": 2685 + }, + { + "epoch": 3.884309472161967, + "grad_norm": 2.517335439887365, + "learning_rate": 2.3065562602266336e-06, + "logits/chosen": -0.5777808427810669, + "logits/rejected": -0.6327955722808838, + "logps/chosen": -0.2458532601594925, + "logps/rejected": -4.5382609367370605, + "loss": 0.1886, + "odds_ratio_loss": 0.05085287243127823, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02458532713353634, + "rewards/margins": 0.42924079298973083, + "rewards/rejected": -0.4538261294364929, + "sft_loss": 0.2458532601594925, + "step": 2686 + }, + { + "epoch": 3.885755603759942, + "grad_norm": 2.2236099260861284, + "learning_rate": 2.3037429689811535e-06, + "logits/chosen": -0.49882829189300537, + "logits/rejected": -0.46719181537628174, + "logps/chosen": -0.30639082193374634, + "logps/rejected": -3.102865219116211, + "loss": 0.2481, + "odds_ratio_loss": 0.07744499295949936, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.030639082193374634, + "rewards/margins": 0.2796474099159241, + "rewards/rejected": -0.3102865219116211, + "sft_loss": 0.30639082193374634, + "step": 2687 + }, + { + "epoch": 3.887201735357918, + "grad_norm": 1.9450700368153286, + "learning_rate": 2.3009307004153535e-06, + "logits/chosen": -0.6472720503807068, + "logits/rejected": -0.49304014444351196, + "logps/chosen": -0.2393447756767273, + "logps/rejected": -4.900020599365234, + "loss": 0.2072, + "odds_ratio_loss": 0.02204173430800438, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02393447607755661, + "rewards/margins": 0.4660675823688507, + "rewards/rejected": -0.4900020360946655, + "sft_loss": 0.2393447756767273, + "step": 2688 + }, + { + "epoch": 3.888647866955893, + "grad_norm": 2.2910900345787035, + "learning_rate": 2.298119456224764e-06, + "logits/chosen": -0.6662863492965698, + "logits/rejected": -0.6600769758224487, + "logps/chosen": -0.13599048554897308, + "logps/rejected": -2.522681713104248, + "loss": 0.2028, + "odds_ratio_loss": 0.07435938715934753, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.013599049299955368, + "rewards/margins": 0.23866915702819824, + "rewards/rejected": -0.2522681951522827, + "sft_loss": 0.13599048554897308, + "step": 2689 + }, + { + "epoch": 3.8900939985538683, + "grad_norm": 3.7471824607256785, + "learning_rate": 2.295309238104291e-06, + "logits/chosen": -0.7487191557884216, + "logits/rejected": -0.44234901666641235, + "logps/chosen": -0.11293800920248032, + "logps/rejected": -5.536605358123779, + "loss": 0.1694, + "odds_ratio_loss": 0.017550412565469742, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011293800547719002, + "rewards/margins": 0.5423667430877686, + "rewards/rejected": -0.55366051197052, + "sft_loss": 0.11293800920248032, + "step": 2690 + }, + { + "epoch": 3.891540130151844, + "grad_norm": 1.9729125724864918, + "learning_rate": 2.2925000477482286e-06, + "logits/chosen": -0.5971082448959351, + "logits/rejected": -0.6901511549949646, + "logps/chosen": -0.1475466787815094, + "logps/rejected": -4.186668395996094, + "loss": 0.1915, + "odds_ratio_loss": 0.03244779258966446, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014754666946828365, + "rewards/margins": 0.40391218662261963, + "rewards/rejected": -0.4186668395996094, + "sft_loss": 0.1475466787815094, + "step": 2691 + }, + { + "epoch": 3.892986261749819, + "grad_norm": 2.286839337755713, + "learning_rate": 2.289691886850246e-06, + "logits/chosen": -0.7391765117645264, + "logits/rejected": -0.47775739431381226, + "logps/chosen": -0.23687493801116943, + "logps/rejected": -3.8079967498779297, + "loss": 0.1872, + "odds_ratio_loss": 0.0462396964430809, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023687491193413734, + "rewards/margins": 0.35711222887039185, + "rewards/rejected": -0.38079971075057983, + "sft_loss": 0.23687493801116943, + "step": 2692 + }, + { + "epoch": 3.8944323933477945, + "grad_norm": 2.16708601581374, + "learning_rate": 2.2868847571033958e-06, + "logits/chosen": -0.65777987241745, + "logits/rejected": -0.4829615652561188, + "logps/chosen": -0.11277547478675842, + "logps/rejected": -4.166236400604248, + "loss": 0.1679, + "odds_ratio_loss": 0.024801742285490036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011277547106146812, + "rewards/margins": 0.4053461253643036, + "rewards/rejected": -0.41662368178367615, + "sft_loss": 0.11277547478675842, + "step": 2693 + }, + { + "epoch": 3.89587852494577, + "grad_norm": 1.9916804771142396, + "learning_rate": 2.284078660200105e-06, + "logits/chosen": -0.5471053123474121, + "logits/rejected": -0.4118785262107849, + "logps/chosen": -0.1762133240699768, + "logps/rejected": -5.470975875854492, + "loss": 0.1503, + "odds_ratio_loss": 0.019666478037834167, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01762133277952671, + "rewards/margins": 0.5294762849807739, + "rewards/rejected": -0.5470975637435913, + "sft_loss": 0.1762133240699768, + "step": 2694 + }, + { + "epoch": 3.8973246565437454, + "grad_norm": 2.0384475244157105, + "learning_rate": 2.2812735978321823e-06, + "logits/chosen": -0.5914080142974854, + "logits/rejected": -0.5802508592605591, + "logps/chosen": -0.12550309300422668, + "logps/rejected": -3.805398941040039, + "loss": 0.1454, + "odds_ratio_loss": 0.025427283719182014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012550310231745243, + "rewards/margins": 0.36798956990242004, + "rewards/rejected": -0.3805398941040039, + "sft_loss": 0.12550309300422668, + "step": 2695 + }, + { + "epoch": 3.8987707881417206, + "grad_norm": 2.0039485066790914, + "learning_rate": 2.278469571690806e-06, + "logits/chosen": -0.6192625761032104, + "logits/rejected": -0.5282604694366455, + "logps/chosen": -0.15284359455108643, + "logps/rejected": -4.06088924407959, + "loss": 0.1777, + "odds_ratio_loss": 0.018614958971738815, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015284359455108643, + "rewards/margins": 0.39080455899238586, + "rewards/rejected": -0.4060888886451721, + "sft_loss": 0.15284359455108643, + "step": 2696 + }, + { + "epoch": 3.9002169197396963, + "grad_norm": 1.9348476449421625, + "learning_rate": 2.2756665834665386e-06, + "logits/chosen": -0.5989269018173218, + "logits/rejected": -0.46438270807266235, + "logps/chosen": -0.204659104347229, + "logps/rejected": -5.059053897857666, + "loss": 0.2055, + "odds_ratio_loss": 0.015010501258075237, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0204659104347229, + "rewards/margins": 0.4854394793510437, + "rewards/rejected": -0.5059053897857666, + "sft_loss": 0.204659104347229, + "step": 2697 + }, + { + "epoch": 3.901663051337672, + "grad_norm": 2.091035380201727, + "learning_rate": 2.272864634849308e-06, + "logits/chosen": -0.47287124395370483, + "logits/rejected": -0.3687141537666321, + "logps/chosen": -0.23247979581356049, + "logps/rejected": -4.085878849029541, + "loss": 0.2085, + "odds_ratio_loss": 0.05591895431280136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023247981444001198, + "rewards/margins": 0.3853399157524109, + "rewards/rejected": -0.40858790278434753, + "sft_loss": 0.23247979581356049, + "step": 2698 + }, + { + "epoch": 3.9031091829356472, + "grad_norm": 1.9688702293823612, + "learning_rate": 2.2700637275284244e-06, + "logits/chosen": -0.4580070972442627, + "logits/rejected": -0.39984434843063354, + "logps/chosen": -0.2849455773830414, + "logps/rejected": -2.5171000957489014, + "loss": 0.1823, + "odds_ratio_loss": 0.06607170403003693, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02849455736577511, + "rewards/margins": 0.22321546077728271, + "rewards/rejected": -0.2517099976539612, + "sft_loss": 0.2849455773830414, + "step": 2699 + }, + { + "epoch": 3.9045553145336225, + "grad_norm": 2.3070770756670913, + "learning_rate": 2.26726386319256e-06, + "logits/chosen": -0.5406573414802551, + "logits/rejected": -0.26209941506385803, + "logps/chosen": -0.1298496127128601, + "logps/rejected": -3.572732925415039, + "loss": 0.1791, + "odds_ratio_loss": 0.017180941998958588, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01298496127128601, + "rewards/margins": 0.34428831934928894, + "rewards/rejected": -0.35727331042289734, + "sft_loss": 0.1298496127128601, + "step": 2700 + }, + { + "epoch": 3.906001446131598, + "grad_norm": 1.915036217109782, + "learning_rate": 2.2644650435297675e-06, + "logits/chosen": -0.5574444532394409, + "logits/rejected": -0.5001909136772156, + "logps/chosen": -0.28338301181793213, + "logps/rejected": -3.268771171569824, + "loss": 0.2028, + "odds_ratio_loss": 0.07406371831893921, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.028338300064206123, + "rewards/margins": 0.29853883385658264, + "rewards/rejected": -0.3268771469593048, + "sft_loss": 0.28338301181793213, + "step": 2701 + }, + { + "epoch": 3.9074475777295734, + "grad_norm": 3.4689329886595868, + "learning_rate": 2.2616672702274643e-06, + "logits/chosen": -0.562351405620575, + "logits/rejected": -0.4643508791923523, + "logps/chosen": -0.1464012861251831, + "logps/rejected": -3.3796513080596924, + "loss": 0.1936, + "odds_ratio_loss": 0.03205585852265358, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01464013010263443, + "rewards/margins": 0.3233250081539154, + "rewards/rejected": -0.33796513080596924, + "sft_loss": 0.1464012861251831, + "step": 2702 + }, + { + "epoch": 3.9088937093275486, + "grad_norm": 2.3207929462257146, + "learning_rate": 2.258870544972437e-06, + "logits/chosen": -0.5727015733718872, + "logits/rejected": -0.4699394106864929, + "logps/chosen": -0.17665952444076538, + "logps/rejected": -3.051046848297119, + "loss": 0.2095, + "odds_ratio_loss": 0.060139112174510956, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017665952444076538, + "rewards/margins": 0.2874387204647064, + "rewards/rejected": -0.30510464310646057, + "sft_loss": 0.17665952444076538, + "step": 2703 + }, + { + "epoch": 3.9103398409255243, + "grad_norm": 2.4119382863806433, + "learning_rate": 2.2560748694508435e-06, + "logits/chosen": -0.5257418155670166, + "logits/rejected": -0.5052908658981323, + "logps/chosen": -0.1041015014052391, + "logps/rejected": -2.8004159927368164, + "loss": 0.2212, + "odds_ratio_loss": 0.02311025746166706, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01041015051305294, + "rewards/margins": 0.2696314752101898, + "rewards/rejected": -0.2800416052341461, + "sft_loss": 0.1041015014052391, + "step": 2704 + }, + { + "epoch": 3.9117859725234996, + "grad_norm": 2.2430778101447055, + "learning_rate": 2.253280245348205e-06, + "logits/chosen": -0.5410814881324768, + "logits/rejected": -0.4534750282764435, + "logps/chosen": -0.2101866453886032, + "logps/rejected": -3.4188551902770996, + "loss": 0.2123, + "odds_ratio_loss": 0.05106037110090256, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02101866528391838, + "rewards/margins": 0.3208668529987335, + "rewards/rejected": -0.341885507106781, + "sft_loss": 0.2101866453886032, + "step": 2705 + }, + { + "epoch": 3.913232104121475, + "grad_norm": 2.414189461889776, + "learning_rate": 2.2504866743494134e-06, + "logits/chosen": -0.6277978420257568, + "logits/rejected": -0.39811521768569946, + "logps/chosen": -0.15590061247348785, + "logps/rejected": -5.489923477172852, + "loss": 0.2404, + "odds_ratio_loss": 0.012680593878030777, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01559006329625845, + "rewards/margins": 0.5334023237228394, + "rewards/rejected": -0.5489923357963562, + "sft_loss": 0.15590061247348785, + "step": 2706 + }, + { + "epoch": 3.9146782357194505, + "grad_norm": 2.1925742092247456, + "learning_rate": 2.24769415813872e-06, + "logits/chosen": -0.6148355007171631, + "logits/rejected": -0.5587329864501953, + "logps/chosen": -0.18322691321372986, + "logps/rejected": -3.181863784790039, + "loss": 0.1937, + "odds_ratio_loss": 0.049598000943660736, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018322691321372986, + "rewards/margins": 0.29986369609832764, + "rewards/rejected": -0.3181864023208618, + "sft_loss": 0.18322691321372986, + "step": 2707 + }, + { + "epoch": 3.9161243673174257, + "grad_norm": 3.1679965919517916, + "learning_rate": 2.2449026983997476e-06, + "logits/chosen": -0.7403485774993896, + "logits/rejected": -0.6171454191207886, + "logps/chosen": -0.2301938235759735, + "logps/rejected": -3.761472225189209, + "loss": 0.1659, + "odds_ratio_loss": 0.04906386882066727, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02301938459277153, + "rewards/margins": 0.3531278371810913, + "rewards/rejected": -0.37614724040031433, + "sft_loss": 0.2301938235759735, + "step": 2708 + }, + { + "epoch": 3.9175704989154014, + "grad_norm": 2.1201844249727153, + "learning_rate": 2.242112296815474e-06, + "logits/chosen": -0.6007668972015381, + "logits/rejected": -0.4732978045940399, + "logps/chosen": -0.2650083899497986, + "logps/rejected": -4.493795394897461, + "loss": 0.2163, + "odds_ratio_loss": 0.03872312977910042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02650083787739277, + "rewards/margins": 0.4228787422180176, + "rewards/rejected": -0.449379563331604, + "sft_loss": 0.2650083899497986, + "step": 2709 + }, + { + "epoch": 3.9190166305133767, + "grad_norm": 1.8862309748937722, + "learning_rate": 2.239322955068244e-06, + "logits/chosen": -0.4192129075527191, + "logits/rejected": -0.39664286375045776, + "logps/chosen": -0.25290870666503906, + "logps/rejected": -5.286991119384766, + "loss": 0.2261, + "odds_ratio_loss": 0.05502773076295853, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025290867313742638, + "rewards/margins": 0.5034083127975464, + "rewards/rejected": -0.5286991596221924, + "sft_loss": 0.25290870666503906, + "step": 2710 + }, + { + "epoch": 3.9204627621113524, + "grad_norm": 1.824106972408141, + "learning_rate": 2.2365346748397606e-06, + "logits/chosen": -0.4954592287540436, + "logits/rejected": -0.42046910524368286, + "logps/chosen": -0.11221830546855927, + "logps/rejected": -6.584062576293945, + "loss": 0.1433, + "odds_ratio_loss": 0.022290529683232307, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011221831664443016, + "rewards/margins": 0.6471844911575317, + "rewards/rejected": -0.6584063172340393, + "sft_loss": 0.11221830546855927, + "step": 2711 + }, + { + "epoch": 3.9219088937093276, + "grad_norm": 2.073955700687443, + "learning_rate": 2.2337474578110904e-06, + "logits/chosen": -0.4804004430770874, + "logits/rejected": -0.32498830556869507, + "logps/chosen": -0.10834227502346039, + "logps/rejected": -4.200552940368652, + "loss": 0.1867, + "odds_ratio_loss": 0.015155954286456108, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01083422638475895, + "rewards/margins": 0.4092210531234741, + "rewards/rejected": -0.4200552701950073, + "sft_loss": 0.10834227502346039, + "step": 2712 + }, + { + "epoch": 3.923355025307303, + "grad_norm": 3.7115801213716497, + "learning_rate": 2.230961305662655e-06, + "logits/chosen": -0.49971461296081543, + "logits/rejected": -0.3881816565990448, + "logps/chosen": -0.27020180225372314, + "logps/rejected": -4.6016716957092285, + "loss": 0.2186, + "odds_ratio_loss": 0.05096177011728287, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027020180597901344, + "rewards/margins": 0.43314701318740845, + "rewards/rejected": -0.46016716957092285, + "sft_loss": 0.27020180225372314, + "step": 2713 + }, + { + "epoch": 3.9248011569052785, + "grad_norm": 2.308041112836885, + "learning_rate": 2.228176220074237e-06, + "logits/chosen": -0.8088836669921875, + "logits/rejected": -0.6313307285308838, + "logps/chosen": -0.21790727972984314, + "logps/rejected": -3.6561279296875, + "loss": 0.1934, + "odds_ratio_loss": 0.04986013099551201, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021790727972984314, + "rewards/margins": 0.34382206201553345, + "rewards/rejected": -0.36561277508735657, + "sft_loss": 0.21790727972984314, + "step": 2714 + }, + { + "epoch": 3.9262472885032538, + "grad_norm": 1.9474719292583453, + "learning_rate": 2.2253922027249765e-06, + "logits/chosen": -0.4550485610961914, + "logits/rejected": -0.35971906781196594, + "logps/chosen": -0.22380274534225464, + "logps/rejected": -3.577040672302246, + "loss": 0.229, + "odds_ratio_loss": 0.03350451588630676, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022380275651812553, + "rewards/margins": 0.3353238105773926, + "rewards/rejected": -0.3577041029930115, + "sft_loss": 0.22380274534225464, + "step": 2715 + }, + { + "epoch": 3.927693420101229, + "grad_norm": 1.9833196622850737, + "learning_rate": 2.222609255293367e-06, + "logits/chosen": -0.5128300189971924, + "logits/rejected": -0.3967776894569397, + "logps/chosen": -0.12300994247198105, + "logps/rejected": -3.8472933769226074, + "loss": 0.1866, + "odds_ratio_loss": 0.033817462623119354, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01230099517852068, + "rewards/margins": 0.37242835760116577, + "rewards/rejected": -0.3847293257713318, + "sft_loss": 0.12300994247198105, + "step": 2716 + }, + { + "epoch": 3.9291395516992047, + "grad_norm": 1.8220600153997757, + "learning_rate": 2.219827379457256e-06, + "logits/chosen": -0.47464361786842346, + "logits/rejected": -0.4734431505203247, + "logps/chosen": -0.1375093311071396, + "logps/rejected": -4.749077796936035, + "loss": 0.1515, + "odds_ratio_loss": 0.03777370974421501, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013750933110713959, + "rewards/margins": 0.46115684509277344, + "rewards/rejected": -0.4749077558517456, + "sft_loss": 0.1375093311071396, + "step": 2717 + }, + { + "epoch": 3.93058568329718, + "grad_norm": 2.051028908996472, + "learning_rate": 2.2170465768938473e-06, + "logits/chosen": -0.6458155512809753, + "logits/rejected": -0.5764715671539307, + "logps/chosen": -0.08788472414016724, + "logps/rejected": -3.526029109954834, + "loss": 0.2008, + "odds_ratio_loss": 0.020078768953680992, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008788472041487694, + "rewards/margins": 0.3438144326210022, + "rewards/rejected": -0.35260289907455444, + "sft_loss": 0.08788472414016724, + "step": 2718 + }, + { + "epoch": 3.932031814895155, + "grad_norm": 1.7076574111730232, + "learning_rate": 2.214266849279699e-06, + "logits/chosen": -0.4735933542251587, + "logits/rejected": -0.44786524772644043, + "logps/chosen": -0.10404633730649948, + "logps/rejected": -4.46885347366333, + "loss": 0.1045, + "odds_ratio_loss": 0.02645310014486313, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010404633358120918, + "rewards/margins": 0.43648070096969604, + "rewards/rejected": -0.446885347366333, + "sft_loss": 0.10404633730649948, + "step": 2719 + }, + { + "epoch": 3.933477946493131, + "grad_norm": 2.0280602741537046, + "learning_rate": 2.211488198290716e-06, + "logits/chosen": -0.6490156054496765, + "logits/rejected": -0.5675171613693237, + "logps/chosen": -0.23582687973976135, + "logps/rejected": -2.961477756500244, + "loss": 0.2049, + "odds_ratio_loss": 0.06842078268527985, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023582689464092255, + "rewards/margins": 0.2725651264190674, + "rewards/rejected": -0.29614779353141785, + "sft_loss": 0.23582687973976135, + "step": 2720 + }, + { + "epoch": 3.9349240780911066, + "grad_norm": 2.0813505754869426, + "learning_rate": 2.208710625602162e-06, + "logits/chosen": -0.5769675970077515, + "logits/rejected": -0.48996660113334656, + "logps/chosen": -0.20689250528812408, + "logps/rejected": -4.864871025085449, + "loss": 0.1726, + "odds_ratio_loss": 0.029630932956933975, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02068925090134144, + "rewards/margins": 0.4657978415489197, + "rewards/rejected": -0.48648712038993835, + "sft_loss": 0.20689250528812408, + "step": 2721 + }, + { + "epoch": 3.936370209689082, + "grad_norm": 2.1687754965363952, + "learning_rate": 2.205934132888641e-06, + "logits/chosen": -0.46100273728370667, + "logits/rejected": -0.36753734946250916, + "logps/chosen": -0.11003377288579941, + "logps/rejected": -4.685311794281006, + "loss": 0.1729, + "odds_ratio_loss": 0.017081379890441895, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011003376916050911, + "rewards/margins": 0.4575278162956238, + "rewards/rejected": -0.46853119134902954, + "sft_loss": 0.11003377288579941, + "step": 2722 + }, + { + "epoch": 3.937816341287057, + "grad_norm": 2.0296629482018655, + "learning_rate": 2.2031587218241148e-06, + "logits/chosen": -0.4646747410297394, + "logits/rejected": -0.39276355504989624, + "logps/chosen": -0.10434699803590775, + "logps/rejected": -4.183708190917969, + "loss": 0.175, + "odds_ratio_loss": 0.02051355130970478, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01043469924479723, + "rewards/margins": 0.4079361855983734, + "rewards/rejected": -0.41837090253829956, + "sft_loss": 0.10434699803590775, + "step": 2723 + }, + { + "epoch": 3.9392624728850327, + "grad_norm": 2.049891085377211, + "learning_rate": 2.2003843940818874e-06, + "logits/chosen": -0.6782933473587036, + "logits/rejected": -0.5610325932502747, + "logps/chosen": -0.20635099709033966, + "logps/rejected": -4.527532577514648, + "loss": 0.2131, + "odds_ratio_loss": 0.03238668665289879, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020635100081562996, + "rewards/margins": 0.4321182072162628, + "rewards/rejected": -0.4527532458305359, + "sft_loss": 0.20635099709033966, + "step": 2724 + }, + { + "epoch": 3.940708604483008, + "grad_norm": 1.86509035176643, + "learning_rate": 2.1976111513346113e-06, + "logits/chosen": -0.6634677648544312, + "logits/rejected": -0.5007971525192261, + "logps/chosen": -0.09252119809389114, + "logps/rejected": -4.644352436065674, + "loss": 0.1433, + "odds_ratio_loss": 0.020731642842292786, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00925211887806654, + "rewards/margins": 0.45518314838409424, + "rewards/rejected": -0.46443524956703186, + "sft_loss": 0.09252119809389114, + "step": 2725 + }, + { + "epoch": 3.942154736080983, + "grad_norm": 2.0492445038755394, + "learning_rate": 2.1948389952542834e-06, + "logits/chosen": -0.5211946964263916, + "logits/rejected": -0.4892616271972656, + "logps/chosen": -0.10040014237165451, + "logps/rejected": -2.7622721195220947, + "loss": 0.2018, + "odds_ratio_loss": 0.021230706945061684, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010040014050900936, + "rewards/margins": 0.2661871910095215, + "rewards/rejected": -0.2762272357940674, + "sft_loss": 0.10040014237165451, + "step": 2726 + }, + { + "epoch": 3.943600867678959, + "grad_norm": 1.9556796476216858, + "learning_rate": 2.1920679275122482e-06, + "logits/chosen": -0.7522842884063721, + "logits/rejected": -0.5683616995811462, + "logps/chosen": -0.12370572984218597, + "logps/rejected": -2.923466920852661, + "loss": 0.1666, + "odds_ratio_loss": 0.01687091588973999, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012370571494102478, + "rewards/margins": 0.27997615933418274, + "rewards/rejected": -0.292346715927124, + "sft_loss": 0.12370572984218597, + "step": 2727 + }, + { + "epoch": 3.945046999276934, + "grad_norm": 2.555360496366615, + "learning_rate": 2.1892979497791945e-06, + "logits/chosen": -0.7111763954162598, + "logits/rejected": -0.534297525882721, + "logps/chosen": -0.14246775209903717, + "logps/rejected": -3.895209550857544, + "loss": 0.1889, + "odds_ratio_loss": 0.01873032934963703, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014246775768697262, + "rewards/margins": 0.3752741813659668, + "rewards/rejected": -0.38952094316482544, + "sft_loss": 0.14246775209903717, + "step": 2728 + }, + { + "epoch": 3.9464931308749094, + "grad_norm": 2.69918533275395, + "learning_rate": 2.1865290637251494e-06, + "logits/chosen": -0.6954668760299683, + "logits/rejected": -0.4582352638244629, + "logps/chosen": -0.16472528874874115, + "logps/rejected": -2.790771245956421, + "loss": 0.189, + "odds_ratio_loss": 0.0377388596534729, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016472529619932175, + "rewards/margins": 0.26260459423065186, + "rewards/rejected": -0.2790771424770355, + "sft_loss": 0.16472528874874115, + "step": 2729 + }, + { + "epoch": 3.947939262472885, + "grad_norm": 2.1214673089172487, + "learning_rate": 2.1837612710194872e-06, + "logits/chosen": -0.5668919086456299, + "logits/rejected": -0.4885096549987793, + "logps/chosen": -0.3104791045188904, + "logps/rejected": -4.2551960945129395, + "loss": 0.1956, + "odds_ratio_loss": 0.07189791649580002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.031047910451889038, + "rewards/margins": 0.3944717049598694, + "rewards/rejected": -0.4255196154117584, + "sft_loss": 0.3104791045188904, + "step": 2730 + }, + { + "epoch": 3.9493853940708603, + "grad_norm": 2.1663124391142325, + "learning_rate": 2.1809945733309193e-06, + "logits/chosen": -0.6335052847862244, + "logits/rejected": -0.5434231162071228, + "logps/chosen": -0.13023731112480164, + "logps/rejected": -3.2621257305145264, + "loss": 0.1589, + "odds_ratio_loss": 0.0158349871635437, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01302372943609953, + "rewards/margins": 0.3131888508796692, + "rewards/rejected": -0.3262125551700592, + "sft_loss": 0.13023731112480164, + "step": 2731 + }, + { + "epoch": 3.950831525668836, + "grad_norm": 2.2296369918556573, + "learning_rate": 2.1782289723274975e-06, + "logits/chosen": -0.5075691342353821, + "logits/rejected": -0.40280091762542725, + "logps/chosen": -0.21550577878952026, + "logps/rejected": -2.2407689094543457, + "loss": 0.1781, + "odds_ratio_loss": 0.041584450751543045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021550578996539116, + "rewards/margins": 0.2025263011455536, + "rewards/rejected": -0.22407689690589905, + "sft_loss": 0.21550577878952026, + "step": 2732 + }, + { + "epoch": 3.9522776572668112, + "grad_norm": 2.1585834508959945, + "learning_rate": 2.175464469676612e-06, + "logits/chosen": -0.6296189427375793, + "logits/rejected": -0.6328527331352234, + "logps/chosen": -0.20828096568584442, + "logps/rejected": -5.044098377227783, + "loss": 0.163, + "odds_ratio_loss": 0.06003636494278908, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02082809805870056, + "rewards/margins": 0.4835817217826843, + "rewards/rejected": -0.5044097900390625, + "sft_loss": 0.20828096568584442, + "step": 2733 + }, + { + "epoch": 3.953723788864787, + "grad_norm": 1.8902474968189271, + "learning_rate": 2.1727010670449945e-06, + "logits/chosen": -0.5789146423339844, + "logits/rejected": -0.4928153157234192, + "logps/chosen": -0.10002440959215164, + "logps/rejected": -4.361918926239014, + "loss": 0.1677, + "odds_ratio_loss": 0.03862610086798668, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01000244077295065, + "rewards/margins": 0.42618948221206665, + "rewards/rejected": -0.4361919164657593, + "sft_loss": 0.10002440959215164, + "step": 2734 + }, + { + "epoch": 3.955169920462762, + "grad_norm": 3.1751381210221767, + "learning_rate": 2.1699387660987077e-06, + "logits/chosen": -0.5375577211380005, + "logits/rejected": -0.5367023944854736, + "logps/chosen": -0.24724946916103363, + "logps/rejected": -3.8961119651794434, + "loss": 0.2169, + "odds_ratio_loss": 0.03406355902552605, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024724945425987244, + "rewards/margins": 0.36488622426986694, + "rewards/rejected": -0.3896111845970154, + "sft_loss": 0.24724946916103363, + "step": 2735 + }, + { + "epoch": 3.9566160520607374, + "grad_norm": 1.9066520362152397, + "learning_rate": 2.1671775685031563e-06, + "logits/chosen": -0.6246693134307861, + "logits/rejected": -0.5068244934082031, + "logps/chosen": -0.17706000804901123, + "logps/rejected": -3.941403865814209, + "loss": 0.1729, + "odds_ratio_loss": 0.031446393579244614, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017706003040075302, + "rewards/margins": 0.37643444538116455, + "rewards/rejected": -0.39414042234420776, + "sft_loss": 0.17706000804901123, + "step": 2736 + }, + { + "epoch": 3.958062183658713, + "grad_norm": 1.9930585146889106, + "learning_rate": 2.1644174759230736e-06, + "logits/chosen": -0.5262311100959778, + "logits/rejected": -0.4606435298919678, + "logps/chosen": -0.21938474476337433, + "logps/rejected": -4.327494144439697, + "loss": 0.1949, + "odds_ratio_loss": 0.045247986912727356, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021938476711511612, + "rewards/margins": 0.4108109772205353, + "rewards/rejected": -0.4327494502067566, + "sft_loss": 0.21938474476337433, + "step": 2737 + }, + { + "epoch": 3.9595083152566883, + "grad_norm": 1.8421809853500997, + "learning_rate": 2.161658490022532e-06, + "logits/chosen": -0.6197947859764099, + "logits/rejected": -0.4551239311695099, + "logps/chosen": -0.22861793637275696, + "logps/rejected": -3.364131450653076, + "loss": 0.1774, + "odds_ratio_loss": 0.047167375683784485, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022861795499920845, + "rewards/margins": 0.3135513663291931, + "rewards/rejected": -0.33641317486763, + "sft_loss": 0.22861793637275696, + "step": 2738 + }, + { + "epoch": 3.9609544468546636, + "grad_norm": 2.0397606314124266, + "learning_rate": 2.1589006124649325e-06, + "logits/chosen": -0.436599999666214, + "logits/rejected": -0.3888368010520935, + "logps/chosen": -0.12135307490825653, + "logps/rejected": -4.132548809051514, + "loss": 0.1574, + "odds_ratio_loss": 0.016933148726820946, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012135308235883713, + "rewards/margins": 0.4011196196079254, + "rewards/rejected": -0.41325491666793823, + "sft_loss": 0.12135307490825653, + "step": 2739 + }, + { + "epoch": 3.9624005784526393, + "grad_norm": 1.75327087377883, + "learning_rate": 2.1561438449130124e-06, + "logits/chosen": -0.5723130106925964, + "logits/rejected": -0.5248115062713623, + "logps/chosen": -0.14575621485710144, + "logps/rejected": -5.364757537841797, + "loss": 0.1623, + "odds_ratio_loss": 0.03158080577850342, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014575622044503689, + "rewards/margins": 0.5219001770019531, + "rewards/rejected": -0.5364757776260376, + "sft_loss": 0.14575621485710144, + "step": 2740 + }, + { + "epoch": 3.9638467100506145, + "grad_norm": 2.0814838546242704, + "learning_rate": 2.153388189028835e-06, + "logits/chosen": -0.5976966619491577, + "logits/rejected": -0.47446054220199585, + "logps/chosen": -0.15319940447807312, + "logps/rejected": -4.2068257331848145, + "loss": 0.1719, + "odds_ratio_loss": 0.02311069332063198, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015319941565394402, + "rewards/margins": 0.40536269545555115, + "rewards/rejected": -0.4206825792789459, + "sft_loss": 0.15319940447807312, + "step": 2741 + }, + { + "epoch": 3.9652928416485898, + "grad_norm": 5.440242071490554, + "learning_rate": 2.1506336464737943e-06, + "logits/chosen": -0.5652493834495544, + "logits/rejected": -0.49528682231903076, + "logps/chosen": -0.2625923752784729, + "logps/rejected": -4.457971572875977, + "loss": 0.2107, + "odds_ratio_loss": 0.0539512000977993, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02625923603773117, + "rewards/margins": 0.4195379316806793, + "rewards/rejected": -0.4457972049713135, + "sft_loss": 0.2625923752784729, + "step": 2742 + }, + { + "epoch": 3.9667389732465654, + "grad_norm": 3.215284049864351, + "learning_rate": 2.147880218908618e-06, + "logits/chosen": -0.7117863893508911, + "logits/rejected": -0.4697888195514679, + "logps/chosen": -0.0863754153251648, + "logps/rejected": -4.418948173522949, + "loss": 0.1422, + "odds_ratio_loss": 0.011987053789198399, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00863754190504551, + "rewards/margins": 0.4332572817802429, + "rewards/rejected": -0.4418948292732239, + "sft_loss": 0.0863754153251648, + "step": 2743 + }, + { + "epoch": 3.968185104844541, + "grad_norm": 2.0169688500160268, + "learning_rate": 2.145127907993354e-06, + "logits/chosen": -0.6409890651702881, + "logits/rejected": -0.6879695057868958, + "logps/chosen": -0.153423473238945, + "logps/rejected": -4.237060546875, + "loss": 0.2012, + "odds_ratio_loss": 0.030342232435941696, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0153423473238945, + "rewards/margins": 0.4083637297153473, + "rewards/rejected": -0.4237060844898224, + "sft_loss": 0.153423473238945, + "step": 2744 + }, + { + "epoch": 3.9696312364425164, + "grad_norm": 2.266381185127771, + "learning_rate": 2.1423767153873845e-06, + "logits/chosen": -0.5684958100318909, + "logits/rejected": -0.49004459381103516, + "logps/chosen": -0.2129286527633667, + "logps/rejected": -3.743337869644165, + "loss": 0.207, + "odds_ratio_loss": 0.025867803022265434, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02129286341369152, + "rewards/margins": 0.3530409336090088, + "rewards/rejected": -0.37433379888534546, + "sft_loss": 0.2129286527633667, + "step": 2745 + }, + { + "epoch": 3.9710773680404916, + "grad_norm": 1.92859371400278, + "learning_rate": 2.13962664274941e-06, + "logits/chosen": -0.5195826292037964, + "logits/rejected": -0.5544252395629883, + "logps/chosen": -0.23682743310928345, + "logps/rejected": -3.3036885261535645, + "loss": 0.1898, + "odds_ratio_loss": 0.08838633447885513, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023682741448283195, + "rewards/margins": 0.3066861033439636, + "rewards/rejected": -0.3303688168525696, + "sft_loss": 0.23682743310928345, + "step": 2746 + }, + { + "epoch": 3.9725234996384673, + "grad_norm": 1.8564797001366269, + "learning_rate": 2.1368776917374623e-06, + "logits/chosen": -0.5486931204795837, + "logits/rejected": -0.48903441429138184, + "logps/chosen": -0.19572268426418304, + "logps/rejected": -5.466530799865723, + "loss": 0.1608, + "odds_ratio_loss": 0.022454869002103806, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019572269171476364, + "rewards/margins": 0.527080774307251, + "rewards/rejected": -0.5466530919075012, + "sft_loss": 0.19572268426418304, + "step": 2747 + }, + { + "epoch": 3.9739696312364425, + "grad_norm": 2.2883512574071077, + "learning_rate": 2.134129864008894e-06, + "logits/chosen": -0.6515862345695496, + "logits/rejected": -0.4136117398738861, + "logps/chosen": -0.2721066474914551, + "logps/rejected": -4.8516316413879395, + "loss": 0.2038, + "odds_ratio_loss": 0.0357467383146286, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027210667729377747, + "rewards/margins": 0.4579525291919708, + "rewards/rejected": -0.48516321182250977, + "sft_loss": 0.2721066474914551, + "step": 2748 + }, + { + "epoch": 3.9754157628344178, + "grad_norm": 2.3359102349897753, + "learning_rate": 2.1313831612203796e-06, + "logits/chosen": -0.7950201034545898, + "logits/rejected": -0.706078290939331, + "logps/chosen": -0.1814843863248825, + "logps/rejected": -3.375770092010498, + "loss": 0.2099, + "odds_ratio_loss": 0.035325754433870316, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01814843900501728, + "rewards/margins": 0.31942859292030334, + "rewards/rejected": -0.3375770151615143, + "sft_loss": 0.1814843863248825, + "step": 2749 + }, + { + "epoch": 3.9768618944323935, + "grad_norm": 2.019886300027027, + "learning_rate": 2.1286375850279154e-06, + "logits/chosen": -0.7477739453315735, + "logits/rejected": -0.6434758901596069, + "logps/chosen": -0.1750190109014511, + "logps/rejected": -4.2602996826171875, + "loss": 0.1752, + "odds_ratio_loss": 0.042220983654260635, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01750190183520317, + "rewards/margins": 0.40852802991867065, + "rewards/rejected": -0.42602992057800293, + "sft_loss": 0.1750190109014511, + "step": 2750 + }, + { + "epoch": 3.9783080260303687, + "grad_norm": 1.6827104281078238, + "learning_rate": 2.1258931370868224e-06, + "logits/chosen": -0.6645622253417969, + "logits/rejected": -0.5144686698913574, + "logps/chosen": -0.09467566013336182, + "logps/rejected": -4.5071258544921875, + "loss": 0.15, + "odds_ratio_loss": 0.017522595822811127, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009467566385865211, + "rewards/margins": 0.44124501943588257, + "rewards/rejected": -0.4507125914096832, + "sft_loss": 0.09467566013336182, + "step": 2751 + }, + { + "epoch": 3.979754157628344, + "grad_norm": 2.795885951210386, + "learning_rate": 2.1231498190517355e-06, + "logits/chosen": -0.43349555134773254, + "logits/rejected": -0.4150978624820709, + "logps/chosen": -0.1894543319940567, + "logps/rejected": -6.349620819091797, + "loss": 0.1768, + "odds_ratio_loss": 0.03964434191584587, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01894543319940567, + "rewards/margins": 0.6160166263580322, + "rewards/rejected": -0.6349620819091797, + "sft_loss": 0.1894543319940567, + "step": 2752 + }, + { + "epoch": 3.9812002892263196, + "grad_norm": 1.8162552078369851, + "learning_rate": 2.1204076325766124e-06, + "logits/chosen": -0.5967478156089783, + "logits/rejected": -0.4460592269897461, + "logps/chosen": -0.17963387072086334, + "logps/rejected": -3.825800895690918, + "loss": 0.1539, + "odds_ratio_loss": 0.030072186142206192, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017963387072086334, + "rewards/margins": 0.3646166920661926, + "rewards/rejected": -0.38258010149002075, + "sft_loss": 0.17963387072086334, + "step": 2753 + }, + { + "epoch": 3.982646420824295, + "grad_norm": 2.151401772662356, + "learning_rate": 2.1176665793147296e-06, + "logits/chosen": -0.721175491809845, + "logits/rejected": -0.4136351943016052, + "logps/chosen": -0.27765095233917236, + "logps/rejected": -2.8129587173461914, + "loss": 0.2211, + "odds_ratio_loss": 0.053692497313022614, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027765095233917236, + "rewards/margins": 0.2535307705402374, + "rewards/rejected": -0.28129586577415466, + "sft_loss": 0.27765095233917236, + "step": 2754 + }, + { + "epoch": 3.9840925524222706, + "grad_norm": 1.8567141033496175, + "learning_rate": 2.1149266609186767e-06, + "logits/chosen": -0.5536510944366455, + "logits/rejected": -0.5295378565788269, + "logps/chosen": -0.14767736196517944, + "logps/rejected": -5.057055473327637, + "loss": 0.195, + "odds_ratio_loss": 0.041959308087825775, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01476773526519537, + "rewards/margins": 0.4909377992153168, + "rewards/rejected": -0.5057055950164795, + "sft_loss": 0.14767736196517944, + "step": 2755 + }, + { + "epoch": 3.985538684020246, + "grad_norm": 2.0893400371062523, + "learning_rate": 2.1121878790403607e-06, + "logits/chosen": -0.6446054577827454, + "logits/rejected": -0.49732160568237305, + "logps/chosen": -0.1490086168050766, + "logps/rejected": -4.0585784912109375, + "loss": 0.2023, + "odds_ratio_loss": 0.03372291848063469, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01490086317062378, + "rewards/margins": 0.39095696806907654, + "rewards/rejected": -0.4058578610420227, + "sft_loss": 0.1490086168050766, + "step": 2756 + }, + { + "epoch": 3.9869848156182215, + "grad_norm": 2.408244901425215, + "learning_rate": 2.1094502353310026e-06, + "logits/chosen": -0.656318187713623, + "logits/rejected": -0.4849543273448944, + "logps/chosen": -0.21277806162834167, + "logps/rejected": -5.278512477874756, + "loss": 0.1842, + "odds_ratio_loss": 0.037755995988845825, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021277807652950287, + "rewards/margins": 0.5065734386444092, + "rewards/rejected": -0.5278512239456177, + "sft_loss": 0.21277806162834167, + "step": 2757 + }, + { + "epoch": 3.9884309472161967, + "grad_norm": 2.1234436431541797, + "learning_rate": 2.1067137314411394e-06, + "logits/chosen": -0.5390723347663879, + "logits/rejected": -0.4621487855911255, + "logps/chosen": -0.28014615178108215, + "logps/rejected": -3.016232490539551, + "loss": 0.2125, + "odds_ratio_loss": 0.05021138861775398, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.028014618903398514, + "rewards/margins": 0.27360865473747253, + "rewards/rejected": -0.30162325501441956, + "sft_loss": 0.28014615178108215, + "step": 2758 + }, + { + "epoch": 3.989877078814172, + "grad_norm": 2.0758920916498522, + "learning_rate": 2.103978369020618e-06, + "logits/chosen": -0.756662130355835, + "logits/rejected": -0.6477712392807007, + "logps/chosen": -0.22675055265426636, + "logps/rejected": -2.2667582035064697, + "loss": 0.1676, + "odds_ratio_loss": 0.05150124430656433, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022675054147839546, + "rewards/margins": 0.20400077104568481, + "rewards/rejected": -0.2266758382320404, + "sft_loss": 0.22675055265426636, + "step": 2759 + }, + { + "epoch": 3.9913232104121477, + "grad_norm": 2.966514602001194, + "learning_rate": 2.1012441497186006e-06, + "logits/chosen": -0.4243288040161133, + "logits/rejected": -0.2724151611328125, + "logps/chosen": -0.08447768539190292, + "logps/rejected": -5.80491828918457, + "loss": 0.1971, + "odds_ratio_loss": 0.013431225903332233, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008447768166661263, + "rewards/margins": 0.5720440745353699, + "rewards/rejected": -0.580491840839386, + "sft_loss": 0.08447768539190292, + "step": 2760 + }, + { + "epoch": 3.992769342010123, + "grad_norm": 2.0690569247897566, + "learning_rate": 2.0985110751835554e-06, + "logits/chosen": -0.5802749991416931, + "logits/rejected": -0.4594845473766327, + "logps/chosen": -0.21454155445098877, + "logps/rejected": -3.6675314903259277, + "loss": 0.1895, + "odds_ratio_loss": 0.054889433085918427, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021454155445098877, + "rewards/margins": 0.34529903531074524, + "rewards/rejected": -0.36675316095352173, + "sft_loss": 0.21454155445098877, + "step": 2761 + }, + { + "epoch": 3.994215473608098, + "grad_norm": 2.130452441813903, + "learning_rate": 2.0957791470632668e-06, + "logits/chosen": -0.3942503035068512, + "logits/rejected": -0.28423938155174255, + "logps/chosen": -0.1051185354590416, + "logps/rejected": -5.772364616394043, + "loss": 0.1506, + "odds_ratio_loss": 0.02300701104104519, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010511854663491249, + "rewards/margins": 0.5667246580123901, + "rewards/rejected": -0.5772364735603333, + "sft_loss": 0.1051185354590416, + "step": 2762 + }, + { + "epoch": 3.995661605206074, + "grad_norm": 2.273248642486303, + "learning_rate": 2.0930483670048225e-06, + "logits/chosen": -0.6677214503288269, + "logits/rejected": -0.47213077545166016, + "logps/chosen": -0.10058383643627167, + "logps/rejected": -5.465931415557861, + "loss": 0.1752, + "odds_ratio_loss": 0.018684882670640945, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010058384388685226, + "rewards/margins": 0.5365347862243652, + "rewards/rejected": -0.546593189239502, + "sft_loss": 0.10058383643627167, + "step": 2763 + }, + { + "epoch": 3.997107736804049, + "grad_norm": 1.873279739010511, + "learning_rate": 2.0903187366546196e-06, + "logits/chosen": -0.7388254404067993, + "logits/rejected": -0.7730780839920044, + "logps/chosen": -0.23930414021015167, + "logps/rejected": -4.100466728210449, + "loss": 0.202, + "odds_ratio_loss": 0.048742517828941345, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023930413648486137, + "rewards/margins": 0.38611626625061035, + "rewards/rejected": -0.41004669666290283, + "sft_loss": 0.23930414021015167, + "step": 2764 + }, + { + "epoch": 3.9985538684020243, + "grad_norm": 2.039276418562883, + "learning_rate": 2.0875902576583613e-06, + "logits/chosen": -0.6181426048278809, + "logits/rejected": -0.440829873085022, + "logps/chosen": -0.12418884038925171, + "logps/rejected": -3.6102936267852783, + "loss": 0.1564, + "odds_ratio_loss": 0.015171993523836136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012418882921338081, + "rewards/margins": 0.34861046075820923, + "rewards/rejected": -0.36102938652038574, + "sft_loss": 0.12418884038925171, + "step": 2765 + }, + { + "epoch": 4.0, + "grad_norm": 2.2517115073863265, + "learning_rate": 2.084862931661061e-06, + "logits/chosen": -0.5372803211212158, + "logits/rejected": -0.3119491636753082, + "logps/chosen": -0.1669417768716812, + "logps/rejected": -4.001537799835205, + "loss": 0.1403, + "odds_ratio_loss": 0.028092026710510254, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01669418066740036, + "rewards/margins": 0.3834596276283264, + "rewards/rejected": -0.400153785943985, + "sft_loss": 0.1669417768716812, + "step": 2766 + }, + { + "epoch": 4.001446131597976, + "grad_norm": 1.9342629816067212, + "learning_rate": 2.08213676030703e-06, + "logits/chosen": -0.6452394127845764, + "logits/rejected": -0.4118797481060028, + "logps/chosen": -0.08616970479488373, + "logps/rejected": -3.6615328788757324, + "loss": 0.0679, + "odds_ratio_loss": 0.016679398715496063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008616969920694828, + "rewards/margins": 0.35753631591796875, + "rewards/rejected": -0.3661532998085022, + "sft_loss": 0.08616970479488373, + "step": 2767 + }, + { + "epoch": 4.0028922631959505, + "grad_norm": 1.8309926478353908, + "learning_rate": 2.0794117452398896e-06, + "logits/chosen": -0.7527530789375305, + "logits/rejected": -0.5017178058624268, + "logps/chosen": -0.0876193642616272, + "logps/rejected": -2.9753198623657227, + "loss": 0.0645, + "odds_ratio_loss": 0.01615132950246334, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00876193679869175, + "rewards/margins": 0.28877004981040955, + "rewards/rejected": -0.29753196239471436, + "sft_loss": 0.0876193642616272, + "step": 2768 + }, + { + "epoch": 4.004338394793926, + "grad_norm": 2.0775734258132093, + "learning_rate": 2.0766878881025626e-06, + "logits/chosen": -0.6768192052841187, + "logits/rejected": -0.5599380135536194, + "logps/chosen": -0.11010854691267014, + "logps/rejected": -4.613247871398926, + "loss": 0.0943, + "odds_ratio_loss": 0.01849268190562725, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011010855436325073, + "rewards/margins": 0.450313925743103, + "rewards/rejected": -0.4613247513771057, + "sft_loss": 0.11010854691267014, + "step": 2769 + }, + { + "epoch": 4.005784526391902, + "grad_norm": 1.598856868986213, + "learning_rate": 2.0739651905372706e-06, + "logits/chosen": -0.4616875946521759, + "logits/rejected": -0.34646421670913696, + "logps/chosen": -0.07392052561044693, + "logps/rejected": -4.814556121826172, + "loss": 0.0724, + "odds_ratio_loss": 0.00886174663901329, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007392052561044693, + "rewards/margins": 0.47406357526779175, + "rewards/rejected": -0.48145565390586853, + "sft_loss": 0.07392052561044693, + "step": 2770 + }, + { + "epoch": 4.007230657989877, + "grad_norm": 2.0273765071761405, + "learning_rate": 2.0712436541855387e-06, + "logits/chosen": -0.6958746314048767, + "logits/rejected": -0.42994117736816406, + "logps/chosen": -0.14068825542926788, + "logps/rejected": -4.38588809967041, + "loss": 0.1447, + "odds_ratio_loss": 0.008284801617264748, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014068827033042908, + "rewards/margins": 0.42451998591423035, + "rewards/rejected": -0.43858882784843445, + "sft_loss": 0.14068825542926788, + "step": 2771 + }, + { + "epoch": 4.008676789587852, + "grad_norm": 1.409673623556823, + "learning_rate": 2.0685232806881896e-06, + "logits/chosen": -0.6632593870162964, + "logits/rejected": -0.5344269275665283, + "logps/chosen": -0.03422532230615616, + "logps/rejected": -4.707058906555176, + "loss": 0.0583, + "odds_ratio_loss": 0.004966989159584045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003422531997784972, + "rewards/margins": 0.46728330850601196, + "rewards/rejected": -0.47070592641830444, + "sft_loss": 0.03422532230615616, + "step": 2772 + }, + { + "epoch": 4.010122921185828, + "grad_norm": 1.5195324066013671, + "learning_rate": 2.06580407168535e-06, + "logits/chosen": -0.744748055934906, + "logits/rejected": -0.6185660362243652, + "logps/chosen": -0.03680463880300522, + "logps/rejected": -5.538668632507324, + "loss": 0.0651, + "odds_ratio_loss": 0.004509086720645428, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0036804641131311655, + "rewards/margins": 0.5501863360404968, + "rewards/rejected": -0.5538668632507324, + "sft_loss": 0.03680463880300522, + "step": 2773 + }, + { + "epoch": 4.011569052783804, + "grad_norm": 2.124422736113147, + "learning_rate": 2.063086028816437e-06, + "logits/chosen": -0.6991645693778992, + "logits/rejected": -0.5606277585029602, + "logps/chosen": -0.1650342047214508, + "logps/rejected": -3.594874382019043, + "loss": 0.0973, + "odds_ratio_loss": 0.01484596822410822, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01650342158973217, + "rewards/margins": 0.34298405051231384, + "rewards/rejected": -0.3594874441623688, + "sft_loss": 0.1650342047214508, + "step": 2774 + }, + { + "epoch": 4.0130151843817785, + "grad_norm": 2.395553922175963, + "learning_rate": 2.0603691537201737e-06, + "logits/chosen": -0.8897903561592102, + "logits/rejected": -0.5353154540061951, + "logps/chosen": -0.0864485427737236, + "logps/rejected": -5.473728656768799, + "loss": 0.0823, + "odds_ratio_loss": 0.005243867635726929, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008644853718578815, + "rewards/margins": 0.5387280583381653, + "rewards/rejected": -0.5473728775978088, + "sft_loss": 0.0864485427737236, + "step": 2775 + }, + { + "epoch": 4.014461315979754, + "grad_norm": 2.6106970833473215, + "learning_rate": 2.057653448034569e-06, + "logits/chosen": -0.7756266593933105, + "logits/rejected": -0.49392417073249817, + "logps/chosen": -0.17186208069324493, + "logps/rejected": -5.397519588470459, + "loss": 0.1117, + "odds_ratio_loss": 0.0077825989574193954, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017186209559440613, + "rewards/margins": 0.5225657820701599, + "rewards/rejected": -0.5397520065307617, + "sft_loss": 0.17186208069324493, + "step": 2776 + }, + { + "epoch": 4.01590744757773, + "grad_norm": 3.5803120207407293, + "learning_rate": 2.0549389133969366e-06, + "logits/chosen": -0.7232028245925903, + "logits/rejected": -0.5350125432014465, + "logps/chosen": -0.08477292954921722, + "logps/rejected": -6.003397464752197, + "loss": 0.0854, + "odds_ratio_loss": 0.011778164654970169, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008477292954921722, + "rewards/margins": 0.5918624997138977, + "rewards/rejected": -0.6003397703170776, + "sft_loss": 0.08477292954921722, + "step": 2777 + }, + { + "epoch": 4.017353579175705, + "grad_norm": 2.2746691520235833, + "learning_rate": 2.0522255514438775e-06, + "logits/chosen": -0.7820395231246948, + "logits/rejected": -0.539219856262207, + "logps/chosen": -0.10004076361656189, + "logps/rejected": -6.174839496612549, + "loss": 0.0983, + "odds_ratio_loss": 0.006524843629449606, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010004077106714249, + "rewards/margins": 0.6074798703193665, + "rewards/rejected": -0.6174839735031128, + "sft_loss": 0.10004076361656189, + "step": 2778 + }, + { + "epoch": 4.01879971077368, + "grad_norm": 2.8298145277154583, + "learning_rate": 2.0495133638112876e-06, + "logits/chosen": -0.8696098327636719, + "logits/rejected": -0.6498009562492371, + "logps/chosen": -0.07407721132040024, + "logps/rejected": -5.003327369689941, + "loss": 0.0689, + "odds_ratio_loss": 0.005369211081415415, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007407721597701311, + "rewards/margins": 0.4929249882698059, + "rewards/rejected": -0.5003327131271362, + "sft_loss": 0.07407721132040024, + "step": 2779 + }, + { + "epoch": 4.020245842371656, + "grad_norm": 1.9598064141005225, + "learning_rate": 2.0468023521343535e-06, + "logits/chosen": -0.7513258457183838, + "logits/rejected": -0.6734272241592407, + "logps/chosen": -0.16317948698997498, + "logps/rejected": -3.4593756198883057, + "loss": 0.1124, + "odds_ratio_loss": 0.021505819633603096, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016317948698997498, + "rewards/margins": 0.3296195864677429, + "rewards/rejected": -0.3459375500679016, + "sft_loss": 0.16317948698997498, + "step": 2780 + }, + { + "epoch": 4.021691973969631, + "grad_norm": 1.7165495887574607, + "learning_rate": 2.044092518047556e-06, + "logits/chosen": -0.9572099447250366, + "logits/rejected": -0.7057690620422363, + "logps/chosen": -0.05946547910571098, + "logps/rejected": -3.578216552734375, + "loss": 0.092, + "odds_ratio_loss": 0.010709418915212154, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005946548189967871, + "rewards/margins": 0.3518751263618469, + "rewards/rejected": -0.35782164335250854, + "sft_loss": 0.05946547910571098, + "step": 2781 + }, + { + "epoch": 4.0231381055676065, + "grad_norm": 1.5373982677542486, + "learning_rate": 2.0413838631846655e-06, + "logits/chosen": -0.9308152794837952, + "logits/rejected": -0.6596737504005432, + "logps/chosen": -0.0882471352815628, + "logps/rejected": -3.0405995845794678, + "loss": 0.0911, + "odds_ratio_loss": 0.009932536631822586, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00882471352815628, + "rewards/margins": 0.295235276222229, + "rewards/rejected": -0.3040599822998047, + "sft_loss": 0.0882471352815628, + "step": 2782 + }, + { + "epoch": 4.024584237165582, + "grad_norm": 1.6064838122981433, + "learning_rate": 2.038676389178737e-06, + "logits/chosen": -0.912672758102417, + "logits/rejected": -0.6855254769325256, + "logps/chosen": -0.10411486029624939, + "logps/rejected": -5.8816142082214355, + "loss": 0.0905, + "odds_ratio_loss": 0.012567874044179916, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010411485098302364, + "rewards/margins": 0.5777499675750732, + "rewards/rejected": -0.5881614089012146, + "sft_loss": 0.10411486029624939, + "step": 2783 + }, + { + "epoch": 4.026030368763558, + "grad_norm": 1.533022173291985, + "learning_rate": 2.0359700976621192e-06, + "logits/chosen": -0.7604274153709412, + "logits/rejected": -0.6002693176269531, + "logps/chosen": -0.09835337847471237, + "logps/rejected": -4.979158401489258, + "loss": 0.0734, + "odds_ratio_loss": 0.00728287547826767, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009835338220000267, + "rewards/margins": 0.4880805015563965, + "rewards/rejected": -0.4979158341884613, + "sft_loss": 0.09835337847471237, + "step": 2784 + }, + { + "epoch": 4.027476500361533, + "grad_norm": 1.8914491919734018, + "learning_rate": 2.0332649902664435e-06, + "logits/chosen": -0.734957218170166, + "logits/rejected": -0.5849780440330505, + "logps/chosen": -0.07099728286266327, + "logps/rejected": -5.679014205932617, + "loss": 0.0875, + "odds_ratio_loss": 0.010899766348302364, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007099728100001812, + "rewards/margins": 0.5608016848564148, + "rewards/rejected": -0.5679014325141907, + "sft_loss": 0.07099728286266327, + "step": 2785 + }, + { + "epoch": 4.028922631959508, + "grad_norm": 1.4716068999371834, + "learning_rate": 2.030561068622631e-06, + "logits/chosen": -0.641997754573822, + "logits/rejected": -0.6025215983390808, + "logps/chosen": -0.11185550689697266, + "logps/rejected": -3.5561747550964355, + "loss": 0.0741, + "odds_ratio_loss": 0.01804935932159424, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011185551062226295, + "rewards/margins": 0.34443193674087524, + "rewards/rejected": -0.355617493391037, + "sft_loss": 0.11185550689697266, + "step": 2786 + }, + { + "epoch": 4.030368763557484, + "grad_norm": 1.45988684450543, + "learning_rate": 2.0278583343608855e-06, + "logits/chosen": -0.6629144549369812, + "logits/rejected": -0.45586591958999634, + "logps/chosen": -0.14175164699554443, + "logps/rejected": -5.194118499755859, + "loss": 0.1214, + "odds_ratio_loss": 0.014719752594828606, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014175164513289928, + "rewards/margins": 0.5052367448806763, + "rewards/rejected": -0.5194118618965149, + "sft_loss": 0.14175164699554443, + "step": 2787 + }, + { + "epoch": 4.031814895155459, + "grad_norm": 1.2245497852271376, + "learning_rate": 2.0251567891106953e-06, + "logits/chosen": -0.6399087905883789, + "logits/rejected": -0.47924333810806274, + "logps/chosen": -0.05142543837428093, + "logps/rejected": -4.47532844543457, + "loss": 0.0544, + "odds_ratio_loss": 0.004084172658622265, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005142544396221638, + "rewards/margins": 0.4423903226852417, + "rewards/rejected": -0.44753289222717285, + "sft_loss": 0.05142543837428093, + "step": 2788 + }, + { + "epoch": 4.033261026753435, + "grad_norm": 2.06127829155153, + "learning_rate": 2.022456434500831e-06, + "logits/chosen": -0.8269731998443604, + "logits/rejected": -0.5844072103500366, + "logps/chosen": -0.0957033634185791, + "logps/rejected": -4.2631611824035645, + "loss": 0.11, + "odds_ratio_loss": 0.006857863627374172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009570336900651455, + "rewards/margins": 0.41674575209617615, + "rewards/rejected": -0.42631611227989197, + "sft_loss": 0.0957033634185791, + "step": 2789 + }, + { + "epoch": 4.03470715835141, + "grad_norm": 1.687212211680361, + "learning_rate": 2.019757272159348e-06, + "logits/chosen": -0.903799295425415, + "logits/rejected": -0.6986560821533203, + "logps/chosen": -0.09332535415887833, + "logps/rejected": -3.8312535285949707, + "loss": 0.0843, + "odds_ratio_loss": 0.012101933360099792, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009332534857094288, + "rewards/margins": 0.3737927973270416, + "rewards/rejected": -0.383125364780426, + "sft_loss": 0.09332535415887833, + "step": 2790 + }, + { + "epoch": 4.036153289949385, + "grad_norm": 1.5214741277296162, + "learning_rate": 2.01705930371358e-06, + "logits/chosen": -0.8658531308174133, + "logits/rejected": -0.5357121825218201, + "logps/chosen": -0.03709006682038307, + "logps/rejected": -4.968294143676758, + "loss": 0.0464, + "odds_ratio_loss": 0.004839232191443443, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0037090065889060497, + "rewards/margins": 0.49312037229537964, + "rewards/rejected": -0.49682939052581787, + "sft_loss": 0.03709006682038307, + "step": 2791 + }, + { + "epoch": 4.037599421547361, + "grad_norm": 1.8959372352641202, + "learning_rate": 2.0143625307901445e-06, + "logits/chosen": -0.7801449298858643, + "logits/rejected": -0.6802493929862976, + "logps/chosen": -0.04702619090676308, + "logps/rejected": -3.90498423576355, + "loss": 0.0703, + "odds_ratio_loss": 0.0023806714452803135, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004702619276940823, + "rewards/margins": 0.38579580187797546, + "rewards/rejected": -0.39049839973449707, + "sft_loss": 0.04702619090676308, + "step": 2792 + }, + { + "epoch": 4.039045553145336, + "grad_norm": 2.415858200709156, + "learning_rate": 2.0116669550149326e-06, + "logits/chosen": -0.6672720909118652, + "logits/rejected": -0.6657229661941528, + "logps/chosen": -0.08343811333179474, + "logps/rejected": -3.9901604652404785, + "loss": 0.0964, + "odds_ratio_loss": 0.0074556972831487656, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008343812078237534, + "rewards/margins": 0.3906722366809845, + "rewards/rejected": -0.3990160822868347, + "sft_loss": 0.08343811333179474, + "step": 2793 + }, + { + "epoch": 4.040491684743311, + "grad_norm": 1.7239051085866066, + "learning_rate": 2.008972578013121e-06, + "logits/chosen": -0.8345197439193726, + "logits/rejected": -0.62155681848526, + "logps/chosen": -0.0501299649477005, + "logps/rejected": -6.423168182373047, + "loss": 0.0885, + "odds_ratio_loss": 0.012613892555236816, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00501299649477005, + "rewards/margins": 0.6373038291931152, + "rewards/rejected": -0.6423167586326599, + "sft_loss": 0.0501299649477005, + "step": 2794 + }, + { + "epoch": 4.041937816341287, + "grad_norm": 2.2864873355325317, + "learning_rate": 2.0062794014091566e-06, + "logits/chosen": -0.7575594186782837, + "logits/rejected": -0.5298593640327454, + "logps/chosen": -0.08866608142852783, + "logps/rejected": -3.8082284927368164, + "loss": 0.0898, + "odds_ratio_loss": 0.02208029478788376, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008866608142852783, + "rewards/margins": 0.3719562888145447, + "rewards/rejected": -0.38082289695739746, + "sft_loss": 0.08866608142852783, + "step": 2795 + }, + { + "epoch": 4.043383947939263, + "grad_norm": 2.0832074173183006, + "learning_rate": 2.0035874268267652e-06, + "logits/chosen": -0.9552007913589478, + "logits/rejected": -0.6479728817939758, + "logps/chosen": -0.19561168551445007, + "logps/rejected": -4.241243839263916, + "loss": 0.1215, + "odds_ratio_loss": 0.02996988594532013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019561167806386948, + "rewards/margins": 0.4045632481575012, + "rewards/rejected": -0.4241243898868561, + "sft_loss": 0.19561168551445007, + "step": 2796 + }, + { + "epoch": 4.044830079537238, + "grad_norm": 1.376822495445301, + "learning_rate": 2.0008966558889518e-06, + "logits/chosen": -0.8547458648681641, + "logits/rejected": -0.6141563653945923, + "logps/chosen": -0.02995988354086876, + "logps/rejected": -5.119405746459961, + "loss": 0.0598, + "odds_ratio_loss": 0.001512437593191862, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0029959888197481632, + "rewards/margins": 0.508944571018219, + "rewards/rejected": -0.511940598487854, + "sft_loss": 0.02995988354086876, + "step": 2797 + }, + { + "epoch": 4.046276211135213, + "grad_norm": 1.7513413495327788, + "learning_rate": 1.9982070902179885e-06, + "logits/chosen": -0.876068651676178, + "logits/rejected": -0.6037436723709106, + "logps/chosen": -0.031491801142692566, + "logps/rejected": -4.855276584625244, + "loss": 0.0784, + "odds_ratio_loss": 0.0038668958004564047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0031491799745708704, + "rewards/margins": 0.4823784828186035, + "rewards/rejected": -0.4855276346206665, + "sft_loss": 0.031491801142692566, + "step": 2798 + }, + { + "epoch": 4.047722342733189, + "grad_norm": 1.607844504092351, + "learning_rate": 1.995518731435427e-06, + "logits/chosen": -0.69386887550354, + "logits/rejected": -0.537121593952179, + "logps/chosen": -0.10619297623634338, + "logps/rejected": -5.933511734008789, + "loss": 0.0766, + "odds_ratio_loss": 0.012091580778360367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010619297623634338, + "rewards/margins": 0.5827318429946899, + "rewards/rejected": -0.5933511257171631, + "sft_loss": 0.10619297623634338, + "step": 2799 + }, + { + "epoch": 4.0491684743311644, + "grad_norm": 2.041163818914151, + "learning_rate": 1.992831581162086e-06, + "logits/chosen": -0.8246333599090576, + "logits/rejected": -0.7129614353179932, + "logps/chosen": -0.10542631149291992, + "logps/rejected": -4.061657905578613, + "loss": 0.0915, + "odds_ratio_loss": 0.011720533482730389, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010542631149291992, + "rewards/margins": 0.39562320709228516, + "rewards/rejected": -0.4061657786369324, + "sft_loss": 0.10542631149291992, + "step": 2800 + }, + { + "epoch": 4.050614605929139, + "grad_norm": 1.5198161109339767, + "learning_rate": 1.9901456410180626e-06, + "logits/chosen": -0.8532893657684326, + "logits/rejected": -0.9210831522941589, + "logps/chosen": -0.10792700946331024, + "logps/rejected": -3.295602560043335, + "loss": 0.0825, + "odds_ratio_loss": 0.009402144700288773, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010792701505124569, + "rewards/margins": 0.3187675476074219, + "rewards/rejected": -0.3295602798461914, + "sft_loss": 0.10792700946331024, + "step": 2801 + }, + { + "epoch": 4.052060737527115, + "grad_norm": 1.6587673101071005, + "learning_rate": 1.987460912622717e-06, + "logits/chosen": -0.7868859171867371, + "logits/rejected": -0.6162922382354736, + "logps/chosen": -0.08886329084634781, + "logps/rejected": -4.913092613220215, + "loss": 0.0714, + "odds_ratio_loss": 0.011058435775339603, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008886328898370266, + "rewards/margins": 0.48242297768592834, + "rewards/rejected": -0.4913092851638794, + "sft_loss": 0.08886329084634781, + "step": 2802 + }, + { + "epoch": 4.053506869125091, + "grad_norm": 1.4545531255335953, + "learning_rate": 1.9847773975946833e-06, + "logits/chosen": -0.3921070694923401, + "logits/rejected": -0.3834128975868225, + "logps/chosen": -0.046835917979478836, + "logps/rejected": -6.639843940734863, + "loss": 0.1102, + "odds_ratio_loss": 0.010030764155089855, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004683591891080141, + "rewards/margins": 0.6593008041381836, + "rewards/rejected": -0.6639844179153442, + "sft_loss": 0.046835917979478836, + "step": 2803 + }, + { + "epoch": 4.054953000723065, + "grad_norm": 1.603097030697306, + "learning_rate": 1.98209509755186e-06, + "logits/chosen": -0.8556368947029114, + "logits/rejected": -0.5742295980453491, + "logps/chosen": -0.12034314125776291, + "logps/rejected": -3.737192392349243, + "loss": 0.096, + "odds_ratio_loss": 0.020050739869475365, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012034314684569836, + "rewards/margins": 0.3616849184036255, + "rewards/rejected": -0.3737192451953888, + "sft_loss": 0.12034314125776291, + "step": 2804 + }, + { + "epoch": 4.056399132321041, + "grad_norm": 1.5499176234984071, + "learning_rate": 1.97941401411142e-06, + "logits/chosen": -0.8785386085510254, + "logits/rejected": -0.736790657043457, + "logps/chosen": -0.09029129147529602, + "logps/rejected": -3.283236503601074, + "loss": 0.0651, + "odds_ratio_loss": 0.010353603400290012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009029129520058632, + "rewards/margins": 0.3192945122718811, + "rewards/rejected": -0.328323632478714, + "sft_loss": 0.09029129147529602, + "step": 2805 + }, + { + "epoch": 4.057845263919017, + "grad_norm": 1.497023625579859, + "learning_rate": 1.976734148889794e-06, + "logits/chosen": -0.9096969366073608, + "logits/rejected": -0.563471794128418, + "logps/chosen": -0.09378468990325928, + "logps/rejected": -4.701470851898193, + "loss": 0.0833, + "odds_ratio_loss": 0.01099737174808979, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009378468617796898, + "rewards/margins": 0.46076861023902893, + "rewards/rejected": -0.4701470732688904, + "sft_loss": 0.09378468990325928, + "step": 2806 + }, + { + "epoch": 4.0592913955169925, + "grad_norm": 1.299958296890451, + "learning_rate": 1.9740555035026856e-06, + "logits/chosen": -0.9414892792701721, + "logits/rejected": -0.8532160520553589, + "logps/chosen": -0.09001899510622025, + "logps/rejected": -4.150521278381348, + "loss": 0.0668, + "odds_ratio_loss": 0.01099309604614973, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0090019004419446, + "rewards/margins": 0.40605026483535767, + "rewards/rejected": -0.4150521159172058, + "sft_loss": 0.09001899510622025, + "step": 2807 + }, + { + "epoch": 4.060737527114967, + "grad_norm": 1.4888656980066968, + "learning_rate": 1.971378079565061e-06, + "logits/chosen": -0.7717404961585999, + "logits/rejected": -0.703227162361145, + "logps/chosen": -0.06621512025594711, + "logps/rejected": -3.4783830642700195, + "loss": 0.0637, + "odds_ratio_loss": 0.006453595124185085, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006621511187404394, + "rewards/margins": 0.3412168025970459, + "rewards/rejected": -0.34783831238746643, + "sft_loss": 0.06621512025594711, + "step": 2808 + }, + { + "epoch": 4.062183658712943, + "grad_norm": 1.4317984222561302, + "learning_rate": 1.9687018786911477e-06, + "logits/chosen": -0.8258988857269287, + "logits/rejected": -0.6008090376853943, + "logps/chosen": -0.04819875583052635, + "logps/rejected": -5.592133522033691, + "loss": 0.0731, + "odds_ratio_loss": 0.0023457964416593313, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004819875583052635, + "rewards/margins": 0.554393470287323, + "rewards/rejected": -0.5592133402824402, + "sft_loss": 0.04819875583052635, + "step": 2809 + }, + { + "epoch": 4.063629790310919, + "grad_norm": 1.5889562813928515, + "learning_rate": 1.9660269024944367e-06, + "logits/chosen": -0.857683539390564, + "logits/rejected": -0.8138494491577148, + "logps/chosen": -0.13486728072166443, + "logps/rejected": -2.3455417156219482, + "loss": 0.086, + "odds_ratio_loss": 0.014339055866003036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013486729003489017, + "rewards/margins": 0.22106747329235077, + "rewards/rejected": -0.2345542013645172, + "sft_loss": 0.13486728072166443, + "step": 2810 + }, + { + "epoch": 4.065075921908893, + "grad_norm": 1.7040159776179822, + "learning_rate": 1.9633531525876804e-06, + "logits/chosen": -0.9003788232803345, + "logits/rejected": -0.46565985679626465, + "logps/chosen": -0.08736556768417358, + "logps/rejected": -5.143779754638672, + "loss": 0.084, + "odds_ratio_loss": 0.006077864672988653, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008736557327210903, + "rewards/margins": 0.5056413412094116, + "rewards/rejected": -0.5143779516220093, + "sft_loss": 0.08736556768417358, + "step": 2811 + }, + { + "epoch": 4.066522053506869, + "grad_norm": 1.6114194827080601, + "learning_rate": 1.960680630582895e-06, + "logits/chosen": -0.8223308324813843, + "logits/rejected": -0.520995020866394, + "logps/chosen": -0.03943537548184395, + "logps/rejected": -4.510629177093506, + "loss": 0.0935, + "odds_ratio_loss": 0.002579466672614217, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003943537827581167, + "rewards/margins": 0.44711941480636597, + "rewards/rejected": -0.4510629177093506, + "sft_loss": 0.03943537548184395, + "step": 2812 + }, + { + "epoch": 4.067968185104845, + "grad_norm": 2.896809649533277, + "learning_rate": 1.9580093380913516e-06, + "logits/chosen": -0.7096729874610901, + "logits/rejected": -0.45962074398994446, + "logps/chosen": -0.09003637731075287, + "logps/rejected": -5.000641822814941, + "loss": 0.0789, + "odds_ratio_loss": 0.004351029638200998, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009003638289868832, + "rewards/margins": 0.4910605549812317, + "rewards/rejected": -0.5000641942024231, + "sft_loss": 0.09003637731075287, + "step": 2813 + }, + { + "epoch": 4.06941431670282, + "grad_norm": 1.6830278975699327, + "learning_rate": 1.955339276723584e-06, + "logits/chosen": -0.8133095502853394, + "logits/rejected": -0.6556833982467651, + "logps/chosen": -0.10588407516479492, + "logps/rejected": -4.726909160614014, + "loss": 0.0942, + "odds_ratio_loss": 0.008478911593556404, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010588408447802067, + "rewards/margins": 0.4621025025844574, + "rewards/rejected": -0.4726909101009369, + "sft_loss": 0.10588407516479492, + "step": 2814 + }, + { + "epoch": 4.070860448300795, + "grad_norm": 1.3555935048586099, + "learning_rate": 1.952670448089381e-06, + "logits/chosen": -0.7163006067276001, + "logits/rejected": -0.587743878364563, + "logps/chosen": -0.07247428596019745, + "logps/rejected": -5.259910583496094, + "loss": 0.0539, + "odds_ratio_loss": 0.013823432847857475, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007247428875416517, + "rewards/margins": 0.5187435746192932, + "rewards/rejected": -0.5259910225868225, + "sft_loss": 0.07247428596019745, + "step": 2815 + }, + { + "epoch": 4.072306579898771, + "grad_norm": 1.6383073489651063, + "learning_rate": 1.9500028537977916e-06, + "logits/chosen": -0.8243443965911865, + "logits/rejected": -0.5978400707244873, + "logps/chosen": -0.07016627490520477, + "logps/rejected": -5.391256809234619, + "loss": 0.0842, + "odds_ratio_loss": 0.007298076990991831, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007016628049314022, + "rewards/margins": 0.5321090221405029, + "rewards/rejected": -0.5391257405281067, + "sft_loss": 0.07016627490520477, + "step": 2816 + }, + { + "epoch": 4.073752711496746, + "grad_norm": 1.4638936787417978, + "learning_rate": 1.9473364954571156e-06, + "logits/chosen": -0.852425217628479, + "logits/rejected": -0.6924081444740295, + "logps/chosen": -0.03700890764594078, + "logps/rejected": -5.5652289390563965, + "loss": 0.0678, + "odds_ratio_loss": 0.0037209701258689165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003700891276821494, + "rewards/margins": 0.5528219938278198, + "rewards/rejected": -0.5565229654312134, + "sft_loss": 0.03700890764594078, + "step": 2817 + }, + { + "epoch": 4.0751988430947215, + "grad_norm": 1.4007257608951111, + "learning_rate": 1.9446713746749124e-06, + "logits/chosen": -0.7752952575683594, + "logits/rejected": -0.5839805603027344, + "logps/chosen": -0.07095170021057129, + "logps/rejected": -5.140481948852539, + "loss": 0.0557, + "odds_ratio_loss": 0.006458982825279236, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007095170207321644, + "rewards/margins": 0.5069530010223389, + "rewards/rejected": -0.5140482187271118, + "sft_loss": 0.07095170021057129, + "step": 2818 + }, + { + "epoch": 4.076644974692697, + "grad_norm": 1.753771771087421, + "learning_rate": 1.94200749305799e-06, + "logits/chosen": -0.6933737397193909, + "logits/rejected": -0.544430673122406, + "logps/chosen": -0.07938231527805328, + "logps/rejected": -3.3233370780944824, + "loss": 0.089, + "odds_ratio_loss": 0.006840870250016451, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007938231341540813, + "rewards/margins": 0.32439547777175903, + "rewards/rejected": -0.3323337137699127, + "sft_loss": 0.07938231527805328, + "step": 2819 + }, + { + "epoch": 4.078091106290673, + "grad_norm": 1.377106669791675, + "learning_rate": 1.9393448522124154e-06, + "logits/chosen": -1.001451015472412, + "logits/rejected": -0.8603689670562744, + "logps/chosen": -0.04609488323330879, + "logps/rejected": -5.648879051208496, + "loss": 0.0742, + "odds_ratio_loss": 0.0034265825524926186, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004609488416463137, + "rewards/margins": 0.5602784752845764, + "rewards/rejected": -0.5648879408836365, + "sft_loss": 0.04609488323330879, + "step": 2820 + }, + { + "epoch": 4.079537237888648, + "grad_norm": 1.6696172775689424, + "learning_rate": 1.9366834537435052e-06, + "logits/chosen": -0.7560070753097534, + "logits/rejected": -0.5555533170700073, + "logps/chosen": -0.09860974550247192, + "logps/rejected": -5.792730808258057, + "loss": 0.1054, + "odds_ratio_loss": 0.007586261723190546, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009860975667834282, + "rewards/margins": 0.569412112236023, + "rewards/rejected": -0.5792730450630188, + "sft_loss": 0.09860974550247192, + "step": 2821 + }, + { + "epoch": 4.080983369486623, + "grad_norm": 1.7064321016532233, + "learning_rate": 1.9340232992558242e-06, + "logits/chosen": -0.862777590751648, + "logits/rejected": -0.6179209351539612, + "logps/chosen": -0.15015126764774323, + "logps/rejected": -3.6706156730651855, + "loss": 0.1044, + "odds_ratio_loss": 0.017487984150648117, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015015127137303352, + "rewards/margins": 0.3520464301109314, + "rewards/rejected": -0.3670615553855896, + "sft_loss": 0.15015126764774323, + "step": 2822 + }, + { + "epoch": 4.082429501084599, + "grad_norm": 1.7723353244915958, + "learning_rate": 1.9313643903531916e-06, + "logits/chosen": -0.8052210211753845, + "logits/rejected": -0.7118884325027466, + "logps/chosen": -0.07638125866651535, + "logps/rejected": -3.0936851501464844, + "loss": 0.1025, + "odds_ratio_loss": 0.007743437774479389, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00763812568038702, + "rewards/margins": 0.30173036456108093, + "rewards/rejected": -0.3093685209751129, + "sft_loss": 0.07638125866651535, + "step": 2823 + }, + { + "epoch": 4.083875632682574, + "grad_norm": 1.603899631457365, + "learning_rate": 1.9287067286386735e-06, + "logits/chosen": -0.7378439903259277, + "logits/rejected": -0.6304211616516113, + "logps/chosen": -0.10273067653179169, + "logps/rejected": -4.914271354675293, + "loss": 0.0856, + "odds_ratio_loss": 0.0075208027847111225, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010273068211972713, + "rewards/margins": 0.48115405440330505, + "rewards/rejected": -0.49142712354660034, + "sft_loss": 0.10273067653179169, + "step": 2824 + }, + { + "epoch": 4.0853217642805495, + "grad_norm": 1.815738070132041, + "learning_rate": 1.926050315714582e-06, + "logits/chosen": -0.7988303899765015, + "logits/rejected": -0.7211605310440063, + "logps/chosen": -0.06291496008634567, + "logps/rejected": -4.587538719177246, + "loss": 0.0739, + "odds_ratio_loss": 0.005865362472832203, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006291495636105537, + "rewards/margins": 0.4524623453617096, + "rewards/rejected": -0.4587538540363312, + "sft_loss": 0.06291496008634567, + "step": 2825 + }, + { + "epoch": 4.086767895878525, + "grad_norm": 1.5383179861936356, + "learning_rate": 1.923395153182478e-06, + "logits/chosen": -0.8308844566345215, + "logits/rejected": -0.715095043182373, + "logps/chosen": -0.053292643278837204, + "logps/rejected": -3.6435461044311523, + "loss": 0.0703, + "odds_ratio_loss": 0.0045709628611803055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005329264793545008, + "rewards/margins": 0.35902532935142517, + "rewards/rejected": -0.36435458064079285, + "sft_loss": 0.053292643278837204, + "step": 2826 + }, + { + "epoch": 4.0882140274765, + "grad_norm": 1.5371817275721964, + "learning_rate": 1.920741242643172e-06, + "logits/chosen": -0.9177175164222717, + "logits/rejected": -0.7014694213867188, + "logps/chosen": -0.0635892003774643, + "logps/rejected": -4.349488258361816, + "loss": 0.0495, + "odds_ratio_loss": 0.0026953257620334625, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0063589196652174, + "rewards/margins": 0.4285898804664612, + "rewards/rejected": -0.4349488317966461, + "sft_loss": 0.0635892003774643, + "step": 2827 + }, + { + "epoch": 4.089660159074476, + "grad_norm": 1.9984594647479759, + "learning_rate": 1.9180885856967133e-06, + "logits/chosen": -0.9267109036445618, + "logits/rejected": -0.71535325050354, + "logps/chosen": -0.0746922492980957, + "logps/rejected": -4.762479782104492, + "loss": 0.117, + "odds_ratio_loss": 0.00817443523555994, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007469225209206343, + "rewards/margins": 0.4687787592411041, + "rewards/rejected": -0.47624802589416504, + "sft_loss": 0.0746922492980957, + "step": 2828 + }, + { + "epoch": 4.091106290672451, + "grad_norm": 1.826210488228371, + "learning_rate": 1.9154371839424014e-06, + "logits/chosen": -0.9434868097305298, + "logits/rejected": -0.7802065014839172, + "logps/chosen": -0.04058940336108208, + "logps/rejected": -6.8330464363098145, + "loss": 0.0835, + "odds_ratio_loss": 0.004032096825540066, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004058940336108208, + "rewards/margins": 0.6792457103729248, + "rewards/rejected": -0.6833046674728394, + "sft_loss": 0.04058940336108208, + "step": 2829 + }, + { + "epoch": 4.092552422270427, + "grad_norm": 1.9463274698751436, + "learning_rate": 1.912787038978774e-06, + "logits/chosen": -0.7986459732055664, + "logits/rejected": -0.5903900861740112, + "logps/chosen": -0.06586778163909912, + "logps/rejected": -4.728672504425049, + "loss": 0.0968, + "odds_ratio_loss": 0.005637805908918381, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006586778908967972, + "rewards/margins": 0.4662804901599884, + "rewards/rejected": -0.4728672504425049, + "sft_loss": 0.06586778163909912, + "step": 2830 + }, + { + "epoch": 4.093998553868402, + "grad_norm": 1.9187071058912697, + "learning_rate": 1.910138152403616e-06, + "logits/chosen": -0.7255648970603943, + "logits/rejected": -0.5702151656150818, + "logps/chosen": -0.15819592773914337, + "logps/rejected": -4.314438819885254, + "loss": 0.0972, + "odds_ratio_loss": 0.014649780467152596, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015819592401385307, + "rewards/margins": 0.4156242907047272, + "rewards/rejected": -0.43144387006759644, + "sft_loss": 0.15819592773914337, + "step": 2831 + }, + { + "epoch": 4.0954446854663775, + "grad_norm": 1.678493531054628, + "learning_rate": 1.907490525813947e-06, + "logits/chosen": -0.5866957306861877, + "logits/rejected": -0.3991992175579071, + "logps/chosen": -0.10705491900444031, + "logps/rejected": -6.210849761962891, + "loss": 0.0792, + "odds_ratio_loss": 0.005011118017137051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010705491527915001, + "rewards/margins": 0.6103795170783997, + "rewards/rejected": -0.621084988117218, + "sft_loss": 0.10705491900444031, + "step": 2832 + }, + { + "epoch": 4.096890817064353, + "grad_norm": 1.6178290940716764, + "learning_rate": 1.9048441608060358e-06, + "logits/chosen": -0.6119937896728516, + "logits/rejected": -0.4112843871116638, + "logps/chosen": -0.044672608375549316, + "logps/rejected": -4.956965446472168, + "loss": 0.0581, + "odds_ratio_loss": 0.0036439471878111362, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0044672610238194466, + "rewards/margins": 0.4912292957305908, + "rewards/rejected": -0.4956965446472168, + "sft_loss": 0.044672608375549316, + "step": 2833 + }, + { + "epoch": 4.098336948662328, + "grad_norm": 1.844544275796571, + "learning_rate": 1.9021990589753827e-06, + "logits/chosen": -0.6817110180854797, + "logits/rejected": -0.5111923217773438, + "logps/chosen": -0.07816684991121292, + "logps/rejected": -5.29497766494751, + "loss": 0.0771, + "odds_ratio_loss": 0.00837037805467844, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007816685363650322, + "rewards/margins": 0.5216810703277588, + "rewards/rejected": -0.5294978022575378, + "sft_loss": 0.07816684991121292, + "step": 2834 + }, + { + "epoch": 4.099783080260304, + "grad_norm": 1.3457218950980514, + "learning_rate": 1.8995552219167284e-06, + "logits/chosen": -0.6739135384559631, + "logits/rejected": -0.555662989616394, + "logps/chosen": -0.04297306016087532, + "logps/rejected": -6.009502410888672, + "loss": 0.0573, + "odds_ratio_loss": 0.0026276421267539263, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004297305829823017, + "rewards/margins": 0.5966529846191406, + "rewards/rejected": -0.6009502410888672, + "sft_loss": 0.04297306016087532, + "step": 2835 + }, + { + "epoch": 4.101229211858279, + "grad_norm": 2.3566402902558714, + "learning_rate": 1.8969126512240555e-06, + "logits/chosen": -0.8081563711166382, + "logits/rejected": -0.6554782390594482, + "logps/chosen": -0.13750715553760529, + "logps/rejected": -3.394568920135498, + "loss": 0.0946, + "odds_ratio_loss": 0.016596950590610504, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013750715181231499, + "rewards/margins": 0.3257061839103699, + "rewards/rejected": -0.3394568860530853, + "sft_loss": 0.13750715553760529, + "step": 2836 + }, + { + "epoch": 4.102675343456254, + "grad_norm": 1.7832953293812144, + "learning_rate": 1.8942713484905761e-06, + "logits/chosen": -0.6158128976821899, + "logits/rejected": -0.46845024824142456, + "logps/chosen": -0.03573101758956909, + "logps/rejected": -4.8370561599731445, + "loss": 0.0443, + "odds_ratio_loss": 0.005472586024552584, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0035731021780520678, + "rewards/margins": 0.480132520198822, + "rewards/rejected": -0.48370563983917236, + "sft_loss": 0.03573101758956909, + "step": 2837 + }, + { + "epoch": 4.10412147505423, + "grad_norm": 1.4666250779657488, + "learning_rate": 1.891631315308745e-06, + "logits/chosen": -0.8202795386314392, + "logits/rejected": -0.8174320459365845, + "logps/chosen": -0.050609488040208817, + "logps/rejected": -3.721074104309082, + "loss": 0.0552, + "odds_ratio_loss": 0.006177366711199284, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005060948897153139, + "rewards/margins": 0.3670464754104614, + "rewards/rejected": -0.3721074163913727, + "sft_loss": 0.050609488040208817, + "step": 2838 + }, + { + "epoch": 4.1055676066522055, + "grad_norm": 1.4677580796088394, + "learning_rate": 1.888992553270245e-06, + "logits/chosen": -0.6891862154006958, + "logits/rejected": -0.5416716933250427, + "logps/chosen": -0.05191085860133171, + "logps/rejected": -3.703627586364746, + "loss": 0.0697, + "odds_ratio_loss": 0.0027856866363435984, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005191085860133171, + "rewards/margins": 0.3651717007160187, + "rewards/rejected": -0.370362788438797, + "sft_loss": 0.05191085860133171, + "step": 2839 + }, + { + "epoch": 4.10701373825018, + "grad_norm": 1.7383613915672071, + "learning_rate": 1.8863550639659983e-06, + "logits/chosen": -0.9624236822128296, + "logits/rejected": -0.6738488078117371, + "logps/chosen": -0.13818246126174927, + "logps/rejected": -4.537891387939453, + "loss": 0.1094, + "odds_ratio_loss": 0.015407193452119827, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013818246312439442, + "rewards/margins": 0.43997085094451904, + "rewards/rejected": -0.4537891447544098, + "sft_loss": 0.13818246126174927, + "step": 2840 + }, + { + "epoch": 4.108459869848156, + "grad_norm": 1.769141927099359, + "learning_rate": 1.883718848986155e-06, + "logits/chosen": -0.9640853404998779, + "logits/rejected": -0.7224030494689941, + "logps/chosen": -0.07164276391267776, + "logps/rejected": -5.423728942871094, + "loss": 0.1005, + "odds_ratio_loss": 0.002488494850695133, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007164277136325836, + "rewards/margins": 0.5352085828781128, + "rewards/rejected": -0.5423728823661804, + "sft_loss": 0.07164276391267776, + "step": 2841 + }, + { + "epoch": 4.109906001446132, + "grad_norm": 1.624625898692095, + "learning_rate": 1.8810839099201004e-06, + "logits/chosen": -0.8588815927505493, + "logits/rejected": -0.7229719161987305, + "logps/chosen": -0.08476316183805466, + "logps/rejected": -3.678792953491211, + "loss": 0.079, + "odds_ratio_loss": 0.010054201819002628, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008476316928863525, + "rewards/margins": 0.35940298438072205, + "rewards/rejected": -0.36787930130958557, + "sft_loss": 0.08476316183805466, + "step": 2842 + }, + { + "epoch": 4.111352133044107, + "grad_norm": 1.7008537404128115, + "learning_rate": 1.878450248356446e-06, + "logits/chosen": -0.8767229318618774, + "logits/rejected": -0.6937390565872192, + "logps/chosen": -0.12689150869846344, + "logps/rejected": -5.257368564605713, + "loss": 0.082, + "odds_ratio_loss": 0.012621680274605751, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012689150869846344, + "rewards/margins": 0.5130476951599121, + "rewards/rejected": -0.5257368683815002, + "sft_loss": 0.12689150869846344, + "step": 2843 + }, + { + "epoch": 4.112798264642082, + "grad_norm": 1.5221274028586664, + "learning_rate": 1.87581786588304e-06, + "logits/chosen": -0.9948400259017944, + "logits/rejected": -0.6448618173599243, + "logps/chosen": -0.11695563793182373, + "logps/rejected": -5.0134406089782715, + "loss": 0.0724, + "odds_ratio_loss": 0.008317345753312111, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011695563793182373, + "rewards/margins": 0.4896485209465027, + "rewards/rejected": -0.5013440847396851, + "sft_loss": 0.11695563793182373, + "step": 2844 + }, + { + "epoch": 4.114244396240058, + "grad_norm": 1.8872482751675728, + "learning_rate": 1.8731867640869528e-06, + "logits/chosen": -0.982231080532074, + "logits/rejected": -0.7305710911750793, + "logps/chosen": -0.09674321860074997, + "logps/rejected": -4.068989276885986, + "loss": 0.1131, + "odds_ratio_loss": 0.006529256701469421, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009674321860074997, + "rewards/margins": 0.3972246050834656, + "rewards/rejected": -0.4068989157676697, + "sft_loss": 0.09674321860074997, + "step": 2845 + }, + { + "epoch": 4.115690527838034, + "grad_norm": 1.6369506462728376, + "learning_rate": 1.8705569445544875e-06, + "logits/chosen": -0.7939417362213135, + "logits/rejected": -0.5607274770736694, + "logps/chosen": -0.08005495369434357, + "logps/rejected": -6.112852573394775, + "loss": 0.0843, + "odds_ratio_loss": 0.006115018390119076, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008005496114492416, + "rewards/margins": 0.603279709815979, + "rewards/rejected": -0.6112852096557617, + "sft_loss": 0.08005495369434357, + "step": 2846 + }, + { + "epoch": 4.117136659436008, + "grad_norm": 1.7432584620353744, + "learning_rate": 1.8679284088711703e-06, + "logits/chosen": -1.0481350421905518, + "logits/rejected": -0.642801821231842, + "logps/chosen": -0.04570764675736427, + "logps/rejected": -5.874081611633301, + "loss": 0.062, + "odds_ratio_loss": 0.0012314720079302788, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0045707649551332, + "rewards/margins": 0.5828374028205872, + "rewards/rejected": -0.587408185005188, + "sft_loss": 0.04570764675736427, + "step": 2847 + }, + { + "epoch": 4.118582791033984, + "grad_norm": 2.034327837129782, + "learning_rate": 1.8653011586217575e-06, + "logits/chosen": -0.9006962180137634, + "logits/rejected": -0.524931013584137, + "logps/chosen": -0.07342912256717682, + "logps/rejected": -5.247374534606934, + "loss": 0.0688, + "odds_ratio_loss": 0.0020064630080014467, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007342912256717682, + "rewards/margins": 0.5173946022987366, + "rewards/rejected": -0.5247374773025513, + "sft_loss": 0.07342912256717682, + "step": 2848 + }, + { + "epoch": 4.12002892263196, + "grad_norm": 1.5094420598404346, + "learning_rate": 1.8626751953902265e-06, + "logits/chosen": -0.8094779849052429, + "logits/rejected": -0.6073752641677856, + "logps/chosen": -0.061085619032382965, + "logps/rejected": -4.160881519317627, + "loss": 0.0567, + "odds_ratio_loss": 0.0034529813565313816, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006108561530709267, + "rewards/margins": 0.40997958183288574, + "rewards/rejected": -0.41608819365501404, + "sft_loss": 0.061085619032382965, + "step": 2849 + }, + { + "epoch": 4.1214750542299345, + "grad_norm": 1.4627918976068797, + "learning_rate": 1.8600505207597789e-06, + "logits/chosen": -0.5969792604446411, + "logits/rejected": -0.4297393262386322, + "logps/chosen": -0.030321719124913216, + "logps/rejected": -6.4586896896362305, + "loss": 0.0626, + "odds_ratio_loss": 0.0009324885904788971, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0030321720987558365, + "rewards/margins": 0.6428368091583252, + "rewards/rejected": -0.6458690166473389, + "sft_loss": 0.030321719124913216, + "step": 2850 + }, + { + "epoch": 4.12292118582791, + "grad_norm": 1.4982677272369433, + "learning_rate": 1.857427136312844e-06, + "logits/chosen": -0.876201331615448, + "logits/rejected": -0.6428571343421936, + "logps/chosen": -0.07151912152767181, + "logps/rejected": -4.5147175788879395, + "loss": 0.0806, + "odds_ratio_loss": 0.0038516269996762276, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007151912897825241, + "rewards/margins": 0.44431987404823303, + "rewards/rejected": -0.45147180557250977, + "sft_loss": 0.07151912152767181, + "step": 2851 + }, + { + "epoch": 4.124367317425886, + "grad_norm": 1.5257893605138617, + "learning_rate": 1.8548050436310669e-06, + "logits/chosen": -0.8841822743415833, + "logits/rejected": -0.5845543146133423, + "logps/chosen": -0.04778134822845459, + "logps/rejected": -5.190133094787598, + "loss": 0.0625, + "odds_ratio_loss": 0.004256582818925381, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004778134636580944, + "rewards/margins": 0.5142351984977722, + "rewards/rejected": -0.5190134048461914, + "sft_loss": 0.04778134822845459, + "step": 2852 + }, + { + "epoch": 4.125813449023862, + "grad_norm": 1.2651179338589529, + "learning_rate": 1.8521842442953198e-06, + "logits/chosen": -0.7237213850021362, + "logits/rejected": -0.6026439070701599, + "logps/chosen": -0.1049027293920517, + "logps/rejected": -5.724381923675537, + "loss": 0.0709, + "odds_ratio_loss": 0.008922792971134186, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010490273125469685, + "rewards/margins": 0.5619478821754456, + "rewards/rejected": -0.5724381804466248, + "sft_loss": 0.1049027293920517, + "step": 2853 + }, + { + "epoch": 4.127259580621836, + "grad_norm": 1.4252690236427485, + "learning_rate": 1.84956473988569e-06, + "logits/chosen": -0.9042859077453613, + "logits/rejected": -0.5686567425727844, + "logps/chosen": -0.056515954434871674, + "logps/rejected": -5.483891487121582, + "loss": 0.0745, + "odds_ratio_loss": 0.005240194033831358, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005651596002280712, + "rewards/margins": 0.542737603187561, + "rewards/rejected": -0.548389196395874, + "sft_loss": 0.056515954434871674, + "step": 2854 + }, + { + "epoch": 4.128705712219812, + "grad_norm": 1.3935884148488344, + "learning_rate": 1.846946531981489e-06, + "logits/chosen": -0.8212897777557373, + "logits/rejected": -0.5057724118232727, + "logps/chosen": -0.039713770151138306, + "logps/rejected": -4.20341157913208, + "loss": 0.0708, + "odds_ratio_loss": 0.0034693204797804356, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00397137738764286, + "rewards/margins": 0.41636979579925537, + "rewards/rejected": -0.4203411638736725, + "sft_loss": 0.039713770151138306, + "step": 2855 + }, + { + "epoch": 4.130151843817788, + "grad_norm": 1.4700289842978596, + "learning_rate": 1.8443296221612426e-06, + "logits/chosen": -0.8127454519271851, + "logits/rejected": -0.6734610199928284, + "logps/chosen": -0.07819758355617523, + "logps/rejected": -4.370360374450684, + "loss": 0.0792, + "odds_ratio_loss": 0.010140936821699142, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007819757796823978, + "rewards/margins": 0.42921626567840576, + "rewards/rejected": -0.43703603744506836, + "sft_loss": 0.07819758355617523, + "step": 2856 + }, + { + "epoch": 4.131597975415763, + "grad_norm": 1.6312862182934151, + "learning_rate": 1.8417140120026954e-06, + "logits/chosen": -0.6778440475463867, + "logits/rejected": -0.5362757444381714, + "logps/chosen": -0.054850347340106964, + "logps/rejected": -3.9211716651916504, + "loss": 0.0752, + "odds_ratio_loss": 0.0033841035328805447, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0054850345477461815, + "rewards/margins": 0.38663214445114136, + "rewards/rejected": -0.39211714267730713, + "sft_loss": 0.054850347340106964, + "step": 2857 + }, + { + "epoch": 4.133044107013738, + "grad_norm": 1.3563265826587092, + "learning_rate": 1.8390997030828074e-06, + "logits/chosen": -0.9520107507705688, + "logits/rejected": -0.7877506017684937, + "logps/chosen": -0.07438983023166656, + "logps/rejected": -4.451206684112549, + "loss": 0.0519, + "odds_ratio_loss": 0.004518951755017042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007438982836902142, + "rewards/margins": 0.4376816749572754, + "rewards/rejected": -0.4451206624507904, + "sft_loss": 0.07438983023166656, + "step": 2858 + }, + { + "epoch": 4.134490238611714, + "grad_norm": 1.343261093042372, + "learning_rate": 1.836486696977758e-06, + "logits/chosen": -0.7512749433517456, + "logits/rejected": -0.6193314790725708, + "logps/chosen": -0.03754514455795288, + "logps/rejected": -5.197000026702881, + "loss": 0.0499, + "odds_ratio_loss": 0.006154801230877638, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003754514502361417, + "rewards/margins": 0.5159454941749573, + "rewards/rejected": -0.5197000503540039, + "sft_loss": 0.03754514455795288, + "step": 2859 + }, + { + "epoch": 4.135936370209689, + "grad_norm": 1.6673291219985102, + "learning_rate": 1.8338749952629353e-06, + "logits/chosen": -0.8384102582931519, + "logits/rejected": -0.6935494542121887, + "logps/chosen": -0.05776578187942505, + "logps/rejected": -3.5844650268554688, + "loss": 0.0767, + "odds_ratio_loss": 0.008841626346111298, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005776578560471535, + "rewards/margins": 0.35266995429992676, + "rewards/rejected": -0.35844647884368896, + "sft_loss": 0.05776578187942505, + "step": 2860 + }, + { + "epoch": 4.137382501807664, + "grad_norm": 1.9142765839106552, + "learning_rate": 1.831264599512945e-06, + "logits/chosen": -0.8089092969894409, + "logits/rejected": -0.6261400580406189, + "logps/chosen": -0.16545385122299194, + "logps/rejected": -4.192132949829102, + "loss": 0.1065, + "odds_ratio_loss": 0.02832021750509739, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016545386984944344, + "rewards/margins": 0.40266790986061096, + "rewards/rejected": -0.41921332478523254, + "sft_loss": 0.16545385122299194, + "step": 2861 + }, + { + "epoch": 4.13882863340564, + "grad_norm": 1.8508060502337165, + "learning_rate": 1.828655511301607e-06, + "logits/chosen": -1.029259204864502, + "logits/rejected": -0.7229580283164978, + "logps/chosen": -0.08430210500955582, + "logps/rejected": -4.091892242431641, + "loss": 0.0872, + "odds_ratio_loss": 0.007961390540003777, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008430209942162037, + "rewards/margins": 0.4007590413093567, + "rewards/rejected": -0.40918928384780884, + "sft_loss": 0.08430210500955582, + "step": 2862 + }, + { + "epoch": 4.140274765003615, + "grad_norm": 1.4236489094364113, + "learning_rate": 1.8260477322019478e-06, + "logits/chosen": -0.9422646760940552, + "logits/rejected": -0.6158483028411865, + "logps/chosen": -0.043667394667863846, + "logps/rejected": -5.095708847045898, + "loss": 0.049, + "odds_ratio_loss": 0.0031190637964755297, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0043667396530508995, + "rewards/margins": 0.5052041411399841, + "rewards/rejected": -0.5095708966255188, + "sft_loss": 0.043667394667863846, + "step": 2863 + }, + { + "epoch": 4.141720896601591, + "grad_norm": 2.237085057444961, + "learning_rate": 1.8234412637862078e-06, + "logits/chosen": -0.7573752403259277, + "logits/rejected": -0.5755871534347534, + "logps/chosen": -0.036390505731105804, + "logps/rejected": -4.381635665893555, + "loss": 0.11, + "odds_ratio_loss": 0.003340129740536213, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003639050293713808, + "rewards/margins": 0.4345245063304901, + "rewards/rejected": -0.43816354870796204, + "sft_loss": 0.036390505731105804, + "step": 2864 + }, + { + "epoch": 4.143167028199566, + "grad_norm": 1.6249757873443114, + "learning_rate": 1.8208361076258347e-06, + "logits/chosen": -0.8179000616073608, + "logits/rejected": -0.6705646514892578, + "logps/chosen": -0.12193619459867477, + "logps/rejected": -4.738859176635742, + "loss": 0.087, + "odds_ratio_loss": 0.0028922702185809612, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012193620204925537, + "rewards/margins": 0.46169233322143555, + "rewards/rejected": -0.4738859534263611, + "sft_loss": 0.12193619459867477, + "step": 2865 + }, + { + "epoch": 4.144613159797542, + "grad_norm": 1.6742919109805106, + "learning_rate": 1.8182322652914897e-06, + "logits/chosen": -0.6485641002655029, + "logits/rejected": -0.580163300037384, + "logps/chosen": -0.030021870508790016, + "logps/rejected": -4.312215328216553, + "loss": 0.0654, + "odds_ratio_loss": 0.003744534682482481, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0030021872371435165, + "rewards/margins": 0.42821940779685974, + "rewards/rejected": -0.4312215745449066, + "sft_loss": 0.030021870508790016, + "step": 2866 + }, + { + "epoch": 4.146059291395517, + "grad_norm": 1.5207496797985913, + "learning_rate": 1.8156297383530363e-06, + "logits/chosen": -0.8672469258308411, + "logits/rejected": -0.7313282489776611, + "logps/chosen": -0.051602356135845184, + "logps/rejected": -6.097007751464844, + "loss": 0.0676, + "odds_ratio_loss": 0.002992212073877454, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005160235799849033, + "rewards/margins": 0.6045405864715576, + "rewards/rejected": -0.6097007989883423, + "sft_loss": 0.051602356135845184, + "step": 2867 + }, + { + "epoch": 4.1475054229934925, + "grad_norm": 1.6381642095253177, + "learning_rate": 1.8130285283795508e-06, + "logits/chosen": -0.7583048343658447, + "logits/rejected": -0.5695833563804626, + "logps/chosen": -0.13401220738887787, + "logps/rejected": -4.404128074645996, + "loss": 0.0939, + "odds_ratio_loss": 0.010725535452365875, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013401219621300697, + "rewards/margins": 0.42701154947280884, + "rewards/rejected": -0.4404127597808838, + "sft_loss": 0.13401220738887787, + "step": 2868 + }, + { + "epoch": 4.148951554591468, + "grad_norm": 1.4680257306046172, + "learning_rate": 1.8104286369393087e-06, + "logits/chosen": -0.8767812252044678, + "logits/rejected": -0.5856321454048157, + "logps/chosen": -0.05747556686401367, + "logps/rejected": -5.090399742126465, + "loss": 0.0814, + "odds_ratio_loss": 0.003675619838759303, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0057475571520626545, + "rewards/margins": 0.503292441368103, + "rewards/rejected": -0.5090399980545044, + "sft_loss": 0.05747556686401367, + "step": 2869 + }, + { + "epoch": 4.150397686189443, + "grad_norm": 1.7573313537408448, + "learning_rate": 1.807830065599798e-06, + "logits/chosen": -0.7944881916046143, + "logits/rejected": -0.6999379992485046, + "logps/chosen": -0.0795753002166748, + "logps/rejected": -5.302948474884033, + "loss": 0.0808, + "odds_ratio_loss": 0.0038214053492993116, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00795752927660942, + "rewards/margins": 0.5223373174667358, + "rewards/rejected": -0.5302948355674744, + "sft_loss": 0.0795753002166748, + "step": 2870 + }, + { + "epoch": 4.151843817787419, + "grad_norm": 1.6291075861205213, + "learning_rate": 1.8052328159277054e-06, + "logits/chosen": -0.7494406700134277, + "logits/rejected": -0.7102495431900024, + "logps/chosen": -0.06435263156890869, + "logps/rejected": -4.151299953460693, + "loss": 0.0722, + "odds_ratio_loss": 0.010602910071611404, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006435262970626354, + "rewards/margins": 0.4086947441101074, + "rewards/rejected": -0.41513001918792725, + "sft_loss": 0.06435263156890869, + "step": 2871 + }, + { + "epoch": 4.153289949385394, + "grad_norm": 1.6436961304644404, + "learning_rate": 1.802636889488922e-06, + "logits/chosen": -0.8582144379615784, + "logits/rejected": -0.6094192266464233, + "logps/chosen": -0.05906105786561966, + "logps/rejected": -5.2758402824401855, + "loss": 0.0643, + "odds_ratio_loss": 0.004993016365915537, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005906105972826481, + "rewards/margins": 0.5216779708862305, + "rewards/rejected": -0.5275840759277344, + "sft_loss": 0.05906105786561966, + "step": 2872 + }, + { + "epoch": 4.154736080983369, + "grad_norm": 1.5066252354267282, + "learning_rate": 1.8000422878485403e-06, + "logits/chosen": -0.8553987741470337, + "logits/rejected": -0.684226393699646, + "logps/chosen": -0.12316957116127014, + "logps/rejected": -3.842941999435425, + "loss": 0.0926, + "odds_ratio_loss": 0.019793318584561348, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012316957116127014, + "rewards/margins": 0.3719772398471832, + "rewards/rejected": -0.38429421186447144, + "sft_loss": 0.12316957116127014, + "step": 2873 + }, + { + "epoch": 4.156182212581345, + "grad_norm": 1.7609833375052197, + "learning_rate": 1.7974490125708575e-06, + "logits/chosen": -0.9975032210350037, + "logits/rejected": -0.7744351029396057, + "logps/chosen": -0.12477146834135056, + "logps/rejected": -3.4359312057495117, + "loss": 0.0988, + "odds_ratio_loss": 0.009972745552659035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0124771473929286, + "rewards/margins": 0.3311160206794739, + "rewards/rejected": -0.34359318017959595, + "sft_loss": 0.12477146834135056, + "step": 2874 + }, + { + "epoch": 4.1576283441793205, + "grad_norm": 1.518218164522744, + "learning_rate": 1.7948570652193688e-06, + "logits/chosen": -0.6091439127922058, + "logits/rejected": -0.5392559766769409, + "logps/chosen": -0.16808342933654785, + "logps/rejected": -2.7966959476470947, + "loss": 0.0894, + "odds_ratio_loss": 0.03804006054997444, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016808344051241875, + "rewards/margins": 0.2628612816333771, + "rewards/rejected": -0.2796696126461029, + "sft_loss": 0.16808342933654785, + "step": 2875 + }, + { + "epoch": 4.159074475777295, + "grad_norm": 1.6362781472337709, + "learning_rate": 1.7922664473567672e-06, + "logits/chosen": -0.7585278153419495, + "logits/rejected": -0.6109793782234192, + "logps/chosen": -0.12611831724643707, + "logps/rejected": -4.219643592834473, + "loss": 0.1081, + "odds_ratio_loss": 0.021449480205774307, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012611833401024342, + "rewards/margins": 0.40935254096984863, + "rewards/rejected": -0.4219644367694855, + "sft_loss": 0.12611831724643707, + "step": 2876 + }, + { + "epoch": 4.160520607375271, + "grad_norm": 3.2649128823647127, + "learning_rate": 1.7896771605449485e-06, + "logits/chosen": -0.7754852175712585, + "logits/rejected": -0.6101595163345337, + "logps/chosen": -0.1011345386505127, + "logps/rejected": -3.0544514656066895, + "loss": 0.0989, + "odds_ratio_loss": 0.009415505453944206, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010113454423844814, + "rewards/margins": 0.2953317165374756, + "rewards/rejected": -0.3054451644420624, + "sft_loss": 0.1011345386505127, + "step": 2877 + }, + { + "epoch": 4.161966738973247, + "grad_norm": 1.3888856417078521, + "learning_rate": 1.7870892063450001e-06, + "logits/chosen": -0.753406822681427, + "logits/rejected": -0.4384850263595581, + "logps/chosen": -0.08360215276479721, + "logps/rejected": -3.66217303276062, + "loss": 0.0711, + "odds_ratio_loss": 0.0076039680279791355, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008360215462744236, + "rewards/margins": 0.3578570485115051, + "rewards/rejected": -0.36621731519699097, + "sft_loss": 0.08360215276479721, + "step": 2878 + }, + { + "epoch": 4.163412870571222, + "grad_norm": 1.529348069798455, + "learning_rate": 1.7845025863172127e-06, + "logits/chosen": -0.685241162776947, + "logits/rejected": -0.6223734617233276, + "logps/chosen": -0.06466331332921982, + "logps/rejected": -4.321099758148193, + "loss": 0.0725, + "odds_ratio_loss": 0.009548970498144627, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006466331891715527, + "rewards/margins": 0.4256436228752136, + "rewards/rejected": -0.4321099519729614, + "sft_loss": 0.06466331332921982, + "step": 2879 + }, + { + "epoch": 4.164859002169197, + "grad_norm": 1.3802640702535711, + "learning_rate": 1.781917302021067e-06, + "logits/chosen": -0.743414044380188, + "logits/rejected": -0.6212598085403442, + "logps/chosen": -0.08406087011098862, + "logps/rejected": -3.3538153171539307, + "loss": 0.0755, + "odds_ratio_loss": 0.007001823280006647, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008406086824834347, + "rewards/margins": 0.32697543501853943, + "rewards/rejected": -0.33538153767585754, + "sft_loss": 0.08406087011098862, + "step": 2880 + }, + { + "epoch": 4.166305133767173, + "grad_norm": 1.7454934799111688, + "learning_rate": 1.7793333550152413e-06, + "logits/chosen": -0.9296549558639526, + "logits/rejected": -0.6040893197059631, + "logps/chosen": -0.09559880197048187, + "logps/rejected": -6.568416118621826, + "loss": 0.0685, + "odds_ratio_loss": 0.002591322874650359, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009559880942106247, + "rewards/margins": 0.6472817063331604, + "rewards/rejected": -0.6568415760993958, + "sft_loss": 0.09559880197048187, + "step": 2881 + }, + { + "epoch": 4.1677512653651485, + "grad_norm": 1.378926994916888, + "learning_rate": 1.776750746857605e-06, + "logits/chosen": -1.0216419696807861, + "logits/rejected": -0.5682446956634521, + "logps/chosen": -0.07705073058605194, + "logps/rejected": -5.844695091247559, + "loss": 0.0884, + "odds_ratio_loss": 0.0033558798022568226, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0077050733380019665, + "rewards/margins": 0.5767644643783569, + "rewards/rejected": -0.5844695568084717, + "sft_loss": 0.07705073058605194, + "step": 2882 + }, + { + "epoch": 4.169197396963123, + "grad_norm": 1.8249380721129438, + "learning_rate": 1.7741694791052248e-06, + "logits/chosen": -0.7748250961303711, + "logits/rejected": -0.5840004682540894, + "logps/chosen": -0.08540331572294235, + "logps/rejected": -5.7757792472839355, + "loss": 0.0915, + "odds_ratio_loss": 0.0048137810081243515, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008540332317352295, + "rewards/margins": 0.5690376162528992, + "rewards/rejected": -0.5775779485702515, + "sft_loss": 0.08540331572294235, + "step": 2883 + }, + { + "epoch": 4.170643528561099, + "grad_norm": 1.6439984729479422, + "learning_rate": 1.7715895533143543e-06, + "logits/chosen": -0.9461988806724548, + "logits/rejected": -0.7191964983940125, + "logps/chosen": -0.08565716445446014, + "logps/rejected": -4.468839168548584, + "loss": 0.085, + "odds_ratio_loss": 0.003522283863276243, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008565716445446014, + "rewards/margins": 0.4383182227611542, + "rewards/rejected": -0.4468839466571808, + "sft_loss": 0.08565716445446014, + "step": 2884 + }, + { + "epoch": 4.172089660159075, + "grad_norm": 1.9779228119632557, + "learning_rate": 1.7690109710404433e-06, + "logits/chosen": -0.8428412675857544, + "logits/rejected": -0.5799111127853394, + "logps/chosen": -0.10756391286849976, + "logps/rejected": -4.175439357757568, + "loss": 0.1062, + "odds_ratio_loss": 0.017258528620004654, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010756392031908035, + "rewards/margins": 0.40678757429122925, + "rewards/rejected": -0.4175439774990082, + "sft_loss": 0.10756391286849976, + "step": 2885 + }, + { + "epoch": 4.1735357917570495, + "grad_norm": 1.5163132148550575, + "learning_rate": 1.7664337338381253e-06, + "logits/chosen": -0.7353506088256836, + "logits/rejected": -0.6050468683242798, + "logps/chosen": -0.09535052627325058, + "logps/rejected": -3.8978962898254395, + "loss": 0.086, + "odds_ratio_loss": 0.013053749687969685, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009535052813589573, + "rewards/margins": 0.3802545666694641, + "rewards/rejected": -0.3897896409034729, + "sft_loss": 0.09535052627325058, + "step": 2886 + }, + { + "epoch": 4.174981923355025, + "grad_norm": 4.89411470614702, + "learning_rate": 1.7638578432612294e-06, + "logits/chosen": -0.5543168783187866, + "logits/rejected": -0.49109578132629395, + "logps/chosen": -0.07704973220825195, + "logps/rejected": -5.383688449859619, + "loss": 0.0643, + "odds_ratio_loss": 0.00851721502840519, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007704972755163908, + "rewards/margins": 0.5306639671325684, + "rewards/rejected": -0.5383689403533936, + "sft_loss": 0.07704973220825195, + "step": 2887 + }, + { + "epoch": 4.176428054953001, + "grad_norm": 1.6508265050328592, + "learning_rate": 1.761283300862768e-06, + "logits/chosen": -0.732459545135498, + "logits/rejected": -0.6587421298027039, + "logps/chosen": -0.061337437480688095, + "logps/rejected": -3.2690553665161133, + "loss": 0.0917, + "odds_ratio_loss": 0.026827432215213776, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006133743561804295, + "rewards/margins": 0.32077181339263916, + "rewards/rejected": -0.3269055485725403, + "sft_loss": 0.061337437480688095, + "step": 2888 + }, + { + "epoch": 4.1778741865509765, + "grad_norm": 1.5687060094740681, + "learning_rate": 1.7587101081949406e-06, + "logits/chosen": -0.871804416179657, + "logits/rejected": -0.6253595352172852, + "logps/chosen": -0.09505133330821991, + "logps/rejected": -4.816754341125488, + "loss": 0.1232, + "odds_ratio_loss": 0.0039987508207559586, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009505132213234901, + "rewards/margins": 0.47217029333114624, + "rewards/rejected": -0.4816754162311554, + "sft_loss": 0.09505133330821991, + "step": 2889 + }, + { + "epoch": 4.179320318148951, + "grad_norm": 1.574633627267037, + "learning_rate": 1.7561382668091383e-06, + "logits/chosen": -0.8467705249786377, + "logits/rejected": -0.5515745282173157, + "logps/chosen": -0.12236753851175308, + "logps/rejected": -6.257351875305176, + "loss": 0.1053, + "odds_ratio_loss": 0.012112165801227093, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012236754409968853, + "rewards/margins": 0.6134985089302063, + "rewards/rejected": -0.6257352232933044, + "sft_loss": 0.12236753851175308, + "step": 2890 + }, + { + "epoch": 4.180766449746927, + "grad_norm": 1.6617557779312644, + "learning_rate": 1.7535677782559306e-06, + "logits/chosen": -0.6538142561912537, + "logits/rejected": -0.5575007200241089, + "logps/chosen": -0.0687403753399849, + "logps/rejected": -4.553154945373535, + "loss": 0.091, + "odds_ratio_loss": 0.008355571888387203, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006874037906527519, + "rewards/margins": 0.44844144582748413, + "rewards/rejected": -0.455315500497818, + "sft_loss": 0.0687403753399849, + "step": 2891 + }, + { + "epoch": 4.182212581344903, + "grad_norm": 1.3237025011825474, + "learning_rate": 1.7509986440850773e-06, + "logits/chosen": -0.8017591834068298, + "logits/rejected": -0.6542471051216125, + "logps/chosen": -0.04119560867547989, + "logps/rejected": -5.426454067230225, + "loss": 0.0717, + "odds_ratio_loss": 0.006445649079978466, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004119561053812504, + "rewards/margins": 0.5385258197784424, + "rewards/rejected": -0.5426453948020935, + "sft_loss": 0.04119560867547989, + "step": 2892 + }, + { + "epoch": 4.1836587129428775, + "grad_norm": 1.6426153001965083, + "learning_rate": 1.748430865845516e-06, + "logits/chosen": -0.7238603234291077, + "logits/rejected": -0.7122807502746582, + "logps/chosen": -0.09050285816192627, + "logps/rejected": -3.2810425758361816, + "loss": 0.09, + "odds_ratio_loss": 0.011915099807083607, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009050285443663597, + "rewards/margins": 0.31905397772789, + "rewards/rejected": -0.32810425758361816, + "sft_loss": 0.09050285816192627, + "step": 2893 + }, + { + "epoch": 4.185104844540853, + "grad_norm": 1.5864270160960752, + "learning_rate": 1.745864445085373e-06, + "logits/chosen": -1.0129873752593994, + "logits/rejected": -0.6170842051506042, + "logps/chosen": -0.045922402292490005, + "logps/rejected": -5.002997398376465, + "loss": 0.0947, + "odds_ratio_loss": 0.0038542391266673803, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004592240788042545, + "rewards/margins": 0.49570751190185547, + "rewards/rejected": -0.5002997517585754, + "sft_loss": 0.045922402292490005, + "step": 2894 + }, + { + "epoch": 4.186550976138829, + "grad_norm": 4.730722961719703, + "learning_rate": 1.7432993833519514e-06, + "logits/chosen": -0.7456246018409729, + "logits/rejected": -0.638870120048523, + "logps/chosen": -0.09694996476173401, + "logps/rejected": -5.234409809112549, + "loss": 0.0961, + "odds_ratio_loss": 0.014911282807588577, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009694996289908886, + "rewards/margins": 0.5137460231781006, + "rewards/rejected": -0.5234410166740417, + "sft_loss": 0.09694996476173401, + "step": 2895 + }, + { + "epoch": 4.187997107736804, + "grad_norm": 1.7372021790081584, + "learning_rate": 1.7407356821917362e-06, + "logits/chosen": -0.7853217124938965, + "logits/rejected": -0.5458134412765503, + "logps/chosen": -0.1326024830341339, + "logps/rejected": -5.669880390167236, + "loss": 0.1053, + "odds_ratio_loss": 0.015873540192842484, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01326024904847145, + "rewards/margins": 0.5537277460098267, + "rewards/rejected": -0.5669880509376526, + "sft_loss": 0.1326024830341339, + "step": 2896 + }, + { + "epoch": 4.189443239334779, + "grad_norm": 1.7437826506465477, + "learning_rate": 1.7381733431503919e-06, + "logits/chosen": -0.7777459621429443, + "logits/rejected": -0.5212811827659607, + "logps/chosen": -0.06154448539018631, + "logps/rejected": -6.10374641418457, + "loss": 0.0601, + "odds_ratio_loss": 0.00723948935046792, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006154448725283146, + "rewards/margins": 0.6042202711105347, + "rewards/rejected": -0.6103746891021729, + "sft_loss": 0.06154448539018631, + "step": 2897 + }, + { + "epoch": 4.190889370932755, + "grad_norm": 1.6087185240794226, + "learning_rate": 1.7356123677727634e-06, + "logits/chosen": -0.8092604875564575, + "logits/rejected": -0.6327940225601196, + "logps/chosen": -0.0712791383266449, + "logps/rejected": -6.498178958892822, + "loss": 0.0952, + "odds_ratio_loss": 0.015109434723854065, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007127914112061262, + "rewards/margins": 0.6426900029182434, + "rewards/rejected": -0.649817943572998, + "sft_loss": 0.0712791383266449, + "step": 2898 + }, + { + "epoch": 4.19233550253073, + "grad_norm": 1.810061844063369, + "learning_rate": 1.7330527576028713e-06, + "logits/chosen": -1.077024221420288, + "logits/rejected": -0.8910109996795654, + "logps/chosen": -0.14297831058502197, + "logps/rejected": -3.079667806625366, + "loss": 0.105, + "odds_ratio_loss": 0.01574338600039482, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014297831803560257, + "rewards/margins": 0.2936689555644989, + "rewards/rejected": -0.30796679854393005, + "sft_loss": 0.14297831058502197, + "step": 2899 + }, + { + "epoch": 4.1937816341287055, + "grad_norm": 1.4080726234451635, + "learning_rate": 1.7304945141839156e-06, + "logits/chosen": -0.8134248852729797, + "logits/rejected": -0.6756539344787598, + "logps/chosen": -0.06007365137338638, + "logps/rejected": -5.442124366760254, + "loss": 0.0702, + "odds_ratio_loss": 0.0070592500269412994, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006007364951074123, + "rewards/margins": 0.5382050275802612, + "rewards/rejected": -0.5442124605178833, + "sft_loss": 0.06007365137338638, + "step": 2900 + }, + { + "epoch": 4.195227765726681, + "grad_norm": 1.8129753504901793, + "learning_rate": 1.7279376390582683e-06, + "logits/chosen": -0.814599871635437, + "logits/rejected": -0.5579865574836731, + "logps/chosen": -0.06395604461431503, + "logps/rejected": -6.123435974121094, + "loss": 0.085, + "odds_ratio_loss": 0.0038474637549370527, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006395603530108929, + "rewards/margins": 0.6059479713439941, + "rewards/rejected": -0.6123435497283936, + "sft_loss": 0.06395604461431503, + "step": 2901 + }, + { + "epoch": 4.196673897324657, + "grad_norm": 1.7496011317153113, + "learning_rate": 1.725382133767482e-06, + "logits/chosen": -0.9168472290039062, + "logits/rejected": -0.6373493671417236, + "logps/chosen": -0.14649318158626556, + "logps/rejected": -6.127131462097168, + "loss": 0.0838, + "odds_ratio_loss": 0.01623375155031681, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014649318531155586, + "rewards/margins": 0.5980637669563293, + "rewards/rejected": -0.612713098526001, + "sft_loss": 0.14649318158626556, + "step": 2902 + }, + { + "epoch": 4.198120028922632, + "grad_norm": 1.8093572721032025, + "learning_rate": 1.7228279998522791e-06, + "logits/chosen": -0.8479722738265991, + "logits/rejected": -0.6456978917121887, + "logps/chosen": -0.0714811310172081, + "logps/rejected": -4.065431594848633, + "loss": 0.0833, + "odds_ratio_loss": 0.015304593369364738, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00714811310172081, + "rewards/margins": 0.39939504861831665, + "rewards/rejected": -0.40654319524765015, + "sft_loss": 0.0714811310172081, + "step": 2903 + }, + { + "epoch": 4.199566160520607, + "grad_norm": 1.7050155520074486, + "learning_rate": 1.7202752388525546e-06, + "logits/chosen": -0.7397294044494629, + "logits/rejected": -0.6496505737304688, + "logps/chosen": -0.05917609483003616, + "logps/rejected": -4.717207908630371, + "loss": 0.0827, + "odds_ratio_loss": 0.0097846370190382, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005917609669268131, + "rewards/margins": 0.46580320596694946, + "rewards/rejected": -0.471720814704895, + "sft_loss": 0.05917609483003616, + "step": 2904 + }, + { + "epoch": 4.201012292118583, + "grad_norm": 1.8064910660008877, + "learning_rate": 1.7177238523073804e-06, + "logits/chosen": -0.7527694702148438, + "logits/rejected": -0.6076477766036987, + "logps/chosen": -0.12672166526317596, + "logps/rejected": -3.709519863128662, + "loss": 0.0817, + "odds_ratio_loss": 0.025193804875016212, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012672166340053082, + "rewards/margins": 0.358279824256897, + "rewards/rejected": -0.3709520101547241, + "sft_loss": 0.12672166526317596, + "step": 2905 + }, + { + "epoch": 4.202458423716558, + "grad_norm": 1.4769855495178907, + "learning_rate": 1.7151738417549945e-06, + "logits/chosen": -0.8476330041885376, + "logits/rejected": -0.7080752849578857, + "logps/chosen": -0.08576930314302444, + "logps/rejected": -3.353492021560669, + "loss": 0.0825, + "odds_ratio_loss": 0.0258241668343544, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008576931431889534, + "rewards/margins": 0.3267722725868225, + "rewards/rejected": -0.3353492021560669, + "sft_loss": 0.08576930314302444, + "step": 2906 + }, + { + "epoch": 4.2039045553145336, + "grad_norm": 1.6047502239398708, + "learning_rate": 1.7126252087328106e-06, + "logits/chosen": -0.7401002645492554, + "logits/rejected": -0.48686888813972473, + "logps/chosen": -0.09672141075134277, + "logps/rejected": -5.065194129943848, + "loss": 0.0748, + "odds_ratio_loss": 0.011058647185564041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009672141633927822, + "rewards/margins": 0.4968472421169281, + "rewards/rejected": -0.5065193772315979, + "sft_loss": 0.09672141075134277, + "step": 2907 + }, + { + "epoch": 4.205350686912509, + "grad_norm": 1.784826024305332, + "learning_rate": 1.710077954777406e-06, + "logits/chosen": -0.947332501411438, + "logits/rejected": -0.6513444185256958, + "logps/chosen": -0.11616423726081848, + "logps/rejected": -5.848725318908691, + "loss": 0.0847, + "odds_ratio_loss": 0.021783018484711647, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011616423726081848, + "rewards/margins": 0.5732560753822327, + "rewards/rejected": -0.5848724842071533, + "sft_loss": 0.11616423726081848, + "step": 2908 + }, + { + "epoch": 4.206796818510484, + "grad_norm": 1.6840418840730764, + "learning_rate": 1.7075320814245325e-06, + "logits/chosen": -0.7306966781616211, + "logits/rejected": -0.5873762369155884, + "logps/chosen": -0.10011477768421173, + "logps/rejected": -4.062368392944336, + "loss": 0.089, + "odds_ratio_loss": 0.009407087229192257, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010011478327214718, + "rewards/margins": 0.39622533321380615, + "rewards/rejected": -0.40623682737350464, + "sft_loss": 0.10011477768421173, + "step": 2909 + }, + { + "epoch": 4.20824295010846, + "grad_norm": 2.1766607536331595, + "learning_rate": 1.7049875902091046e-06, + "logits/chosen": -0.9869181513786316, + "logits/rejected": -0.6725652813911438, + "logps/chosen": -0.07069652527570724, + "logps/rejected": -5.175534725189209, + "loss": 0.0693, + "odds_ratio_loss": 0.004759188741445541, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007069652434438467, + "rewards/margins": 0.5104838013648987, + "rewards/rejected": -0.517553448677063, + "sft_loss": 0.07069652527570724, + "step": 2910 + }, + { + "epoch": 4.209689081706435, + "grad_norm": 1.3402883750903443, + "learning_rate": 1.7024444826652067e-06, + "logits/chosen": -0.5692568421363831, + "logits/rejected": -0.4671230912208557, + "logps/chosen": -0.046070732176303864, + "logps/rejected": -3.63814640045166, + "loss": 0.0817, + "odds_ratio_loss": 0.004079668782651424, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004607073031365871, + "rewards/margins": 0.3592075705528259, + "rewards/rejected": -0.36381465196609497, + "sft_loss": 0.046070732176303864, + "step": 2911 + }, + { + "epoch": 4.211135213304411, + "grad_norm": 1.8269769593137486, + "learning_rate": 1.6999027603260853e-06, + "logits/chosen": -1.0144546031951904, + "logits/rejected": -0.7266017198562622, + "logps/chosen": -0.12296151369810104, + "logps/rejected": -4.764224529266357, + "loss": 0.0957, + "odds_ratio_loss": 0.00944902841001749, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012296151369810104, + "rewards/margins": 0.464126318693161, + "rewards/rejected": -0.4764224588871002, + "sft_loss": 0.12296151369810104, + "step": 2912 + }, + { + "epoch": 4.212581344902386, + "grad_norm": 1.6723520264588212, + "learning_rate": 1.697362424724158e-06, + "logits/chosen": -0.8207547664642334, + "logits/rejected": -0.6812379956245422, + "logps/chosen": -0.06191746145486832, + "logps/rejected": -5.384179592132568, + "loss": 0.0729, + "odds_ratio_loss": 0.006640473380684853, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006191745866090059, + "rewards/margins": 0.5322262644767761, + "rewards/rejected": -0.5384179353713989, + "sft_loss": 0.06191746145486832, + "step": 2913 + }, + { + "epoch": 4.214027476500362, + "grad_norm": 1.6031455868403621, + "learning_rate": 1.6948234773909995e-06, + "logits/chosen": -0.9777011871337891, + "logits/rejected": -0.8530696630477905, + "logps/chosen": -0.09695122390985489, + "logps/rejected": -4.301798343658447, + "loss": 0.1181, + "odds_ratio_loss": 0.008988775312900543, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009695122018456459, + "rewards/margins": 0.4204846918582916, + "rewards/rejected": -0.4301798343658447, + "sft_loss": 0.09695122390985489, + "step": 2914 + }, + { + "epoch": 4.215473608098337, + "grad_norm": 1.6686746446757759, + "learning_rate": 1.6922859198573516e-06, + "logits/chosen": -0.7078945636749268, + "logits/rejected": -0.5031204223632812, + "logps/chosen": -0.05042143911123276, + "logps/rejected": -4.290203094482422, + "loss": 0.0847, + "odds_ratio_loss": 0.007931235246360302, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005042144097387791, + "rewards/margins": 0.42397817969322205, + "rewards/rejected": -0.4290202856063843, + "sft_loss": 0.05042143911123276, + "step": 2915 + }, + { + "epoch": 4.216919739696312, + "grad_norm": 1.3601920987396054, + "learning_rate": 1.6897497536531188e-06, + "logits/chosen": -0.8085181713104248, + "logits/rejected": -0.473737895488739, + "logps/chosen": -0.04474177956581116, + "logps/rejected": -4.583406448364258, + "loss": 0.0655, + "odds_ratio_loss": 0.0023093842901289463, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004474177956581116, + "rewards/margins": 0.45386651158332825, + "rewards/rejected": -0.4583406448364258, + "sft_loss": 0.04474177956581116, + "step": 2916 + }, + { + "epoch": 4.218365871294288, + "grad_norm": 1.9745610489643932, + "learning_rate": 1.6872149803073642e-06, + "logits/chosen": -0.9446933269500732, + "logits/rejected": -0.632041871547699, + "logps/chosen": -0.07594672590494156, + "logps/rejected": -4.689000129699707, + "loss": 0.0593, + "odds_ratio_loss": 0.012281806208193302, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007594672497361898, + "rewards/margins": 0.4613053798675537, + "rewards/rejected": -0.46890002489089966, + "sft_loss": 0.07594672590494156, + "step": 2917 + }, + { + "epoch": 4.219812002892263, + "grad_norm": 1.5163951778369975, + "learning_rate": 1.6846816013483114e-06, + "logits/chosen": -0.8689394593238831, + "logits/rejected": -0.5733392834663391, + "logps/chosen": -0.04879632219672203, + "logps/rejected": -3.380754232406616, + "loss": 0.0642, + "odds_ratio_loss": 0.004706839565187693, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004879632964730263, + "rewards/margins": 0.3331957459449768, + "rewards/rejected": -0.3380753993988037, + "sft_loss": 0.04879632219672203, + "step": 2918 + }, + { + "epoch": 4.221258134490238, + "grad_norm": 1.5403147100350525, + "learning_rate": 1.6821496183033426e-06, + "logits/chosen": -0.6562444567680359, + "logits/rejected": -0.5436285138130188, + "logps/chosen": -0.06507329642772675, + "logps/rejected": -3.918442964553833, + "loss": 0.1034, + "odds_ratio_loss": 0.004862302914261818, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006507330108433962, + "rewards/margins": 0.3853369653224945, + "rewards/rejected": -0.3918442726135254, + "sft_loss": 0.06507329642772675, + "step": 2919 + }, + { + "epoch": 4.222704266088214, + "grad_norm": 1.9639610620744994, + "learning_rate": 1.6796190326990035e-06, + "logits/chosen": -0.7960516810417175, + "logits/rejected": -0.5733585357666016, + "logps/chosen": -0.0999024510383606, + "logps/rejected": -5.8444013595581055, + "loss": 0.0888, + "odds_ratio_loss": 0.001729599549435079, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00999024510383606, + "rewards/margins": 0.574449896812439, + "rewards/rejected": -0.5844402313232422, + "sft_loss": 0.0999024510383606, + "step": 2920 + }, + { + "epoch": 4.22415039768619, + "grad_norm": 1.2356981106459832, + "learning_rate": 1.6770898460609898e-06, + "logits/chosen": -0.9516746401786804, + "logits/rejected": -0.6569518446922302, + "logps/chosen": -0.04203661531209946, + "logps/rejected": -4.199306488037109, + "loss": 0.0365, + "odds_ratio_loss": 0.004309549927711487, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004203661344945431, + "rewards/margins": 0.4157269597053528, + "rewards/rejected": -0.4199306070804596, + "sft_loss": 0.04203661531209946, + "step": 2921 + }, + { + "epoch": 4.225596529284164, + "grad_norm": 1.7615373206972187, + "learning_rate": 1.674562059914161e-06, + "logits/chosen": -0.8067028522491455, + "logits/rejected": -0.7631421089172363, + "logps/chosen": -0.07550188153982162, + "logps/rejected": -5.086093902587891, + "loss": 0.1018, + "odds_ratio_loss": 0.0082497987896204, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007550188340246677, + "rewards/margins": 0.5010592341423035, + "rewards/rejected": -0.508609414100647, + "sft_loss": 0.07550188153982162, + "step": 2922 + }, + { + "epoch": 4.22704266088214, + "grad_norm": 1.6479858436312316, + "learning_rate": 1.6720356757825256e-06, + "logits/chosen": -0.8297333717346191, + "logits/rejected": -0.7453821301460266, + "logps/chosen": -0.16031071543693542, + "logps/rejected": -4.488302230834961, + "loss": 0.098, + "odds_ratio_loss": 0.019046666100621223, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016031071543693542, + "rewards/margins": 0.43279916048049927, + "rewards/rejected": -0.448830246925354, + "sft_loss": 0.16031071543693542, + "step": 2923 + }, + { + "epoch": 4.228488792480116, + "grad_norm": 1.7918703336478432, + "learning_rate": 1.669510695189253e-06, + "logits/chosen": -0.833388090133667, + "logits/rejected": -0.6928578615188599, + "logps/chosen": -0.0595964752137661, + "logps/rejected": -4.915972709655762, + "loss": 0.0828, + "odds_ratio_loss": 0.005791317671537399, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005959647241979837, + "rewards/margins": 0.4856376349925995, + "rewards/rejected": -0.4915972352027893, + "sft_loss": 0.0595964752137661, + "step": 2924 + }, + { + "epoch": 4.2299349240780915, + "grad_norm": 1.7973661105477898, + "learning_rate": 1.6669871196566607e-06, + "logits/chosen": -0.6049919128417969, + "logits/rejected": -0.48009204864501953, + "logps/chosen": -0.09875990450382233, + "logps/rejected": -4.366959571838379, + "loss": 0.088, + "odds_ratio_loss": 0.009393557906150818, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009875990450382233, + "rewards/margins": 0.4268200099468231, + "rewards/rejected": -0.43669599294662476, + "sft_loss": 0.09875990450382233, + "step": 2925 + }, + { + "epoch": 4.231381055676066, + "grad_norm": 1.3643896640774793, + "learning_rate": 1.6644649507062241e-06, + "logits/chosen": -0.8335921764373779, + "logits/rejected": -0.5632571578025818, + "logps/chosen": -0.057263340801000595, + "logps/rejected": -5.25675630569458, + "loss": 0.0885, + "odds_ratio_loss": 0.0035503399558365345, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00572633370757103, + "rewards/margins": 0.5199492573738098, + "rewards/rejected": -0.5256756544113159, + "sft_loss": 0.057263340801000595, + "step": 2926 + }, + { + "epoch": 4.232827187274042, + "grad_norm": 1.7117069678343029, + "learning_rate": 1.6619441898585676e-06, + "logits/chosen": -0.860699474811554, + "logits/rejected": -0.7995709180831909, + "logps/chosen": -0.06467077136039734, + "logps/rejected": -3.649376392364502, + "loss": 0.1072, + "odds_ratio_loss": 0.009878093376755714, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006467076949775219, + "rewards/margins": 0.3584705591201782, + "rewards/rejected": -0.36493760347366333, + "sft_loss": 0.06467077136039734, + "step": 2927 + }, + { + "epoch": 4.234273318872018, + "grad_norm": 1.4866554904397025, + "learning_rate": 1.6594248386334649e-06, + "logits/chosen": -0.9298467040061951, + "logits/rejected": -0.7085505723953247, + "logps/chosen": -0.07608020305633545, + "logps/rejected": -4.444356918334961, + "loss": 0.0761, + "odds_ratio_loss": 0.004563465248793364, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007608020678162575, + "rewards/margins": 0.43682771921157837, + "rewards/rejected": -0.444435715675354, + "sft_loss": 0.07608020305633545, + "step": 2928 + }, + { + "epoch": 4.235719450469992, + "grad_norm": 1.4014182752061624, + "learning_rate": 1.6569068985498457e-06, + "logits/chosen": -0.943458616733551, + "logits/rejected": -0.849433183670044, + "logps/chosen": -0.051664408296346664, + "logps/rejected": -4.948188781738281, + "loss": 0.055, + "odds_ratio_loss": 0.006422580685466528, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005166441202163696, + "rewards/margins": 0.4896524250507355, + "rewards/rejected": -0.49481892585754395, + "sft_loss": 0.051664408296346664, + "step": 2929 + }, + { + "epoch": 4.237165582067968, + "grad_norm": 1.854606938644546, + "learning_rate": 1.6543903711257832e-06, + "logits/chosen": -0.7278324365615845, + "logits/rejected": -0.4564022421836853, + "logps/chosen": -0.10963063687086105, + "logps/rejected": -4.744562149047852, + "loss": 0.0732, + "odds_ratio_loss": 0.009700143709778786, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01096306461840868, + "rewards/margins": 0.46349310874938965, + "rewards/rejected": -0.47445619106292725, + "sft_loss": 0.10963063687086105, + "step": 2930 + }, + { + "epoch": 4.238611713665944, + "grad_norm": 1.3679336336823964, + "learning_rate": 1.651875257878503e-06, + "logits/chosen": -0.8306885361671448, + "logits/rejected": -0.7784897089004517, + "logps/chosen": -0.05908175930380821, + "logps/rejected": -4.5930585861206055, + "loss": 0.0494, + "odds_ratio_loss": 0.011198028922080994, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005908175837248564, + "rewards/margins": 0.4533976912498474, + "rewards/rejected": -0.45930585265159607, + "sft_loss": 0.05908175930380821, + "step": 2931 + }, + { + "epoch": 4.240057845263919, + "grad_norm": 1.495799549172853, + "learning_rate": 1.6493615603243733e-06, + "logits/chosen": -0.5538917183876038, + "logits/rejected": -0.5523948073387146, + "logps/chosen": -0.07916896045207977, + "logps/rejected": -4.839788436889648, + "loss": 0.0927, + "odds_ratio_loss": 0.06816166639328003, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.007916895672678947, + "rewards/margins": 0.47606196999549866, + "rewards/rejected": -0.48397886753082275, + "sft_loss": 0.07916896045207977, + "step": 2932 + }, + { + "epoch": 4.241503976861894, + "grad_norm": 1.9770485452901845, + "learning_rate": 1.6468492799789155e-06, + "logits/chosen": -0.8710498809814453, + "logits/rejected": -0.7079587578773499, + "logps/chosen": -0.037493444979190826, + "logps/rejected": -4.81158447265625, + "loss": 0.0908, + "odds_ratio_loss": 0.004065762739628553, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0037493444979190826, + "rewards/margins": 0.47740909457206726, + "rewards/rejected": -0.48115843534469604, + "sft_loss": 0.037493444979190826, + "step": 2933 + }, + { + "epoch": 4.24295010845987, + "grad_norm": 1.7584110563510973, + "learning_rate": 1.6443384183567907e-06, + "logits/chosen": -0.779168963432312, + "logits/rejected": -0.7387828826904297, + "logps/chosen": -0.12338142096996307, + "logps/rejected": -4.761046409606934, + "loss": 0.0934, + "odds_ratio_loss": 0.015843737870454788, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012338140979409218, + "rewards/margins": 0.463766485452652, + "rewards/rejected": -0.47610461711883545, + "sft_loss": 0.12338142096996307, + "step": 2934 + }, + { + "epoch": 4.244396240057846, + "grad_norm": 1.7602609188825338, + "learning_rate": 1.6418289769718072e-06, + "logits/chosen": -0.8570691347122192, + "logits/rejected": -0.7762001752853394, + "logps/chosen": -0.09432914108037949, + "logps/rejected": -4.270380973815918, + "loss": 0.1054, + "odds_ratio_loss": 0.009115164168179035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009432912804186344, + "rewards/margins": 0.4176052212715149, + "rewards/rejected": -0.4270381033420563, + "sft_loss": 0.09432914108037949, + "step": 2935 + }, + { + "epoch": 4.2458423716558205, + "grad_norm": 1.7361866931127048, + "learning_rate": 1.6393209573369146e-06, + "logits/chosen": -0.934897780418396, + "logits/rejected": -0.7846497297286987, + "logps/chosen": -0.07415792346000671, + "logps/rejected": -4.217360496520996, + "loss": 0.1052, + "odds_ratio_loss": 0.011329401284456253, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007415792904794216, + "rewards/margins": 0.41432029008865356, + "rewards/rejected": -0.42173606157302856, + "sft_loss": 0.07415792346000671, + "step": 2936 + }, + { + "epoch": 4.247288503253796, + "grad_norm": 1.622613194938224, + "learning_rate": 1.6368143609642102e-06, + "logits/chosen": -0.8236643075942993, + "logits/rejected": -0.6638423204421997, + "logps/chosen": -0.10684844851493835, + "logps/rejected": -4.309247016906738, + "loss": 0.0832, + "odds_ratio_loss": 0.015189705416560173, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01068484503775835, + "rewards/margins": 0.4202398657798767, + "rewards/rejected": -0.4309247136116028, + "sft_loss": 0.10684844851493835, + "step": 2937 + }, + { + "epoch": 4.248734634851772, + "grad_norm": 1.4281181979339255, + "learning_rate": 1.6343091893649282e-06, + "logits/chosen": -0.64753657579422, + "logits/rejected": -0.5138456225395203, + "logps/chosen": -0.029012585058808327, + "logps/rejected": -6.18804931640625, + "loss": 0.0521, + "odds_ratio_loss": 0.0036126519553363323, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002901258412748575, + "rewards/margins": 0.6159037351608276, + "rewards/rejected": -0.618804931640625, + "sft_loss": 0.029012585058808327, + "step": 2938 + }, + { + "epoch": 4.250180766449747, + "grad_norm": 1.5929975903122693, + "learning_rate": 1.6318054440494473e-06, + "logits/chosen": -0.7193521857261658, + "logits/rejected": -0.6249187588691711, + "logps/chosen": -0.07214511930942535, + "logps/rejected": -5.506035804748535, + "loss": 0.1076, + "odds_ratio_loss": 0.010497845709323883, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007214512676000595, + "rewards/margins": 0.543389081954956, + "rewards/rejected": -0.5506036281585693, + "sft_loss": 0.07214511930942535, + "step": 2939 + }, + { + "epoch": 4.251626898047722, + "grad_norm": 1.6636318072838059, + "learning_rate": 1.6293031265272834e-06, + "logits/chosen": -0.7778177261352539, + "logits/rejected": -0.7233993411064148, + "logps/chosen": -0.05430297553539276, + "logps/rejected": -3.162222385406494, + "loss": 0.0946, + "odds_ratio_loss": 0.010178321041166782, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005430297926068306, + "rewards/margins": 0.310791939496994, + "rewards/rejected": -0.31622225046157837, + "sft_loss": 0.05430297553539276, + "step": 2940 + }, + { + "epoch": 4.253073029645698, + "grad_norm": 1.677532945172881, + "learning_rate": 1.6268022383070949e-06, + "logits/chosen": -0.6373997926712036, + "logits/rejected": -0.579150915145874, + "logps/chosen": -0.07864230126142502, + "logps/rejected": -4.474689483642578, + "loss": 0.0894, + "odds_ratio_loss": 0.022512556985020638, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007864230312407017, + "rewards/margins": 0.4396046996116638, + "rewards/rejected": -0.44746896624565125, + "sft_loss": 0.07864230126142502, + "step": 2941 + }, + { + "epoch": 4.254519161243673, + "grad_norm": 1.6819675796686997, + "learning_rate": 1.6243027808966763e-06, + "logits/chosen": -0.9810624122619629, + "logits/rejected": -0.6949055194854736, + "logps/chosen": -0.1464422345161438, + "logps/rejected": -4.525935649871826, + "loss": 0.0808, + "odds_ratio_loss": 0.008545887656509876, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01464422419667244, + "rewards/margins": 0.43794935941696167, + "rewards/rejected": -0.4525935649871826, + "sft_loss": 0.1464422345161438, + "step": 2942 + }, + { + "epoch": 4.2559652928416485, + "grad_norm": 1.5653710384809179, + "learning_rate": 1.6218047558029574e-06, + "logits/chosen": -0.9486204385757446, + "logits/rejected": -0.6342113018035889, + "logps/chosen": -0.09254883229732513, + "logps/rejected": -4.13349723815918, + "loss": 0.1043, + "odds_ratio_loss": 0.009536804631352425, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009254883043467999, + "rewards/margins": 0.4040948152542114, + "rewards/rejected": -0.4133497178554535, + "sft_loss": 0.09254883229732513, + "step": 2943 + }, + { + "epoch": 4.257411424439624, + "grad_norm": 1.8681108971211955, + "learning_rate": 1.6193081645320098e-06, + "logits/chosen": -0.7743822336196899, + "logits/rejected": -0.5420173406600952, + "logps/chosen": -0.0828114002943039, + "logps/rejected": -4.951780319213867, + "loss": 0.0878, + "odds_ratio_loss": 0.01936756819486618, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00828113965690136, + "rewards/margins": 0.4868968725204468, + "rewards/rejected": -0.4951780438423157, + "sft_loss": 0.0828114002943039, + "step": 2944 + }, + { + "epoch": 4.258857556037599, + "grad_norm": 1.736072625058886, + "learning_rate": 1.6168130085890353e-06, + "logits/chosen": -0.9736768007278442, + "logits/rejected": -0.6207584738731384, + "logps/chosen": -0.07031136751174927, + "logps/rejected": -2.8038110733032227, + "loss": 0.0748, + "odds_ratio_loss": 0.009815733879804611, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007031137123703957, + "rewards/margins": 0.27334994077682495, + "rewards/rejected": -0.28038108348846436, + "sft_loss": 0.07031136751174927, + "step": 2945 + }, + { + "epoch": 4.260303687635575, + "grad_norm": 1.8051801795448943, + "learning_rate": 1.6143192894783751e-06, + "logits/chosen": -0.671993613243103, + "logits/rejected": -0.5487344264984131, + "logps/chosen": -0.1085626408457756, + "logps/rejected": -4.188941478729248, + "loss": 0.0862, + "odds_ratio_loss": 0.004737728741019964, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010856264270842075, + "rewards/margins": 0.4080379009246826, + "rewards/rejected": -0.4188941717147827, + "sft_loss": 0.1085626408457756, + "step": 2946 + }, + { + "epoch": 4.26174981923355, + "grad_norm": 1.3776328659708403, + "learning_rate": 1.611827008703499e-06, + "logits/chosen": -0.7267762422561646, + "logits/rejected": -0.4366953372955322, + "logps/chosen": -0.10525096952915192, + "logps/rejected": -3.887083053588867, + "loss": 0.0738, + "odds_ratio_loss": 0.007737172767519951, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010525096207857132, + "rewards/margins": 0.3781832158565521, + "rewards/rejected": -0.38870832324028015, + "sft_loss": 0.10525096952915192, + "step": 2947 + }, + { + "epoch": 4.263195950831526, + "grad_norm": 1.6865062513248021, + "learning_rate": 1.6093361677670157e-06, + "logits/chosen": -0.5784760117530823, + "logits/rejected": -0.5577380657196045, + "logps/chosen": -0.03789515048265457, + "logps/rejected": -4.079635143280029, + "loss": 0.0948, + "odds_ratio_loss": 0.008148334920406342, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003789515234529972, + "rewards/margins": 0.4041740298271179, + "rewards/rejected": -0.4079635441303253, + "sft_loss": 0.03789515048265457, + "step": 2948 + }, + { + "epoch": 4.264642082429501, + "grad_norm": 1.573544466630816, + "learning_rate": 1.6068467681706602e-06, + "logits/chosen": -0.6730165481567383, + "logits/rejected": -0.38399767875671387, + "logps/chosen": -0.05993356555700302, + "logps/rejected": -5.047731399536133, + "loss": 0.0711, + "odds_ratio_loss": 0.002913933712989092, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005993356462568045, + "rewards/margins": 0.4987798035144806, + "rewards/rejected": -0.5047731399536133, + "sft_loss": 0.05993356555700302, + "step": 2949 + }, + { + "epoch": 4.2660882140274765, + "grad_norm": 1.4803755685437572, + "learning_rate": 1.6043588114153016e-06, + "logits/chosen": -0.8042940497398376, + "logits/rejected": -0.545076847076416, + "logps/chosen": -0.04276617616415024, + "logps/rejected": -4.222831726074219, + "loss": 0.0499, + "odds_ratio_loss": 0.004342740401625633, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0042766183614730835, + "rewards/margins": 0.4180065393447876, + "rewards/rejected": -0.4222831726074219, + "sft_loss": 0.04276617616415024, + "step": 2950 + }, + { + "epoch": 4.267534345625452, + "grad_norm": 1.546044165144899, + "learning_rate": 1.601872299000936e-06, + "logits/chosen": -0.830041766166687, + "logits/rejected": -0.49972018599510193, + "logps/chosen": -0.17610527575016022, + "logps/rejected": -4.997128486633301, + "loss": 0.1124, + "odds_ratio_loss": 0.016635987907648087, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01761052943766117, + "rewards/margins": 0.48210230469703674, + "rewards/rejected": -0.49971282482147217, + "sft_loss": 0.17610527575016022, + "step": 2951 + }, + { + "epoch": 4.268980477223427, + "grad_norm": 1.6893484365315183, + "learning_rate": 1.599387232426695e-06, + "logits/chosen": -0.8119892477989197, + "logits/rejected": -0.5879321694374084, + "logps/chosen": -0.09631012380123138, + "logps/rejected": -4.048828601837158, + "loss": 0.081, + "odds_ratio_loss": 0.006678726989775896, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009631011635065079, + "rewards/margins": 0.3952518701553345, + "rewards/rejected": -0.40488284826278687, + "sft_loss": 0.09631012380123138, + "step": 2952 + }, + { + "epoch": 4.270426608821403, + "grad_norm": 1.507223223978156, + "learning_rate": 1.5969036131908302e-06, + "logits/chosen": -0.9762201905250549, + "logits/rejected": -0.7047141790390015, + "logps/chosen": -0.05365900322794914, + "logps/rejected": -4.9685821533203125, + "loss": 0.0596, + "odds_ratio_loss": 0.00955482292920351, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005365900695323944, + "rewards/margins": 0.4914923310279846, + "rewards/rejected": -0.49685823917388916, + "sft_loss": 0.05365900322794914, + "step": 2953 + }, + { + "epoch": 4.271872740419378, + "grad_norm": 1.6075610959081317, + "learning_rate": 1.5944214427907277e-06, + "logits/chosen": -0.8845387697219849, + "logits/rejected": -0.6668776869773865, + "logps/chosen": -0.07586217671632767, + "logps/rejected": -4.427034378051758, + "loss": 0.0802, + "odds_ratio_loss": 0.00986341293901205, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007586217485368252, + "rewards/margins": 0.435117244720459, + "rewards/rejected": -0.4427034556865692, + "sft_loss": 0.07586217671632767, + "step": 2954 + }, + { + "epoch": 4.273318872017353, + "grad_norm": 1.4176098445474563, + "learning_rate": 1.5919407227228976e-06, + "logits/chosen": -0.9498851299285889, + "logits/rejected": -0.5791040062904358, + "logps/chosen": -0.051679011434316635, + "logps/rejected": -4.279007434844971, + "loss": 0.0494, + "odds_ratio_loss": 0.0026139526162296534, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005167901515960693, + "rewards/margins": 0.4227328300476074, + "rewards/rejected": -0.4279007315635681, + "sft_loss": 0.051679011434316635, + "step": 2955 + }, + { + "epoch": 4.274765003615329, + "grad_norm": 1.6257258837083226, + "learning_rate": 1.5894614544829747e-06, + "logits/chosen": -0.7169545888900757, + "logits/rejected": -0.42061808705329895, + "logps/chosen": -0.14304006099700928, + "logps/rejected": -6.230022430419922, + "loss": 0.0903, + "odds_ratio_loss": 0.005613164976239204, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014304005540907383, + "rewards/margins": 0.6086981892585754, + "rewards/rejected": -0.6230022311210632, + "sft_loss": 0.14304006099700928, + "step": 2956 + }, + { + "epoch": 4.2762111352133045, + "grad_norm": 1.8611753096508106, + "learning_rate": 1.5869836395657185e-06, + "logits/chosen": -0.6850326061248779, + "logits/rejected": -0.4503934979438782, + "logps/chosen": -0.03483531251549721, + "logps/rejected": -5.079290866851807, + "loss": 0.0554, + "odds_ratio_loss": 0.0030643048230558634, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0034835312981158495, + "rewards/margins": 0.5044455528259277, + "rewards/rejected": -0.5079290866851807, + "sft_loss": 0.03483531251549721, + "step": 2957 + }, + { + "epoch": 4.27765726681128, + "grad_norm": 1.4420781987404232, + "learning_rate": 1.5845072794650118e-06, + "logits/chosen": -0.765277087688446, + "logits/rejected": -0.6213148832321167, + "logps/chosen": -0.1533205658197403, + "logps/rejected": -4.846577167510986, + "loss": 0.0834, + "odds_ratio_loss": 0.022611288353800774, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015332056209445, + "rewards/margins": 0.46932563185691833, + "rewards/rejected": -0.4846577048301697, + "sft_loss": 0.1533205658197403, + "step": 2958 + }, + { + "epoch": 4.279103398409255, + "grad_norm": 1.5171362176056595, + "learning_rate": 1.5820323756738643e-06, + "logits/chosen": -0.8511385917663574, + "logits/rejected": -0.5496898293495178, + "logps/chosen": -0.048428021371364594, + "logps/rejected": -5.54318380355835, + "loss": 0.0619, + "odds_ratio_loss": 0.006257210858166218, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004842801950871944, + "rewards/margins": 0.5494755506515503, + "rewards/rejected": -0.554318368434906, + "sft_loss": 0.048428021371364594, + "step": 2959 + }, + { + "epoch": 4.280549530007231, + "grad_norm": 1.7435049856938651, + "learning_rate": 1.579558929684401e-06, + "logits/chosen": -0.6494778394699097, + "logits/rejected": -0.5383796095848083, + "logps/chosen": -0.14231713116168976, + "logps/rejected": -3.891136646270752, + "loss": 0.1077, + "odds_ratio_loss": 0.02279626578092575, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01423171442002058, + "rewards/margins": 0.37488192319869995, + "rewards/rejected": -0.3891136646270752, + "sft_loss": 0.14231713116168976, + "step": 2960 + }, + { + "epoch": 4.281995661605206, + "grad_norm": 1.6390575823940858, + "learning_rate": 1.5770869429878752e-06, + "logits/chosen": -0.9379094243049622, + "logits/rejected": -0.8066362738609314, + "logps/chosen": -0.08813867717981339, + "logps/rejected": -3.6948931217193604, + "loss": 0.0692, + "odds_ratio_loss": 0.01244413573294878, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008813867345452309, + "rewards/margins": 0.3606754541397095, + "rewards/rejected": -0.3694893419742584, + "sft_loss": 0.08813867717981339, + "step": 2961 + }, + { + "epoch": 4.283441793203181, + "grad_norm": 1.6843204785703874, + "learning_rate": 1.5746164170746542e-06, + "logits/chosen": -0.8207695484161377, + "logits/rejected": -0.7455189824104309, + "logps/chosen": -0.019551407545804977, + "logps/rejected": -4.0999836921691895, + "loss": 0.0764, + "odds_ratio_loss": 0.0014808757696300745, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001955140847712755, + "rewards/margins": 0.40804323554039, + "rewards/rejected": -0.40999835729599, + "sft_loss": 0.019551407545804977, + "step": 2962 + }, + { + "epoch": 4.284887924801157, + "grad_norm": 1.7963696202728958, + "learning_rate": 1.5721473534342296e-06, + "logits/chosen": -0.8363921642303467, + "logits/rejected": -0.680959939956665, + "logps/chosen": -0.0901620164513588, + "logps/rejected": -5.3850202560424805, + "loss": 0.0799, + "odds_ratio_loss": 0.008571883663535118, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009016201831400394, + "rewards/margins": 0.529485821723938, + "rewards/rejected": -0.538502037525177, + "sft_loss": 0.0901620164513588, + "step": 2963 + }, + { + "epoch": 4.286334056399133, + "grad_norm": 1.7375869804762725, + "learning_rate": 1.5696797535552078e-06, + "logits/chosen": -0.9396668672561646, + "logits/rejected": -0.5902308821678162, + "logps/chosen": -0.05397862195968628, + "logps/rejected": -3.661261558532715, + "loss": 0.1087, + "odds_ratio_loss": 0.006330575793981552, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005397862754762173, + "rewards/margins": 0.36072826385498047, + "rewards/rejected": -0.3661261796951294, + "sft_loss": 0.05397862195968628, + "step": 2964 + }, + { + "epoch": 4.287780187997107, + "grad_norm": 1.759220854338626, + "learning_rate": 1.5672136189253143e-06, + "logits/chosen": -0.8728683590888977, + "logits/rejected": -0.5772249698638916, + "logps/chosen": -0.12193911522626877, + "logps/rejected": -6.481555461883545, + "loss": 0.0981, + "odds_ratio_loss": 0.013428367674350739, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012193911708891392, + "rewards/margins": 0.635961651802063, + "rewards/rejected": -0.6481555104255676, + "sft_loss": 0.12193911522626877, + "step": 2965 + }, + { + "epoch": 4.289226319595083, + "grad_norm": 1.4307744608227613, + "learning_rate": 1.5647489510313894e-06, + "logits/chosen": -0.595003068447113, + "logits/rejected": -0.4218134582042694, + "logps/chosen": -0.09229257702827454, + "logps/rejected": -4.479331970214844, + "loss": 0.0776, + "odds_ratio_loss": 0.007592702284455299, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009229256771504879, + "rewards/margins": 0.4387039542198181, + "rewards/rejected": -0.44793322682380676, + "sft_loss": 0.09229257702827454, + "step": 2966 + }, + { + "epoch": 4.290672451193059, + "grad_norm": 1.5528265033967315, + "learning_rate": 1.562285751359393e-06, + "logits/chosen": -0.5394838452339172, + "logits/rejected": -0.5432738661766052, + "logps/chosen": -0.051818784326314926, + "logps/rejected": -5.729471206665039, + "loss": 0.0756, + "odds_ratio_loss": 0.008107408881187439, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0051818788051605225, + "rewards/margins": 0.5677652359008789, + "rewards/rejected": -0.5729471445083618, + "sft_loss": 0.051818784326314926, + "step": 2967 + }, + { + "epoch": 4.2921185827910335, + "grad_norm": 1.5265022495863947, + "learning_rate": 1.5598240213943945e-06, + "logits/chosen": -0.8859455585479736, + "logits/rejected": -0.606169581413269, + "logps/chosen": -0.068026602268219, + "logps/rejected": -3.8222718238830566, + "loss": 0.0673, + "odds_ratio_loss": 0.005845806561410427, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006802660878747702, + "rewards/margins": 0.3754245638847351, + "rewards/rejected": -0.38222724199295044, + "sft_loss": 0.068026602268219, + "step": 2968 + }, + { + "epoch": 4.293564714389009, + "grad_norm": 1.7731344063337215, + "learning_rate": 1.5573637626205818e-06, + "logits/chosen": -0.7323688864707947, + "logits/rejected": -0.6176028847694397, + "logps/chosen": -0.05627206712961197, + "logps/rejected": -4.247992038726807, + "loss": 0.0782, + "odds_ratio_loss": 0.004452931694686413, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005627206526696682, + "rewards/margins": 0.4191719889640808, + "rewards/rejected": -0.42479920387268066, + "sft_loss": 0.05627206712961197, + "step": 2969 + }, + { + "epoch": 4.295010845986985, + "grad_norm": 1.6054365983973067, + "learning_rate": 1.5549049765212554e-06, + "logits/chosen": -0.6429954767227173, + "logits/rejected": -0.5249923467636108, + "logps/chosen": -0.14273762702941895, + "logps/rejected": -4.192305564880371, + "loss": 0.0963, + "odds_ratio_loss": 0.06254995614290237, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.01427376363426447, + "rewards/margins": 0.4049568176269531, + "rewards/rejected": -0.419230580329895, + "sft_loss": 0.14273762702941895, + "step": 2970 + }, + { + "epoch": 4.296456977584961, + "grad_norm": 1.527875617710426, + "learning_rate": 1.5524476645788238e-06, + "logits/chosen": -0.8400360941886902, + "logits/rejected": -0.5458084344863892, + "logps/chosen": -0.06384026259183884, + "logps/rejected": -5.687685489654541, + "loss": 0.0735, + "odds_ratio_loss": 0.0056127458810806274, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006384026259183884, + "rewards/margins": 0.5623845458030701, + "rewards/rejected": -0.5687686204910278, + "sft_loss": 0.06384026259183884, + "step": 2971 + }, + { + "epoch": 4.297903109182935, + "grad_norm": 1.5936572274065168, + "learning_rate": 1.5499918282748122e-06, + "logits/chosen": -0.8916709423065186, + "logits/rejected": -0.6111676692962646, + "logps/chosen": -0.0904655009508133, + "logps/rejected": -4.318139553070068, + "loss": 0.0742, + "odds_ratio_loss": 0.0061836340464651585, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009046549908816814, + "rewards/margins": 0.42276743054389954, + "rewards/rejected": -0.43181395530700684, + "sft_loss": 0.0904655009508133, + "step": 2972 + }, + { + "epoch": 4.299349240780911, + "grad_norm": 1.5231636245400404, + "learning_rate": 1.5475374690898519e-06, + "logits/chosen": -0.7592072486877441, + "logits/rejected": -0.706114649772644, + "logps/chosen": -0.02892283909022808, + "logps/rejected": -4.623927116394043, + "loss": 0.0602, + "odds_ratio_loss": 0.006367319729179144, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002892283955588937, + "rewards/margins": 0.45950043201446533, + "rewards/rejected": -0.4623927175998688, + "sft_loss": 0.02892283909022808, + "step": 2973 + }, + { + "epoch": 4.300795372378887, + "grad_norm": 1.363212769075135, + "learning_rate": 1.5450845885036858e-06, + "logits/chosen": -0.7811589241027832, + "logits/rejected": -0.6956539750099182, + "logps/chosen": -0.13292838633060455, + "logps/rejected": -3.8825864791870117, + "loss": 0.0763, + "odds_ratio_loss": 0.009715279564261436, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01329283881932497, + "rewards/margins": 0.3749657869338989, + "rewards/rejected": -0.3882586359977722, + "sft_loss": 0.13292838633060455, + "step": 2974 + }, + { + "epoch": 4.302241503976862, + "grad_norm": 1.6827472496502511, + "learning_rate": 1.5426331879951628e-06, + "logits/chosen": -1.0225802659988403, + "logits/rejected": -0.652125895023346, + "logps/chosen": -0.06771662831306458, + "logps/rejected": -5.0016303062438965, + "loss": 0.0796, + "odds_ratio_loss": 0.008105105720460415, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006771662272512913, + "rewards/margins": 0.49339133501052856, + "rewards/rejected": -0.5001630187034607, + "sft_loss": 0.06771662831306458, + "step": 2975 + }, + { + "epoch": 4.303687635574837, + "grad_norm": 1.928130002839635, + "learning_rate": 1.5401832690422448e-06, + "logits/chosen": -0.7736106514930725, + "logits/rejected": -0.6177071332931519, + "logps/chosen": -0.14021962881088257, + "logps/rejected": -3.8993277549743652, + "loss": 0.1028, + "odds_ratio_loss": 0.008643961511552334, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014021963812410831, + "rewards/margins": 0.37591081857681274, + "rewards/rejected": -0.389932781457901, + "sft_loss": 0.14021962881088257, + "step": 2976 + }, + { + "epoch": 4.305133767172813, + "grad_norm": 1.4138003491700672, + "learning_rate": 1.5377348331219934e-06, + "logits/chosen": -0.6652264595031738, + "logits/rejected": -0.5014814734458923, + "logps/chosen": -0.045446865260601044, + "logps/rejected": -4.814835548400879, + "loss": 0.0547, + "odds_ratio_loss": 0.003532176371663809, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004544686526060104, + "rewards/margins": 0.4769388735294342, + "rewards/rejected": -0.4814835786819458, + "sft_loss": 0.045446865260601044, + "step": 2977 + }, + { + "epoch": 4.306579898770788, + "grad_norm": 1.2321565482718624, + "learning_rate": 1.535287881710583e-06, + "logits/chosen": -0.6869823336601257, + "logits/rejected": -0.485745906829834, + "logps/chosen": -0.016159405931830406, + "logps/rejected": -7.835278511047363, + "loss": 0.0353, + "odds_ratio_loss": 0.0009945888305082917, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0016159405931830406, + "rewards/margins": 0.7819118499755859, + "rewards/rejected": -0.7835278511047363, + "sft_loss": 0.016159405931830406, + "step": 2978 + }, + { + "epoch": 4.308026030368763, + "grad_norm": 1.761381333961773, + "learning_rate": 1.5328424162832869e-06, + "logits/chosen": -0.8571205139160156, + "logits/rejected": -0.7177923917770386, + "logps/chosen": -0.09584394842386246, + "logps/rejected": -5.071878433227539, + "loss": 0.0894, + "odds_ratio_loss": 0.01330016739666462, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00958439614623785, + "rewards/margins": 0.4976034164428711, + "rewards/rejected": -0.5071878433227539, + "sft_loss": 0.09584394842386246, + "step": 2979 + }, + { + "epoch": 4.309472161966739, + "grad_norm": 1.560676269187292, + "learning_rate": 1.5303984383144881e-06, + "logits/chosen": -0.8674488067626953, + "logits/rejected": -0.5158795714378357, + "logps/chosen": -0.08388251066207886, + "logps/rejected": -4.4099578857421875, + "loss": 0.0683, + "odds_ratio_loss": 0.00677723903208971, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008388251066207886, + "rewards/margins": 0.432607501745224, + "rewards/rejected": -0.4409957528114319, + "sft_loss": 0.08388251066207886, + "step": 2980 + }, + { + "epoch": 4.310918293564715, + "grad_norm": 1.5808008085636467, + "learning_rate": 1.5279559492776688e-06, + "logits/chosen": -0.6280909180641174, + "logits/rejected": -0.4606649577617645, + "logps/chosen": -0.06359011679887772, + "logps/rejected": -3.5109429359436035, + "loss": 0.0882, + "odds_ratio_loss": 0.008953486569225788, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0063590118661522865, + "rewards/margins": 0.3447352945804596, + "rewards/rejected": -0.3510943055152893, + "sft_loss": 0.06359011679887772, + "step": 2981 + }, + { + "epoch": 4.31236442516269, + "grad_norm": 2.3823337309263204, + "learning_rate": 1.5255149506454127e-06, + "logits/chosen": -0.663183867931366, + "logits/rejected": -0.63742595911026, + "logps/chosen": -0.06929976493120193, + "logps/rejected": -3.3315176963806152, + "loss": 0.0753, + "odds_ratio_loss": 0.006415051873773336, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006929976865649223, + "rewards/margins": 0.3262217938899994, + "rewards/rejected": -0.33315175771713257, + "sft_loss": 0.06929976493120193, + "step": 2982 + }, + { + "epoch": 4.313810556760665, + "grad_norm": 1.9637548548003436, + "learning_rate": 1.523075443889411e-06, + "logits/chosen": -0.8092422485351562, + "logits/rejected": -0.5086420774459839, + "logps/chosen": -0.09017635136842728, + "logps/rejected": -4.9352827072143555, + "loss": 0.0883, + "odds_ratio_loss": 0.004887173883616924, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009017635136842728, + "rewards/margins": 0.4845106899738312, + "rewards/rejected": -0.4935283064842224, + "sft_loss": 0.09017635136842728, + "step": 2983 + }, + { + "epoch": 4.315256688358641, + "grad_norm": 1.7855616405277308, + "learning_rate": 1.520637430480447e-06, + "logits/chosen": -0.6320347785949707, + "logits/rejected": -0.5742474794387817, + "logps/chosen": -0.10536934435367584, + "logps/rejected": -5.842680931091309, + "loss": 0.0956, + "odds_ratio_loss": 0.009043958969414234, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01053693424910307, + "rewards/margins": 0.5737311244010925, + "rewards/rejected": -0.5842680931091309, + "sft_loss": 0.10536934435367584, + "step": 2984 + }, + { + "epoch": 4.316702819956616, + "grad_norm": 1.4993141920071105, + "learning_rate": 1.518200911888412e-06, + "logits/chosen": -0.676465630531311, + "logits/rejected": -0.4837293028831482, + "logps/chosen": -0.03564174845814705, + "logps/rejected": -3.781062364578247, + "loss": 0.0672, + "odds_ratio_loss": 0.002124813385307789, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003564174985513091, + "rewards/margins": 0.3745420575141907, + "rewards/rejected": -0.3781062364578247, + "sft_loss": 0.03564174845814705, + "step": 2985 + }, + { + "epoch": 4.318148951554591, + "grad_norm": 1.7181779387692628, + "learning_rate": 1.5157658895822892e-06, + "logits/chosen": -0.9328091144561768, + "logits/rejected": -0.6783272624015808, + "logps/chosen": -0.10630530118942261, + "logps/rejected": -5.30551290512085, + "loss": 0.1007, + "odds_ratio_loss": 0.011326825246214867, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010630530305206776, + "rewards/margins": 0.519920825958252, + "rewards/rejected": -0.5305513739585876, + "sft_loss": 0.10630530118942261, + "step": 2986 + }, + { + "epoch": 4.319595083152567, + "grad_norm": 1.5179729610740627, + "learning_rate": 1.5133323650301653e-06, + "logits/chosen": -0.8240708112716675, + "logits/rejected": -0.643516480922699, + "logps/chosen": -0.07202492654323578, + "logps/rejected": -4.5885725021362305, + "loss": 0.067, + "odds_ratio_loss": 0.007801711093634367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007202493026852608, + "rewards/margins": 0.4516547620296478, + "rewards/rejected": -0.45885729789733887, + "sft_loss": 0.07202492654323578, + "step": 2987 + }, + { + "epoch": 4.321041214750542, + "grad_norm": 1.948795780060964, + "learning_rate": 1.5109003396992196e-06, + "logits/chosen": -1.0349688529968262, + "logits/rejected": -0.46029579639434814, + "logps/chosen": -0.06265415251255035, + "logps/rejected": -4.769626140594482, + "loss": 0.0644, + "odds_ratio_loss": 0.003265473060309887, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0062654148787260056, + "rewards/margins": 0.47069722414016724, + "rewards/rejected": -0.4769626259803772, + "sft_loss": 0.06265415251255035, + "step": 2988 + }, + { + "epoch": 4.322487346348518, + "grad_norm": 1.5111670667012016, + "learning_rate": 1.5084698150557294e-06, + "logits/chosen": -0.8330080509185791, + "logits/rejected": -0.5925476551055908, + "logps/chosen": -0.05622435361146927, + "logps/rejected": -3.154797315597534, + "loss": 0.0729, + "odds_ratio_loss": 0.006971028633415699, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005622435361146927, + "rewards/margins": 0.3098573088645935, + "rewards/rejected": -0.31547972559928894, + "sft_loss": 0.05622435361146927, + "step": 2989 + }, + { + "epoch": 4.323933477946493, + "grad_norm": 1.9164595471589108, + "learning_rate": 1.506040792565066e-06, + "logits/chosen": -0.763274610042572, + "logits/rejected": -0.5204155445098877, + "logps/chosen": -0.0702725499868393, + "logps/rejected": -5.756369113922119, + "loss": 0.0946, + "odds_ratio_loss": 0.003552860114723444, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007027255371212959, + "rewards/margins": 0.5686096549034119, + "rewards/rejected": -0.5756369233131409, + "sft_loss": 0.0702725499868393, + "step": 2990 + }, + { + "epoch": 4.325379609544468, + "grad_norm": 1.5652675124707198, + "learning_rate": 1.5036132736916986e-06, + "logits/chosen": -0.9880766272544861, + "logits/rejected": -0.8328273892402649, + "logps/chosen": -0.16169053316116333, + "logps/rejected": -4.504159927368164, + "loss": 0.0928, + "odds_ratio_loss": 0.015480546280741692, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016169054433703423, + "rewards/margins": 0.4342469573020935, + "rewards/rejected": -0.4504159986972809, + "sft_loss": 0.16169053316116333, + "step": 2991 + }, + { + "epoch": 4.326825741142444, + "grad_norm": 1.386084019741126, + "learning_rate": 1.5011872598991845e-06, + "logits/chosen": -0.891055703163147, + "logits/rejected": -0.6146432161331177, + "logps/chosen": -0.06819592416286469, + "logps/rejected": -5.212233543395996, + "loss": 0.058, + "odds_ratio_loss": 0.007073340006172657, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006819593720138073, + "rewards/margins": 0.514403760433197, + "rewards/rejected": -0.5212233662605286, + "sft_loss": 0.06819592416286469, + "step": 2992 + }, + { + "epoch": 4.3282718727404195, + "grad_norm": 1.701412371797771, + "learning_rate": 1.4987627526501797e-06, + "logits/chosen": -0.8779382705688477, + "logits/rejected": -0.7203929424285889, + "logps/chosen": -0.07334870100021362, + "logps/rejected": -5.043249130249023, + "loss": 0.0908, + "odds_ratio_loss": 0.008891316130757332, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007334871217608452, + "rewards/margins": 0.49699005484580994, + "rewards/rejected": -0.5043249726295471, + "sft_loss": 0.07334870100021362, + "step": 2993 + }, + { + "epoch": 4.329718004338395, + "grad_norm": 1.499832025218408, + "learning_rate": 1.4963397534064255e-06, + "logits/chosen": -0.6140452027320862, + "logits/rejected": -0.5003236532211304, + "logps/chosen": -0.0780247375369072, + "logps/rejected": -3.900202512741089, + "loss": 0.0825, + "odds_ratio_loss": 0.006760553922504187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007802474312484264, + "rewards/margins": 0.38221779465675354, + "rewards/rejected": -0.3900202810764313, + "sft_loss": 0.0780247375369072, + "step": 2994 + }, + { + "epoch": 4.33116413593637, + "grad_norm": 1.1920435557793356, + "learning_rate": 1.4939182636287594e-06, + "logits/chosen": -0.6822051405906677, + "logits/rejected": -0.5335265398025513, + "logps/chosen": -0.05268733948469162, + "logps/rejected": -5.311867713928223, + "loss": 0.0537, + "odds_ratio_loss": 0.010495830327272415, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005268733948469162, + "rewards/margins": 0.5259181261062622, + "rewards/rejected": -0.5311868190765381, + "sft_loss": 0.05268733948469162, + "step": 2995 + }, + { + "epoch": 4.332610267534346, + "grad_norm": 1.4166781490558604, + "learning_rate": 1.4914982847771063e-06, + "logits/chosen": -0.8763676881790161, + "logits/rejected": -0.6532676815986633, + "logps/chosen": -0.07074081897735596, + "logps/rejected": -4.6171064376831055, + "loss": 0.0696, + "odds_ratio_loss": 0.006446932442486286, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007074081804603338, + "rewards/margins": 0.4546365737915039, + "rewards/rejected": -0.4617106318473816, + "sft_loss": 0.07074081897735596, + "step": 2996 + }, + { + "epoch": 4.334056399132321, + "grad_norm": 1.6560432823034414, + "learning_rate": 1.4890798183104788e-06, + "logits/chosen": -0.5278356075286865, + "logits/rejected": -0.3531097173690796, + "logps/chosen": -0.08435966819524765, + "logps/rejected": -7.50329065322876, + "loss": 0.0967, + "odds_ratio_loss": 0.008475156500935555, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008435966446995735, + "rewards/margins": 0.7418931722640991, + "rewards/rejected": -0.7503290772438049, + "sft_loss": 0.08435966819524765, + "step": 2997 + }, + { + "epoch": 4.335502530730296, + "grad_norm": 1.8409694247477042, + "learning_rate": 1.4866628656869816e-06, + "logits/chosen": -0.8288428783416748, + "logits/rejected": -0.5312150716781616, + "logps/chosen": -0.10413511097431183, + "logps/rejected": -3.546949625015259, + "loss": 0.1194, + "odds_ratio_loss": 0.012981563806533813, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010413511656224728, + "rewards/margins": 0.34428146481513977, + "rewards/rejected": -0.3546949625015259, + "sft_loss": 0.10413511097431183, + "step": 2998 + }, + { + "epoch": 4.336948662328272, + "grad_norm": 1.7431106907764917, + "learning_rate": 1.484247428363802e-06, + "logits/chosen": -1.054826259613037, + "logits/rejected": -0.7220063209533691, + "logps/chosen": -0.0636000782251358, + "logps/rejected": -4.4596781730651855, + "loss": 0.0817, + "odds_ratio_loss": 0.004126345738768578, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006360008381307125, + "rewards/margins": 0.43960779905319214, + "rewards/rejected": -0.44596779346466064, + "sft_loss": 0.0636000782251358, + "step": 2999 + }, + { + "epoch": 4.3383947939262475, + "grad_norm": 1.6516343056764413, + "learning_rate": 1.4818335077972188e-06, + "logits/chosen": -0.9266605377197266, + "logits/rejected": -0.6105201840400696, + "logps/chosen": -0.05521458014845848, + "logps/rejected": -4.643217086791992, + "loss": 0.0699, + "odds_ratio_loss": 0.004646781831979752, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0055214581079781055, + "rewards/margins": 0.4588002860546112, + "rewards/rejected": -0.46432173252105713, + "sft_loss": 0.05521458014845848, + "step": 3000 + }, + { + "epoch": 4.339840925524222, + "grad_norm": 1.4289864542975454, + "learning_rate": 1.479421105442591e-06, + "logits/chosen": -0.9717822074890137, + "logits/rejected": -0.62735515832901, + "logps/chosen": -0.05365077406167984, + "logps/rejected": -4.997622489929199, + "loss": 0.0743, + "odds_ratio_loss": 0.003372207749634981, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005365077406167984, + "rewards/margins": 0.4943971633911133, + "rewards/rejected": -0.49976223707199097, + "sft_loss": 0.05365077406167984, + "step": 3001 + }, + { + "epoch": 4.341287057122198, + "grad_norm": 1.6480211924822543, + "learning_rate": 1.4770102227543678e-06, + "logits/chosen": -0.8481490612030029, + "logits/rejected": -0.6115496158599854, + "logps/chosen": -0.0740613266825676, + "logps/rejected": -5.005758285522461, + "loss": 0.0932, + "odds_ratio_loss": 0.009503107517957687, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00740613229572773, + "rewards/margins": 0.49316978454589844, + "rewards/rejected": -0.5005759000778198, + "sft_loss": 0.0740613266825676, + "step": 3002 + }, + { + "epoch": 4.342733188720174, + "grad_norm": 1.3298459528138336, + "learning_rate": 1.474600861186078e-06, + "logits/chosen": -0.622434139251709, + "logits/rejected": -0.5628266930580139, + "logps/chosen": -0.057216983288526535, + "logps/rejected": -3.126250982284546, + "loss": 0.0674, + "odds_ratio_loss": 0.006634141784161329, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005721698515117168, + "rewards/margins": 0.3069033920764923, + "rewards/rejected": -0.31262508034706116, + "sft_loss": 0.057216983288526535, + "step": 3003 + }, + { + "epoch": 4.344179320318149, + "grad_norm": 1.8040818693788596, + "learning_rate": 1.4721930221903342e-06, + "logits/chosen": -0.7807648181915283, + "logits/rejected": -0.6595614552497864, + "logps/chosen": -0.04600096866488457, + "logps/rejected": -4.082098484039307, + "loss": 0.0867, + "odds_ratio_loss": 0.003212960669770837, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004600096959620714, + "rewards/margins": 0.4036097526550293, + "rewards/rejected": -0.4082098603248596, + "sft_loss": 0.04600096866488457, + "step": 3004 + }, + { + "epoch": 4.345625451916124, + "grad_norm": 1.8544771440367331, + "learning_rate": 1.469786707218831e-06, + "logits/chosen": -0.8745797872543335, + "logits/rejected": -0.6506798267364502, + "logps/chosen": -0.13814522325992584, + "logps/rejected": -5.566080570220947, + "loss": 0.0952, + "odds_ratio_loss": 0.019235284999012947, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013814522884786129, + "rewards/margins": 0.5427935719490051, + "rewards/rejected": -0.5566080808639526, + "sft_loss": 0.13814522325992584, + "step": 3005 + }, + { + "epoch": 4.3470715835141, + "grad_norm": 1.507690637374228, + "learning_rate": 1.4673819177223466e-06, + "logits/chosen": -0.8183923959732056, + "logits/rejected": -0.669651210308075, + "logps/chosen": -0.18851414322853088, + "logps/rejected": -3.478821277618408, + "loss": 0.1186, + "odds_ratio_loss": 0.018825456500053406, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01885141432285309, + "rewards/margins": 0.32903075218200684, + "rewards/rejected": -0.34788215160369873, + "sft_loss": 0.18851414322853088, + "step": 3006 + }, + { + "epoch": 4.3485177151120755, + "grad_norm": 1.4777687325743487, + "learning_rate": 1.4649786551507354e-06, + "logits/chosen": -0.7968933582305908, + "logits/rejected": -0.5578837394714355, + "logps/chosen": -0.06734529882669449, + "logps/rejected": -4.371576309204102, + "loss": 0.0649, + "odds_ratio_loss": 0.004955535754561424, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006734529510140419, + "rewards/margins": 0.4304230809211731, + "rewards/rejected": -0.43715763092041016, + "sft_loss": 0.06734529882669449, + "step": 3007 + }, + { + "epoch": 4.34996384671005, + "grad_norm": 1.3447914227375248, + "learning_rate": 1.4625769209529342e-06, + "logits/chosen": -0.7508656978607178, + "logits/rejected": -0.6274343729019165, + "logps/chosen": -0.06294934451580048, + "logps/rejected": -5.2878594398498535, + "loss": 0.0614, + "odds_ratio_loss": 0.005506738089025021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006294934079051018, + "rewards/margins": 0.5224910378456116, + "rewards/rejected": -0.5287859439849854, + "sft_loss": 0.06294934451580048, + "step": 3008 + }, + { + "epoch": 4.351409978308026, + "grad_norm": 2.018138682950108, + "learning_rate": 1.460176716576959e-06, + "logits/chosen": -0.7540772557258606, + "logits/rejected": -0.6052873730659485, + "logps/chosen": -0.051301129162311554, + "logps/rejected": -6.2577667236328125, + "loss": 0.0804, + "odds_ratio_loss": 0.005588476546108723, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00513011310249567, + "rewards/margins": 0.620646595954895, + "rewards/rejected": -0.6257766485214233, + "sft_loss": 0.051301129162311554, + "step": 3009 + }, + { + "epoch": 4.352856109906002, + "grad_norm": 1.3678448408498183, + "learning_rate": 1.4577780434699012e-06, + "logits/chosen": -0.8656454086303711, + "logits/rejected": -0.6067763566970825, + "logps/chosen": -0.07266905903816223, + "logps/rejected": -5.833568096160889, + "loss": 0.0738, + "odds_ratio_loss": 0.013198381289839745, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007266905624419451, + "rewards/margins": 0.5760899186134338, + "rewards/rejected": -0.5833568572998047, + "sft_loss": 0.07266905903816223, + "step": 3010 + }, + { + "epoch": 4.3543022415039765, + "grad_norm": 1.663768760253496, + "learning_rate": 1.4553809030779287e-06, + "logits/chosen": -0.7601166367530823, + "logits/rejected": -0.6807395219802856, + "logps/chosen": -0.1088540330529213, + "logps/rejected": -3.1109838485717773, + "loss": 0.0937, + "odds_ratio_loss": 0.009050026535987854, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010885403491556644, + "rewards/margins": 0.3002129793167114, + "rewards/rejected": -0.3110983967781067, + "sft_loss": 0.1088540330529213, + "step": 3011 + }, + { + "epoch": 4.355748373101952, + "grad_norm": 1.5594475178551062, + "learning_rate": 1.4529852968462858e-06, + "logits/chosen": -0.7941527366638184, + "logits/rejected": -0.585971474647522, + "logps/chosen": -0.07313597947359085, + "logps/rejected": -4.013620376586914, + "loss": 0.0593, + "odds_ratio_loss": 0.009553024545311928, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007313598413020372, + "rewards/margins": 0.39404845237731934, + "rewards/rejected": -0.4013620615005493, + "sft_loss": 0.07313597947359085, + "step": 3012 + }, + { + "epoch": 4.357194504699928, + "grad_norm": 1.9061263595083393, + "learning_rate": 1.450591226219295e-06, + "logits/chosen": -1.0241869688034058, + "logits/rejected": -0.8023190498352051, + "logps/chosen": -0.05216866359114647, + "logps/rejected": -4.660516738891602, + "loss": 0.073, + "odds_ratio_loss": 0.01102613378316164, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0052168662659823895, + "rewards/margins": 0.460834801197052, + "rewards/rejected": -0.46605169773101807, + "sft_loss": 0.05216866359114647, + "step": 3013 + }, + { + "epoch": 4.358640636297903, + "grad_norm": 2.652329974259841, + "learning_rate": 1.4481986926403473e-06, + "logits/chosen": -0.9176286458969116, + "logits/rejected": -0.8531794548034668, + "logps/chosen": -0.18076550960540771, + "logps/rejected": -4.112682342529297, + "loss": 0.1192, + "odds_ratio_loss": 0.042850561439991, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01807655207812786, + "rewards/margins": 0.39319172501564026, + "rewards/rejected": -0.4112682640552521, + "sft_loss": 0.18076550960540771, + "step": 3014 + }, + { + "epoch": 4.360086767895878, + "grad_norm": 1.4776010369065984, + "learning_rate": 1.445807697551913e-06, + "logits/chosen": -0.8927839398384094, + "logits/rejected": -0.6999744176864624, + "logps/chosen": -0.04306114837527275, + "logps/rejected": -5.057520866394043, + "loss": 0.0499, + "odds_ratio_loss": 0.00421172333881259, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004306115210056305, + "rewards/margins": 0.5014459490776062, + "rewards/rejected": -0.5057520270347595, + "sft_loss": 0.04306114837527275, + "step": 3015 + }, + { + "epoch": 4.361532899493854, + "grad_norm": 1.7796610779704267, + "learning_rate": 1.4434182423955296e-06, + "logits/chosen": -0.7437611222267151, + "logits/rejected": -0.5102348327636719, + "logps/chosen": -0.057212602347135544, + "logps/rejected": -6.495122909545898, + "loss": 0.087, + "odds_ratio_loss": 0.00607125461101532, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005721260327845812, + "rewards/margins": 0.6437910199165344, + "rewards/rejected": -0.6495122909545898, + "sft_loss": 0.057212602347135544, + "step": 3016 + }, + { + "epoch": 4.36297903109183, + "grad_norm": 1.7363048835289525, + "learning_rate": 1.4410303286118106e-06, + "logits/chosen": -0.8889349699020386, + "logits/rejected": -0.6399192214012146, + "logps/chosen": -0.07837576419115067, + "logps/rejected": -5.654128551483154, + "loss": 0.0973, + "odds_ratio_loss": 0.0068669687025249004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007837576791644096, + "rewards/margins": 0.5575752854347229, + "rewards/rejected": -0.5654128789901733, + "sft_loss": 0.07837576419115067, + "step": 3017 + }, + { + "epoch": 4.3644251626898045, + "grad_norm": 1.7329045014223299, + "learning_rate": 1.438643957640436e-06, + "logits/chosen": -0.6671526432037354, + "logits/rejected": -0.48754367232322693, + "logps/chosen": -0.06948922574520111, + "logps/rejected": -4.8217668533325195, + "loss": 0.1362, + "odds_ratio_loss": 0.006450343877077103, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006948922295123339, + "rewards/margins": 0.4752277433872223, + "rewards/rejected": -0.48217666149139404, + "sft_loss": 0.06948922574520111, + "step": 3018 + }, + { + "epoch": 4.36587129428778, + "grad_norm": 1.547783584280245, + "learning_rate": 1.4362591309201618e-06, + "logits/chosen": -0.6410992741584778, + "logits/rejected": -0.5067773461341858, + "logps/chosen": -0.06263686716556549, + "logps/rejected": -4.180117607116699, + "loss": 0.0802, + "odds_ratio_loss": 0.0073380540125072, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006263687275350094, + "rewards/margins": 0.4117480516433716, + "rewards/rejected": -0.41801172494888306, + "sft_loss": 0.06263686716556549, + "step": 3019 + }, + { + "epoch": 4.367317425885756, + "grad_norm": 1.7173950891688827, + "learning_rate": 1.4338758498888028e-06, + "logits/chosen": -0.7619547843933105, + "logits/rejected": -0.5084792375564575, + "logps/chosen": -0.06390223652124405, + "logps/rejected": -4.947268486022949, + "loss": 0.1019, + "odds_ratio_loss": 0.006950300186872482, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006390223745256662, + "rewards/margins": 0.4883366525173187, + "rewards/rejected": -0.49472689628601074, + "sft_loss": 0.06390223652124405, + "step": 3020 + }, + { + "epoch": 4.368763557483731, + "grad_norm": 1.6770132351279181, + "learning_rate": 1.4314941159832516e-06, + "logits/chosen": -0.7331417798995972, + "logits/rejected": -0.5848100781440735, + "logps/chosen": -0.0755729153752327, + "logps/rejected": -4.190185070037842, + "loss": 0.0927, + "odds_ratio_loss": 0.004576533567160368, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007557292003184557, + "rewards/margins": 0.4114612340927124, + "rewards/rejected": -0.4190185070037842, + "sft_loss": 0.0755729153752327, + "step": 3021 + }, + { + "epoch": 4.370209689081706, + "grad_norm": 1.4978278317319662, + "learning_rate": 1.4291139306394651e-06, + "logits/chosen": -0.9728338718414307, + "logits/rejected": -0.6097639799118042, + "logps/chosen": -0.06788711994886398, + "logps/rejected": -4.2939534187316895, + "loss": 0.0838, + "odds_ratio_loss": 0.0016902622301131487, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006788711994886398, + "rewards/margins": 0.4226066470146179, + "rewards/rejected": -0.4293953776359558, + "sft_loss": 0.06788711994886398, + "step": 3022 + }, + { + "epoch": 4.371655820679682, + "grad_norm": 1.5576854515553027, + "learning_rate": 1.4267352952924632e-06, + "logits/chosen": -0.5633742213249207, + "logits/rejected": -0.47203364968299866, + "logps/chosen": -0.04447542876005173, + "logps/rejected": -5.289262294769287, + "loss": 0.0457, + "odds_ratio_loss": 0.003245576983317733, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0044475425966084, + "rewards/margins": 0.5244786739349365, + "rewards/rejected": -0.5289261937141418, + "sft_loss": 0.04447542876005173, + "step": 3023 + }, + { + "epoch": 4.373101952277657, + "grad_norm": 1.5445070302860024, + "learning_rate": 1.4243582113763376e-06, + "logits/chosen": -0.9106212258338928, + "logits/rejected": -0.4883584976196289, + "logps/chosen": -0.08085661381483078, + "logps/rejected": -6.208954811096191, + "loss": 0.0742, + "odds_ratio_loss": 0.0067681861110031605, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008085661567747593, + "rewards/margins": 0.6128097772598267, + "rewards/rejected": -0.620895504951477, + "sft_loss": 0.08085661381483078, + "step": 3024 + }, + { + "epoch": 4.3745480838756325, + "grad_norm": 1.5797770050022457, + "learning_rate": 1.4219826803242372e-06, + "logits/chosen": -0.9211044907569885, + "logits/rejected": -0.6510443091392517, + "logps/chosen": -0.031016170978546143, + "logps/rejected": -3.0293474197387695, + "loss": 0.0921, + "odds_ratio_loss": 0.0027869718614965677, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0031016170978546143, + "rewards/margins": 0.2998330891132355, + "rewards/rejected": -0.3029347360134125, + "sft_loss": 0.031016170978546143, + "step": 3025 + }, + { + "epoch": 4.375994215473608, + "grad_norm": 1.8973584967663573, + "learning_rate": 1.4196087035683818e-06, + "logits/chosen": -0.7323670387268066, + "logits/rejected": -0.6421462893486023, + "logps/chosen": -0.08696135878562927, + "logps/rejected": -4.719554901123047, + "loss": 0.0594, + "odds_ratio_loss": 0.010816432535648346, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008696136064827442, + "rewards/margins": 0.46325939893722534, + "rewards/rejected": -0.4719555377960205, + "sft_loss": 0.08696135878562927, + "step": 3026 + }, + { + "epoch": 4.377440347071584, + "grad_norm": 1.4381222411520438, + "learning_rate": 1.4172362825400499e-06, + "logits/chosen": -0.6959972381591797, + "logits/rejected": -0.5631358623504639, + "logps/chosen": -0.07375743985176086, + "logps/rejected": -4.761948108673096, + "loss": 0.0651, + "odds_ratio_loss": 0.004052689298987389, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007375743240118027, + "rewards/margins": 0.4688190221786499, + "rewards/rejected": -0.4761947989463806, + "sft_loss": 0.07375743985176086, + "step": 3027 + }, + { + "epoch": 4.378886478669559, + "grad_norm": 1.8112701096091208, + "learning_rate": 1.4148654186695818e-06, + "logits/chosen": -0.7027171850204468, + "logits/rejected": -0.6286312341690063, + "logps/chosen": -0.10041756927967072, + "logps/rejected": -4.47844123840332, + "loss": 0.1241, + "odds_ratio_loss": 0.01177249290049076, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010041756555438042, + "rewards/margins": 0.43780237436294556, + "rewards/rejected": -0.44784414768218994, + "sft_loss": 0.10041756927967072, + "step": 3028 + }, + { + "epoch": 4.380332610267534, + "grad_norm": 1.729400708490863, + "learning_rate": 1.4124961133863802e-06, + "logits/chosen": -0.950899064540863, + "logits/rejected": -0.5882576107978821, + "logps/chosen": -0.07639265805482864, + "logps/rejected": -5.41416072845459, + "loss": 0.0693, + "odds_ratio_loss": 0.0038110874593257904, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007639266550540924, + "rewards/margins": 0.5337768197059631, + "rewards/rejected": -0.5414160490036011, + "sft_loss": 0.07639265805482864, + "step": 3029 + }, + { + "epoch": 4.38177874186551, + "grad_norm": 1.877374137773176, + "learning_rate": 1.4101283681189097e-06, + "logits/chosen": -0.7628424167633057, + "logits/rejected": -0.5627432465553284, + "logps/chosen": -0.08173272758722305, + "logps/rejected": -6.892059326171875, + "loss": 0.1018, + "odds_ratio_loss": 0.003274757880717516, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00817327294498682, + "rewards/margins": 0.6810327172279358, + "rewards/rejected": -0.6892059445381165, + "sft_loss": 0.08173272758722305, + "step": 3030 + }, + { + "epoch": 4.383224873463485, + "grad_norm": 1.8461184307068432, + "learning_rate": 1.4077621842946905e-06, + "logits/chosen": -0.7165622711181641, + "logits/rejected": -0.5991818308830261, + "logps/chosen": -0.08831194043159485, + "logps/rejected": -3.3247945308685303, + "loss": 0.0721, + "odds_ratio_loss": 0.01518079824745655, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008831193670630455, + "rewards/margins": 0.32364824414253235, + "rewards/rejected": -0.33247941732406616, + "sft_loss": 0.08831194043159485, + "step": 3031 + }, + { + "epoch": 4.384671005061461, + "grad_norm": 1.5390651399829893, + "learning_rate": 1.4053975633403062e-06, + "logits/chosen": -0.903634786605835, + "logits/rejected": -0.5671402812004089, + "logps/chosen": -0.07350382208824158, + "logps/rejected": -5.765620708465576, + "loss": 0.0779, + "odds_ratio_loss": 0.008814067579805851, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007350381929427385, + "rewards/margins": 0.5692117214202881, + "rewards/rejected": -0.5765621066093445, + "sft_loss": 0.07350382208824158, + "step": 3032 + }, + { + "epoch": 4.386117136659436, + "grad_norm": 2.6024145275267627, + "learning_rate": 1.4030345066813927e-06, + "logits/chosen": -0.9356105327606201, + "logits/rejected": -0.6585832834243774, + "logps/chosen": -0.11158432811498642, + "logps/rejected": -3.363126277923584, + "loss": 0.0882, + "odds_ratio_loss": 0.006204155273735523, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011158432811498642, + "rewards/margins": 0.325154185295105, + "rewards/rejected": -0.3363126218318939, + "sft_loss": 0.11158432811498642, + "step": 3033 + }, + { + "epoch": 4.387563268257411, + "grad_norm": 1.500933012566789, + "learning_rate": 1.400673015742649e-06, + "logits/chosen": -0.8368505239486694, + "logits/rejected": -0.660033106803894, + "logps/chosen": -0.07904787361621857, + "logps/rejected": -3.45121169090271, + "loss": 0.0637, + "odds_ratio_loss": 0.009564665146172047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007904788479208946, + "rewards/margins": 0.3372163772583008, + "rewards/rejected": -0.3451211750507355, + "sft_loss": 0.07904787361621857, + "step": 3034 + }, + { + "epoch": 4.389009399855387, + "grad_norm": 1.695864175616555, + "learning_rate": 1.3983130919478248e-06, + "logits/chosen": -0.908363401889801, + "logits/rejected": -0.6203923225402832, + "logps/chosen": -0.06778942048549652, + "logps/rejected": -3.6424999237060547, + "loss": 0.0752, + "odds_ratio_loss": 0.005892588756978512, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006778942421078682, + "rewards/margins": 0.3574710488319397, + "rewards/rejected": -0.36424997448921204, + "sft_loss": 0.06778942048549652, + "step": 3035 + }, + { + "epoch": 4.390455531453362, + "grad_norm": 1.5718319470054316, + "learning_rate": 1.3959547367197262e-06, + "logits/chosen": -0.6267784833908081, + "logits/rejected": -0.5574684739112854, + "logps/chosen": -0.08968807756900787, + "logps/rejected": -5.226409435272217, + "loss": 0.0907, + "odds_ratio_loss": 0.0179790947586298, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008968808688223362, + "rewards/margins": 0.5136721134185791, + "rewards/rejected": -0.5226409435272217, + "sft_loss": 0.08968807756900787, + "step": 3036 + }, + { + "epoch": 4.391901663051337, + "grad_norm": 1.5139362649763861, + "learning_rate": 1.3935979514802166e-06, + "logits/chosen": -0.628441572189331, + "logits/rejected": -0.4122357964515686, + "logps/chosen": -0.05464586615562439, + "logps/rejected": -3.6563820838928223, + "loss": 0.085, + "odds_ratio_loss": 0.008319716900587082, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005464586429297924, + "rewards/margins": 0.36017361283302307, + "rewards/rejected": -0.36563819646835327, + "sft_loss": 0.05464586615562439, + "step": 3037 + }, + { + "epoch": 4.393347794649313, + "grad_norm": 1.4656706139150852, + "learning_rate": 1.3912427376502075e-06, + "logits/chosen": -0.8293111324310303, + "logits/rejected": -0.5837162137031555, + "logps/chosen": -0.04411669075489044, + "logps/rejected": -4.333014011383057, + "loss": 0.078, + "odds_ratio_loss": 0.0032760649919509888, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004411669448018074, + "rewards/margins": 0.4288897216320038, + "rewards/rejected": -0.4333013892173767, + "sft_loss": 0.04411669075489044, + "step": 3038 + }, + { + "epoch": 4.394793926247289, + "grad_norm": 1.6543272906345592, + "learning_rate": 1.3888890966496698e-06, + "logits/chosen": -0.6298922300338745, + "logits/rejected": -0.4950665831565857, + "logps/chosen": -0.05027003213763237, + "logps/rejected": -5.746697425842285, + "loss": 0.0852, + "odds_ratio_loss": 0.003421762026846409, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005027003586292267, + "rewards/margins": 0.5696427226066589, + "rewards/rejected": -0.5746697187423706, + "sft_loss": 0.05027003213763237, + "step": 3039 + }, + { + "epoch": 4.396240057845264, + "grad_norm": 1.7242446848815338, + "learning_rate": 1.3865370298976188e-06, + "logits/chosen": -0.7039051055908203, + "logits/rejected": -0.6944054365158081, + "logps/chosen": -0.07602061331272125, + "logps/rejected": -3.5846824645996094, + "loss": 0.0798, + "odds_ratio_loss": 0.008702849969267845, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007602061610668898, + "rewards/margins": 0.3508661985397339, + "rewards/rejected": -0.35846826434135437, + "sft_loss": 0.07602061331272125, + "step": 3040 + }, + { + "epoch": 4.397686189443239, + "grad_norm": 1.5606431444503155, + "learning_rate": 1.3841865388121275e-06, + "logits/chosen": -0.6780179738998413, + "logits/rejected": -0.5885179042816162, + "logps/chosen": -0.1251131147146225, + "logps/rejected": -4.6654558181762695, + "loss": 0.085, + "odds_ratio_loss": 0.02848890796303749, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01251131109893322, + "rewards/margins": 0.4540342688560486, + "rewards/rejected": -0.46654558181762695, + "sft_loss": 0.1251131147146225, + "step": 3041 + }, + { + "epoch": 4.399132321041215, + "grad_norm": 1.6872868354444532, + "learning_rate": 1.3818376248103144e-06, + "logits/chosen": -0.9445464611053467, + "logits/rejected": -0.6897961497306824, + "logps/chosen": -0.06516368687152863, + "logps/rejected": -5.009978771209717, + "loss": 0.0916, + "odds_ratio_loss": 0.011260229162871838, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006516368594020605, + "rewards/margins": 0.49448153376579285, + "rewards/rejected": -0.5009979009628296, + "sft_loss": 0.06516368687152863, + "step": 3042 + }, + { + "epoch": 4.4005784526391905, + "grad_norm": 1.9080324274666887, + "learning_rate": 1.3794902893083485e-06, + "logits/chosen": -0.7691489458084106, + "logits/rejected": -0.589622974395752, + "logps/chosen": -0.09429274499416351, + "logps/rejected": -3.665755033493042, + "loss": 0.1116, + "odds_ratio_loss": 0.0055620986968278885, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009429275058209896, + "rewards/margins": 0.3571462631225586, + "rewards/rejected": -0.3665755093097687, + "sft_loss": 0.09429274499416351, + "step": 3043 + }, + { + "epoch": 4.402024584237165, + "grad_norm": 1.5782764306018584, + "learning_rate": 1.377144533721445e-06, + "logits/chosen": -0.8291411399841309, + "logits/rejected": -0.6172857880592346, + "logps/chosen": -0.11644075810909271, + "logps/rejected": -6.257889747619629, + "loss": 0.0623, + "odds_ratio_loss": 0.0051756082102656364, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011644075624644756, + "rewards/margins": 0.6141449213027954, + "rewards/rejected": -0.6257889866828918, + "sft_loss": 0.11644075810909271, + "step": 3044 + }, + { + "epoch": 4.403470715835141, + "grad_norm": 1.6116329655987056, + "learning_rate": 1.3748003594638728e-06, + "logits/chosen": -0.6932000517845154, + "logits/rejected": -0.6144500374794006, + "logps/chosen": -0.033836688846349716, + "logps/rejected": -3.270808219909668, + "loss": 0.0628, + "odds_ratio_loss": 0.006877487525343895, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0033836690708994865, + "rewards/margins": 0.32369717955589294, + "rewards/rejected": -0.3270808458328247, + "sft_loss": 0.033836688846349716, + "step": 3045 + }, + { + "epoch": 4.404916847433117, + "grad_norm": 1.5976320593278748, + "learning_rate": 1.3724577679489393e-06, + "logits/chosen": -0.6334445476531982, + "logits/rejected": -0.5385106801986694, + "logps/chosen": -0.0667404979467392, + "logps/rejected": -5.588997840881348, + "loss": 0.0673, + "odds_ratio_loss": 0.006370170041918755, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0066740503534674644, + "rewards/margins": 0.5522257089614868, + "rewards/rejected": -0.5588997602462769, + "sft_loss": 0.0667404979467392, + "step": 3046 + }, + { + "epoch": 4.406362979031091, + "grad_norm": 1.5601781729780566, + "learning_rate": 1.3701167605890054e-06, + "logits/chosen": -0.7392277717590332, + "logits/rejected": -0.5148305296897888, + "logps/chosen": -0.0697949156165123, + "logps/rejected": -3.349868059158325, + "loss": 0.0715, + "odds_ratio_loss": 0.007275932468473911, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00697949156165123, + "rewards/margins": 0.3280073404312134, + "rewards/rejected": -0.3349868059158325, + "sft_loss": 0.0697949156165123, + "step": 3047 + }, + { + "epoch": 4.407809110629067, + "grad_norm": 1.6185824118809349, + "learning_rate": 1.3677773387954696e-06, + "logits/chosen": -0.7123594284057617, + "logits/rejected": -0.3868655562400818, + "logps/chosen": -0.10565149039030075, + "logps/rejected": -7.379734992980957, + "loss": 0.0901, + "odds_ratio_loss": 0.00902761984616518, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010565148666501045, + "rewards/margins": 0.7274083495140076, + "rewards/rejected": -0.7379735112190247, + "sft_loss": 0.10565149039030075, + "step": 3048 + }, + { + "epoch": 4.409255242227043, + "grad_norm": 1.4439418842601006, + "learning_rate": 1.3654395039787808e-06, + "logits/chosen": -0.8884992599487305, + "logits/rejected": -0.5133688449859619, + "logps/chosen": -0.01734546385705471, + "logps/rejected": -4.463611602783203, + "loss": 0.0663, + "odds_ratio_loss": 0.00035680370638146996, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001734546385705471, + "rewards/margins": 0.44462665915489197, + "rewards/rejected": -0.4463611841201782, + "sft_loss": 0.01734546385705471, + "step": 3049 + }, + { + "epoch": 4.4107013738250185, + "grad_norm": 1.4545245283187038, + "learning_rate": 1.3631032575484276e-06, + "logits/chosen": -0.7421780228614807, + "logits/rejected": -0.8050009608268738, + "logps/chosen": -0.12215909361839294, + "logps/rejected": -2.9910664558410645, + "loss": 0.0897, + "odds_ratio_loss": 0.03483511507511139, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01221590954810381, + "rewards/margins": 0.28689074516296387, + "rewards/rejected": -0.2991066575050354, + "sft_loss": 0.12215909361839294, + "step": 3050 + }, + { + "epoch": 4.412147505422993, + "grad_norm": 1.7163056525259084, + "learning_rate": 1.3607686009129395e-06, + "logits/chosen": -0.8548030853271484, + "logits/rejected": -0.7793415188789368, + "logps/chosen": -0.10555724054574966, + "logps/rejected": -6.055647373199463, + "loss": 0.0987, + "odds_ratio_loss": 0.013582490384578705, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010555723682045937, + "rewards/margins": 0.5950090289115906, + "rewards/rejected": -0.6055647730827332, + "sft_loss": 0.10555724054574966, + "step": 3051 + }, + { + "epoch": 4.413593637020969, + "grad_norm": 1.8039202268622319, + "learning_rate": 1.3584355354798933e-06, + "logits/chosen": -0.894365668296814, + "logits/rejected": -0.652009129524231, + "logps/chosen": -0.10110814869403839, + "logps/rejected": -5.039724826812744, + "loss": 0.0822, + "odds_ratio_loss": 0.007859394885599613, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010110815986990929, + "rewards/margins": 0.49386167526245117, + "rewards/rejected": -0.5039725303649902, + "sft_loss": 0.10110814869403839, + "step": 3052 + }, + { + "epoch": 4.415039768618945, + "grad_norm": 1.5687944501539148, + "learning_rate": 1.3561040626558993e-06, + "logits/chosen": -0.8314445614814758, + "logits/rejected": -0.5648901462554932, + "logps/chosen": -0.12590952217578888, + "logps/rejected": -5.708104610443115, + "loss": 0.0936, + "odds_ratio_loss": 0.007341315969824791, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012590952217578888, + "rewards/margins": 0.5582195520401001, + "rewards/rejected": -0.5708104372024536, + "sft_loss": 0.12590952217578888, + "step": 3053 + }, + { + "epoch": 4.4164859002169194, + "grad_norm": 1.7372836321558947, + "learning_rate": 1.3537741838466144e-06, + "logits/chosen": -0.753945529460907, + "logits/rejected": -0.6398801207542419, + "logps/chosen": -0.10332097113132477, + "logps/rejected": -3.84722900390625, + "loss": 0.093, + "odds_ratio_loss": 0.014396404847502708, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010332096368074417, + "rewards/margins": 0.3743908405303955, + "rewards/rejected": -0.38472288846969604, + "sft_loss": 0.10332097113132477, + "step": 3054 + }, + { + "epoch": 4.417932031814895, + "grad_norm": 1.653094550223046, + "learning_rate": 1.3514459004567282e-06, + "logits/chosen": -0.7897300124168396, + "logits/rejected": -0.7482852339744568, + "logps/chosen": -0.046823035925626755, + "logps/rejected": -5.072388648986816, + "loss": 0.0919, + "odds_ratio_loss": 0.005791356787085533, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0046823034062981606, + "rewards/margins": 0.502556562423706, + "rewards/rejected": -0.5072388648986816, + "sft_loss": 0.046823035925626755, + "step": 3055 + }, + { + "epoch": 4.419378163412871, + "grad_norm": 1.7870939070781757, + "learning_rate": 1.3491192138899746e-06, + "logits/chosen": -0.7878323793411255, + "logits/rejected": -0.6233074069023132, + "logps/chosen": -0.07858381420373917, + "logps/rejected": -5.563356399536133, + "loss": 0.0799, + "odds_ratio_loss": 0.020533833652734756, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007858381606638432, + "rewards/margins": 0.5484772324562073, + "rewards/rejected": -0.5563355684280396, + "sft_loss": 0.07858381420373917, + "step": 3056 + }, + { + "epoch": 4.420824295010846, + "grad_norm": 1.4064986944225257, + "learning_rate": 1.3467941255491191e-06, + "logits/chosen": -0.6935770511627197, + "logits/rejected": -0.5368286967277527, + "logps/chosen": -0.061862025409936905, + "logps/rejected": -6.563525199890137, + "loss": 0.0847, + "odds_ratio_loss": 0.005179527681320906, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006186202168464661, + "rewards/margins": 0.6501663327217102, + "rewards/rejected": -0.6563525795936584, + "sft_loss": 0.061862025409936905, + "step": 3057 + }, + { + "epoch": 4.422270426608821, + "grad_norm": 1.4615415491387482, + "learning_rate": 1.3444706368359673e-06, + "logits/chosen": -0.6259520053863525, + "logits/rejected": -0.5293348431587219, + "logps/chosen": -0.051740460097789764, + "logps/rejected": -3.2695064544677734, + "loss": 0.0548, + "odds_ratio_loss": 0.004323053639382124, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005174045916646719, + "rewards/margins": 0.3217766284942627, + "rewards/rejected": -0.32695066928863525, + "sft_loss": 0.051740460097789764, + "step": 3058 + }, + { + "epoch": 4.423716558206797, + "grad_norm": 2.5391363837179877, + "learning_rate": 1.3421487491513577e-06, + "logits/chosen": -0.7854698300361633, + "logits/rejected": -0.6050273180007935, + "logps/chosen": -0.06993037462234497, + "logps/rejected": -6.034948348999023, + "loss": 0.1104, + "odds_ratio_loss": 0.009069676510989666, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006993037648499012, + "rewards/margins": 0.5965018272399902, + "rewards/rejected": -0.6034948229789734, + "sft_loss": 0.06993037462234497, + "step": 3059 + }, + { + "epoch": 4.425162689804772, + "grad_norm": 1.6143254545828654, + "learning_rate": 1.3398284638951674e-06, + "logits/chosen": -0.8341240882873535, + "logits/rejected": -0.8235796093940735, + "logps/chosen": -0.11071653664112091, + "logps/rejected": -3.5441067218780518, + "loss": 0.0679, + "odds_ratio_loss": 0.009739990346133709, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011071654036641121, + "rewards/margins": 0.3433390259742737, + "rewards/rejected": -0.35441067814826965, + "sft_loss": 0.11071653664112091, + "step": 3060 + }, + { + "epoch": 4.4266088214027475, + "grad_norm": 1.7933967896292562, + "learning_rate": 1.3375097824663022e-06, + "logits/chosen": -0.7039732336997986, + "logits/rejected": -0.5318971872329712, + "logps/chosen": -0.08936135470867157, + "logps/rejected": -5.843852519989014, + "loss": 0.1061, + "odds_ratio_loss": 0.014453625306487083, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008936136029660702, + "rewards/margins": 0.5754491686820984, + "rewards/rejected": -0.5843852162361145, + "sft_loss": 0.08936135470867157, + "step": 3061 + }, + { + "epoch": 4.428054953000723, + "grad_norm": 1.739041043864036, + "learning_rate": 1.3351927062627053e-06, + "logits/chosen": -0.817838728427887, + "logits/rejected": -0.5577576160430908, + "logps/chosen": -0.07707401365041733, + "logps/rejected": -4.251495361328125, + "loss": 0.1131, + "odds_ratio_loss": 0.008257195353507996, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007707401178777218, + "rewards/margins": 0.41744211316108704, + "rewards/rejected": -0.425149530172348, + "sft_loss": 0.07707401365041733, + "step": 3062 + }, + { + "epoch": 4.429501084598699, + "grad_norm": 1.4160499745882797, + "learning_rate": 1.332877236681352e-06, + "logits/chosen": -0.8466053009033203, + "logits/rejected": -0.5242434740066528, + "logps/chosen": -0.10801158845424652, + "logps/rejected": -3.6215646266937256, + "loss": 0.1005, + "odds_ratio_loss": 0.00946379266679287, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010801158845424652, + "rewards/margins": 0.35135528445243835, + "rewards/rejected": -0.3621564507484436, + "sft_loss": 0.10801158845424652, + "step": 3063 + }, + { + "epoch": 4.430947216196674, + "grad_norm": 1.5466882379686322, + "learning_rate": 1.330563375118245e-06, + "logits/chosen": -0.7832115292549133, + "logits/rejected": -0.536728024482727, + "logps/chosen": -0.07185237854719162, + "logps/rejected": -4.798366546630859, + "loss": 0.066, + "odds_ratio_loss": 0.0033283275552093983, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007185238413512707, + "rewards/margins": 0.4726513922214508, + "rewards/rejected": -0.479836642742157, + "sft_loss": 0.07185237854719162, + "step": 3064 + }, + { + "epoch": 4.432393347794649, + "grad_norm": 1.4196276378868817, + "learning_rate": 1.3282511229684223e-06, + "logits/chosen": -0.6810212135314941, + "logits/rejected": -0.709541916847229, + "logps/chosen": -0.059914518147706985, + "logps/rejected": -4.5397047996521, + "loss": 0.0743, + "odds_ratio_loss": 0.01058815699070692, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005991451907902956, + "rewards/margins": 0.44797906279563904, + "rewards/rejected": -0.4539704918861389, + "sft_loss": 0.059914518147706985, + "step": 3065 + }, + { + "epoch": 4.433839479392625, + "grad_norm": 1.3472066677998193, + "learning_rate": 1.3259404816259481e-06, + "logits/chosen": -0.751527726650238, + "logits/rejected": -0.6476584076881409, + "logps/chosen": -0.07095544040203094, + "logps/rejected": -3.7621631622314453, + "loss": 0.0746, + "odds_ratio_loss": 0.0034617676865309477, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007095544598996639, + "rewards/margins": 0.3691207766532898, + "rewards/rejected": -0.3762163519859314, + "sft_loss": 0.07095544040203094, + "step": 3066 + }, + { + "epoch": 4.4352856109906, + "grad_norm": 1.628631458493376, + "learning_rate": 1.3236314524839172e-06, + "logits/chosen": -0.8099523782730103, + "logits/rejected": -0.6404974460601807, + "logps/chosen": -0.09229202568531036, + "logps/rejected": -3.195460081100464, + "loss": 0.0832, + "odds_ratio_loss": 0.007787358481436968, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009229202754795551, + "rewards/margins": 0.310316801071167, + "rewards/rejected": -0.31954601407051086, + "sft_loss": 0.09229202568531036, + "step": 3067 + }, + { + "epoch": 4.4367317425885755, + "grad_norm": 1.4118554915639228, + "learning_rate": 1.3213240369344498e-06, + "logits/chosen": -0.8080687522888184, + "logits/rejected": -0.50105881690979, + "logps/chosen": -0.04950429126620293, + "logps/rejected": -4.707045555114746, + "loss": 0.0551, + "odds_ratio_loss": 0.006314246449619532, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004950429312884808, + "rewards/margins": 0.4657541513442993, + "rewards/rejected": -0.4707045555114746, + "sft_loss": 0.04950429126620293, + "step": 3068 + }, + { + "epoch": 4.438177874186551, + "grad_norm": 1.409611417063734, + "learning_rate": 1.319018236368698e-06, + "logits/chosen": -0.8788902759552002, + "logits/rejected": -0.6900272369384766, + "logps/chosen": -0.04148077964782715, + "logps/rejected": -3.2429776191711426, + "loss": 0.0471, + "odds_ratio_loss": 0.005179783795028925, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00414807815104723, + "rewards/margins": 0.320149689912796, + "rewards/rejected": -0.32429778575897217, + "sft_loss": 0.04148077964782715, + "step": 3069 + }, + { + "epoch": 4.439624005784526, + "grad_norm": 2.262076676746834, + "learning_rate": 1.3167140521768359e-06, + "logits/chosen": -0.9927254915237427, + "logits/rejected": -0.6981010437011719, + "logps/chosen": -0.08163347840309143, + "logps/rejected": -5.204900741577148, + "loss": 0.069, + "odds_ratio_loss": 0.006048885174095631, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008163347840309143, + "rewards/margins": 0.512326717376709, + "rewards/rejected": -0.5204900503158569, + "sft_loss": 0.08163347840309143, + "step": 3070 + }, + { + "epoch": 4.441070137382502, + "grad_norm": 1.593186487495553, + "learning_rate": 1.3144114857480664e-06, + "logits/chosen": -0.8842880129814148, + "logits/rejected": -0.6513842344284058, + "logps/chosen": -0.06474480032920837, + "logps/rejected": -3.6384944915771484, + "loss": 0.0741, + "odds_ratio_loss": 0.0073405965231359005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00647447956725955, + "rewards/margins": 0.35737502574920654, + "rewards/rejected": -0.3638494610786438, + "sft_loss": 0.06474480032920837, + "step": 3071 + }, + { + "epoch": 4.442516268980477, + "grad_norm": 3.063001386453148, + "learning_rate": 1.312110538470613e-06, + "logits/chosen": -0.8223741054534912, + "logits/rejected": -0.6300759315490723, + "logps/chosen": -0.08572448790073395, + "logps/rejected": -5.912900447845459, + "loss": 0.1048, + "odds_ratio_loss": 0.04846544191241264, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.00857244897633791, + "rewards/margins": 0.5827176570892334, + "rewards/rejected": -0.5912900567054749, + "sft_loss": 0.08572448790073395, + "step": 3072 + }, + { + "epoch": 4.443962400578453, + "grad_norm": 1.638978270409598, + "learning_rate": 1.3098112117317279e-06, + "logits/chosen": -0.6799906492233276, + "logits/rejected": -0.49231547117233276, + "logps/chosen": -0.08531919121742249, + "logps/rejected": -5.583248615264893, + "loss": 0.0791, + "odds_ratio_loss": 0.006705442443490028, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008531919680535793, + "rewards/margins": 0.5497928857803345, + "rewards/rejected": -0.5583248138427734, + "sft_loss": 0.08531919121742249, + "step": 3073 + }, + { + "epoch": 4.445408532176428, + "grad_norm": 2.1057000892118083, + "learning_rate": 1.307513506917683e-06, + "logits/chosen": -0.761457085609436, + "logits/rejected": -0.5390462875366211, + "logps/chosen": -0.1640089601278305, + "logps/rejected": -5.138252258300781, + "loss": 0.1403, + "odds_ratio_loss": 0.014966591261327267, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01640089601278305, + "rewards/margins": 0.4974243640899658, + "rewards/rejected": -0.5138252973556519, + "sft_loss": 0.1640089601278305, + "step": 3074 + }, + { + "epoch": 4.4468546637744035, + "grad_norm": 1.4700041819270353, + "learning_rate": 1.3052174254137712e-06, + "logits/chosen": -0.6377707719802856, + "logits/rejected": -0.5121666789054871, + "logps/chosen": -0.024024605751037598, + "logps/rejected": -4.902261257171631, + "loss": 0.0515, + "odds_ratio_loss": 0.0034545832313597202, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0024024604354053736, + "rewards/margins": 0.4878236651420593, + "rewards/rejected": -0.490226149559021, + "sft_loss": 0.024024605751037598, + "step": 3075 + }, + { + "epoch": 4.448300795372379, + "grad_norm": 1.5186502267359456, + "learning_rate": 1.3029229686043111e-06, + "logits/chosen": -0.8834959268569946, + "logits/rejected": -0.5727044343948364, + "logps/chosen": -0.036372192203998566, + "logps/rejected": -6.26273775100708, + "loss": 0.0695, + "odds_ratio_loss": 0.0046208943240344524, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003637219313532114, + "rewards/margins": 0.6226365566253662, + "rewards/rejected": -0.6262738108634949, + "sft_loss": 0.036372192203998566, + "step": 3076 + }, + { + "epoch": 4.449746926970354, + "grad_norm": 1.5327777476835118, + "learning_rate": 1.300630137872637e-06, + "logits/chosen": -0.7297614812850952, + "logits/rejected": -0.5649856328964233, + "logps/chosen": -0.0725630521774292, + "logps/rejected": -5.728961944580078, + "loss": 0.0704, + "odds_ratio_loss": 0.007540358696132898, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007256305776536465, + "rewards/margins": 0.5656399130821228, + "rewards/rejected": -0.5728961825370789, + "sft_loss": 0.0725630521774292, + "step": 3077 + }, + { + "epoch": 4.45119305856833, + "grad_norm": 1.486418897043296, + "learning_rate": 1.2983389346011079e-06, + "logits/chosen": -0.6607635617256165, + "logits/rejected": -0.6320513486862183, + "logps/chosen": -0.0793953388929367, + "logps/rejected": -6.55406379699707, + "loss": 0.0727, + "odds_ratio_loss": 0.01003860030323267, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007939533330500126, + "rewards/margins": 0.6474668979644775, + "rewards/rejected": -0.6554064154624939, + "sft_loss": 0.0793953388929367, + "step": 3078 + }, + { + "epoch": 4.452639190166305, + "grad_norm": 1.5748827094661648, + "learning_rate": 1.2960493601710956e-06, + "logits/chosen": -0.8328260183334351, + "logits/rejected": -0.5687087774276733, + "logps/chosen": -0.09150678664445877, + "logps/rejected": -4.2515997886657715, + "loss": 0.0911, + "odds_ratio_loss": 0.006226380355656147, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009150679223239422, + "rewards/margins": 0.4160093069076538, + "rewards/rejected": -0.4251599907875061, + "sft_loss": 0.09150678664445877, + "step": 3079 + }, + { + "epoch": 4.45408532176428, + "grad_norm": 1.5380901831808957, + "learning_rate": 1.293761415962996e-06, + "logits/chosen": -0.7865862846374512, + "logits/rejected": -0.6210229396820068, + "logps/chosen": -0.03755870461463928, + "logps/rejected": -5.537697792053223, + "loss": 0.0941, + "odds_ratio_loss": 0.005131867248564959, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0037558707408607006, + "rewards/margins": 0.5500138998031616, + "rewards/rejected": -0.5537697672843933, + "sft_loss": 0.03755870461463928, + "step": 3080 + }, + { + "epoch": 4.455531453362256, + "grad_norm": 1.2266009128818411, + "learning_rate": 1.2914751033562178e-06, + "logits/chosen": -0.8513956069946289, + "logits/rejected": -0.5696482062339783, + "logps/chosen": -0.08961950242519379, + "logps/rejected": -5.04107141494751, + "loss": 0.0765, + "odds_ratio_loss": 0.0022056095767766237, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008961950428783894, + "rewards/margins": 0.49514520168304443, + "rewards/rejected": -0.5041071772575378, + "sft_loss": 0.08961950242519379, + "step": 3081 + }, + { + "epoch": 4.4569775849602316, + "grad_norm": 1.4910928711147577, + "learning_rate": 1.2891904237291873e-06, + "logits/chosen": -0.6649074554443359, + "logits/rejected": -0.6417031288146973, + "logps/chosen": -0.053086064755916595, + "logps/rejected": -2.805670976638794, + "loss": 0.1115, + "odds_ratio_loss": 0.018563320860266685, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005308606196194887, + "rewards/margins": 0.27525848150253296, + "rewards/rejected": -0.28056707978248596, + "sft_loss": 0.053086064755916595, + "step": 3082 + }, + { + "epoch": 4.458423716558206, + "grad_norm": 1.5262137978721781, + "learning_rate": 1.2869073784593453e-06, + "logits/chosen": -0.7511852979660034, + "logits/rejected": -0.6438009142875671, + "logps/chosen": -0.08350561559200287, + "logps/rejected": -3.083378553390503, + "loss": 0.0719, + "odds_ratio_loss": 0.012275271117687225, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008350562304258347, + "rewards/margins": 0.2999872863292694, + "rewards/rejected": -0.30833783745765686, + "sft_loss": 0.08350561559200287, + "step": 3083 + }, + { + "epoch": 4.459869848156182, + "grad_norm": 1.3219017121775374, + "learning_rate": 1.2846259689231506e-06, + "logits/chosen": -0.8914992809295654, + "logits/rejected": -0.5301820039749146, + "logps/chosen": -0.057186149060726166, + "logps/rejected": -5.1810760498046875, + "loss": 0.0701, + "odds_ratio_loss": 0.0078020961955189705, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005718615371733904, + "rewards/margins": 0.512389063835144, + "rewards/rejected": -0.5181075930595398, + "sft_loss": 0.057186149060726166, + "step": 3084 + }, + { + "epoch": 4.461315979754158, + "grad_norm": 2.145105467061074, + "learning_rate": 1.2823461964960713e-06, + "logits/chosen": -0.837430477142334, + "logits/rejected": -0.699951171875, + "logps/chosen": -0.1344883143901825, + "logps/rejected": -3.640721321105957, + "loss": 0.1181, + "odds_ratio_loss": 0.015670428052544594, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01344883069396019, + "rewards/margins": 0.3506232500076294, + "rewards/rejected": -0.36407211422920227, + "sft_loss": 0.1344883143901825, + "step": 3085 + }, + { + "epoch": 4.462762111352133, + "grad_norm": 1.7994304164534771, + "learning_rate": 1.2800680625525933e-06, + "logits/chosen": -0.7815194129943848, + "logits/rejected": -0.45796847343444824, + "logps/chosen": -0.08412434160709381, + "logps/rejected": -6.533374786376953, + "loss": 0.0596, + "odds_ratio_loss": 0.0032411282882094383, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008412433788180351, + "rewards/margins": 0.6449249982833862, + "rewards/rejected": -0.6533374190330505, + "sft_loss": 0.08412434160709381, + "step": 3086 + }, + { + "epoch": 4.464208242950108, + "grad_norm": 1.5619771864210943, + "learning_rate": 1.2777915684662088e-06, + "logits/chosen": -0.7581502199172974, + "logits/rejected": -0.6802593469619751, + "logps/chosen": -0.15440955758094788, + "logps/rejected": -4.527318477630615, + "loss": 0.0886, + "odds_ratio_loss": 0.02079537883400917, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015440955758094788, + "rewards/margins": 0.4372909367084503, + "rewards/rejected": -0.4527318775653839, + "sft_loss": 0.15440955758094788, + "step": 3087 + }, + { + "epoch": 4.465654374548084, + "grad_norm": 2.034043709736984, + "learning_rate": 1.2755167156094278e-06, + "logits/chosen": -0.670623242855072, + "logits/rejected": -0.5299463272094727, + "logps/chosen": -0.06610743701457977, + "logps/rejected": -4.180477142333984, + "loss": 0.0689, + "odds_ratio_loss": 0.005267998669296503, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006610743701457977, + "rewards/margins": 0.41143694519996643, + "rewards/rejected": -0.418047696352005, + "sft_loss": 0.06610743701457977, + "step": 3088 + }, + { + "epoch": 4.46710050614606, + "grad_norm": 1.7924040631150486, + "learning_rate": 1.2732435053537657e-06, + "logits/chosen": -1.0045921802520752, + "logits/rejected": -0.7713348269462585, + "logps/chosen": -0.04516824334859848, + "logps/rejected": -5.035297870635986, + "loss": 0.063, + "odds_ratio_loss": 0.0038515704218298197, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004516824148595333, + "rewards/margins": 0.49901291728019714, + "rewards/rejected": -0.5035297870635986, + "sft_loss": 0.04516824334859848, + "step": 3089 + }, + { + "epoch": 4.468546637744034, + "grad_norm": 1.6682105497831299, + "learning_rate": 1.2709719390697484e-06, + "logits/chosen": -0.6681150794029236, + "logits/rejected": -0.5994194746017456, + "logps/chosen": -0.08257712423801422, + "logps/rejected": -4.70539665222168, + "loss": 0.058, + "odds_ratio_loss": 0.010448940098285675, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008257712237536907, + "rewards/margins": 0.4622820019721985, + "rewards/rejected": -0.4705396592617035, + "sft_loss": 0.08257712423801422, + "step": 3090 + }, + { + "epoch": 4.46999276934201, + "grad_norm": 1.7738282748311598, + "learning_rate": 1.2687020181269147e-06, + "logits/chosen": -0.7731041312217712, + "logits/rejected": -0.6896120309829712, + "logps/chosen": -0.13104453682899475, + "logps/rejected": -5.16012716293335, + "loss": 0.0768, + "odds_ratio_loss": 0.023107830435037613, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0131044527515769, + "rewards/margins": 0.5029082298278809, + "rewards/rejected": -0.5160127878189087, + "sft_loss": 0.13104453682899475, + "step": 3091 + }, + { + "epoch": 4.471438900939986, + "grad_norm": 1.6843927662054177, + "learning_rate": 1.2664337438938052e-06, + "logits/chosen": -0.8892203569412231, + "logits/rejected": -0.742080807685852, + "logps/chosen": -0.12089746445417404, + "logps/rejected": -4.260397911071777, + "loss": 0.0922, + "odds_ratio_loss": 0.012745104730129242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012089746072888374, + "rewards/margins": 0.41395002603530884, + "rewards/rejected": -0.42603975534439087, + "sft_loss": 0.12089746445417404, + "step": 3092 + }, + { + "epoch": 4.4728850325379605, + "grad_norm": 1.5276591685861431, + "learning_rate": 1.264167117737974e-06, + "logits/chosen": -0.7408867478370667, + "logits/rejected": -0.5432535409927368, + "logps/chosen": -0.055726416409015656, + "logps/rejected": -5.040454387664795, + "loss": 0.0677, + "odds_ratio_loss": 0.01312476396560669, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00557264219969511, + "rewards/margins": 0.49847283959388733, + "rewards/rejected": -0.5040454268455505, + "sft_loss": 0.055726416409015656, + "step": 3093 + }, + { + "epoch": 4.474331164135936, + "grad_norm": 1.6530262946310774, + "learning_rate": 1.2619021410259749e-06, + "logits/chosen": -0.705237627029419, + "logits/rejected": -0.39580756425857544, + "logps/chosen": -0.06976450234651566, + "logps/rejected": -4.587730884552002, + "loss": 0.0955, + "odds_ratio_loss": 0.003844949882477522, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006976449862122536, + "rewards/margins": 0.45179668068885803, + "rewards/rejected": -0.458773136138916, + "sft_loss": 0.06976450234651566, + "step": 3094 + }, + { + "epoch": 4.475777295733912, + "grad_norm": 1.8816842919449692, + "learning_rate": 1.2596388151233749e-06, + "logits/chosen": -0.6847041845321655, + "logits/rejected": -0.5424312353134155, + "logps/chosen": -0.09019088745117188, + "logps/rejected": -5.533663749694824, + "loss": 0.1015, + "odds_ratio_loss": 0.006966853979974985, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009019088931381702, + "rewards/margins": 0.54434734582901, + "rewards/rejected": -0.5533663630485535, + "sft_loss": 0.09019088745117188, + "step": 3095 + }, + { + "epoch": 4.477223427331888, + "grad_norm": 1.5343422977668435, + "learning_rate": 1.2573771413947385e-06, + "logits/chosen": -0.8714608550071716, + "logits/rejected": -0.5927603244781494, + "logps/chosen": -0.04954008013010025, + "logps/rejected": -5.682701587677002, + "loss": 0.0522, + "odds_ratio_loss": 0.005804221611469984, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00495400745421648, + "rewards/margins": 0.5633162260055542, + "rewards/rejected": -0.568270206451416, + "sft_loss": 0.04954008013010025, + "step": 3096 + }, + { + "epoch": 4.478669558929862, + "grad_norm": 1.3475022711559164, + "learning_rate": 1.2551171212036388e-06, + "logits/chosen": -0.8582381010055542, + "logits/rejected": -0.744134783744812, + "logps/chosen": -0.05268469080328941, + "logps/rejected": -4.953915596008301, + "loss": 0.0539, + "odds_ratio_loss": 0.005596311762928963, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005268468987196684, + "rewards/margins": 0.49012309312820435, + "rewards/rejected": -0.4953915476799011, + "sft_loss": 0.05268469080328941, + "step": 3097 + }, + { + "epoch": 4.480115690527838, + "grad_norm": 1.6798491379007103, + "learning_rate": 1.2528587559126482e-06, + "logits/chosen": -0.7833631038665771, + "logits/rejected": -0.5663323998451233, + "logps/chosen": -0.04492802917957306, + "logps/rejected": -3.8920435905456543, + "loss": 0.0682, + "odds_ratio_loss": 0.002451021457090974, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004492803476750851, + "rewards/margins": 0.3847115635871887, + "rewards/rejected": -0.38920438289642334, + "sft_loss": 0.04492802917957306, + "step": 3098 + }, + { + "epoch": 4.481561822125814, + "grad_norm": 1.4371191416192433, + "learning_rate": 1.2506020468833467e-06, + "logits/chosen": -0.8777024149894714, + "logits/rejected": -0.6330994963645935, + "logps/chosen": -0.1080128401517868, + "logps/rejected": -5.859587669372559, + "loss": 0.0664, + "odds_ratio_loss": 0.007772314827889204, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010801284573972225, + "rewards/margins": 0.5751575231552124, + "rewards/rejected": -0.5859587788581848, + "sft_loss": 0.1080128401517868, + "step": 3099 + }, + { + "epoch": 4.483007953723789, + "grad_norm": 1.6584298264925346, + "learning_rate": 1.2483469954763096e-06, + "logits/chosen": -0.7187407612800598, + "logits/rejected": -0.6401557326316833, + "logps/chosen": -0.19415581226348877, + "logps/rejected": -5.682297706604004, + "loss": 0.1093, + "odds_ratio_loss": 0.006913043558597565, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019415581598877907, + "rewards/margins": 0.5488142371177673, + "rewards/rejected": -0.5682297945022583, + "sft_loss": 0.19415581226348877, + "step": 3100 + }, + { + "epoch": 4.484454085321764, + "grad_norm": 1.490297451192404, + "learning_rate": 1.2460936030511184e-06, + "logits/chosen": -1.0072453022003174, + "logits/rejected": -0.7015312314033508, + "logps/chosen": -0.04474491626024246, + "logps/rejected": -3.6979777812957764, + "loss": 0.062, + "odds_ratio_loss": 0.0034636668860912323, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0044744922779500484, + "rewards/margins": 0.3653232455253601, + "rewards/rejected": -0.36979779601097107, + "sft_loss": 0.04474491626024246, + "step": 3101 + }, + { + "epoch": 4.48590021691974, + "grad_norm": 1.4188491354073367, + "learning_rate": 1.2438418709663489e-06, + "logits/chosen": -0.6652334928512573, + "logits/rejected": -0.4683679938316345, + "logps/chosen": -0.0296041090041399, + "logps/rejected": -4.450751304626465, + "loss": 0.0804, + "odds_ratio_loss": 0.0026001809164881706, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0029604113660752773, + "rewards/margins": 0.4421147406101227, + "rewards/rejected": -0.445075124502182, + "sft_loss": 0.0296041090041399, + "step": 3102 + }, + { + "epoch": 4.487346348517715, + "grad_norm": 1.6423753492513558, + "learning_rate": 1.2415918005795823e-06, + "logits/chosen": -0.928438663482666, + "logits/rejected": -0.5258166193962097, + "logps/chosen": -0.12412159889936447, + "logps/rejected": -5.318241119384766, + "loss": 0.0877, + "odds_ratio_loss": 0.0032795346342027187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012412158772349358, + "rewards/margins": 0.5194119215011597, + "rewards/rejected": -0.5318241119384766, + "sft_loss": 0.12412159889936447, + "step": 3103 + }, + { + "epoch": 4.48879248011569, + "grad_norm": 2.2357703682444248, + "learning_rate": 1.2393433932473922e-06, + "logits/chosen": -0.8368874192237854, + "logits/rejected": -0.5514189004898071, + "logps/chosen": -0.0364052839577198, + "logps/rejected": -4.619117736816406, + "loss": 0.0767, + "odds_ratio_loss": 0.0018499845173209906, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003640528302639723, + "rewards/margins": 0.4582712650299072, + "rewards/rejected": -0.46191176772117615, + "sft_loss": 0.0364052839577198, + "step": 3104 + }, + { + "epoch": 4.490238611713666, + "grad_norm": 1.475287637368776, + "learning_rate": 1.237096650325351e-06, + "logits/chosen": -1.124525785446167, + "logits/rejected": -0.7751089334487915, + "logps/chosen": -0.0581241250038147, + "logps/rejected": -4.338222503662109, + "loss": 0.0609, + "odds_ratio_loss": 0.005026786122471094, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005812412593513727, + "rewards/margins": 0.4280098080635071, + "rewards/rejected": -0.4338222146034241, + "sft_loss": 0.0581241250038147, + "step": 3105 + }, + { + "epoch": 4.491684743311641, + "grad_norm": 1.457490704264928, + "learning_rate": 1.2348515731680306e-06, + "logits/chosen": -0.7537250518798828, + "logits/rejected": -0.6656769514083862, + "logps/chosen": -0.15097872912883759, + "logps/rejected": -4.950982093811035, + "loss": 0.0875, + "odds_ratio_loss": 0.01871381886303425, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015097874216735363, + "rewards/margins": 0.480000376701355, + "rewards/rejected": -0.4950982332229614, + "sft_loss": 0.15097872912883759, + "step": 3106 + }, + { + "epoch": 4.493130874909617, + "grad_norm": 1.4918833440545058, + "learning_rate": 1.2326081631289941e-06, + "logits/chosen": -0.787826657295227, + "logits/rejected": -0.6312362551689148, + "logps/chosen": -0.07020243257284164, + "logps/rejected": -6.140321731567383, + "loss": 0.0824, + "odds_ratio_loss": 0.011244947090744972, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007020242977887392, + "rewards/margins": 0.6070119142532349, + "rewards/rejected": -0.6140321493148804, + "sft_loss": 0.07020243257284164, + "step": 3107 + }, + { + "epoch": 4.494577006507592, + "grad_norm": 1.9028191693388976, + "learning_rate": 1.230366421560804e-06, + "logits/chosen": -0.8860039710998535, + "logits/rejected": -0.6597675681114197, + "logps/chosen": -0.05819562450051308, + "logps/rejected": -4.343313694000244, + "loss": 0.0756, + "odds_ratio_loss": 0.009293823502957821, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00581956235691905, + "rewards/margins": 0.4285118281841278, + "rewards/rejected": -0.43433141708374023, + "sft_loss": 0.05819562450051308, + "step": 3108 + }, + { + "epoch": 4.496023138105568, + "grad_norm": 1.6718646559326362, + "learning_rate": 1.2281263498150125e-06, + "logits/chosen": -0.8646371960639954, + "logits/rejected": -0.9306697845458984, + "logps/chosen": -0.06505996733903885, + "logps/rejected": -6.015259265899658, + "loss": 0.0827, + "odds_ratio_loss": 0.0021365510765463114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006505997385829687, + "rewards/margins": 0.5950199365615845, + "rewards/rejected": -0.6015259027481079, + "sft_loss": 0.06505996733903885, + "step": 3109 + }, + { + "epoch": 4.497469269703543, + "grad_norm": 1.4464144348963581, + "learning_rate": 1.2258879492421695e-06, + "logits/chosen": -0.8180409669876099, + "logits/rejected": -0.5849642753601074, + "logps/chosen": -0.08725761622190475, + "logps/rejected": -6.156432151794434, + "loss": 0.074, + "odds_ratio_loss": 0.002803635550662875, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00872576143592596, + "rewards/margins": 0.6069174408912659, + "rewards/rejected": -0.6156432032585144, + "sft_loss": 0.08725761622190475, + "step": 3110 + }, + { + "epoch": 4.4989154013015185, + "grad_norm": 1.603659515241102, + "learning_rate": 1.2236512211918125e-06, + "logits/chosen": -0.7529808282852173, + "logits/rejected": -0.6013559103012085, + "logps/chosen": -0.06795186549425125, + "logps/rejected": -5.375178813934326, + "loss": 0.0702, + "odds_ratio_loss": 0.009950753301382065, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006795186549425125, + "rewards/margins": 0.5307227373123169, + "rewards/rejected": -0.5375179052352905, + "sft_loss": 0.06795186549425125, + "step": 3111 + }, + { + "epoch": 4.500361532899494, + "grad_norm": 1.6272200025476848, + "learning_rate": 1.2214161670124767e-06, + "logits/chosen": -0.8545068502426147, + "logits/rejected": -0.6369377970695496, + "logps/chosen": -0.06836562603712082, + "logps/rejected": -4.065532207489014, + "loss": 0.0764, + "odds_ratio_loss": 0.006054166704416275, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006836562883108854, + "rewards/margins": 0.39971664547920227, + "rewards/rejected": -0.40655317902565, + "sft_loss": 0.06836562603712082, + "step": 3112 + }, + { + "epoch": 4.501807664497469, + "grad_norm": 1.7318462854388412, + "learning_rate": 1.2191827880516804e-06, + "logits/chosen": -0.8255729675292969, + "logits/rejected": -0.6984921097755432, + "logps/chosen": -0.045626379549503326, + "logps/rejected": -3.896320343017578, + "loss": 0.0746, + "odds_ratio_loss": 0.008824576623737812, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004562637768685818, + "rewards/margins": 0.3850694000720978, + "rewards/rejected": -0.3896320164203644, + "sft_loss": 0.045626379549503326, + "step": 3113 + }, + { + "epoch": 4.503253796095445, + "grad_norm": 1.553585178335543, + "learning_rate": 1.216951085655939e-06, + "logits/chosen": -0.8265068531036377, + "logits/rejected": -0.6123427152633667, + "logps/chosen": -0.0894949659705162, + "logps/rejected": -4.58646297454834, + "loss": 0.0653, + "odds_ratio_loss": 0.009005776606500149, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008949496783316135, + "rewards/margins": 0.449696809053421, + "rewards/rejected": -0.458646297454834, + "sft_loss": 0.0894949659705162, + "step": 3114 + }, + { + "epoch": 4.50469992769342, + "grad_norm": 1.6527698553956316, + "learning_rate": 1.214721061170752e-06, + "logits/chosen": -0.9693940877914429, + "logits/rejected": -0.6829290390014648, + "logps/chosen": -0.10791851580142975, + "logps/rejected": -4.9538068771362305, + "loss": 0.0856, + "odds_ratio_loss": 0.005190132651478052, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010791851207613945, + "rewards/margins": 0.4845888018608093, + "rewards/rejected": -0.4953806698322296, + "sft_loss": 0.10791851580142975, + "step": 3115 + }, + { + "epoch": 4.506146059291396, + "grad_norm": 1.6640371254845585, + "learning_rate": 1.2124927159406108e-06, + "logits/chosen": -0.7707617878913879, + "logits/rejected": -0.6420580148696899, + "logps/chosen": -0.05679768696427345, + "logps/rejected": -3.126427173614502, + "loss": 0.0911, + "odds_ratio_loss": 0.005179098807275295, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00567976851016283, + "rewards/margins": 0.30696290731430054, + "rewards/rejected": -0.3126426935195923, + "sft_loss": 0.05679768696427345, + "step": 3116 + }, + { + "epoch": 4.507592190889371, + "grad_norm": 2.0472468137344806, + "learning_rate": 1.210266051308994e-06, + "logits/chosen": -0.8093677759170532, + "logits/rejected": -0.6810101270675659, + "logps/chosen": -0.06034237891435623, + "logps/rejected": -4.68641471862793, + "loss": 0.0849, + "odds_ratio_loss": 0.009722420014441013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006034237798303366, + "rewards/margins": 0.4626072943210602, + "rewards/rejected": -0.4686414897441864, + "sft_loss": 0.06034237891435623, + "step": 3117 + }, + { + "epoch": 4.5090383224873465, + "grad_norm": 1.5325845079517588, + "learning_rate": 1.208041068618364e-06, + "logits/chosen": -0.7304118871688843, + "logits/rejected": -0.6360498070716858, + "logps/chosen": -0.057711161673069, + "logps/rejected": -4.730152130126953, + "loss": 0.0471, + "odds_ratio_loss": 0.006703021004796028, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005771116353571415, + "rewards/margins": 0.46724411845207214, + "rewards/rejected": -0.4730152487754822, + "sft_loss": 0.057711161673069, + "step": 3118 + }, + { + "epoch": 4.510484454085322, + "grad_norm": 1.6384690787658085, + "learning_rate": 1.205817769210173e-06, + "logits/chosen": -0.596650242805481, + "logits/rejected": -0.5212622284889221, + "logps/chosen": -0.10295553505420685, + "logps/rejected": -5.2416276931762695, + "loss": 0.0868, + "odds_ratio_loss": 0.01541278325021267, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010295553132891655, + "rewards/margins": 0.5138672590255737, + "rewards/rejected": -0.524162769317627, + "sft_loss": 0.10295553505420685, + "step": 3119 + }, + { + "epoch": 4.511930585683297, + "grad_norm": 3.2031844268630087, + "learning_rate": 1.2035961544248557e-06, + "logits/chosen": -0.8341730833053589, + "logits/rejected": -0.5385291576385498, + "logps/chosen": -0.05033033713698387, + "logps/rejected": -5.8386383056640625, + "loss": 0.1294, + "odds_ratio_loss": 0.005003097467124462, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005033033899962902, + "rewards/margins": 0.5788308382034302, + "rewards/rejected": -0.5838638544082642, + "sft_loss": 0.05033033713698387, + "step": 3120 + }, + { + "epoch": 4.513376717281273, + "grad_norm": 1.8825190631616322, + "learning_rate": 1.2013762256018316e-06, + "logits/chosen": -0.5786645412445068, + "logits/rejected": -0.44824934005737305, + "logps/chosen": -0.08105679601430893, + "logps/rejected": -6.404214859008789, + "loss": 0.0781, + "odds_ratio_loss": 0.009483350440859795, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008105679415166378, + "rewards/margins": 0.6323158144950867, + "rewards/rejected": -0.6404215097427368, + "sft_loss": 0.08105679601430893, + "step": 3121 + }, + { + "epoch": 4.514822848879248, + "grad_norm": 1.6008860726040326, + "learning_rate": 1.1991579840795037e-06, + "logits/chosen": -0.6722384691238403, + "logits/rejected": -0.48127275705337524, + "logps/chosen": -0.06287876516580582, + "logps/rejected": -6.787384986877441, + "loss": 0.0912, + "odds_ratio_loss": 0.007575103081762791, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006287876050919294, + "rewards/margins": 0.6724505424499512, + "rewards/rejected": -0.6787384152412415, + "sft_loss": 0.06287876516580582, + "step": 3122 + }, + { + "epoch": 4.516268980477223, + "grad_norm": 1.342589231387517, + "learning_rate": 1.1969414311952593e-06, + "logits/chosen": -0.8379350304603577, + "logits/rejected": -0.6947819590568542, + "logps/chosen": -0.034005191177129745, + "logps/rejected": -5.846619606018066, + "loss": 0.0581, + "odds_ratio_loss": 0.0031600147485733032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0034005185589194298, + "rewards/margins": 0.581261396408081, + "rewards/rejected": -0.5846619606018066, + "sft_loss": 0.034005191177129745, + "step": 3123 + }, + { + "epoch": 4.517715112075199, + "grad_norm": 1.5147641999518209, + "learning_rate": 1.1947265682854645e-06, + "logits/chosen": -0.7644755840301514, + "logits/rejected": -0.6600558161735535, + "logps/chosen": -0.11227733641862869, + "logps/rejected": -5.182661056518555, + "loss": 0.0626, + "odds_ratio_loss": 0.019188618287444115, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011227734386920929, + "rewards/margins": 0.5070383548736572, + "rewards/rejected": -0.5182661414146423, + "sft_loss": 0.11227733641862869, + "step": 3124 + }, + { + "epoch": 4.5191612436731745, + "grad_norm": 1.5468387885414232, + "learning_rate": 1.192513396685471e-06, + "logits/chosen": -0.7222833037376404, + "logits/rejected": -0.5326530933380127, + "logps/chosen": -0.02052503637969494, + "logps/rejected": -5.6258955001831055, + "loss": 0.0875, + "odds_ratio_loss": 0.002066811081022024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002052503637969494, + "rewards/margins": 0.5605370402336121, + "rewards/rejected": -0.5625895261764526, + "sft_loss": 0.02052503637969494, + "step": 3125 + }, + { + "epoch": 4.520607375271149, + "grad_norm": 1.665196211024822, + "learning_rate": 1.190301917729606e-06, + "logits/chosen": -0.7487690448760986, + "logits/rejected": -0.7268781661987305, + "logps/chosen": -0.09351005405187607, + "logps/rejected": -5.104481220245361, + "loss": 0.1071, + "odds_ratio_loss": 0.0030038892291486263, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009351005777716637, + "rewards/margins": 0.5010970830917358, + "rewards/rejected": -0.5104480981826782, + "sft_loss": 0.09351005405187607, + "step": 3126 + }, + { + "epoch": 4.522053506869125, + "grad_norm": 1.8089219923251223, + "learning_rate": 1.1880921327511799e-06, + "logits/chosen": -0.7335822582244873, + "logits/rejected": -0.635443389415741, + "logps/chosen": -0.06432907283306122, + "logps/rejected": -3.4109225273132324, + "loss": 0.087, + "odds_ratio_loss": 0.005904484074562788, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006432907655835152, + "rewards/margins": 0.3346593677997589, + "rewards/rejected": -0.3410922884941101, + "sft_loss": 0.06432907283306122, + "step": 3127 + }, + { + "epoch": 4.523499638467101, + "grad_norm": 1.444253112444959, + "learning_rate": 1.1858840430824798e-06, + "logits/chosen": -0.8454017639160156, + "logits/rejected": -0.6811554431915283, + "logps/chosen": -0.045448124408721924, + "logps/rejected": -4.554635047912598, + "loss": 0.0643, + "odds_ratio_loss": 0.005859294906258583, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004544812720268965, + "rewards/margins": 0.45091870427131653, + "rewards/rejected": -0.4554634988307953, + "sft_loss": 0.045448124408721924, + "step": 3128 + }, + { + "epoch": 4.5249457700650755, + "grad_norm": 1.4456132302971754, + "learning_rate": 1.1836776500547698e-06, + "logits/chosen": -0.804023802280426, + "logits/rejected": -0.6073635816574097, + "logps/chosen": -0.04560978338122368, + "logps/rejected": -3.941932201385498, + "loss": 0.0558, + "odds_ratio_loss": 0.006999202072620392, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004560978151857853, + "rewards/margins": 0.3896322250366211, + "rewards/rejected": -0.394193172454834, + "sft_loss": 0.04560978338122368, + "step": 3129 + }, + { + "epoch": 4.526391901663051, + "grad_norm": 1.7544043039159367, + "learning_rate": 1.181472954998295e-06, + "logits/chosen": -0.7125527858734131, + "logits/rejected": -0.5077294707298279, + "logps/chosen": -0.1320764422416687, + "logps/rejected": -4.950249671936035, + "loss": 0.0929, + "odds_ratio_loss": 0.011006537824869156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013207645155489445, + "rewards/margins": 0.4818173050880432, + "rewards/rejected": -0.49502497911453247, + "sft_loss": 0.1320764422416687, + "step": 3130 + }, + { + "epoch": 4.527838033261027, + "grad_norm": 1.4651207792497591, + "learning_rate": 1.1792699592422714e-06, + "logits/chosen": -0.6505829691886902, + "logits/rejected": -0.49591994285583496, + "logps/chosen": -0.06142151355743408, + "logps/rejected": -4.931596755981445, + "loss": 0.0811, + "odds_ratio_loss": 0.008902262896299362, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006142151076346636, + "rewards/margins": 0.48701754212379456, + "rewards/rejected": -0.493159681558609, + "sft_loss": 0.06142151355743408, + "step": 3131 + }, + { + "epoch": 4.5292841648590025, + "grad_norm": 1.37934565512855, + "learning_rate": 1.1770686641148966e-06, + "logits/chosen": -0.9451402425765991, + "logits/rejected": -0.652174174785614, + "logps/chosen": -0.042112577706575394, + "logps/rejected": -4.959356307983398, + "loss": 0.0686, + "odds_ratio_loss": 0.00329545047134161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004211257677525282, + "rewards/margins": 0.49172443151474, + "rewards/rejected": -0.49593567848205566, + "sft_loss": 0.042112577706575394, + "step": 3132 + }, + { + "epoch": 4.530730296456977, + "grad_norm": 1.8806322433014044, + "learning_rate": 1.1748690709433361e-06, + "logits/chosen": -0.6774564981460571, + "logits/rejected": -0.47243747115135193, + "logps/chosen": -0.038698915392160416, + "logps/rejected": -5.015310764312744, + "loss": 0.0835, + "odds_ratio_loss": 0.003635758301243186, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0038698913995176554, + "rewards/margins": 0.49766120314598083, + "rewards/rejected": -0.5015311241149902, + "sft_loss": 0.038698915392160416, + "step": 3133 + }, + { + "epoch": 4.532176428054953, + "grad_norm": 1.9020404331515046, + "learning_rate": 1.1726711810537366e-06, + "logits/chosen": -0.700002133846283, + "logits/rejected": -0.5288843512535095, + "logps/chosen": -0.15027688443660736, + "logps/rejected": -4.056690216064453, + "loss": 0.0959, + "odds_ratio_loss": 0.005189661867916584, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015027688816189766, + "rewards/margins": 0.39064133167266846, + "rewards/rejected": -0.40566903352737427, + "sft_loss": 0.15027688443660736, + "step": 3134 + }, + { + "epoch": 4.533622559652929, + "grad_norm": 1.4043050533639423, + "learning_rate": 1.1704749957712117e-06, + "logits/chosen": -0.8214952945709229, + "logits/rejected": -0.5681087374687195, + "logps/chosen": -0.04724892973899841, + "logps/rejected": -4.8302321434021, + "loss": 0.0588, + "odds_ratio_loss": 0.007608736399561167, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004724893253296614, + "rewards/margins": 0.4782983064651489, + "rewards/rejected": -0.4830232262611389, + "sft_loss": 0.04724892973899841, + "step": 3135 + }, + { + "epoch": 4.5350686912509035, + "grad_norm": 1.6404462461922473, + "learning_rate": 1.1682805164198502e-06, + "logits/chosen": -0.6733072996139526, + "logits/rejected": -0.5581374764442444, + "logps/chosen": -0.04132752865552902, + "logps/rejected": -4.327413082122803, + "loss": 0.0587, + "odds_ratio_loss": 0.004820593632757664, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004132753238081932, + "rewards/margins": 0.4286085367202759, + "rewards/rejected": -0.43274131417274475, + "sft_loss": 0.04132752865552902, + "step": 3136 + }, + { + "epoch": 4.536514822848879, + "grad_norm": 1.8346210650122372, + "learning_rate": 1.1660877443227106e-06, + "logits/chosen": -0.8485831022262573, + "logits/rejected": -0.7402029037475586, + "logps/chosen": -0.10031185299158096, + "logps/rejected": -3.5375704765319824, + "loss": 0.1088, + "odds_ratio_loss": 0.013819573447108269, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010031186044216156, + "rewards/margins": 0.3437258303165436, + "rewards/rejected": -0.35375702381134033, + "sft_loss": 0.10031185299158096, + "step": 3137 + }, + { + "epoch": 4.537960954446855, + "grad_norm": 1.4162384065240734, + "learning_rate": 1.1638966808018258e-06, + "logits/chosen": -0.6201837062835693, + "logits/rejected": -0.5881571769714355, + "logps/chosen": -0.05904054269194603, + "logps/rejected": -2.439631938934326, + "loss": 0.077, + "odds_ratio_loss": 0.009300749748945236, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00590405473485589, + "rewards/margins": 0.2380591332912445, + "rewards/rejected": -0.24396318197250366, + "sft_loss": 0.05904054269194603, + "step": 3138 + }, + { + "epoch": 4.539407086044831, + "grad_norm": 1.4561234361716575, + "learning_rate": 1.1617073271781937e-06, + "logits/chosen": -0.8747523427009583, + "logits/rejected": -0.5830885767936707, + "logps/chosen": -0.06753970682621002, + "logps/rejected": -3.148228168487549, + "loss": 0.0605, + "odds_ratio_loss": 0.00502714142203331, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006753971334546804, + "rewards/margins": 0.3080688416957855, + "rewards/rejected": -0.314822793006897, + "sft_loss": 0.06753970682621002, + "step": 3139 + }, + { + "epoch": 4.540853217642805, + "grad_norm": 1.4928781049529505, + "learning_rate": 1.1595196847717858e-06, + "logits/chosen": -0.537721574306488, + "logits/rejected": -0.4988245964050293, + "logps/chosen": -0.07080557942390442, + "logps/rejected": -4.220191955566406, + "loss": 0.085, + "odds_ratio_loss": 0.013362744823098183, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007080557756125927, + "rewards/margins": 0.41493868827819824, + "rewards/rejected": -0.42201921343803406, + "sft_loss": 0.07080557942390442, + "step": 3140 + }, + { + "epoch": 4.542299349240781, + "grad_norm": 1.717320927515359, + "learning_rate": 1.1573337549015384e-06, + "logits/chosen": -0.7508547306060791, + "logits/rejected": -0.548348069190979, + "logps/chosen": -0.03961345553398132, + "logps/rejected": -4.493158340454102, + "loss": 0.0565, + "odds_ratio_loss": 0.003105518640950322, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00396134564653039, + "rewards/margins": 0.44535452127456665, + "rewards/rejected": -0.4493158459663391, + "sft_loss": 0.03961345553398132, + "step": 3141 + }, + { + "epoch": 4.543745480838757, + "grad_norm": 1.9170535110296663, + "learning_rate": 1.1551495388853583e-06, + "logits/chosen": -0.6952974796295166, + "logits/rejected": -0.6106499433517456, + "logps/chosen": -0.12642665207386017, + "logps/rejected": -5.010893821716309, + "loss": 0.1007, + "odds_ratio_loss": 0.012112841010093689, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012642664834856987, + "rewards/margins": 0.4884466528892517, + "rewards/rejected": -0.501089334487915, + "sft_loss": 0.12642665207386017, + "step": 3142 + }, + { + "epoch": 4.5451916124367315, + "grad_norm": 1.7576434765191336, + "learning_rate": 1.1529670380401166e-06, + "logits/chosen": -0.7798545360565186, + "logits/rejected": -0.6895734667778015, + "logps/chosen": -0.10837902128696442, + "logps/rejected": -3.75516414642334, + "loss": 0.0832, + "odds_ratio_loss": 0.010169276036322117, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010837902314960957, + "rewards/margins": 0.3646785318851471, + "rewards/rejected": -0.375516414642334, + "sft_loss": 0.10837902128696442, + "step": 3143 + }, + { + "epoch": 4.546637744034707, + "grad_norm": 1.5294315441059878, + "learning_rate": 1.15078625368165e-06, + "logits/chosen": -0.8288568258285522, + "logits/rejected": -0.5528225898742676, + "logps/chosen": -0.08221597224473953, + "logps/rejected": -4.284519195556641, + "loss": 0.0672, + "odds_ratio_loss": 0.0071078077889978886, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008221596479415894, + "rewards/margins": 0.42023032903671265, + "rewards/rejected": -0.4284519553184509, + "sft_loss": 0.08221597224473953, + "step": 3144 + }, + { + "epoch": 4.548083875632683, + "grad_norm": 1.6890116765857413, + "learning_rate": 1.1486071871247637e-06, + "logits/chosen": -0.8703626990318298, + "logits/rejected": -0.65482097864151, + "logps/chosen": -0.05179030820727348, + "logps/rejected": -4.451789855957031, + "loss": 0.0888, + "odds_ratio_loss": 0.004219289869070053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005179030820727348, + "rewards/margins": 0.43999993801116943, + "rewards/rejected": -0.4451789855957031, + "sft_loss": 0.05179030820727348, + "step": 3145 + }, + { + "epoch": 4.549530007230658, + "grad_norm": 1.7118473076649063, + "learning_rate": 1.1464298396832232e-06, + "logits/chosen": -0.746877133846283, + "logits/rejected": -0.5233524441719055, + "logps/chosen": -0.061596743762493134, + "logps/rejected": -3.4973483085632324, + "loss": 0.0606, + "odds_ratio_loss": 0.0044494750909507275, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006159674376249313, + "rewards/margins": 0.3435751795768738, + "rewards/rejected": -0.3497348725795746, + "sft_loss": 0.061596743762493134, + "step": 3146 + }, + { + "epoch": 4.550976138828633, + "grad_norm": 1.6246389022279693, + "learning_rate": 1.144254212669761e-06, + "logits/chosen": -0.6675397753715515, + "logits/rejected": -0.5680756568908691, + "logps/chosen": -0.12102605402469635, + "logps/rejected": -4.196258544921875, + "loss": 0.089, + "odds_ratio_loss": 0.015979530289769173, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012102605774998665, + "rewards/margins": 0.4075232446193695, + "rewards/rejected": -0.41962581872940063, + "sft_loss": 0.12102605402469635, + "step": 3147 + }, + { + "epoch": 4.552422270426609, + "grad_norm": 1.4363813660078086, + "learning_rate": 1.142080307396069e-06, + "logits/chosen": -0.9800047874450684, + "logits/rejected": -0.6176702976226807, + "logps/chosen": -0.056100714951753616, + "logps/rejected": -4.86061954498291, + "loss": 0.0655, + "odds_ratio_loss": 0.007833914831280708, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005610071122646332, + "rewards/margins": 0.4804518520832062, + "rewards/rejected": -0.4860619306564331, + "sft_loss": 0.056100714951753616, + "step": 3148 + }, + { + "epoch": 4.553868402024584, + "grad_norm": 1.7603116527742264, + "learning_rate": 1.1399081251728047e-06, + "logits/chosen": -0.7957381010055542, + "logits/rejected": -0.6939653754234314, + "logps/chosen": -0.07760776579380035, + "logps/rejected": -4.140201568603516, + "loss": 0.0792, + "odds_ratio_loss": 0.015399527736008167, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0077607762068510056, + "rewards/margins": 0.4062593877315521, + "rewards/rejected": -0.4140201508998871, + "sft_loss": 0.07760776579380035, + "step": 3149 + }, + { + "epoch": 4.55531453362256, + "grad_norm": 1.7762852522056913, + "learning_rate": 1.1377376673095836e-06, + "logits/chosen": -0.7090476751327515, + "logits/rejected": -0.6018462777137756, + "logps/chosen": -0.06537199765443802, + "logps/rejected": -5.667291641235352, + "loss": 0.085, + "odds_ratio_loss": 0.010719805024564266, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006537200417369604, + "rewards/margins": 0.5601919889450073, + "rewards/rejected": -0.5667291879653931, + "sft_loss": 0.06537199765443802, + "step": 3150 + }, + { + "epoch": 4.556760665220535, + "grad_norm": 1.7313737183629967, + "learning_rate": 1.1355689351149837e-06, + "logits/chosen": -0.7923053503036499, + "logits/rejected": -0.6517032384872437, + "logps/chosen": -0.08571956306695938, + "logps/rejected": -5.058340549468994, + "loss": 0.0809, + "odds_ratio_loss": 0.005782403517514467, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008571955375373363, + "rewards/margins": 0.4972621202468872, + "rewards/rejected": -0.5058341026306152, + "sft_loss": 0.08571956306695938, + "step": 3151 + }, + { + "epoch": 4.55820679681851, + "grad_norm": 1.4263625628365386, + "learning_rate": 1.1334019298965394e-06, + "logits/chosen": -0.6988058090209961, + "logits/rejected": -0.5728355050086975, + "logps/chosen": -0.03624889254570007, + "logps/rejected": -3.573759078979492, + "loss": 0.064, + "odds_ratio_loss": 0.0032424121163785458, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003624889301136136, + "rewards/margins": 0.3537510633468628, + "rewards/rejected": -0.3573759198188782, + "sft_loss": 0.03624889254570007, + "step": 3152 + }, + { + "epoch": 4.559652928416486, + "grad_norm": 1.6847747000268567, + "learning_rate": 1.1312366529607493e-06, + "logits/chosen": -1.2507294416427612, + "logits/rejected": -0.6984279751777649, + "logps/chosen": -0.07849462330341339, + "logps/rejected": -4.19482421875, + "loss": 0.0845, + "odds_ratio_loss": 0.00989564135670662, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007849462330341339, + "rewards/margins": 0.4116330146789551, + "rewards/rejected": -0.4194824695587158, + "sft_loss": 0.07849462330341339, + "step": 3153 + }, + { + "epoch": 4.561099060014461, + "grad_norm": 1.4166334580337705, + "learning_rate": 1.1290731056130645e-06, + "logits/chosen": -1.1231694221496582, + "logits/rejected": -0.7405078411102295, + "logps/chosen": -0.09685708582401276, + "logps/rejected": -4.7172393798828125, + "loss": 0.074, + "odds_ratio_loss": 0.010411504656076431, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00968570914119482, + "rewards/margins": 0.4620382487773895, + "rewards/rejected": -0.4717239737510681, + "sft_loss": 0.09685708582401276, + "step": 3154 + }, + { + "epoch": 4.562545191612437, + "grad_norm": 1.472571547360005, + "learning_rate": 1.1269112891578964e-06, + "logits/chosen": -0.8097749352455139, + "logits/rejected": -0.6882657408714294, + "logps/chosen": -0.09382905811071396, + "logps/rejected": -3.311452627182007, + "loss": 0.0875, + "odds_ratio_loss": 0.010742062702775002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009382905438542366, + "rewards/margins": 0.321762353181839, + "rewards/rejected": -0.3311452567577362, + "sft_loss": 0.09382905811071396, + "step": 3155 + }, + { + "epoch": 4.563991323210412, + "grad_norm": 2.522302408631471, + "learning_rate": 1.124751204898614e-06, + "logits/chosen": -1.0051918029785156, + "logits/rejected": -0.6038036346435547, + "logps/chosen": -0.0715935081243515, + "logps/rejected": -5.134657382965088, + "loss": 0.0943, + "odds_ratio_loss": 0.004194090608507395, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007159349974244833, + "rewards/margins": 0.5063064098358154, + "rewards/rejected": -0.5134657621383667, + "sft_loss": 0.0715935081243515, + "step": 3156 + }, + { + "epoch": 4.565437454808388, + "grad_norm": 1.3968773246812638, + "learning_rate": 1.1225928541375376e-06, + "logits/chosen": -0.8158324956893921, + "logits/rejected": -0.6068318486213684, + "logps/chosen": -0.07950488477945328, + "logps/rejected": -9.155324935913086, + "loss": 0.0772, + "odds_ratio_loss": 0.005621365271508694, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007950487546622753, + "rewards/margins": 0.9075820446014404, + "rewards/rejected": -0.9155324697494507, + "sft_loss": 0.07950488477945328, + "step": 3157 + }, + { + "epoch": 4.566883586406363, + "grad_norm": 1.6581570448756777, + "learning_rate": 1.1204362381759485e-06, + "logits/chosen": -1.0890898704528809, + "logits/rejected": -0.6594418287277222, + "logps/chosen": -0.07467133551836014, + "logps/rejected": -5.057663917541504, + "loss": 0.0913, + "odds_ratio_loss": 0.001974435057491064, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007467133458703756, + "rewards/margins": 0.4982993006706238, + "rewards/rejected": -0.5057663917541504, + "sft_loss": 0.07467133551836014, + "step": 3158 + }, + { + "epoch": 4.568329718004338, + "grad_norm": 2.052336604338936, + "learning_rate": 1.1182813583140736e-06, + "logits/chosen": -0.9036225080490112, + "logits/rejected": -0.690902054309845, + "logps/chosen": -0.10851224511861801, + "logps/rejected": -6.375421047210693, + "loss": 0.1284, + "odds_ratio_loss": 0.010672297328710556, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010851224884390831, + "rewards/margins": 0.6266908645629883, + "rewards/rejected": -0.6375421285629272, + "sft_loss": 0.10851224511861801, + "step": 3159 + }, + { + "epoch": 4.569775849602314, + "grad_norm": 1.3507760008718734, + "learning_rate": 1.1161282158511016e-06, + "logits/chosen": -0.692068874835968, + "logits/rejected": -0.607538104057312, + "logps/chosen": -0.05230974406003952, + "logps/rejected": -4.683342456817627, + "loss": 0.0577, + "odds_ratio_loss": 0.0022822008468210697, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005230974406003952, + "rewards/margins": 0.4631032943725586, + "rewards/rejected": -0.46833428740501404, + "sft_loss": 0.05230974406003952, + "step": 3160 + }, + { + "epoch": 4.571221981200289, + "grad_norm": 1.3867036048678973, + "learning_rate": 1.1139768120851677e-06, + "logits/chosen": -0.8184716701507568, + "logits/rejected": -0.6516551971435547, + "logps/chosen": -0.04152075573801994, + "logps/rejected": -6.1955413818359375, + "loss": 0.0513, + "odds_ratio_loss": 0.0025585112161934376, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004152075387537479, + "rewards/margins": 0.6154020428657532, + "rewards/rejected": -0.6195541024208069, + "sft_loss": 0.04152075573801994, + "step": 3161 + }, + { + "epoch": 4.572668112798265, + "grad_norm": 1.5935727445354109, + "learning_rate": 1.1118271483133638e-06, + "logits/chosen": -0.6832992434501648, + "logits/rejected": -0.6606971621513367, + "logps/chosen": -0.031898848712444305, + "logps/rejected": -4.598211288452148, + "loss": 0.0939, + "odds_ratio_loss": 0.0026514395140111446, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0031898850575089455, + "rewards/margins": 0.45663124322891235, + "rewards/rejected": -0.4598211646080017, + "sft_loss": 0.031898848712444305, + "step": 3162 + }, + { + "epoch": 4.57411424439624, + "grad_norm": 1.6348180305027182, + "learning_rate": 1.1096792258317273e-06, + "logits/chosen": -0.8901809453964233, + "logits/rejected": -0.6974164247512817, + "logps/chosen": -0.07738389074802399, + "logps/rejected": -4.344188690185547, + "loss": 0.1015, + "odds_ratio_loss": 0.00792864803224802, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007738389540463686, + "rewards/margins": 0.42668044567108154, + "rewards/rejected": -0.43441885709762573, + "sft_loss": 0.07738389074802399, + "step": 3163 + }, + { + "epoch": 4.575560375994216, + "grad_norm": 1.452333306176182, + "learning_rate": 1.1075330459352517e-06, + "logits/chosen": -0.8866585493087769, + "logits/rejected": -0.6407424211502075, + "logps/chosen": -0.0721864402294159, + "logps/rejected": -4.096374988555908, + "loss": 0.0696, + "odds_ratio_loss": 0.010981040075421333, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007218644022941589, + "rewards/margins": 0.402418851852417, + "rewards/rejected": -0.4096375107765198, + "sft_loss": 0.0721864402294159, + "step": 3164 + }, + { + "epoch": 4.577006507592191, + "grad_norm": 1.6162076523042697, + "learning_rate": 1.1053886099178745e-06, + "logits/chosen": -0.7911291718482971, + "logits/rejected": -0.6464878916740417, + "logps/chosen": -0.10432671755552292, + "logps/rejected": -3.5140199661254883, + "loss": 0.0838, + "odds_ratio_loss": 0.008694362826645374, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010432671755552292, + "rewards/margins": 0.3409693241119385, + "rewards/rejected": -0.35140201449394226, + "sft_loss": 0.10432671755552292, + "step": 3165 + }, + { + "epoch": 4.578452639190166, + "grad_norm": 1.8954756696938493, + "learning_rate": 1.1032459190724858e-06, + "logits/chosen": -0.8337982892990112, + "logits/rejected": -0.7270399332046509, + "logps/chosen": -0.11213445663452148, + "logps/rejected": -4.699167251586914, + "loss": 0.1142, + "odds_ratio_loss": 0.007468009367585182, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011213446035981178, + "rewards/margins": 0.45870327949523926, + "rewards/rejected": -0.46991676092147827, + "sft_loss": 0.11213445663452148, + "step": 3166 + }, + { + "epoch": 4.579898770788142, + "grad_norm": 1.972473660722877, + "learning_rate": 1.1011049746909216e-06, + "logits/chosen": -0.9359748363494873, + "logits/rejected": -0.7544848918914795, + "logps/chosen": -0.1299418807029724, + "logps/rejected": -4.122316360473633, + "loss": 0.085, + "odds_ratio_loss": 0.009792567230761051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012994188815355301, + "rewards/margins": 0.3992374539375305, + "rewards/rejected": -0.4122316241264343, + "sft_loss": 0.1299418807029724, + "step": 3167 + }, + { + "epoch": 4.5813449023861175, + "grad_norm": 2.1773388410438055, + "learning_rate": 1.0989657780639632e-06, + "logits/chosen": -0.8588944673538208, + "logits/rejected": -0.5516154766082764, + "logps/chosen": -0.08692246675491333, + "logps/rejected": -6.501953125, + "loss": 0.0723, + "odds_ratio_loss": 0.009055457077920437, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008692245930433273, + "rewards/margins": 0.641503095626831, + "rewards/rejected": -0.6501953601837158, + "sft_loss": 0.08692246675491333, + "step": 3168 + }, + { + "epoch": 4.582791033984092, + "grad_norm": 1.71320771588054, + "learning_rate": 1.0968283304813435e-06, + "logits/chosen": -0.7051694989204407, + "logits/rejected": -0.5732921361923218, + "logps/chosen": -0.054236263036727905, + "logps/rejected": -5.548681735992432, + "loss": 0.1064, + "odds_ratio_loss": 0.002435041591525078, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005423626862466335, + "rewards/margins": 0.5494445562362671, + "rewards/rejected": -0.5548681616783142, + "sft_loss": 0.054236263036727905, + "step": 3169 + }, + { + "epoch": 4.584237165582068, + "grad_norm": 1.8240270589071466, + "learning_rate": 1.0946926332317344e-06, + "logits/chosen": -0.7067397832870483, + "logits/rejected": -0.8004418015480042, + "logps/chosen": -0.10146773606538773, + "logps/rejected": -3.536372184753418, + "loss": 0.1123, + "odds_ratio_loss": 0.024081986397504807, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010146773420274258, + "rewards/margins": 0.34349045157432556, + "rewards/rejected": -0.3536372184753418, + "sft_loss": 0.10146773606538773, + "step": 3170 + }, + { + "epoch": 4.585683297180044, + "grad_norm": 2.3802481561976525, + "learning_rate": 1.092558687602758e-06, + "logits/chosen": -0.8300359845161438, + "logits/rejected": -0.5264511704444885, + "logps/chosen": -0.13310036063194275, + "logps/rejected": -5.091741561889648, + "loss": 0.1434, + "odds_ratio_loss": 0.0062806615605950356, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01331003662198782, + "rewards/margins": 0.4958640933036804, + "rewards/rejected": -0.5091741681098938, + "sft_loss": 0.13310036063194275, + "step": 3171 + }, + { + "epoch": 4.587129428778018, + "grad_norm": 1.4752608857026752, + "learning_rate": 1.0904264948809769e-06, + "logits/chosen": -0.8397960662841797, + "logits/rejected": -0.6188733577728271, + "logps/chosen": -0.056685350835323334, + "logps/rejected": -3.9327101707458496, + "loss": 0.0549, + "odds_ratio_loss": 0.009330632165074348, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005668535362929106, + "rewards/margins": 0.3876025080680847, + "rewards/rejected": -0.3932710289955139, + "sft_loss": 0.056685350835323334, + "step": 3172 + }, + { + "epoch": 4.588575560375994, + "grad_norm": 1.4437816871536389, + "learning_rate": 1.0882960563518993e-06, + "logits/chosen": -0.6703709959983826, + "logits/rejected": -0.5640783905982971, + "logps/chosen": -0.03526798635721207, + "logps/rejected": -4.020233154296875, + "loss": 0.0908, + "odds_ratio_loss": 0.004295479506254196, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003526798915117979, + "rewards/margins": 0.3984965980052948, + "rewards/rejected": -0.4020233452320099, + "sft_loss": 0.03526798635721207, + "step": 3173 + }, + { + "epoch": 4.59002169197397, + "grad_norm": 1.50024639564137, + "learning_rate": 1.0861673732999737e-06, + "logits/chosen": -0.6850433349609375, + "logits/rejected": -0.5941628217697144, + "logps/chosen": -0.17146408557891846, + "logps/rejected": -4.675539970397949, + "loss": 0.0941, + "odds_ratio_loss": 0.01795043796300888, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017146408557891846, + "rewards/margins": 0.45040759444236755, + "rewards/rejected": -0.4675540030002594, + "sft_loss": 0.17146408557891846, + "step": 3174 + }, + { + "epoch": 4.591467823571945, + "grad_norm": 1.757910834910301, + "learning_rate": 1.0840404470085908e-06, + "logits/chosen": -0.8096254467964172, + "logits/rejected": -0.5914109945297241, + "logps/chosen": -0.06457695364952087, + "logps/rejected": -3.9802322387695312, + "loss": 0.149, + "odds_ratio_loss": 0.009919540025293827, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006457695737481117, + "rewards/margins": 0.3915655314922333, + "rewards/rejected": -0.39802321791648865, + "sft_loss": 0.06457695364952087, + "step": 3175 + }, + { + "epoch": 4.59291395516992, + "grad_norm": 3.319954432745719, + "learning_rate": 1.0819152787600815e-06, + "logits/chosen": -0.8060805201530457, + "logits/rejected": -0.4811224341392517, + "logps/chosen": -0.12582726776599884, + "logps/rejected": -4.696778297424316, + "loss": 0.0859, + "odds_ratio_loss": 0.0025469374377280474, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012582727707922459, + "rewards/margins": 0.45709505677223206, + "rewards/rejected": -0.46967780590057373, + "sft_loss": 0.12582726776599884, + "step": 3176 + }, + { + "epoch": 4.594360086767896, + "grad_norm": 1.4319642017221001, + "learning_rate": 1.07979186983572e-06, + "logits/chosen": -0.7860830426216125, + "logits/rejected": -0.6465473771095276, + "logps/chosen": -0.08816933631896973, + "logps/rejected": -3.463515281677246, + "loss": 0.0742, + "odds_ratio_loss": 0.005800843238830566, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008816934190690517, + "rewards/margins": 0.3375346064567566, + "rewards/rejected": -0.3463515043258667, + "sft_loss": 0.08816933631896973, + "step": 3177 + }, + { + "epoch": 4.595806218365872, + "grad_norm": 1.4844982954091979, + "learning_rate": 1.0776702215157153e-06, + "logits/chosen": -0.8793070912361145, + "logits/rejected": -0.7314832210540771, + "logps/chosen": -0.0764409527182579, + "logps/rejected": -3.413910388946533, + "loss": 0.06, + "odds_ratio_loss": 0.010066686198115349, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007644095458090305, + "rewards/margins": 0.333746999502182, + "rewards/rejected": -0.34139108657836914, + "sft_loss": 0.0764409527182579, + "step": 3178 + }, + { + "epoch": 4.5972523499638465, + "grad_norm": 1.536625642676757, + "learning_rate": 1.0755503350792188e-06, + "logits/chosen": -0.7018544673919678, + "logits/rejected": -0.6762786507606506, + "logps/chosen": -0.06070394814014435, + "logps/rejected": -2.954383373260498, + "loss": 0.1113, + "odds_ratio_loss": 0.007944751530885696, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00607039500027895, + "rewards/margins": 0.2893679738044739, + "rewards/rejected": -0.29543834924697876, + "sft_loss": 0.06070394814014435, + "step": 3179 + }, + { + "epoch": 4.598698481561822, + "grad_norm": 1.3876750626742878, + "learning_rate": 1.0734322118043158e-06, + "logits/chosen": -0.861190676689148, + "logits/rejected": -0.7343040704727173, + "logps/chosen": -0.03276204317808151, + "logps/rejected": -7.235999584197998, + "loss": 0.0495, + "odds_ratio_loss": 0.001321395393460989, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0032762044575065374, + "rewards/margins": 0.7203238010406494, + "rewards/rejected": -0.7236000299453735, + "sft_loss": 0.03276204317808151, + "step": 3180 + }, + { + "epoch": 4.600144613159798, + "grad_norm": 1.4689343043016123, + "learning_rate": 1.0713158529680336e-06, + "logits/chosen": -0.8702647686004639, + "logits/rejected": -0.7496334314346313, + "logps/chosen": -0.05915086343884468, + "logps/rejected": -3.7460100650787354, + "loss": 0.0443, + "odds_ratio_loss": 0.00663282535970211, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005915086250752211, + "rewards/margins": 0.36868590116500854, + "rewards/rejected": -0.37460100650787354, + "sft_loss": 0.05915086343884468, + "step": 3181 + }, + { + "epoch": 4.601590744757773, + "grad_norm": 2.1623315858903696, + "learning_rate": 1.069201259846331e-06, + "logits/chosen": -0.8386332988739014, + "logits/rejected": -0.6750970482826233, + "logps/chosen": -0.08837445080280304, + "logps/rejected": -4.1748151779174805, + "loss": 0.0652, + "odds_ratio_loss": 0.009183772839605808, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008837445639073849, + "rewards/margins": 0.40864408016204834, + "rewards/rejected": -0.41748151183128357, + "sft_loss": 0.08837445080280304, + "step": 3182 + }, + { + "epoch": 4.603036876355748, + "grad_norm": 1.4154943795887955, + "learning_rate": 1.0670884337141028e-06, + "logits/chosen": -0.6935139894485474, + "logits/rejected": -0.42883455753326416, + "logps/chosen": -0.10457340627908707, + "logps/rejected": -5.574097156524658, + "loss": 0.0871, + "odds_ratio_loss": 0.016215285286307335, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010457340627908707, + "rewards/margins": 0.5469523668289185, + "rewards/rejected": -0.5574097037315369, + "sft_loss": 0.10457340627908707, + "step": 3183 + }, + { + "epoch": 4.604483007953724, + "grad_norm": 1.7813193278087818, + "learning_rate": 1.0649773758451832e-06, + "logits/chosen": -0.8007773160934448, + "logits/rejected": -0.6058996915817261, + "logps/chosen": -0.07291635125875473, + "logps/rejected": -5.96047306060791, + "loss": 0.0747, + "odds_ratio_loss": 0.005711485166102648, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0072916364297270775, + "rewards/margins": 0.58875572681427, + "rewards/rejected": -0.5960473418235779, + "sft_loss": 0.07291635125875473, + "step": 3184 + }, + { + "epoch": 4.6059291395517, + "grad_norm": 1.8367648936309657, + "learning_rate": 1.0628680875123327e-06, + "logits/chosen": -0.5505800843238831, + "logits/rejected": -0.4273523986339569, + "logps/chosen": -0.06284250319004059, + "logps/rejected": -3.4022233486175537, + "loss": 0.0687, + "odds_ratio_loss": 0.006926396396011114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006284250877797604, + "rewards/margins": 0.3339381217956543, + "rewards/rejected": -0.3402223289012909, + "sft_loss": 0.06284250319004059, + "step": 3185 + }, + { + "epoch": 4.6073752711496745, + "grad_norm": 1.7856939273411114, + "learning_rate": 1.0607605699872534e-06, + "logits/chosen": -0.7774771451950073, + "logits/rejected": -0.6703144311904907, + "logps/chosen": -0.18263757228851318, + "logps/rejected": -4.344770908355713, + "loss": 0.099, + "odds_ratio_loss": 0.021513303741812706, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01826375722885132, + "rewards/margins": 0.41621333360671997, + "rewards/rejected": -0.4344770908355713, + "sft_loss": 0.18263757228851318, + "step": 3186 + }, + { + "epoch": 4.60882140274765, + "grad_norm": 1.779759710533555, + "learning_rate": 1.0586548245405715e-06, + "logits/chosen": -0.5290587544441223, + "logits/rejected": -0.5527068972587585, + "logps/chosen": -0.20977208018302917, + "logps/rejected": -4.67911434173584, + "loss": 0.1121, + "odds_ratio_loss": 0.024520423263311386, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020977208390831947, + "rewards/margins": 0.44693419337272644, + "rewards/rejected": -0.46791142225265503, + "sft_loss": 0.20977208018302917, + "step": 3187 + }, + { + "epoch": 4.610267534345626, + "grad_norm": 1.6500003465840882, + "learning_rate": 1.0565508524418522e-06, + "logits/chosen": -0.826015293598175, + "logits/rejected": -0.6062042713165283, + "logps/chosen": -0.09776744246482849, + "logps/rejected": -3.359614849090576, + "loss": 0.0841, + "odds_ratio_loss": 0.008568299002945423, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009776744991540909, + "rewards/margins": 0.3261847198009491, + "rewards/rejected": -0.3359614908695221, + "sft_loss": 0.09776744246482849, + "step": 3188 + }, + { + "epoch": 4.611713665943601, + "grad_norm": 1.4752869233630663, + "learning_rate": 1.0544486549595868e-06, + "logits/chosen": -0.8045449256896973, + "logits/rejected": -0.71222323179245, + "logps/chosen": -0.04860668256878853, + "logps/rejected": -3.213080883026123, + "loss": 0.0518, + "odds_ratio_loss": 0.007026593200862408, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004860668908804655, + "rewards/margins": 0.3164474368095398, + "rewards/rejected": -0.32130810618400574, + "sft_loss": 0.04860668256878853, + "step": 3189 + }, + { + "epoch": 4.613159797541576, + "grad_norm": 1.349596427506426, + "learning_rate": 1.0523482333611973e-06, + "logits/chosen": -0.9437330961227417, + "logits/rejected": -0.670153021812439, + "logps/chosen": -0.027105463668704033, + "logps/rejected": -5.5081562995910645, + "loss": 0.0588, + "odds_ratio_loss": 0.003837391035631299, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002710546599701047, + "rewards/margins": 0.5481050610542297, + "rewards/rejected": -0.5508156418800354, + "sft_loss": 0.027105463668704033, + "step": 3190 + }, + { + "epoch": 4.614605929139552, + "grad_norm": 1.6136975666084887, + "learning_rate": 1.0502495889130348e-06, + "logits/chosen": -0.6746639609336853, + "logits/rejected": -0.5449431538581848, + "logps/chosen": -0.04978754371404648, + "logps/rejected": -4.931973934173584, + "loss": 0.0676, + "odds_ratio_loss": 0.01146079320460558, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004978754557669163, + "rewards/margins": 0.48821866512298584, + "rewards/rejected": -0.49319738149642944, + "sft_loss": 0.04978754371404648, + "step": 3191 + }, + { + "epoch": 4.616052060737527, + "grad_norm": 1.8053309321240607, + "learning_rate": 1.0481527228803825e-06, + "logits/chosen": -0.7316631078720093, + "logits/rejected": -0.48841989040374756, + "logps/chosen": -0.05751391872763634, + "logps/rejected": -7.884844779968262, + "loss": 0.0968, + "odds_ratio_loss": 0.0013119642389938235, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005751391872763634, + "rewards/margins": 0.7827330827713013, + "rewards/rejected": -0.7884844541549683, + "sft_loss": 0.05751391872763634, + "step": 3192 + }, + { + "epoch": 4.6174981923355025, + "grad_norm": 1.6906203577332746, + "learning_rate": 1.0460576365274464e-06, + "logits/chosen": -0.5634068846702576, + "logits/rejected": -0.4004989564418793, + "logps/chosen": -0.09039582312107086, + "logps/rejected": -6.0387139320373535, + "loss": 0.0674, + "odds_ratio_loss": 0.010964239947497845, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00903958361595869, + "rewards/margins": 0.5948318243026733, + "rewards/rejected": -0.6038714051246643, + "sft_loss": 0.09039582312107086, + "step": 3193 + }, + { + "epoch": 4.618944323933478, + "grad_norm": 1.5504675757012243, + "learning_rate": 1.043964331117364e-06, + "logits/chosen": -0.6690958142280579, + "logits/rejected": -0.7253610491752625, + "logps/chosen": -0.04411579668521881, + "logps/rejected": -3.1955485343933105, + "loss": 0.0484, + "odds_ratio_loss": 0.011617464944720268, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004411579575389624, + "rewards/margins": 0.31514328718185425, + "rewards/rejected": -0.3195548951625824, + "sft_loss": 0.04411579668521881, + "step": 3194 + }, + { + "epoch": 4.620390455531453, + "grad_norm": 1.9494225748283929, + "learning_rate": 1.0418728079121946e-06, + "logits/chosen": -0.7559603452682495, + "logits/rejected": -0.6388688087463379, + "logps/chosen": -0.08622848987579346, + "logps/rejected": -4.276580810546875, + "loss": 0.0798, + "odds_ratio_loss": 0.00728376442566514, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008622849360108376, + "rewards/margins": 0.4190352261066437, + "rewards/rejected": -0.4276581108570099, + "sft_loss": 0.08622848987579346, + "step": 3195 + }, + { + "epoch": 4.621836587129429, + "grad_norm": 1.5533084791527914, + "learning_rate": 1.039783068172928e-06, + "logits/chosen": -0.704337477684021, + "logits/rejected": -0.5278608202934265, + "logps/chosen": -0.11907921731472015, + "logps/rejected": -3.9224257469177246, + "loss": 0.1027, + "odds_ratio_loss": 0.014725545421242714, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011907922104001045, + "rewards/margins": 0.38033467531204224, + "rewards/rejected": -0.3922426104545593, + "sft_loss": 0.11907921731472015, + "step": 3196 + }, + { + "epoch": 4.623282718727404, + "grad_norm": 1.5224963198530153, + "learning_rate": 1.0376951131594745e-06, + "logits/chosen": -1.0034908056259155, + "logits/rejected": -0.6819223165512085, + "logps/chosen": -0.044741638004779816, + "logps/rejected": -5.437371730804443, + "loss": 0.0572, + "odds_ratio_loss": 0.0024968015495687723, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0044741639867424965, + "rewards/margins": 0.5392630100250244, + "rewards/rejected": -0.5437371730804443, + "sft_loss": 0.044741638004779816, + "step": 3197 + }, + { + "epoch": 4.624728850325379, + "grad_norm": 2.572168604966729, + "learning_rate": 1.0356089441306685e-06, + "logits/chosen": -1.0177311897277832, + "logits/rejected": -0.6533507108688354, + "logps/chosen": -0.08731850981712341, + "logps/rejected": -4.7309112548828125, + "loss": 0.0765, + "odds_ratio_loss": 0.01737801730632782, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008731850422918797, + "rewards/margins": 0.4643592834472656, + "rewards/rejected": -0.47309115529060364, + "sft_loss": 0.08731850981712341, + "step": 3198 + }, + { + "epoch": 4.626174981923355, + "grad_norm": 1.7934135984111654, + "learning_rate": 1.0335245623442724e-06, + "logits/chosen": -0.8442277908325195, + "logits/rejected": -0.6539605259895325, + "logps/chosen": -0.10159482061862946, + "logps/rejected": -4.887598991394043, + "loss": 0.1044, + "odds_ratio_loss": 0.010155798867344856, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01015948224812746, + "rewards/margins": 0.4786003828048706, + "rewards/rejected": -0.4887598752975464, + "sft_loss": 0.10159482061862946, + "step": 3199 + }, + { + "epoch": 4.6276211135213305, + "grad_norm": 1.4702477282441089, + "learning_rate": 1.0314419690569645e-06, + "logits/chosen": -0.7017852067947388, + "logits/rejected": -0.6079646348953247, + "logps/chosen": -0.05808115005493164, + "logps/rejected": -4.254312992095947, + "loss": 0.0666, + "odds_ratio_loss": 0.00289735640399158, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005808115005493164, + "rewards/margins": 0.4196231961250305, + "rewards/rejected": -0.4254313111305237, + "sft_loss": 0.05808115005493164, + "step": 3200 + }, + { + "epoch": 4.629067245119306, + "grad_norm": 1.8173043025233961, + "learning_rate": 1.0293611655243508e-06, + "logits/chosen": -0.8450669646263123, + "logits/rejected": -0.6467236280441284, + "logps/chosen": -0.17060142755508423, + "logps/rejected": -6.41766357421875, + "loss": 0.1134, + "odds_ratio_loss": 0.03092418983578682, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017060142010450363, + "rewards/margins": 0.6247062087059021, + "rewards/rejected": -0.641766369342804, + "sft_loss": 0.17060142755508423, + "step": 3201 + }, + { + "epoch": 4.630513376717281, + "grad_norm": 1.7247881706158277, + "learning_rate": 1.0272821530009528e-06, + "logits/chosen": -0.7554938793182373, + "logits/rejected": -0.631575882434845, + "logps/chosen": -0.051126834005117416, + "logps/rejected": -5.378612518310547, + "loss": 0.0964, + "odds_ratio_loss": 0.004960625432431698, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005112683400511742, + "rewards/margins": 0.5327485799789429, + "rewards/rejected": -0.5378612279891968, + "sft_loss": 0.051126834005117416, + "step": 3202 + }, + { + "epoch": 4.631959508315257, + "grad_norm": 1.698964824312715, + "learning_rate": 1.025204932740218e-06, + "logits/chosen": -0.8689213991165161, + "logits/rejected": -0.4361826181411743, + "logps/chosen": -0.0891483798623085, + "logps/rejected": -5.1415205001831055, + "loss": 0.0987, + "odds_ratio_loss": 0.04759196564555168, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.008914838545024395, + "rewards/margins": 0.5052372217178345, + "rewards/rejected": -0.5141521096229553, + "sft_loss": 0.0891483798623085, + "step": 3203 + }, + { + "epoch": 4.633405639913232, + "grad_norm": 1.543659408045758, + "learning_rate": 1.0231295059945084e-06, + "logits/chosen": -0.7963493466377258, + "logits/rejected": -0.6935844421386719, + "logps/chosen": -0.054386600852012634, + "logps/rejected": -3.769763708114624, + "loss": 0.0832, + "odds_ratio_loss": 0.0008780553471297026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005438660271465778, + "rewards/margins": 0.3715377449989319, + "rewards/rejected": -0.3769764006137848, + "sft_loss": 0.054386600852012634, + "step": 3204 + }, + { + "epoch": 4.634851771511207, + "grad_norm": 1.5964525768359796, + "learning_rate": 1.0210558740151065e-06, + "logits/chosen": -0.7700487375259399, + "logits/rejected": -0.5274595618247986, + "logps/chosen": -0.08021484315395355, + "logps/rejected": -5.129361152648926, + "loss": 0.0822, + "odds_ratio_loss": 0.013991568237543106, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00802148412913084, + "rewards/margins": 0.5049146413803101, + "rewards/rejected": -0.5129361152648926, + "sft_loss": 0.08021484315395355, + "step": 3205 + }, + { + "epoch": 4.636297903109183, + "grad_norm": 1.7217177993051314, + "learning_rate": 1.0189840380522118e-06, + "logits/chosen": -0.7825113534927368, + "logits/rejected": -0.5335025787353516, + "logps/chosen": -0.048637744039297104, + "logps/rejected": -4.941579341888428, + "loss": 0.0923, + "odds_ratio_loss": 0.006592496298253536, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00486377440392971, + "rewards/margins": 0.4892942011356354, + "rewards/rejected": -0.49415794014930725, + "sft_loss": 0.048637744039297104, + "step": 3206 + }, + { + "epoch": 4.637744034707159, + "grad_norm": 1.6148713420823113, + "learning_rate": 1.0169139993549443e-06, + "logits/chosen": -0.6278676390647888, + "logits/rejected": -0.5886056423187256, + "logps/chosen": -0.09759227186441422, + "logps/rejected": -6.357240676879883, + "loss": 0.076, + "odds_ratio_loss": 0.010194497182965279, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009759227745234966, + "rewards/margins": 0.6259648203849792, + "rewards/rejected": -0.6357241272926331, + "sft_loss": 0.09759227186441422, + "step": 3207 + }, + { + "epoch": 4.639190166305134, + "grad_norm": 1.6617822163455997, + "learning_rate": 1.0148457591713358e-06, + "logits/chosen": -0.7911767363548279, + "logits/rejected": -0.7584560513496399, + "logps/chosen": -0.12109269201755524, + "logps/rejected": -3.515918254852295, + "loss": 0.12, + "odds_ratio_loss": 0.022366242483258247, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012109269388020039, + "rewards/margins": 0.33948254585266113, + "rewards/rejected": -0.3515918254852295, + "sft_loss": 0.12109269201755524, + "step": 3208 + }, + { + "epoch": 4.640636297903109, + "grad_norm": 1.5696273536778012, + "learning_rate": 1.0127793187483367e-06, + "logits/chosen": -0.7384767532348633, + "logits/rejected": -0.5731381177902222, + "logps/chosen": -0.07455767691135406, + "logps/rejected": -5.048724174499512, + "loss": 0.0901, + "odds_ratio_loss": 0.0019971770234405994, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007455767132341862, + "rewards/margins": 0.497416615486145, + "rewards/rejected": -0.5048723816871643, + "sft_loss": 0.07455767691135406, + "step": 3209 + }, + { + "epoch": 4.642082429501085, + "grad_norm": 1.3018747419669137, + "learning_rate": 1.010714679331813e-06, + "logits/chosen": -0.6185959577560425, + "logits/rejected": -0.5798184871673584, + "logps/chosen": -0.08040968328714371, + "logps/rejected": -4.064995288848877, + "loss": 0.0642, + "odds_ratio_loss": 0.05532249063253403, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.00804096832871437, + "rewards/margins": 0.3984585702419281, + "rewards/rejected": -0.4064995348453522, + "sft_loss": 0.08040968328714371, + "step": 3210 + }, + { + "epoch": 4.64352856109906, + "grad_norm": 1.5401199624080288, + "learning_rate": 1.0086518421665417e-06, + "logits/chosen": -0.8253481984138489, + "logits/rejected": -0.4594198763370514, + "logps/chosen": -0.06197258457541466, + "logps/rejected": -4.424698352813721, + "loss": 0.0879, + "odds_ratio_loss": 0.05480283871293068, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.006197258364409208, + "rewards/margins": 0.4362725615501404, + "rewards/rejected": -0.44246983528137207, + "sft_loss": 0.06197258457541466, + "step": 3211 + }, + { + "epoch": 4.644974692697035, + "grad_norm": 1.7590357910590537, + "learning_rate": 1.0065908084962166e-06, + "logits/chosen": -0.6786633133888245, + "logits/rejected": -0.5764570236206055, + "logps/chosen": -0.11568009108304977, + "logps/rejected": -3.1933226585388184, + "loss": 0.0924, + "odds_ratio_loss": 0.018704205751419067, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011568009853363037, + "rewards/margins": 0.30776429176330566, + "rewards/rejected": -0.3193323016166687, + "sft_loss": 0.11568009108304977, + "step": 3212 + }, + { + "epoch": 4.646420824295011, + "grad_norm": 1.4765110012013127, + "learning_rate": 1.0045315795634416e-06, + "logits/chosen": -0.8340771794319153, + "logits/rejected": -0.63593989610672, + "logps/chosen": -0.06669916212558746, + "logps/rejected": -3.186521530151367, + "loss": 0.0646, + "odds_ratio_loss": 0.008953748270869255, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006669916212558746, + "rewards/margins": 0.3119822144508362, + "rewards/rejected": -0.3186521530151367, + "sft_loss": 0.06669916212558746, + "step": 3213 + }, + { + "epoch": 4.647866955892987, + "grad_norm": 1.4826650637112055, + "learning_rate": 1.002474156609734e-06, + "logits/chosen": -0.7655255794525146, + "logits/rejected": -0.5246065855026245, + "logps/chosen": -0.029604678973555565, + "logps/rejected": -7.7826080322265625, + "loss": 0.085, + "odds_ratio_loss": 0.0013552091550081968, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0029604679439216852, + "rewards/margins": 0.7753003835678101, + "rewards/rejected": -0.7782608270645142, + "sft_loss": 0.029604678973555565, + "step": 3214 + }, + { + "epoch": 4.649313087490961, + "grad_norm": 1.5306493087569484, + "learning_rate": 1.0004185408755196e-06, + "logits/chosen": -0.9814145565032959, + "logits/rejected": -0.6695619821548462, + "logps/chosen": -0.0681711882352829, + "logps/rejected": -5.490948677062988, + "loss": 0.0671, + "odds_ratio_loss": 0.008766880258917809, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006817118730396032, + "rewards/margins": 0.5422778129577637, + "rewards/rejected": -0.5490949153900146, + "sft_loss": 0.0681711882352829, + "step": 3215 + }, + { + "epoch": 4.650759219088937, + "grad_norm": 1.4412286059471762, + "learning_rate": 9.983647336001401e-07, + "logits/chosen": -0.6026754379272461, + "logits/rejected": -0.6260681748390198, + "logps/chosen": -0.09057678282260895, + "logps/rejected": -3.460932731628418, + "loss": 0.0653, + "odds_ratio_loss": 0.015110738575458527, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009057678282260895, + "rewards/margins": 0.3370356261730194, + "rewards/rejected": -0.3460932672023773, + "sft_loss": 0.09057678282260895, + "step": 3216 + }, + { + "epoch": 4.652205350686913, + "grad_norm": 1.334960822259661, + "learning_rate": 9.963127360218409e-07, + "logits/chosen": -0.8631464242935181, + "logits/rejected": -0.5723793506622314, + "logps/chosen": -0.05538426339626312, + "logps/rejected": -5.14091157913208, + "loss": 0.0499, + "odds_ratio_loss": 0.004457543138414621, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005538426339626312, + "rewards/margins": 0.5085527300834656, + "rewards/rejected": -0.5140911340713501, + "sft_loss": 0.05538426339626312, + "step": 3217 + }, + { + "epoch": 4.653651482284888, + "grad_norm": 1.655925383862883, + "learning_rate": 9.94262549377781e-07, + "logits/chosen": -0.6246498227119446, + "logits/rejected": -0.5415275692939758, + "logps/chosen": -0.06165987253189087, + "logps/rejected": -3.9687774181365967, + "loss": 0.0864, + "odds_ratio_loss": 0.015562493354082108, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006165987346321344, + "rewards/margins": 0.3907117545604706, + "rewards/rejected": -0.3968777060508728, + "sft_loss": 0.06165987253189087, + "step": 3218 + }, + { + "epoch": 4.655097613882863, + "grad_norm": 1.551288429706046, + "learning_rate": 9.922141749040238e-07, + "logits/chosen": -0.8679819703102112, + "logits/rejected": -0.5541244745254517, + "logps/chosen": -0.06448002904653549, + "logps/rejected": -4.471212387084961, + "loss": 0.0783, + "odds_ratio_loss": 0.004951273091137409, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006448003463447094, + "rewards/margins": 0.44067320227622986, + "rewards/rejected": -0.44712120294570923, + "sft_loss": 0.06448002904653549, + "step": 3219 + }, + { + "epoch": 4.656543745480839, + "grad_norm": 1.5372933816352716, + "learning_rate": 9.901676138355438e-07, + "logits/chosen": -0.6660540699958801, + "logits/rejected": -0.5739330649375916, + "logps/chosen": -0.04592113569378853, + "logps/rejected": -4.954591274261475, + "loss": 0.0778, + "odds_ratio_loss": 0.004109309054911137, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004592114128172398, + "rewards/margins": 0.4908669590950012, + "rewards/rejected": -0.49545910954475403, + "sft_loss": 0.04592113569378853, + "step": 3220 + }, + { + "epoch": 4.657989877078814, + "grad_norm": 1.5844634244020717, + "learning_rate": 9.88122867406219e-07, + "logits/chosen": -0.7254536151885986, + "logits/rejected": -0.5857067704200745, + "logps/chosen": -0.1282578706741333, + "logps/rejected": -5.345030307769775, + "loss": 0.0979, + "odds_ratio_loss": 0.006788488943129778, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01282578706741333, + "rewards/margins": 0.5216772556304932, + "rewards/rejected": -0.5345030426979065, + "sft_loss": 0.1282578706741333, + "step": 3221 + }, + { + "epoch": 4.659436008676789, + "grad_norm": 1.5352851093014344, + "learning_rate": 9.860799368488338e-07, + "logits/chosen": -0.8546849489212036, + "logits/rejected": -0.49446427822113037, + "logps/chosen": -0.058683719485998154, + "logps/rejected": -6.933520793914795, + "loss": 0.0789, + "odds_ratio_loss": 0.003612469183281064, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005868372041732073, + "rewards/margins": 0.6874837279319763, + "rewards/rejected": -0.6933520436286926, + "sft_loss": 0.058683719485998154, + "step": 3222 + }, + { + "epoch": 4.660882140274765, + "grad_norm": 1.9348714464409837, + "learning_rate": 9.840388233950809e-07, + "logits/chosen": -0.878996729850769, + "logits/rejected": -0.6803721785545349, + "logps/chosen": -0.08524107933044434, + "logps/rejected": -3.973269462585449, + "loss": 0.1024, + "odds_ratio_loss": 0.00521640432998538, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008524107746779919, + "rewards/margins": 0.38880282640457153, + "rewards/rejected": -0.3973269760608673, + "sft_loss": 0.08524107933044434, + "step": 3223 + }, + { + "epoch": 4.662328271872741, + "grad_norm": 2.0220508369894223, + "learning_rate": 9.819995282755526e-07, + "logits/chosen": -0.7756155133247375, + "logits/rejected": -0.6541730761528015, + "logps/chosen": -0.14658205211162567, + "logps/rejected": -4.749970436096191, + "loss": 0.1, + "odds_ratio_loss": 0.013168347999453545, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014658206142485142, + "rewards/margins": 0.460338830947876, + "rewards/rejected": -0.47499704360961914, + "sft_loss": 0.14658205211162567, + "step": 3224 + }, + { + "epoch": 4.663774403470716, + "grad_norm": 1.6531658948928443, + "learning_rate": 9.799620527197503e-07, + "logits/chosen": -0.7469644546508789, + "logits/rejected": -0.509243369102478, + "logps/chosen": -0.1062968522310257, + "logps/rejected": -4.866016864776611, + "loss": 0.0707, + "odds_ratio_loss": 0.007952062413096428, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0106296855956316, + "rewards/margins": 0.4759719967842102, + "rewards/rejected": -0.48660171031951904, + "sft_loss": 0.1062968522310257, + "step": 3225 + }, + { + "epoch": 4.665220535068691, + "grad_norm": 1.5363536198298622, + "learning_rate": 9.779263979560735e-07, + "logits/chosen": -0.8867968916893005, + "logits/rejected": -0.7032777070999146, + "logps/chosen": -0.07279931008815765, + "logps/rejected": -3.2184600830078125, + "loss": 0.0761, + "odds_ratio_loss": 0.006781273987144232, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0072799306362867355, + "rewards/margins": 0.31456607580184937, + "rewards/rejected": -0.32184600830078125, + "sft_loss": 0.07279931008815765, + "step": 3226 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 1.585570187302912, + "learning_rate": 9.758925652118276e-07, + "logits/chosen": -0.729887068271637, + "logits/rejected": -0.5777206420898438, + "logps/chosen": -0.05206802114844322, + "logps/rejected": -4.347285747528076, + "loss": 0.093, + "odds_ratio_loss": 0.003964191768318415, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005206801928579807, + "rewards/margins": 0.42952173948287964, + "rewards/rejected": -0.43472856283187866, + "sft_loss": 0.05206802114844322, + "step": 3227 + }, + { + "epoch": 4.668112798264642, + "grad_norm": 1.1830202726874117, + "learning_rate": 9.738605557132168e-07, + "logits/chosen": -0.7481473088264465, + "logits/rejected": -0.5761478543281555, + "logps/chosen": -0.03131213039159775, + "logps/rejected": -5.612335205078125, + "loss": 0.0631, + "odds_ratio_loss": 0.003878758754581213, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0031312128994613886, + "rewards/margins": 0.5581023097038269, + "rewards/rejected": -0.5612335801124573, + "sft_loss": 0.03131213039159775, + "step": 3228 + }, + { + "epoch": 4.6695589298626174, + "grad_norm": 1.4602396396613635, + "learning_rate": 9.71830370685348e-07, + "logits/chosen": -0.80008864402771, + "logits/rejected": -0.8370683789253235, + "logps/chosen": -0.08197289705276489, + "logps/rejected": -4.320422172546387, + "loss": 0.0699, + "odds_ratio_loss": 0.010543855838477612, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008197290822863579, + "rewards/margins": 0.4238448441028595, + "rewards/rejected": -0.4320421516895294, + "sft_loss": 0.08197289705276489, + "step": 3229 + }, + { + "epoch": 4.671005061460593, + "grad_norm": 1.4630280771677762, + "learning_rate": 9.698020113522253e-07, + "logits/chosen": -0.6244576573371887, + "logits/rejected": -0.5266187787055969, + "logps/chosen": -0.04099838435649872, + "logps/rejected": -6.273877143859863, + "loss": 0.0721, + "odds_ratio_loss": 0.0028168242424726486, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004099838435649872, + "rewards/margins": 0.6232879161834717, + "rewards/rejected": -0.6273877620697021, + "sft_loss": 0.04099838435649872, + "step": 3230 + }, + { + "epoch": 4.672451193058569, + "grad_norm": 1.6816336361544946, + "learning_rate": 9.677754789367569e-07, + "logits/chosen": -1.0483098030090332, + "logits/rejected": -0.6950076818466187, + "logps/chosen": -0.10204799473285675, + "logps/rejected": -5.153908729553223, + "loss": 0.1092, + "odds_ratio_loss": 0.00561350304633379, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0102047985419631, + "rewards/margins": 0.5051860809326172, + "rewards/rejected": -0.5153908729553223, + "sft_loss": 0.10204799473285675, + "step": 3231 + }, + { + "epoch": 4.673897324656544, + "grad_norm": 1.6650569138582232, + "learning_rate": 9.657507746607442e-07, + "logits/chosen": -0.8510163426399231, + "logits/rejected": -0.6006070971488953, + "logps/chosen": -0.061348456889390945, + "logps/rejected": -4.452303886413574, + "loss": 0.0874, + "odds_ratio_loss": 0.008031385019421577, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006134845782071352, + "rewards/margins": 0.43909552693367004, + "rewards/rejected": -0.4452304244041443, + "sft_loss": 0.061348456889390945, + "step": 3232 + }, + { + "epoch": 4.675343456254519, + "grad_norm": 1.4129358383161126, + "learning_rate": 9.637278997448919e-07, + "logits/chosen": -1.053891658782959, + "logits/rejected": -0.7565010190010071, + "logps/chosen": -0.03904981166124344, + "logps/rejected": -4.420421600341797, + "loss": 0.0511, + "odds_ratio_loss": 0.0059059979394078255, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0039049815386533737, + "rewards/margins": 0.4381372034549713, + "rewards/rejected": -0.44204211235046387, + "sft_loss": 0.03904981166124344, + "step": 3233 + }, + { + "epoch": 4.676789587852495, + "grad_norm": 1.6502092599765257, + "learning_rate": 9.617068554087953e-07, + "logits/chosen": -0.7731687426567078, + "logits/rejected": -0.6064621806144714, + "logps/chosen": -0.07107666879892349, + "logps/rejected": -4.7771806716918945, + "loss": 0.0717, + "odds_ratio_loss": 0.004392960108816624, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007107666693627834, + "rewards/margins": 0.4706103801727295, + "rewards/rejected": -0.4777180850505829, + "sft_loss": 0.07107666879892349, + "step": 3234 + }, + { + "epoch": 4.67823571945047, + "grad_norm": 1.828353358209516, + "learning_rate": 9.596876428709531e-07, + "logits/chosen": -0.7077252864837646, + "logits/rejected": -0.49990472197532654, + "logps/chosen": -0.09420859813690186, + "logps/rejected": -4.479310035705566, + "loss": 0.0771, + "odds_ratio_loss": 0.012622407637536526, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00942085962742567, + "rewards/margins": 0.4385101795196533, + "rewards/rejected": -0.4479310214519501, + "sft_loss": 0.09420859813690186, + "step": 3235 + }, + { + "epoch": 4.6796818510484455, + "grad_norm": 1.7821117898301133, + "learning_rate": 9.576702633487537e-07, + "logits/chosen": -0.8034138679504395, + "logits/rejected": -0.6696498990058899, + "logps/chosen": -0.14594009518623352, + "logps/rejected": -3.874324321746826, + "loss": 0.0969, + "odds_ratio_loss": 0.024632243439555168, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014594011008739471, + "rewards/margins": 0.37283843755722046, + "rewards/rejected": -0.38743242621421814, + "sft_loss": 0.14594009518623352, + "step": 3236 + }, + { + "epoch": 4.681127982646421, + "grad_norm": 1.9260742548491816, + "learning_rate": 9.556547180584828e-07, + "logits/chosen": -0.6212836503982544, + "logits/rejected": -0.5890661478042603, + "logps/chosen": -0.15807095170021057, + "logps/rejected": -3.948227882385254, + "loss": 0.126, + "odds_ratio_loss": 0.0165469441562891, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01580709218978882, + "rewards/margins": 0.3790157437324524, + "rewards/rejected": -0.3948228359222412, + "sft_loss": 0.15807095170021057, + "step": 3237 + }, + { + "epoch": 4.682574114244396, + "grad_norm": 1.7002914000121425, + "learning_rate": 9.536410082153215e-07, + "logits/chosen": -0.7422010898590088, + "logits/rejected": -0.556551992893219, + "logps/chosen": -0.19723354279994965, + "logps/rejected": -5.211292743682861, + "loss": 0.106, + "odds_ratio_loss": 0.009756670333445072, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019723355770111084, + "rewards/margins": 0.5014059543609619, + "rewards/rejected": -0.521129310131073, + "sft_loss": 0.19723354279994965, + "step": 3238 + }, + { + "epoch": 4.684020245842372, + "grad_norm": 1.3402956800471524, + "learning_rate": 9.516291350333414e-07, + "logits/chosen": -0.8872632384300232, + "logits/rejected": -0.7011750936508179, + "logps/chosen": -0.04245447367429733, + "logps/rejected": -4.0774006843566895, + "loss": 0.053, + "odds_ratio_loss": 0.0019003519555553794, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004245447926223278, + "rewards/margins": 0.40349453687667847, + "rewards/rejected": -0.4077400267124176, + "sft_loss": 0.04245447367429733, + "step": 3239 + }, + { + "epoch": 4.685466377440347, + "grad_norm": 1.6582515741923567, + "learning_rate": 9.496190997255098e-07, + "logits/chosen": -0.6277506947517395, + "logits/rejected": -0.443684846162796, + "logps/chosen": -0.07038243860006332, + "logps/rejected": -4.194167137145996, + "loss": 0.0861, + "odds_ratio_loss": 0.014950907789170742, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00703824358060956, + "rewards/margins": 0.4123784899711609, + "rewards/rejected": -0.41941672563552856, + "sft_loss": 0.07038243860006332, + "step": 3240 + }, + { + "epoch": 4.686912509038322, + "grad_norm": 1.5661539655850352, + "learning_rate": 9.476109035036831e-07, + "logits/chosen": -0.833951473236084, + "logits/rejected": -0.7562543153762817, + "logps/chosen": -0.10855622589588165, + "logps/rejected": -3.645920753479004, + "loss": 0.076, + "odds_ratio_loss": 0.010784967802464962, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010855622589588165, + "rewards/margins": 0.35373643040657043, + "rewards/rejected": -0.3645920753479004, + "sft_loss": 0.10855622589588165, + "step": 3241 + }, + { + "epoch": 4.688358640636298, + "grad_norm": 1.4776366552591538, + "learning_rate": 9.456045475786121e-07, + "logits/chosen": -0.7049121260643005, + "logits/rejected": -0.518916130065918, + "logps/chosen": -0.04573969542980194, + "logps/rejected": -5.782866954803467, + "loss": 0.0754, + "odds_ratio_loss": 0.006768940016627312, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004573969170451164, + "rewards/margins": 0.5737127065658569, + "rewards/rejected": -0.5782866477966309, + "sft_loss": 0.04573969542980194, + "step": 3242 + }, + { + "epoch": 4.6898047722342735, + "grad_norm": 1.615380623509659, + "learning_rate": 9.436000331599347e-07, + "logits/chosen": -0.865658164024353, + "logits/rejected": -0.7146108150482178, + "logps/chosen": -0.038462672382593155, + "logps/rejected": -4.004945755004883, + "loss": 0.0563, + "odds_ratio_loss": 0.007846795953810215, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003846267005428672, + "rewards/margins": 0.3966482877731323, + "rewards/rejected": -0.40049460530281067, + "sft_loss": 0.038462672382593155, + "step": 3243 + }, + { + "epoch": 4.691250903832248, + "grad_norm": 1.7145212188015946, + "learning_rate": 9.415973614561812e-07, + "logits/chosen": -0.6654316782951355, + "logits/rejected": -0.5937279462814331, + "logps/chosen": -0.07255206257104874, + "logps/rejected": -5.105835914611816, + "loss": 0.0939, + "odds_ratio_loss": 0.010402324609458447, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007255205884575844, + "rewards/margins": 0.5033283233642578, + "rewards/rejected": -0.5105835199356079, + "sft_loss": 0.07255206257104874, + "step": 3244 + }, + { + "epoch": 4.692697035430224, + "grad_norm": 1.5620231531870004, + "learning_rate": 9.395965336747677e-07, + "logits/chosen": -0.7461257576942444, + "logits/rejected": -0.6894717216491699, + "logps/chosen": -0.06823474168777466, + "logps/rejected": -5.050882339477539, + "loss": 0.0625, + "odds_ratio_loss": 0.007007937878370285, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006823473609983921, + "rewards/margins": 0.49826475977897644, + "rewards/rejected": -0.505088210105896, + "sft_loss": 0.06823474168777466, + "step": 3245 + }, + { + "epoch": 4.6941431670282, + "grad_norm": 1.5250744458372743, + "learning_rate": 9.375975510220033e-07, + "logits/chosen": -0.6381784677505493, + "logits/rejected": -0.47818252444267273, + "logps/chosen": -0.06893990188837051, + "logps/rejected": -4.53413200378418, + "loss": 0.0681, + "odds_ratio_loss": 0.006821871735155582, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006893990095704794, + "rewards/margins": 0.446519136428833, + "rewards/rejected": -0.4534131586551666, + "sft_loss": 0.06893990188837051, + "step": 3246 + }, + { + "epoch": 4.695589298626175, + "grad_norm": 1.3255197344195617, + "learning_rate": 9.356004147030798e-07, + "logits/chosen": -0.7541120052337646, + "logits/rejected": -0.6116973161697388, + "logps/chosen": -0.10232388228178024, + "logps/rejected": -4.735118865966797, + "loss": 0.0695, + "odds_ratio_loss": 0.005717164371162653, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01023238804191351, + "rewards/margins": 0.4632795453071594, + "rewards/rejected": -0.4735119044780731, + "sft_loss": 0.10232388228178024, + "step": 3247 + }, + { + "epoch": 4.69703543022415, + "grad_norm": 1.6754504716143828, + "learning_rate": 9.336051259220807e-07, + "logits/chosen": -0.813551127910614, + "logits/rejected": -0.5718523859977722, + "logps/chosen": -0.08019101619720459, + "logps/rejected": -4.289122104644775, + "loss": 0.0796, + "odds_ratio_loss": 0.008219133131206036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008019101805984974, + "rewards/margins": 0.4208931028842926, + "rewards/rejected": -0.4289122223854065, + "sft_loss": 0.08019101619720459, + "step": 3248 + }, + { + "epoch": 4.698481561822126, + "grad_norm": 1.4682579826607338, + "learning_rate": 9.316116858819704e-07, + "logits/chosen": -0.9900127053260803, + "logits/rejected": -0.6790282726287842, + "logps/chosen": -0.04047767072916031, + "logps/rejected": -4.796731948852539, + "loss": 0.0736, + "odds_ratio_loss": 0.002619100734591484, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0040477667935192585, + "rewards/margins": 0.4756254553794861, + "rewards/rejected": -0.47967323660850525, + "sft_loss": 0.04047767072916031, + "step": 3249 + }, + { + "epoch": 4.6999276934201015, + "grad_norm": 1.7451548851827552, + "learning_rate": 9.296200957846028e-07, + "logits/chosen": -0.8112927079200745, + "logits/rejected": -0.715489387512207, + "logps/chosen": -0.07249405980110168, + "logps/rejected": -4.894453525543213, + "loss": 0.0778, + "odds_ratio_loss": 0.00706165935844183, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007249406538903713, + "rewards/margins": 0.48219597339630127, + "rewards/rejected": -0.48944535851478577, + "sft_loss": 0.07249405980110168, + "step": 3250 + }, + { + "epoch": 4.701373825018076, + "grad_norm": 1.649219451393204, + "learning_rate": 9.276303568307167e-07, + "logits/chosen": -0.9887948036193848, + "logits/rejected": -0.7613665461540222, + "logps/chosen": -0.0671960636973381, + "logps/rejected": -3.415968179702759, + "loss": 0.098, + "odds_ratio_loss": 0.003112174803391099, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006719606928527355, + "rewards/margins": 0.33487722277641296, + "rewards/rejected": -0.3415968418121338, + "sft_loss": 0.0671960636973381, + "step": 3251 + }, + { + "epoch": 4.702819956616052, + "grad_norm": 2.515817384014982, + "learning_rate": 9.25642470219929e-07, + "logits/chosen": -0.8247559070587158, + "logits/rejected": -0.6773849129676819, + "logps/chosen": -0.07466698437929153, + "logps/rejected": -4.479544162750244, + "loss": 0.0735, + "odds_ratio_loss": 0.0051119765266776085, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007466698996722698, + "rewards/margins": 0.44048771262168884, + "rewards/rejected": -0.4479544162750244, + "sft_loss": 0.07466698437929153, + "step": 3252 + }, + { + "epoch": 4.704266088214028, + "grad_norm": 1.491972360970979, + "learning_rate": 9.236564371507474e-07, + "logits/chosen": -0.7877534031867981, + "logits/rejected": -0.6748812794685364, + "logps/chosen": -0.13032501935958862, + "logps/rejected": -4.983989715576172, + "loss": 0.0713, + "odds_ratio_loss": 0.01147688739001751, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013032502494752407, + "rewards/margins": 0.48536643385887146, + "rewards/rejected": -0.49839890003204346, + "sft_loss": 0.13032501935958862, + "step": 3253 + }, + { + "epoch": 4.705712219812003, + "grad_norm": 1.624021210716625, + "learning_rate": 9.216722588205561e-07, + "logits/chosen": -0.8625186085700989, + "logits/rejected": -0.8469754457473755, + "logps/chosen": -0.08548222482204437, + "logps/rejected": -4.792725086212158, + "loss": 0.0718, + "odds_ratio_loss": 0.008853597566485405, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008548222482204437, + "rewards/margins": 0.47072431445121765, + "rewards/rejected": -0.4792725443840027, + "sft_loss": 0.08548222482204437, + "step": 3254 + }, + { + "epoch": 4.707158351409978, + "grad_norm": 1.7508042191690136, + "learning_rate": 9.196899364256259e-07, + "logits/chosen": -0.6333081722259521, + "logits/rejected": -0.4871371388435364, + "logps/chosen": -0.08562222123146057, + "logps/rejected": -4.179487705230713, + "loss": 0.1186, + "odds_ratio_loss": 0.008961411193013191, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008562222123146057, + "rewards/margins": 0.4093865156173706, + "rewards/rejected": -0.41794872283935547, + "sft_loss": 0.08562222123146057, + "step": 3255 + }, + { + "epoch": 4.708604483007954, + "grad_norm": 1.4369748515968988, + "learning_rate": 9.177094711611041e-07, + "logits/chosen": -0.7007789611816406, + "logits/rejected": -0.4575268030166626, + "logps/chosen": -0.034006789326667786, + "logps/rejected": -4.8226637840271, + "loss": 0.0648, + "odds_ratio_loss": 0.001526236068457365, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0034006789792329073, + "rewards/margins": 0.47886568307876587, + "rewards/rejected": -0.4822664260864258, + "sft_loss": 0.034006789326667786, + "step": 3256 + }, + { + "epoch": 4.7100506146059296, + "grad_norm": 1.93287908188121, + "learning_rate": 9.157308642210235e-07, + "logits/chosen": -0.7941340804100037, + "logits/rejected": -0.5466873049736023, + "logps/chosen": -0.06813658028841019, + "logps/rejected": -3.8739757537841797, + "loss": 0.0663, + "odds_ratio_loss": 0.014670598320662975, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0068136584013700485, + "rewards/margins": 0.3805839419364929, + "rewards/rejected": -0.3873975872993469, + "sft_loss": 0.06813658028841019, + "step": 3257 + }, + { + "epoch": 4.711496746203904, + "grad_norm": 1.6984279813334189, + "learning_rate": 9.137541167982905e-07, + "logits/chosen": -0.997580349445343, + "logits/rejected": -0.6903654932975769, + "logps/chosen": -0.11518067121505737, + "logps/rejected": -4.90092658996582, + "loss": 0.0901, + "odds_ratio_loss": 0.011495430953800678, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011518066748976707, + "rewards/margins": 0.47857460379600525, + "rewards/rejected": -0.4900926947593689, + "sft_loss": 0.11518067121505737, + "step": 3258 + }, + { + "epoch": 4.71294287780188, + "grad_norm": 1.5717400046835743, + "learning_rate": 9.117792300846958e-07, + "logits/chosen": -0.8282598257064819, + "logits/rejected": -0.6946208477020264, + "logps/chosen": -0.031053097918629646, + "logps/rejected": -4.154017925262451, + "loss": 0.0545, + "odds_ratio_loss": 0.0034298747777938843, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0031053097918629646, + "rewards/margins": 0.41229650378227234, + "rewards/rejected": -0.41540175676345825, + "sft_loss": 0.031053097918629646, + "step": 3259 + }, + { + "epoch": 4.714389009399856, + "grad_norm": 1.4768072947611934, + "learning_rate": 9.098062052709052e-07, + "logits/chosen": -0.8885550498962402, + "logits/rejected": -0.587072491645813, + "logps/chosen": -0.07804380357265472, + "logps/rejected": -6.696857929229736, + "loss": 0.0927, + "odds_ratio_loss": 0.008093244396150112, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007804380729794502, + "rewards/margins": 0.6618814468383789, + "rewards/rejected": -0.6696858406066895, + "sft_loss": 0.07804380357265472, + "step": 3260 + }, + { + "epoch": 4.7158351409978305, + "grad_norm": 1.5119276549388956, + "learning_rate": 9.078350435464627e-07, + "logits/chosen": -0.7504903078079224, + "logits/rejected": -0.6390421390533447, + "logps/chosen": -0.11923964321613312, + "logps/rejected": -4.330544471740723, + "loss": 0.0777, + "odds_ratio_loss": 0.007041964679956436, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011923965066671371, + "rewards/margins": 0.4211304783821106, + "rewards/rejected": -0.4330544173717499, + "sft_loss": 0.11923964321613312, + "step": 3261 + }, + { + "epoch": 4.717281272595806, + "grad_norm": 1.848013472334746, + "learning_rate": 9.058657460997876e-07, + "logits/chosen": -1.02030611038208, + "logits/rejected": -0.9156399965286255, + "logps/chosen": -0.07417614012956619, + "logps/rejected": -3.798497200012207, + "loss": 0.0791, + "odds_ratio_loss": 0.008186925202608109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007417613640427589, + "rewards/margins": 0.3724321126937866, + "rewards/rejected": -0.37984973192214966, + "sft_loss": 0.07417614012956619, + "step": 3262 + }, + { + "epoch": 4.718727404193782, + "grad_norm": 2.0126285877800028, + "learning_rate": 9.03898314118178e-07, + "logits/chosen": -0.9355478882789612, + "logits/rejected": -0.703206479549408, + "logps/chosen": -0.1144571453332901, + "logps/rejected": -6.172203063964844, + "loss": 0.0773, + "odds_ratio_loss": 0.011154555715620518, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011445715092122555, + "rewards/margins": 0.6057746410369873, + "rewards/rejected": -0.6172203421592712, + "sft_loss": 0.1144571453332901, + "step": 3263 + }, + { + "epoch": 4.720173535791757, + "grad_norm": 1.5994780503232926, + "learning_rate": 9.019327487878072e-07, + "logits/chosen": -0.7576562166213989, + "logits/rejected": -0.6976518630981445, + "logps/chosen": -0.05885721370577812, + "logps/rejected": -5.845735549926758, + "loss": 0.0676, + "odds_ratio_loss": 0.005953238345682621, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005885721184313297, + "rewards/margins": 0.5786879062652588, + "rewards/rejected": -0.5845736265182495, + "sft_loss": 0.05885721370577812, + "step": 3264 + }, + { + "epoch": 4.721619667389732, + "grad_norm": 1.8736737702028536, + "learning_rate": 8.999690512937195e-07, + "logits/chosen": -0.9201067686080933, + "logits/rejected": -0.7886765003204346, + "logps/chosen": -0.03042430989444256, + "logps/rejected": -2.8618273735046387, + "loss": 0.0658, + "odds_ratio_loss": 0.003192545147612691, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0030424310825765133, + "rewards/margins": 0.2831403315067291, + "rewards/rejected": -0.2861827611923218, + "sft_loss": 0.03042430989444256, + "step": 3265 + }, + { + "epoch": 4.723065798987708, + "grad_norm": 1.5332286243868711, + "learning_rate": 8.980072228198374e-07, + "logits/chosen": -1.1953673362731934, + "logits/rejected": -0.7290738224983215, + "logps/chosen": -0.08467172086238861, + "logps/rejected": -4.777332305908203, + "loss": 0.0647, + "odds_ratio_loss": 0.003652008716017008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008467172272503376, + "rewards/margins": 0.4692661166191101, + "rewards/rejected": -0.4777332842350006, + "sft_loss": 0.08467172086238861, + "step": 3266 + }, + { + "epoch": 4.724511930585683, + "grad_norm": 1.421654983561279, + "learning_rate": 8.960472645489536e-07, + "logits/chosen": -0.9492431879043579, + "logits/rejected": -0.63211590051651, + "logps/chosen": -0.05600889027118683, + "logps/rejected": -6.399876594543457, + "loss": 0.0529, + "odds_ratio_loss": 0.0036964938044548035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005600889213383198, + "rewards/margins": 0.6343867778778076, + "rewards/rejected": -0.639987587928772, + "sft_loss": 0.05600889027118683, + "step": 3267 + }, + { + "epoch": 4.7259580621836585, + "grad_norm": 1.5880771854974947, + "learning_rate": 8.940891776627348e-07, + "logits/chosen": -0.8621605634689331, + "logits/rejected": -0.5667805671691895, + "logps/chosen": -0.09114475548267365, + "logps/rejected": -7.375045299530029, + "loss": 0.0932, + "odds_ratio_loss": 0.00746044609695673, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00911447498947382, + "rewards/margins": 0.728390097618103, + "rewards/rejected": -0.7375046014785767, + "sft_loss": 0.09114475548267365, + "step": 3268 + }, + { + "epoch": 4.727404193781634, + "grad_norm": 1.5189323585430745, + "learning_rate": 8.921329633417172e-07, + "logits/chosen": -0.9254083633422852, + "logits/rejected": -0.7080655097961426, + "logps/chosen": -0.11067543923854828, + "logps/rejected": -3.79581356048584, + "loss": 0.0815, + "odds_ratio_loss": 0.026493411511182785, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011067543178796768, + "rewards/margins": 0.368513822555542, + "rewards/rejected": -0.37958139181137085, + "sft_loss": 0.11067543923854828, + "step": 3269 + }, + { + "epoch": 4.72885032537961, + "grad_norm": 1.7915520979090995, + "learning_rate": 8.901786227653119e-07, + "logits/chosen": -1.0431417226791382, + "logits/rejected": -0.6968197822570801, + "logps/chosen": -0.06255242228507996, + "logps/rejected": -5.142256259918213, + "loss": 0.1002, + "odds_ratio_loss": 0.004798182286322117, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006255242042243481, + "rewards/margins": 0.50797039270401, + "rewards/rejected": -0.5142256617546082, + "sft_loss": 0.06255242228507996, + "step": 3270 + }, + { + "epoch": 4.730296456977585, + "grad_norm": 1.5051111918909226, + "learning_rate": 8.882261571117959e-07, + "logits/chosen": -0.9455627202987671, + "logits/rejected": -0.6978684663772583, + "logps/chosen": -0.058345332741737366, + "logps/rejected": -3.4079577922821045, + "loss": 0.0952, + "odds_ratio_loss": 0.0066518341191112995, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005834532901644707, + "rewards/margins": 0.33496126532554626, + "rewards/rejected": -0.3407958149909973, + "sft_loss": 0.058345332741737366, + "step": 3271 + }, + { + "epoch": 4.73174258857556, + "grad_norm": 1.6356443459061143, + "learning_rate": 8.862755675583207e-07, + "logits/chosen": -0.5706143379211426, + "logits/rejected": -0.42333659529685974, + "logps/chosen": -0.0708899050951004, + "logps/rejected": -6.153177261352539, + "loss": 0.0879, + "odds_ratio_loss": 0.0040434496477246284, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007088990416377783, + "rewards/margins": 0.6082287430763245, + "rewards/rejected": -0.6153177618980408, + "sft_loss": 0.0708899050951004, + "step": 3272 + }, + { + "epoch": 4.733188720173536, + "grad_norm": 1.667441586060215, + "learning_rate": 8.843268552809009e-07, + "logits/chosen": -0.7408003211021423, + "logits/rejected": -0.5686386227607727, + "logps/chosen": -0.10141906142234802, + "logps/rejected": -4.278630256652832, + "loss": 0.0906, + "odds_ratio_loss": 0.010225032456219196, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010141907259821892, + "rewards/margins": 0.417721152305603, + "rewards/rejected": -0.42786306142807007, + "sft_loss": 0.10141906142234802, + "step": 3273 + }, + { + "epoch": 4.734634851771511, + "grad_norm": 1.646184662641339, + "learning_rate": 8.823800214544257e-07, + "logits/chosen": -0.8300518989562988, + "logits/rejected": -0.771294116973877, + "logps/chosen": -0.06484097242355347, + "logps/rejected": -5.420320510864258, + "loss": 0.081, + "odds_ratio_loss": 0.00832346174865961, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006484096869826317, + "rewards/margins": 0.5355479121208191, + "rewards/rejected": -0.54203200340271, + "sft_loss": 0.06484097242355347, + "step": 3274 + }, + { + "epoch": 4.736080983369487, + "grad_norm": 1.6407642969954903, + "learning_rate": 8.804350672526469e-07, + "logits/chosen": -0.8406962156295776, + "logits/rejected": -0.6480029821395874, + "logps/chosen": -0.06862354278564453, + "logps/rejected": -4.057324409484863, + "loss": 0.0744, + "odds_ratio_loss": 0.017293782904744148, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006862354464828968, + "rewards/margins": 0.398870050907135, + "rewards/rejected": -0.4057324230670929, + "sft_loss": 0.06862354278564453, + "step": 3275 + }, + { + "epoch": 4.737527114967462, + "grad_norm": 1.243947005110168, + "learning_rate": 8.784919938481832e-07, + "logits/chosen": -0.9880828261375427, + "logits/rejected": -0.694316029548645, + "logps/chosen": -0.04657265543937683, + "logps/rejected": -4.722322463989258, + "loss": 0.0611, + "odds_ratio_loss": 0.004112796392291784, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004657265730202198, + "rewards/margins": 0.46757495403289795, + "rewards/rejected": -0.4722321927547455, + "sft_loss": 0.04657265543937683, + "step": 3276 + }, + { + "epoch": 4.738973246565438, + "grad_norm": 1.4949852325477238, + "learning_rate": 8.76550802412523e-07, + "logits/chosen": -0.43887242674827576, + "logits/rejected": -0.36812540888786316, + "logps/chosen": -0.03462660312652588, + "logps/rejected": -5.46714973449707, + "loss": 0.0631, + "odds_ratio_loss": 0.013562695123255253, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0034626605920493603, + "rewards/margins": 0.5432523488998413, + "rewards/rejected": -0.5467150211334229, + "sft_loss": 0.03462660312652588, + "step": 3277 + }, + { + "epoch": 4.740419378163413, + "grad_norm": 1.4479273066737757, + "learning_rate": 8.746114941160163e-07, + "logits/chosen": -0.691802978515625, + "logits/rejected": -0.5151437520980835, + "logps/chosen": -0.1676384061574936, + "logps/rejected": -5.6583147048950195, + "loss": 0.0748, + "odds_ratio_loss": 0.016385193914175034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01676384173333645, + "rewards/margins": 0.5490676760673523, + "rewards/rejected": -0.5658314824104309, + "sft_loss": 0.1676384061574936, + "step": 3278 + }, + { + "epoch": 4.741865509761388, + "grad_norm": 1.6250065519446988, + "learning_rate": 8.726740701278808e-07, + "logits/chosen": -0.9104323387145996, + "logits/rejected": -0.8115368485450745, + "logps/chosen": -0.02846909500658512, + "logps/rejected": -3.3297507762908936, + "loss": 0.0595, + "odds_ratio_loss": 0.0022020572796463966, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0028469092212617397, + "rewards/margins": 0.33012816309928894, + "rewards/rejected": -0.3329750597476959, + "sft_loss": 0.02846909500658512, + "step": 3279 + }, + { + "epoch": 4.743311641359364, + "grad_norm": 1.5518030908611087, + "learning_rate": 8.707385316161953e-07, + "logits/chosen": -0.8623483180999756, + "logits/rejected": -0.645122766494751, + "logps/chosen": -0.05945183336734772, + "logps/rejected": -4.296452045440674, + "loss": 0.0591, + "odds_ratio_loss": 0.005059407092630863, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005945183336734772, + "rewards/margins": 0.4237000346183777, + "rewards/rejected": -0.4296451807022095, + "sft_loss": 0.05945183336734772, + "step": 3280 + }, + { + "epoch": 4.744757772957339, + "grad_norm": 1.7926493160828691, + "learning_rate": 8.688048797479042e-07, + "logits/chosen": -1.0307143926620483, + "logits/rejected": -0.6179426908493042, + "logps/chosen": -0.06234199181199074, + "logps/rejected": -5.2709503173828125, + "loss": 0.0493, + "odds_ratio_loss": 0.003489615162834525, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0062341997399926186, + "rewards/margins": 0.5208608508110046, + "rewards/rejected": -0.5270950198173523, + "sft_loss": 0.06234199181199074, + "step": 3281 + }, + { + "epoch": 4.746203904555315, + "grad_norm": 1.658140114417297, + "learning_rate": 8.668731156888131e-07, + "logits/chosen": -0.9141590595245361, + "logits/rejected": -0.656516432762146, + "logps/chosen": -0.11738917231559753, + "logps/rejected": -3.92860746383667, + "loss": 0.063, + "odds_ratio_loss": 0.012846600264310837, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011738916859030724, + "rewards/margins": 0.38112184405326843, + "rewards/rejected": -0.3928607702255249, + "sft_loss": 0.11738917231559753, + "step": 3282 + }, + { + "epoch": 4.74765003615329, + "grad_norm": 2.812226274576573, + "learning_rate": 8.649432406035897e-07, + "logits/chosen": -0.6458684206008911, + "logits/rejected": -0.5954203009605408, + "logps/chosen": -0.14193274080753326, + "logps/rejected": -5.994907379150391, + "loss": 0.1098, + "odds_ratio_loss": 0.005412849597632885, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014193275012075901, + "rewards/margins": 0.5852974653244019, + "rewards/rejected": -0.5994908213615417, + "sft_loss": 0.14193274080753326, + "step": 3283 + }, + { + "epoch": 4.749096167751265, + "grad_norm": 1.5820536753740884, + "learning_rate": 8.630152556557613e-07, + "logits/chosen": -0.6483851671218872, + "logits/rejected": -0.5950357913970947, + "logps/chosen": -0.04669715836644173, + "logps/rejected": -3.50252628326416, + "loss": 0.0653, + "odds_ratio_loss": 0.0048396410420536995, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004669716116040945, + "rewards/margins": 0.34558290243148804, + "rewards/rejected": -0.350252628326416, + "sft_loss": 0.04669715836644173, + "step": 3284 + }, + { + "epoch": 4.750542299349241, + "grad_norm": 1.691255261241321, + "learning_rate": 8.610891620077198e-07, + "logits/chosen": -0.6523389220237732, + "logits/rejected": -0.4981173872947693, + "logps/chosen": -0.11179995536804199, + "logps/rejected": -4.435649871826172, + "loss": 0.1057, + "odds_ratio_loss": 0.007845187559723854, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011179995723068714, + "rewards/margins": 0.4323849678039551, + "rewards/rejected": -0.4435649812221527, + "sft_loss": 0.11179995536804199, + "step": 3285 + }, + { + "epoch": 4.7519884309472165, + "grad_norm": 1.625313677026407, + "learning_rate": 8.59164960820712e-07, + "logits/chosen": -0.8503109216690063, + "logits/rejected": -0.7520562410354614, + "logps/chosen": -0.16012218594551086, + "logps/rejected": -3.7297375202178955, + "loss": 0.1028, + "odds_ratio_loss": 0.033738117665052414, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016012219712138176, + "rewards/margins": 0.35696154832839966, + "rewards/rejected": -0.3729737401008606, + "sft_loss": 0.16012218594551086, + "step": 3286 + }, + { + "epoch": 4.753434562545191, + "grad_norm": 1.4980789666005607, + "learning_rate": 8.572426532548487e-07, + "logits/chosen": -0.6264578104019165, + "logits/rejected": -0.6370917558670044, + "logps/chosen": -0.09624272584915161, + "logps/rejected": -3.5333688259124756, + "loss": 0.0647, + "odds_ratio_loss": 0.013179374858736992, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009624271653592587, + "rewards/margins": 0.34371256828308105, + "rewards/rejected": -0.3533368706703186, + "sft_loss": 0.09624272584915161, + "step": 3287 + }, + { + "epoch": 4.754880694143167, + "grad_norm": 1.7420127662395781, + "learning_rate": 8.553222404690928e-07, + "logits/chosen": -0.8942149877548218, + "logits/rejected": -0.7594558000564575, + "logps/chosen": -0.08043821156024933, + "logps/rejected": -4.053690433502197, + "loss": 0.0754, + "odds_ratio_loss": 0.005299334414303303, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008043821901082993, + "rewards/margins": 0.39732521772384644, + "rewards/rejected": -0.4053690433502197, + "sft_loss": 0.08043821156024933, + "step": 3288 + }, + { + "epoch": 4.756326825741143, + "grad_norm": 1.6059975248009308, + "learning_rate": 8.534037236212715e-07, + "logits/chosen": -0.8111061453819275, + "logits/rejected": -0.6317873597145081, + "logps/chosen": -0.06460095942020416, + "logps/rejected": -5.87692928314209, + "loss": 0.0611, + "odds_ratio_loss": 0.00885202456265688, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006460095755755901, + "rewards/margins": 0.5812327861785889, + "rewards/rejected": -0.58769291639328, + "sft_loss": 0.06460095942020416, + "step": 3289 + }, + { + "epoch": 4.757772957339117, + "grad_norm": 1.8821353090731472, + "learning_rate": 8.514871038680644e-07, + "logits/chosen": -0.7282547950744629, + "logits/rejected": -0.5171594619750977, + "logps/chosen": -0.09477448463439941, + "logps/rejected": -6.691037654876709, + "loss": 0.1061, + "odds_ratio_loss": 0.00491682905703783, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009477448649704456, + "rewards/margins": 0.659626305103302, + "rewards/rejected": -0.6691038012504578, + "sft_loss": 0.09477448463439941, + "step": 3290 + }, + { + "epoch": 4.759219088937093, + "grad_norm": 1.3869499519014088, + "learning_rate": 8.495723823650078e-07, + "logits/chosen": -0.8584036231040955, + "logits/rejected": -0.7203078866004944, + "logps/chosen": -0.06936081498861313, + "logps/rejected": -5.100736618041992, + "loss": 0.0654, + "odds_ratio_loss": 0.005390543024986982, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006936081685125828, + "rewards/margins": 0.5031375885009766, + "rewards/rejected": -0.5100736618041992, + "sft_loss": 0.06936081498861313, + "step": 3291 + }, + { + "epoch": 4.760665220535069, + "grad_norm": 1.572558923505267, + "learning_rate": 8.476595602664965e-07, + "logits/chosen": -0.7464091777801514, + "logits/rejected": -0.6228684186935425, + "logps/chosen": -0.07792852073907852, + "logps/rejected": -4.133586883544922, + "loss": 0.0911, + "odds_ratio_loss": 0.008083056658506393, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0077928523533046246, + "rewards/margins": 0.4055657982826233, + "rewards/rejected": -0.4133586883544922, + "sft_loss": 0.07792852073907852, + "step": 3292 + }, + { + "epoch": 4.7621113521330445, + "grad_norm": 1.8275520156329068, + "learning_rate": 8.457486387257753e-07, + "logits/chosen": -0.6914631128311157, + "logits/rejected": -0.5352708697319031, + "logps/chosen": -0.06538383662700653, + "logps/rejected": -4.57308292388916, + "loss": 0.0546, + "odds_ratio_loss": 0.005749408155679703, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006538383662700653, + "rewards/margins": 0.45076990127563477, + "rewards/rejected": -0.4573083519935608, + "sft_loss": 0.06538383662700653, + "step": 3293 + }, + { + "epoch": 4.763557483731019, + "grad_norm": 1.6277201198392217, + "learning_rate": 8.438396188949486e-07, + "logits/chosen": -0.6373045444488525, + "logits/rejected": -0.615994930267334, + "logps/chosen": -0.11351317912340164, + "logps/rejected": -6.261795997619629, + "loss": 0.0821, + "odds_ratio_loss": 0.02115788869559765, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011351317167282104, + "rewards/margins": 0.61482834815979, + "rewards/rejected": -0.6261796355247498, + "sft_loss": 0.11351317912340164, + "step": 3294 + }, + { + "epoch": 4.765003615328995, + "grad_norm": 1.771173716342708, + "learning_rate": 8.419325019249699e-07, + "logits/chosen": -0.8777015209197998, + "logits/rejected": -0.7351939082145691, + "logps/chosen": -0.09541021287441254, + "logps/rejected": -4.427221298217773, + "loss": 0.0785, + "odds_ratio_loss": 0.0014403918758034706, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009541021659970284, + "rewards/margins": 0.43318116664886475, + "rewards/rejected": -0.4427221417427063, + "sft_loss": 0.09541021287441254, + "step": 3295 + }, + { + "epoch": 4.766449746926971, + "grad_norm": 1.4668075763215394, + "learning_rate": 8.400272889656484e-07, + "logits/chosen": -0.9218635559082031, + "logits/rejected": -0.8534926176071167, + "logps/chosen": -0.044725202023983, + "logps/rejected": -3.9485654830932617, + "loss": 0.0502, + "odds_ratio_loss": 0.007518763653934002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0044725202023983, + "rewards/margins": 0.3903840184211731, + "rewards/rejected": -0.3948565125465393, + "sft_loss": 0.044725202023983, + "step": 3296 + }, + { + "epoch": 4.7678958785249455, + "grad_norm": 1.754333085943087, + "learning_rate": 8.381239811656434e-07, + "logits/chosen": -0.5121525526046753, + "logits/rejected": -0.393355131149292, + "logps/chosen": -0.0673723891377449, + "logps/rejected": -4.3197021484375, + "loss": 0.0698, + "odds_ratio_loss": 0.006755652837455273, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006737238727509975, + "rewards/margins": 0.42523300647735596, + "rewards/rejected": -0.4319702386856079, + "sft_loss": 0.0673723891377449, + "step": 3297 + }, + { + "epoch": 4.769342010122921, + "grad_norm": 1.429132077368146, + "learning_rate": 8.362225796724662e-07, + "logits/chosen": -0.8575242757797241, + "logits/rejected": -0.8655635714530945, + "logps/chosen": -0.10043197125196457, + "logps/rejected": -4.142751693725586, + "loss": 0.0738, + "odds_ratio_loss": 0.010751158930361271, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010043196380138397, + "rewards/margins": 0.4042320251464844, + "rewards/rejected": -0.414275199174881, + "sft_loss": 0.10043197125196457, + "step": 3298 + }, + { + "epoch": 4.770788141720897, + "grad_norm": 2.0904057729420122, + "learning_rate": 8.343230856324779e-07, + "logits/chosen": -0.6189036965370178, + "logits/rejected": -0.5577619671821594, + "logps/chosen": -0.06153484806418419, + "logps/rejected": -5.164226531982422, + "loss": 0.0832, + "odds_ratio_loss": 0.007398117333650589, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006153484806418419, + "rewards/margins": 0.5102692246437073, + "rewards/rejected": -0.5164227485656738, + "sft_loss": 0.06153484806418419, + "step": 3299 + }, + { + "epoch": 4.7722342733188725, + "grad_norm": 1.809363032411696, + "learning_rate": 8.324255001908929e-07, + "logits/chosen": -0.6867760419845581, + "logits/rejected": -0.5739983916282654, + "logps/chosen": -0.08187718689441681, + "logps/rejected": -4.6687726974487305, + "loss": 0.0809, + "odds_ratio_loss": 0.01753270998597145, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008187718689441681, + "rewards/margins": 0.4586895704269409, + "rewards/rejected": -0.4668773114681244, + "sft_loss": 0.08187718689441681, + "step": 3300 + }, + { + "epoch": 4.773680404916847, + "grad_norm": 1.6080700276039386, + "learning_rate": 8.305298244917698e-07, + "logits/chosen": -1.0285183191299438, + "logits/rejected": -0.8908055424690247, + "logps/chosen": -0.1339968740940094, + "logps/rejected": -2.8577942848205566, + "loss": 0.0732, + "odds_ratio_loss": 0.020126506686210632, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01339968666434288, + "rewards/margins": 0.2723797559738159, + "rewards/rejected": -0.2857794165611267, + "sft_loss": 0.1339968740940094, + "step": 3301 + }, + { + "epoch": 4.775126536514823, + "grad_norm": 1.5805169753771973, + "learning_rate": 8.286360596780197e-07, + "logits/chosen": -0.8294731378555298, + "logits/rejected": -0.64952552318573, + "logps/chosen": -0.061815451830625534, + "logps/rejected": -4.596172332763672, + "loss": 0.0699, + "odds_ratio_loss": 0.0055434140376746655, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006181545555591583, + "rewards/margins": 0.4534357190132141, + "rewards/rejected": -0.4596172571182251, + "sft_loss": 0.061815451830625534, + "step": 3302 + }, + { + "epoch": 4.776572668112799, + "grad_norm": 1.7640904225463478, + "learning_rate": 8.267442068914019e-07, + "logits/chosen": -0.8276826739311218, + "logits/rejected": -0.5451090931892395, + "logps/chosen": -0.04526611045002937, + "logps/rejected": -5.430952072143555, + "loss": 0.0699, + "odds_ratio_loss": 0.0029891584999859333, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00452661095187068, + "rewards/margins": 0.5385686159133911, + "rewards/rejected": -0.5430951714515686, + "sft_loss": 0.04526611045002937, + "step": 3303 + }, + { + "epoch": 4.7780187997107735, + "grad_norm": 1.4000226060925451, + "learning_rate": 8.248542672725189e-07, + "logits/chosen": -0.8920959830284119, + "logits/rejected": -0.6845582127571106, + "logps/chosen": -0.04449871927499771, + "logps/rejected": -3.2585527896881104, + "loss": 0.0678, + "odds_ratio_loss": 0.005256593693047762, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004449872300028801, + "rewards/margins": 0.3214053809642792, + "rewards/rejected": -0.3258552551269531, + "sft_loss": 0.04449871927499771, + "step": 3304 + }, + { + "epoch": 4.779464931308749, + "grad_norm": 1.467193415411127, + "learning_rate": 8.229662419608252e-07, + "logits/chosen": -0.9364633560180664, + "logits/rejected": -0.799718976020813, + "logps/chosen": -0.09500187635421753, + "logps/rejected": -4.936524391174316, + "loss": 0.0788, + "odds_ratio_loss": 0.007705302909016609, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009500187821686268, + "rewards/margins": 0.48415225744247437, + "rewards/rejected": -0.49365243315696716, + "sft_loss": 0.09500187635421753, + "step": 3305 + }, + { + "epoch": 4.780911062906725, + "grad_norm": 1.4723060397784287, + "learning_rate": 8.210801320946163e-07, + "logits/chosen": -0.8033458590507507, + "logits/rejected": -0.514359712600708, + "logps/chosen": -0.07291413843631744, + "logps/rejected": -6.4514875411987305, + "loss": 0.0735, + "odds_ratio_loss": 0.004370196722447872, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007291413843631744, + "rewards/margins": 0.6378573179244995, + "rewards/rejected": -0.6451488137245178, + "sft_loss": 0.07291413843631744, + "step": 3306 + }, + { + "epoch": 4.7823571945047, + "grad_norm": 1.4452690512837696, + "learning_rate": 8.191959388110356e-07, + "logits/chosen": -0.767648458480835, + "logits/rejected": -0.6331923007965088, + "logps/chosen": -0.06505173444747925, + "logps/rejected": -3.9707190990448, + "loss": 0.0706, + "odds_ratio_loss": 0.0042672669515013695, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006505173165351152, + "rewards/margins": 0.39056673645973206, + "rewards/rejected": -0.397071897983551, + "sft_loss": 0.06505173444747925, + "step": 3307 + }, + { + "epoch": 4.783803326102675, + "grad_norm": 1.4691145708870954, + "learning_rate": 8.173136632460687e-07, + "logits/chosen": -0.8610515594482422, + "logits/rejected": -0.6491256356239319, + "logps/chosen": -0.05403333902359009, + "logps/rejected": -3.766594648361206, + "loss": 0.0779, + "odds_ratio_loss": 0.004882722161710262, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005403334274888039, + "rewards/margins": 0.37125617265701294, + "rewards/rejected": -0.3766595125198364, + "sft_loss": 0.05403333902359009, + "step": 3308 + }, + { + "epoch": 4.785249457700651, + "grad_norm": 2.0942384834532115, + "learning_rate": 8.154333065345489e-07, + "logits/chosen": -0.7129379510879517, + "logits/rejected": -0.49292656779289246, + "logps/chosen": -0.08610312640666962, + "logps/rejected": -4.657783031463623, + "loss": 0.1099, + "odds_ratio_loss": 0.007286392152309418, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008610313758254051, + "rewards/margins": 0.45716798305511475, + "rewards/rejected": -0.46577832102775574, + "sft_loss": 0.08610312640666962, + "step": 3309 + }, + { + "epoch": 4.786695589298626, + "grad_norm": 1.9279725748993217, + "learning_rate": 8.135548698101482e-07, + "logits/chosen": -0.6503070592880249, + "logits/rejected": -0.5543050765991211, + "logps/chosen": -0.0853266641497612, + "logps/rejected": -4.004879951477051, + "loss": 0.08, + "odds_ratio_loss": 0.019836550578475, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008532666601240635, + "rewards/margins": 0.39195531606674194, + "rewards/rejected": -0.4004879891872406, + "sft_loss": 0.0853266641497612, + "step": 3310 + }, + { + "epoch": 4.7881417208966015, + "grad_norm": 1.967430729296363, + "learning_rate": 8.116783542053855e-07, + "logits/chosen": -0.7817206382751465, + "logits/rejected": -0.6119512915611267, + "logps/chosen": -0.12662483751773834, + "logps/rejected": -4.243107795715332, + "loss": 0.1184, + "odds_ratio_loss": 0.009148865006864071, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012662483379244804, + "rewards/margins": 0.41164830327033997, + "rewards/rejected": -0.4243107736110687, + "sft_loss": 0.12662483751773834, + "step": 3311 + }, + { + "epoch": 4.789587852494577, + "grad_norm": 1.6003137675765775, + "learning_rate": 8.09803760851616e-07, + "logits/chosen": -0.8021121621131897, + "logits/rejected": -0.5248170495033264, + "logps/chosen": -0.05905238538980484, + "logps/rejected": -4.108620643615723, + "loss": 0.0577, + "odds_ratio_loss": 0.007532968185842037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005905238911509514, + "rewards/margins": 0.4049568772315979, + "rewards/rejected": -0.4108620882034302, + "sft_loss": 0.05905238538980484, + "step": 3312 + }, + { + "epoch": 4.791033984092552, + "grad_norm": 1.5645076243387899, + "learning_rate": 8.079310908790419e-07, + "logits/chosen": -0.934636116027832, + "logits/rejected": -0.722909688949585, + "logps/chosen": -0.1410127729177475, + "logps/rejected": -5.098935127258301, + "loss": 0.1074, + "odds_ratio_loss": 0.0036122482270002365, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01410127803683281, + "rewards/margins": 0.4957922697067261, + "rewards/rejected": -0.509893536567688, + "sft_loss": 0.1410127729177475, + "step": 3313 + }, + { + "epoch": 4.792480115690528, + "grad_norm": 1.3982813202070725, + "learning_rate": 8.060603454167019e-07, + "logits/chosen": -1.0576629638671875, + "logits/rejected": -0.6195839047431946, + "logps/chosen": -0.08455665409564972, + "logps/rejected": -5.276945114135742, + "loss": 0.084, + "odds_ratio_loss": 0.007596557028591633, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008455665782094002, + "rewards/margins": 0.5192388296127319, + "rewards/rejected": -0.5276945233345032, + "sft_loss": 0.08455665409564972, + "step": 3314 + }, + { + "epoch": 4.793926247288503, + "grad_norm": 1.6979981582653707, + "learning_rate": 8.041915255924747e-07, + "logits/chosen": -0.7179811596870422, + "logits/rejected": -0.5930829644203186, + "logps/chosen": -0.04754379764199257, + "logps/rejected": -4.652839183807373, + "loss": 0.0682, + "odds_ratio_loss": 0.0038252677768468857, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004754380322992802, + "rewards/margins": 0.46052953600883484, + "rewards/rejected": -0.46528393030166626, + "sft_loss": 0.04754379764199257, + "step": 3315 + }, + { + "epoch": 4.795372378886479, + "grad_norm": 1.6512873697364088, + "learning_rate": 8.023246325330784e-07, + "logits/chosen": -0.9477057456970215, + "logits/rejected": -0.686681866645813, + "logps/chosen": -0.11634199321269989, + "logps/rejected": -5.4780354499816895, + "loss": 0.0937, + "odds_ratio_loss": 0.008887776173651218, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01163419894874096, + "rewards/margins": 0.5361693501472473, + "rewards/rejected": -0.5478035807609558, + "sft_loss": 0.11634199321269989, + "step": 3316 + }, + { + "epoch": 4.796818510484454, + "grad_norm": 1.5667079079881334, + "learning_rate": 8.004596673640707e-07, + "logits/chosen": -0.7140193581581116, + "logits/rejected": -0.4710603654384613, + "logps/chosen": -0.08510614186525345, + "logps/rejected": -3.7450685501098633, + "loss": 0.0795, + "odds_ratio_loss": 0.016938410699367523, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008510613813996315, + "rewards/margins": 0.36599624156951904, + "rewards/rejected": -0.3745068609714508, + "sft_loss": 0.08510614186525345, + "step": 3317 + }, + { + "epoch": 4.7982646420824295, + "grad_norm": 1.6338412747663902, + "learning_rate": 7.985966312098469e-07, + "logits/chosen": -0.6758091449737549, + "logits/rejected": -0.4951193928718567, + "logps/chosen": -0.12785102427005768, + "logps/rejected": -6.292760848999023, + "loss": 0.0977, + "odds_ratio_loss": 0.012771306559443474, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012785101309418678, + "rewards/margins": 0.6164909601211548, + "rewards/rejected": -0.6292760968208313, + "sft_loss": 0.12785102427005768, + "step": 3318 + }, + { + "epoch": 4.799710773680405, + "grad_norm": 1.496919749181908, + "learning_rate": 7.967355251936361e-07, + "logits/chosen": -0.7086659073829651, + "logits/rejected": -0.5454675555229187, + "logps/chosen": -0.10819420963525772, + "logps/rejected": -6.755625247955322, + "loss": 0.0749, + "odds_ratio_loss": 0.004403555765748024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010819422081112862, + "rewards/margins": 0.6647431254386902, + "rewards/rejected": -0.6755625009536743, + "sft_loss": 0.10819420963525772, + "step": 3319 + }, + { + "epoch": 4.80115690527838, + "grad_norm": 1.8413203272352143, + "learning_rate": 7.948763504375087e-07, + "logits/chosen": -0.9324480891227722, + "logits/rejected": -0.6381005048751831, + "logps/chosen": -0.06453929096460342, + "logps/rejected": -6.1513166427612305, + "loss": 0.0978, + "odds_ratio_loss": 0.0025580860674381256, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006453928537666798, + "rewards/margins": 0.6086777448654175, + "rewards/rejected": -0.6151317358016968, + "sft_loss": 0.06453929096460342, + "step": 3320 + }, + { + "epoch": 4.802603036876356, + "grad_norm": 1.5277251212604894, + "learning_rate": 7.930191080623668e-07, + "logits/chosen": -0.6400019526481628, + "logits/rejected": -0.5209491848945618, + "logps/chosen": -0.07888957858085632, + "logps/rejected": -4.372533321380615, + "loss": 0.0621, + "odds_ratio_loss": 0.010060323402285576, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007888957858085632, + "rewards/margins": 0.429364413022995, + "rewards/rejected": -0.4372533857822418, + "sft_loss": 0.07888957858085632, + "step": 3321 + }, + { + "epoch": 4.804049168474331, + "grad_norm": 2.2584557191850894, + "learning_rate": 7.911637991879483e-07, + "logits/chosen": -1.0325016975402832, + "logits/rejected": -0.7070356011390686, + "logps/chosen": -0.09169142693281174, + "logps/rejected": -4.366299152374268, + "loss": 0.1086, + "odds_ratio_loss": 0.012625223957002163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009169142693281174, + "rewards/margins": 0.42746075987815857, + "rewards/rejected": -0.43662989139556885, + "sft_loss": 0.09169142693281174, + "step": 3322 + }, + { + "epoch": 4.805495300072307, + "grad_norm": 1.64122497738142, + "learning_rate": 7.893104249328258e-07, + "logits/chosen": -0.9511623978614807, + "logits/rejected": -0.7628656625747681, + "logps/chosen": -0.09314146637916565, + "logps/rejected": -4.508124828338623, + "loss": 0.0803, + "odds_ratio_loss": 0.005370480008423328, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009314147755503654, + "rewards/margins": 0.4414983093738556, + "rewards/rejected": -0.4508124887943268, + "sft_loss": 0.09314146637916565, + "step": 3323 + }, + { + "epoch": 4.806941431670282, + "grad_norm": 1.5377080828397056, + "learning_rate": 7.874589864144066e-07, + "logits/chosen": -0.8962692022323608, + "logits/rejected": -0.5798559784889221, + "logps/chosen": -0.06909617781639099, + "logps/rejected": -7.326718807220459, + "loss": 0.0594, + "odds_ratio_loss": 0.00794798880815506, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006909618619829416, + "rewards/margins": 0.7257623076438904, + "rewards/rejected": -0.7326719164848328, + "sft_loss": 0.06909617781639099, + "step": 3324 + }, + { + "epoch": 4.808387563268258, + "grad_norm": 1.5937599520948071, + "learning_rate": 7.856094847489286e-07, + "logits/chosen": -0.6906980872154236, + "logits/rejected": -0.5133391618728638, + "logps/chosen": -0.09170494973659515, + "logps/rejected": -4.876096725463867, + "loss": 0.071, + "odds_ratio_loss": 0.00837009958922863, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00917049590498209, + "rewards/margins": 0.47843921184539795, + "rewards/rejected": -0.4876096844673157, + "sft_loss": 0.09170494973659515, + "step": 3325 + }, + { + "epoch": 4.809833694866233, + "grad_norm": 1.6849063604132934, + "learning_rate": 7.837619210514645e-07, + "logits/chosen": -0.8195112943649292, + "logits/rejected": -0.6542753577232361, + "logps/chosen": -0.08772353827953339, + "logps/rejected": -5.507904052734375, + "loss": 0.0818, + "odds_ratio_loss": 0.009588465094566345, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008772353641688824, + "rewards/margins": 0.5420180559158325, + "rewards/rejected": -0.5507904291152954, + "sft_loss": 0.08772353827953339, + "step": 3326 + }, + { + "epoch": 4.811279826464208, + "grad_norm": 1.597280251006234, + "learning_rate": 7.819162964359161e-07, + "logits/chosen": -0.8942530155181885, + "logits/rejected": -0.6368886232376099, + "logps/chosen": -0.07599680870771408, + "logps/rejected": -4.553309440612793, + "loss": 0.0985, + "odds_ratio_loss": 0.005070159211754799, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007599681615829468, + "rewards/margins": 0.44773125648498535, + "rewards/rejected": -0.45533090829849243, + "sft_loss": 0.07599680870771408, + "step": 3327 + }, + { + "epoch": 4.812725958062184, + "grad_norm": 1.5081792858017415, + "learning_rate": 7.800726120150188e-07, + "logits/chosen": -0.9549357891082764, + "logits/rejected": -0.6117710471153259, + "logps/chosen": -0.08728785812854767, + "logps/rejected": -5.800983428955078, + "loss": 0.074, + "odds_ratio_loss": 0.008302710950374603, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008728786371648312, + "rewards/margins": 0.5713695883750916, + "rewards/rejected": -0.5800983905792236, + "sft_loss": 0.08728785812854767, + "step": 3328 + }, + { + "epoch": 4.814172089660159, + "grad_norm": 1.6857913081849396, + "learning_rate": 7.782308689003359e-07, + "logits/chosen": -0.9277090430259705, + "logits/rejected": -0.642306923866272, + "logps/chosen": -0.04577184468507767, + "logps/rejected": -4.958084583282471, + "loss": 0.0645, + "odds_ratio_loss": 0.0022096242755651474, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004577184561640024, + "rewards/margins": 0.4912312626838684, + "rewards/rejected": -0.495808482170105, + "sft_loss": 0.04577184468507767, + "step": 3329 + }, + { + "epoch": 4.815618221258134, + "grad_norm": 2.1125135146669125, + "learning_rate": 7.763910682022606e-07, + "logits/chosen": -0.7076646089553833, + "logits/rejected": -0.5731692910194397, + "logps/chosen": -0.05527088791131973, + "logps/rejected": -6.649045944213867, + "loss": 0.0628, + "odds_ratio_loss": 0.005929501727223396, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005527088418602943, + "rewards/margins": 0.6593775153160095, + "rewards/rejected": -0.6649045944213867, + "sft_loss": 0.05527088791131973, + "step": 3330 + }, + { + "epoch": 4.81706435285611, + "grad_norm": 1.5913484401509341, + "learning_rate": 7.74553211030017e-07, + "logits/chosen": -0.7423536777496338, + "logits/rejected": -0.6629868745803833, + "logps/chosen": -0.06886343657970428, + "logps/rejected": -3.8152801990509033, + "loss": 0.0913, + "odds_ratio_loss": 0.014335782267153263, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0068863434717059135, + "rewards/margins": 0.3746417164802551, + "rewards/rejected": -0.38152801990509033, + "sft_loss": 0.06886343657970428, + "step": 3331 + }, + { + "epoch": 4.818510484454086, + "grad_norm": 1.6210084217221785, + "learning_rate": 7.727172984916545e-07, + "logits/chosen": -0.8341798782348633, + "logits/rejected": -0.7047946453094482, + "logps/chosen": -0.1327381432056427, + "logps/rejected": -4.129507064819336, + "loss": 0.1181, + "odds_ratio_loss": 0.02535739727318287, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013273816555738449, + "rewards/margins": 0.39967694878578186, + "rewards/rejected": -0.4129507541656494, + "sft_loss": 0.1327381432056427, + "step": 3332 + }, + { + "epoch": 4.81995661605206, + "grad_norm": 1.6399711939380055, + "learning_rate": 7.708833316940535e-07, + "logits/chosen": -0.8430829048156738, + "logits/rejected": -0.6608396768569946, + "logps/chosen": -0.05684221535921097, + "logps/rejected": -4.846701622009277, + "loss": 0.086, + "odds_ratio_loss": 0.0052174655720591545, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005684222094714642, + "rewards/margins": 0.478985995054245, + "rewards/rejected": -0.4846702218055725, + "sft_loss": 0.05684221535921097, + "step": 3333 + }, + { + "epoch": 4.821402747650036, + "grad_norm": 1.5697641034160588, + "learning_rate": 7.690513117429169e-07, + "logits/chosen": -0.9114935398101807, + "logits/rejected": -0.7490615248680115, + "logps/chosen": -0.07970191538333893, + "logps/rejected": -3.7014589309692383, + "loss": 0.0887, + "odds_ratio_loss": 0.009289674460887909, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007970191538333893, + "rewards/margins": 0.36217570304870605, + "rewards/rejected": -0.37014591693878174, + "sft_loss": 0.07970191538333893, + "step": 3334 + }, + { + "epoch": 4.822848879248012, + "grad_norm": 1.5121408909888006, + "learning_rate": 7.672212397427795e-07, + "logits/chosen": -0.8596815466880798, + "logits/rejected": -0.5858790278434753, + "logps/chosen": -0.15773321688175201, + "logps/rejected": -4.535950660705566, + "loss": 0.1105, + "odds_ratio_loss": 0.011155569925904274, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015773320570588112, + "rewards/margins": 0.43782174587249756, + "rewards/rejected": -0.4535950720310211, + "sft_loss": 0.15773321688175201, + "step": 3335 + }, + { + "epoch": 4.8242950108459866, + "grad_norm": 1.675437148395465, + "learning_rate": 7.653931167969965e-07, + "logits/chosen": -0.7641026973724365, + "logits/rejected": -0.5949983596801758, + "logps/chosen": -0.05824780464172363, + "logps/rejected": -5.220847129821777, + "loss": 0.0899, + "odds_ratio_loss": 0.00477550970390439, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005824780557304621, + "rewards/margins": 0.5162599086761475, + "rewards/rejected": -0.5220847129821777, + "sft_loss": 0.05824780464172363, + "step": 3336 + }, + { + "epoch": 4.825741142443962, + "grad_norm": 1.509678398919964, + "learning_rate": 7.635669440077502e-07, + "logits/chosen": -0.7890247106552124, + "logits/rejected": -0.4827120006084442, + "logps/chosen": -0.08683416247367859, + "logps/rejected": -5.226888179779053, + "loss": 0.0642, + "odds_ratio_loss": 0.0037988172844052315, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008683416061103344, + "rewards/margins": 0.5140054225921631, + "rewards/rejected": -0.5226888656616211, + "sft_loss": 0.08683416247367859, + "step": 3337 + }, + { + "epoch": 4.827187274041938, + "grad_norm": 1.3819942621004409, + "learning_rate": 7.61742722476046e-07, + "logits/chosen": -0.8519781827926636, + "logits/rejected": -0.6214016675949097, + "logps/chosen": -0.0465102382004261, + "logps/rejected": -4.396292686462402, + "loss": 0.0694, + "odds_ratio_loss": 0.003573787398636341, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004651024006307125, + "rewards/margins": 0.43497827649116516, + "rewards/rejected": -0.43962928652763367, + "sft_loss": 0.0465102382004261, + "step": 3338 + }, + { + "epoch": 4.828633405639914, + "grad_norm": 1.6948199742458905, + "learning_rate": 7.599204533017163e-07, + "logits/chosen": -0.8539596796035767, + "logits/rejected": -0.6457651257514954, + "logps/chosen": -0.08881109207868576, + "logps/rejected": -4.225852012634277, + "loss": 0.0877, + "odds_ratio_loss": 0.015900075435638428, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008881110697984695, + "rewards/margins": 0.41370415687561035, + "rewards/rejected": -0.42258524894714355, + "sft_loss": 0.08881109207868576, + "step": 3339 + }, + { + "epoch": 4.830079537237888, + "grad_norm": 1.597991480719571, + "learning_rate": 7.581001375834115e-07, + "logits/chosen": -0.7218050360679626, + "logits/rejected": -0.5693727731704712, + "logps/chosen": -0.04720648005604744, + "logps/rejected": -5.685175895690918, + "loss": 0.0757, + "odds_ratio_loss": 0.0023048685397952795, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004720647819340229, + "rewards/margins": 0.5637969374656677, + "rewards/rejected": -0.5685176253318787, + "sft_loss": 0.04720648005604744, + "step": 3340 + }, + { + "epoch": 4.831525668835864, + "grad_norm": 1.8196226662982362, + "learning_rate": 7.562817764186089e-07, + "logits/chosen": -0.9130833148956299, + "logits/rejected": -0.6541707515716553, + "logps/chosen": -0.03671087324619293, + "logps/rejected": -4.781536102294922, + "loss": 0.0823, + "odds_ratio_loss": 0.002584438305348158, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0036710870917886496, + "rewards/margins": 0.4744824767112732, + "rewards/rejected": -0.47815364599227905, + "sft_loss": 0.03671087324619293, + "step": 3341 + }, + { + "epoch": 4.83297180043384, + "grad_norm": 1.3978966669157058, + "learning_rate": 7.544653709036031e-07, + "logits/chosen": -0.75586998462677, + "logits/rejected": -0.6667506694793701, + "logps/chosen": -0.02506261318922043, + "logps/rejected": -5.60970401763916, + "loss": 0.0442, + "odds_ratio_loss": 0.002245509997010231, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0025062612257897854, + "rewards/margins": 0.5584641098976135, + "rewards/rejected": -0.5609703660011292, + "sft_loss": 0.02506261318922043, + "step": 3342 + }, + { + "epoch": 4.834417932031815, + "grad_norm": 1.6595972252761546, + "learning_rate": 7.52650922133514e-07, + "logits/chosen": -0.8522578477859497, + "logits/rejected": -0.6769514679908752, + "logps/chosen": -0.10625261068344116, + "logps/rejected": -5.128435134887695, + "loss": 0.0849, + "odds_ratio_loss": 0.00923290103673935, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010625261813402176, + "rewards/margins": 0.5022182464599609, + "rewards/rejected": -0.5128434896469116, + "sft_loss": 0.10625261068344116, + "step": 3343 + }, + { + "epoch": 4.83586406362979, + "grad_norm": 4.07614397131639, + "learning_rate": 7.508384312022782e-07, + "logits/chosen": -0.6114479303359985, + "logits/rejected": -0.5096147656440735, + "logps/chosen": -0.20816218852996826, + "logps/rejected": -5.687783241271973, + "loss": 0.1081, + "odds_ratio_loss": 0.014260072261095047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020816218107938766, + "rewards/margins": 0.5479621291160583, + "rewards/rejected": -0.5687783360481262, + "sft_loss": 0.20816218852996826, + "step": 3344 + }, + { + "epoch": 4.837310195227766, + "grad_norm": 1.537217968234896, + "learning_rate": 7.490278992026527e-07, + "logits/chosen": -0.7360656261444092, + "logits/rejected": -0.5381304025650024, + "logps/chosen": -0.05491877347230911, + "logps/rejected": -6.194522380828857, + "loss": 0.0955, + "odds_ratio_loss": 0.006263894494622946, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005491877440363169, + "rewards/margins": 0.613960325717926, + "rewards/rejected": -0.6194522380828857, + "sft_loss": 0.05491877347230911, + "step": 3345 + }, + { + "epoch": 4.838756326825742, + "grad_norm": 1.2221302872235238, + "learning_rate": 7.472193272262153e-07, + "logits/chosen": -0.7894086837768555, + "logits/rejected": -0.7410459518432617, + "logps/chosen": -0.08093753457069397, + "logps/rejected": -5.554490566253662, + "loss": 0.0606, + "odds_ratio_loss": 0.006017337553203106, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008093753829598427, + "rewards/margins": 0.5473552942276001, + "rewards/rejected": -0.5554490685462952, + "sft_loss": 0.08093753457069397, + "step": 3346 + }, + { + "epoch": 4.840202458423716, + "grad_norm": 1.6006744327868236, + "learning_rate": 7.454127163633592e-07, + "logits/chosen": -0.7655268311500549, + "logits/rejected": -0.5101924538612366, + "logps/chosen": -0.07313164323568344, + "logps/rejected": -5.084259986877441, + "loss": 0.0724, + "odds_ratio_loss": 0.004324886482208967, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007313164416700602, + "rewards/margins": 0.5011128187179565, + "rewards/rejected": -0.5084260106086731, + "sft_loss": 0.07313164323568344, + "step": 3347 + }, + { + "epoch": 4.841648590021692, + "grad_norm": 1.791665583703619, + "learning_rate": 7.43608067703299e-07, + "logits/chosen": -0.7527843713760376, + "logits/rejected": -0.525779128074646, + "logps/chosen": -0.11555786430835724, + "logps/rejected": -5.336285591125488, + "loss": 0.1089, + "odds_ratio_loss": 0.012387518770992756, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011555787175893784, + "rewards/margins": 0.5220727920532227, + "rewards/rejected": -0.5336285829544067, + "sft_loss": 0.11555786430835724, + "step": 3348 + }, + { + "epoch": 4.843094721619668, + "grad_norm": 1.661398734647164, + "learning_rate": 7.418053823340619e-07, + "logits/chosen": -0.7001569271087646, + "logits/rejected": -0.6337046027183533, + "logps/chosen": -0.053295012563467026, + "logps/rejected": -3.712512493133545, + "loss": 0.0658, + "odds_ratio_loss": 0.00555766187608242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005329500883817673, + "rewards/margins": 0.3659217357635498, + "rewards/rejected": -0.3712512254714966, + "sft_loss": 0.053295012563467026, + "step": 3349 + }, + { + "epoch": 4.844540853217643, + "grad_norm": 1.7115225809925612, + "learning_rate": 7.400046613424953e-07, + "logits/chosen": -0.9585342407226562, + "logits/rejected": -0.7728517055511475, + "logps/chosen": -0.06088492274284363, + "logps/rejected": -4.558899879455566, + "loss": 0.0988, + "odds_ratio_loss": 0.009430285543203354, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00608849199488759, + "rewards/margins": 0.449801504611969, + "rewards/rejected": -0.4558899998664856, + "sft_loss": 0.06088492274284363, + "step": 3350 + }, + { + "epoch": 4.845986984815618, + "grad_norm": 1.5834390739287287, + "learning_rate": 7.382059058142593e-07, + "logits/chosen": -0.6291943192481995, + "logits/rejected": -0.5301669239997864, + "logps/chosen": -0.10131149739027023, + "logps/rejected": -4.346987724304199, + "loss": 0.1024, + "odds_ratio_loss": 0.02162271924316883, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010131150484085083, + "rewards/margins": 0.42456763982772827, + "rewards/rejected": -0.43469879031181335, + "sft_loss": 0.10131149739027023, + "step": 3351 + }, + { + "epoch": 4.847433116413594, + "grad_norm": 1.503712398335488, + "learning_rate": 7.364091168338316e-07, + "logits/chosen": -0.8372702598571777, + "logits/rejected": -0.4649083614349365, + "logps/chosen": -0.08138298988342285, + "logps/rejected": -5.166512489318848, + "loss": 0.0685, + "odds_ratio_loss": 0.00755068426951766, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008138298988342285, + "rewards/margins": 0.5085129737854004, + "rewards/rejected": -0.5166512727737427, + "sft_loss": 0.08138298988342285, + "step": 3352 + }, + { + "epoch": 4.848879248011569, + "grad_norm": 1.5724541370779448, + "learning_rate": 7.346142954845023e-07, + "logits/chosen": -0.8317492008209229, + "logits/rejected": -0.6612436771392822, + "logps/chosen": -0.08570529520511627, + "logps/rejected": -5.606926918029785, + "loss": 0.0873, + "odds_ratio_loss": 0.004792380146682262, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008570530451834202, + "rewards/margins": 0.552122175693512, + "rewards/rejected": -0.5606927275657654, + "sft_loss": 0.08570529520511627, + "step": 3353 + }, + { + "epoch": 4.8503253796095445, + "grad_norm": 1.5864421670245932, + "learning_rate": 7.328214428483761e-07, + "logits/chosen": -0.8036106824874878, + "logits/rejected": -0.6917225122451782, + "logps/chosen": -0.03204449266195297, + "logps/rejected": -5.145187854766846, + "loss": 0.0606, + "odds_ratio_loss": 0.004465484991669655, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003204449312761426, + "rewards/margins": 0.5113143920898438, + "rewards/rejected": -0.5145187973976135, + "sft_loss": 0.03204449266195297, + "step": 3354 + }, + { + "epoch": 4.85177151120752, + "grad_norm": 1.8370978992534, + "learning_rate": 7.310305600063689e-07, + "logits/chosen": -0.7530336380004883, + "logits/rejected": -0.5965844988822937, + "logps/chosen": -0.08221124112606049, + "logps/rejected": -5.269443988800049, + "loss": 0.0844, + "odds_ratio_loss": 0.006964895874261856, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008221123367547989, + "rewards/margins": 0.5187232494354248, + "rewards/rejected": -0.5269443988800049, + "sft_loss": 0.08221124112606049, + "step": 3355 + }, + { + "epoch": 4.853217642805495, + "grad_norm": 1.599729561734898, + "learning_rate": 7.292416480382124e-07, + "logits/chosen": -0.8736181259155273, + "logits/rejected": -0.61781907081604, + "logps/chosen": -0.118461973965168, + "logps/rejected": -4.439339637756348, + "loss": 0.0943, + "odds_ratio_loss": 0.006806999910622835, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01184619776904583, + "rewards/margins": 0.432087779045105, + "rewards/rejected": -0.44393399357795715, + "sft_loss": 0.118461973965168, + "step": 3356 + }, + { + "epoch": 4.854663774403471, + "grad_norm": 1.41699904404708, + "learning_rate": 7.274547080224484e-07, + "logits/chosen": -0.9623618125915527, + "logits/rejected": -0.6903282403945923, + "logps/chosen": -0.09254752844572067, + "logps/rejected": -5.285045623779297, + "loss": 0.0765, + "odds_ratio_loss": 0.005762843415141106, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009254752658307552, + "rewards/margins": 0.5192498564720154, + "rewards/rejected": -0.5285046100616455, + "sft_loss": 0.09254752844572067, + "step": 3357 + }, + { + "epoch": 4.856109906001446, + "grad_norm": 1.5705078152447065, + "learning_rate": 7.256697410364285e-07, + "logits/chosen": -1.076545000076294, + "logits/rejected": -0.740825891494751, + "logps/chosen": -0.09734838455915451, + "logps/rejected": -4.460480690002441, + "loss": 0.1055, + "odds_ratio_loss": 0.004817990120500326, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009734838269650936, + "rewards/margins": 0.4363132119178772, + "rewards/rejected": -0.4460480809211731, + "sft_loss": 0.09734838455915451, + "step": 3358 + }, + { + "epoch": 4.857556037599421, + "grad_norm": 1.4897765346710856, + "learning_rate": 7.23886748156318e-07, + "logits/chosen": -0.7283959984779358, + "logits/rejected": -0.5863144397735596, + "logps/chosen": -0.09330400079488754, + "logps/rejected": -6.437313079833984, + "loss": 0.0829, + "odds_ratio_loss": 0.010229337960481644, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00933040026575327, + "rewards/margins": 0.6344009637832642, + "rewards/rejected": -0.6437313556671143, + "sft_loss": 0.09330400079488754, + "step": 3359 + }, + { + "epoch": 4.859002169197397, + "grad_norm": 2.2661257282257825, + "learning_rate": 7.221057304570881e-07, + "logits/chosen": -0.5533820986747742, + "logits/rejected": -0.5214040279388428, + "logps/chosen": -0.12027622759342194, + "logps/rejected": -5.463490009307861, + "loss": 0.0807, + "odds_ratio_loss": 0.05367731302976608, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.012027623131871223, + "rewards/margins": 0.5343213677406311, + "rewards/rejected": -0.5463489890098572, + "sft_loss": 0.12027622759342194, + "step": 3360 + }, + { + "epoch": 4.8604483007953725, + "grad_norm": 1.5140500816134663, + "learning_rate": 7.203266890125217e-07, + "logits/chosen": -0.5703849792480469, + "logits/rejected": -0.5185967683792114, + "logps/chosen": -0.040762219578027725, + "logps/rejected": -6.055505275726318, + "loss": 0.0789, + "odds_ratio_loss": 0.004309056792408228, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0040762219578027725, + "rewards/margins": 0.6014742851257324, + "rewards/rejected": -0.6055505275726318, + "sft_loss": 0.040762219578027725, + "step": 3361 + }, + { + "epoch": 4.861894432393348, + "grad_norm": 1.8219579547368454, + "learning_rate": 7.185496248952078e-07, + "logits/chosen": -0.507846474647522, + "logits/rejected": -0.41886287927627563, + "logps/chosen": -0.04386391118168831, + "logps/rejected": -5.788032054901123, + "loss": 0.0736, + "odds_ratio_loss": 0.004342243075370789, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004386391025036573, + "rewards/margins": 0.5744168758392334, + "rewards/rejected": -0.578803300857544, + "sft_loss": 0.04386391118168831, + "step": 3362 + }, + { + "epoch": 4.863340563991323, + "grad_norm": 1.6904218040386811, + "learning_rate": 7.167745391765483e-07, + "logits/chosen": -0.8306548595428467, + "logits/rejected": -0.6804887056350708, + "logps/chosen": -0.10468629747629166, + "logps/rejected": -4.65870475769043, + "loss": 0.0904, + "odds_ratio_loss": 0.008187741041183472, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010468630120158195, + "rewards/margins": 0.4554018974304199, + "rewards/rejected": -0.4658704996109009, + "sft_loss": 0.10468629747629166, + "step": 3363 + }, + { + "epoch": 4.864786695589299, + "grad_norm": 1.406382721212895, + "learning_rate": 7.150014329267456e-07, + "logits/chosen": -0.7490118741989136, + "logits/rejected": -0.5290810465812683, + "logps/chosen": -0.04611920937895775, + "logps/rejected": -5.029910564422607, + "loss": 0.0674, + "odds_ratio_loss": 0.004671413917094469, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004611921031028032, + "rewards/margins": 0.4983791708946228, + "rewards/rejected": -0.5029910802841187, + "sft_loss": 0.04611920937895775, + "step": 3364 + }, + { + "epoch": 4.866232827187274, + "grad_norm": 1.4718586864445116, + "learning_rate": 7.132303072148147e-07, + "logits/chosen": -0.6270997524261475, + "logits/rejected": -0.5362582802772522, + "logps/chosen": -0.14092980325222015, + "logps/rejected": -5.525834560394287, + "loss": 0.1005, + "odds_ratio_loss": 0.013515939004719257, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014092981815338135, + "rewards/margins": 0.5384904742240906, + "rewards/rejected": -0.5525834560394287, + "sft_loss": 0.14092980325222015, + "step": 3365 + }, + { + "epoch": 4.867678958785249, + "grad_norm": 1.3171658711366296, + "learning_rate": 7.114611631085719e-07, + "logits/chosen": -0.8933150172233582, + "logits/rejected": -0.7245310544967651, + "logps/chosen": -0.06106524541974068, + "logps/rejected": -4.623565673828125, + "loss": 0.0743, + "odds_ratio_loss": 0.007010796573013067, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00610652519389987, + "rewards/margins": 0.45624998211860657, + "rewards/rejected": -0.4623565375804901, + "sft_loss": 0.06106524541974068, + "step": 3366 + }, + { + "epoch": 4.869125090383225, + "grad_norm": 2.0077338029846494, + "learning_rate": 7.096940016746429e-07, + "logits/chosen": -0.762084424495697, + "logits/rejected": -0.6642420291900635, + "logps/chosen": -0.11195956915616989, + "logps/rejected": -3.8559727668762207, + "loss": 0.0896, + "odds_ratio_loss": 0.008951609954237938, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011195957660675049, + "rewards/margins": 0.3744013011455536, + "rewards/rejected": -0.385597288608551, + "sft_loss": 0.11195956915616989, + "step": 3367 + }, + { + "epoch": 4.8705712219812005, + "grad_norm": 1.5746236198419516, + "learning_rate": 7.079288239784542e-07, + "logits/chosen": -0.6728679537773132, + "logits/rejected": -0.5606586337089539, + "logps/chosen": -0.1795307844877243, + "logps/rejected": -5.230622291564941, + "loss": 0.1015, + "odds_ratio_loss": 0.023649588227272034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01795307919383049, + "rewards/margins": 0.5051091313362122, + "rewards/rejected": -0.5230622291564941, + "sft_loss": 0.1795307844877243, + "step": 3368 + }, + { + "epoch": 4.872017353579176, + "grad_norm": 1.4742014680311728, + "learning_rate": 7.061656310842381e-07, + "logits/chosen": -0.68035888671875, + "logits/rejected": -0.6797270774841309, + "logps/chosen": -0.08322112262248993, + "logps/rejected": -3.5652883052825928, + "loss": 0.0882, + "odds_ratio_loss": 0.012553151696920395, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008322112262248993, + "rewards/margins": 0.3482067584991455, + "rewards/rejected": -0.3565288782119751, + "sft_loss": 0.08322112262248993, + "step": 3369 + }, + { + "epoch": 4.873463485177151, + "grad_norm": 1.4925291304976303, + "learning_rate": 7.044044240550313e-07, + "logits/chosen": -0.8202152252197266, + "logits/rejected": -0.563833475112915, + "logps/chosen": -0.06742648780345917, + "logps/rejected": -4.185554027557373, + "loss": 0.0591, + "odds_ratio_loss": 0.005834805779159069, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006742648780345917, + "rewards/margins": 0.4118127226829529, + "rewards/rejected": -0.4185553789138794, + "sft_loss": 0.06742648780345917, + "step": 3370 + }, + { + "epoch": 4.874909616775127, + "grad_norm": 2.5154884893336478, + "learning_rate": 7.026452039526703e-07, + "logits/chosen": -0.8807682991027832, + "logits/rejected": -0.5594336986541748, + "logps/chosen": -0.05363871157169342, + "logps/rejected": -5.723832130432129, + "loss": 0.1029, + "odds_ratio_loss": 0.005470744799822569, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005363871343433857, + "rewards/margins": 0.5670193433761597, + "rewards/rejected": -0.5723832249641418, + "sft_loss": 0.05363871157169342, + "step": 3371 + }, + { + "epoch": 4.876355748373102, + "grad_norm": 1.6423306994229212, + "learning_rate": 7.008879718377976e-07, + "logits/chosen": -0.6424853801727295, + "logits/rejected": -0.5146550536155701, + "logps/chosen": -0.07374648004770279, + "logps/rejected": -6.507882118225098, + "loss": 0.0858, + "odds_ratio_loss": 0.005046168342232704, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007374648004770279, + "rewards/margins": 0.6434135437011719, + "rewards/rejected": -0.6507881879806519, + "sft_loss": 0.07374648004770279, + "step": 3372 + }, + { + "epoch": 4.877801879971077, + "grad_norm": 1.6355065912046813, + "learning_rate": 6.991327287698525e-07, + "logits/chosen": -0.7128841876983643, + "logits/rejected": -0.6331945061683655, + "logps/chosen": -0.08015397936105728, + "logps/rejected": -3.10212779045105, + "loss": 0.0926, + "odds_ratio_loss": 0.008713113144040108, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008015397936105728, + "rewards/margins": 0.3021973967552185, + "rewards/rejected": -0.31021279096603394, + "sft_loss": 0.08015397936105728, + "step": 3373 + }, + { + "epoch": 4.879248011569053, + "grad_norm": 1.7960911189141748, + "learning_rate": 6.973794758070806e-07, + "logits/chosen": -0.7306787967681885, + "logits/rejected": -0.6239840984344482, + "logps/chosen": -0.13009199500083923, + "logps/rejected": -3.731572389602661, + "loss": 0.0737, + "odds_ratio_loss": 0.007687950972467661, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013009199872612953, + "rewards/margins": 0.3601480722427368, + "rewards/rejected": -0.373157262802124, + "sft_loss": 0.13009199500083923, + "step": 3374 + }, + { + "epoch": 4.8806941431670285, + "grad_norm": 1.6128639569714278, + "learning_rate": 6.956282140065224e-07, + "logits/chosen": -0.5749595165252686, + "logits/rejected": -0.48999878764152527, + "logps/chosen": -0.08304701745510101, + "logps/rejected": -6.037389755249023, + "loss": 0.0804, + "odds_ratio_loss": 0.0033578509464859962, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008304702118039131, + "rewards/margins": 0.595434308052063, + "rewards/rejected": -0.6037390232086182, + "sft_loss": 0.08304701745510101, + "step": 3375 + }, + { + "epoch": 4.882140274765003, + "grad_norm": 1.61063155767896, + "learning_rate": 6.93878944424021e-07, + "logits/chosen": -0.6542560458183289, + "logits/rejected": -0.710060715675354, + "logps/chosen": -0.10780586302280426, + "logps/rejected": -5.035168647766113, + "loss": 0.0817, + "odds_ratio_loss": 0.008214112371206284, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01078058686107397, + "rewards/margins": 0.4927363097667694, + "rewards/rejected": -0.5035169124603271, + "sft_loss": 0.10780586302280426, + "step": 3376 + }, + { + "epoch": 4.883586406362979, + "grad_norm": 1.931393542363766, + "learning_rate": 6.921316681142167e-07, + "logits/chosen": -0.8681274056434631, + "logits/rejected": -0.6243171691894531, + "logps/chosen": -0.1537507176399231, + "logps/rejected": -5.715590000152588, + "loss": 0.114, + "odds_ratio_loss": 0.014606855809688568, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01537507213652134, + "rewards/margins": 0.5561839938163757, + "rewards/rejected": -0.5715590119361877, + "sft_loss": 0.1537507176399231, + "step": 3377 + }, + { + "epoch": 4.885032537960955, + "grad_norm": 1.6623591535238764, + "learning_rate": 6.903863861305498e-07, + "logits/chosen": -0.7357972264289856, + "logits/rejected": -0.553459882736206, + "logps/chosen": -0.06771313399076462, + "logps/rejected": -3.4469971656799316, + "loss": 0.1139, + "odds_ratio_loss": 0.01092704851180315, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006771313492208719, + "rewards/margins": 0.3379283845424652, + "rewards/rejected": -0.3446996808052063, + "sft_loss": 0.06771313399076462, + "step": 3378 + }, + { + "epoch": 4.8864786695589295, + "grad_norm": 1.744530404431733, + "learning_rate": 6.886430995252564e-07, + "logits/chosen": -0.8139889240264893, + "logits/rejected": -0.6457674503326416, + "logps/chosen": -0.15386618673801422, + "logps/rejected": -4.273682117462158, + "loss": 0.0993, + "odds_ratio_loss": 0.01531638391315937, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015386618673801422, + "rewards/margins": 0.4119815528392792, + "rewards/rejected": -0.4273681938648224, + "sft_loss": 0.15386618673801422, + "step": 3379 + }, + { + "epoch": 4.887924801156905, + "grad_norm": 1.3904580673138525, + "learning_rate": 6.869018093493721e-07, + "logits/chosen": -0.8261375427246094, + "logits/rejected": -0.7308131456375122, + "logps/chosen": -0.17565055191516876, + "logps/rejected": -3.2135813236236572, + "loss": 0.0742, + "odds_ratio_loss": 0.025811482220888138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017565056681632996, + "rewards/margins": 0.3037930727005005, + "rewards/rejected": -0.3213581442832947, + "sft_loss": 0.17565055191516876, + "step": 3380 + }, + { + "epoch": 4.889370932754881, + "grad_norm": 1.594133966204827, + "learning_rate": 6.851625166527255e-07, + "logits/chosen": -0.8713509440422058, + "logits/rejected": -0.6780038475990295, + "logps/chosen": -0.0992034375667572, + "logps/rejected": -4.472326755523682, + "loss": 0.0729, + "odds_ratio_loss": 0.015876563265919685, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00992034375667572, + "rewards/margins": 0.4373123347759247, + "rewards/rejected": -0.4472326636314392, + "sft_loss": 0.0992034375667572, + "step": 3381 + }, + { + "epoch": 4.890817064352856, + "grad_norm": 1.3939413364635795, + "learning_rate": 6.834252224839438e-07, + "logits/chosen": -0.516639232635498, + "logits/rejected": -0.49215567111968994, + "logps/chosen": -0.06323473155498505, + "logps/rejected": -2.7271199226379395, + "loss": 0.0867, + "odds_ratio_loss": 0.01052488386631012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006323473993688822, + "rewards/margins": 0.26638853549957275, + "rewards/rejected": -0.27271202206611633, + "sft_loss": 0.06323473155498505, + "step": 3382 + }, + { + "epoch": 4.892263195950831, + "grad_norm": 2.4650325801464246, + "learning_rate": 6.81689927890448e-07, + "logits/chosen": -0.6514842510223389, + "logits/rejected": -0.4817507266998291, + "logps/chosen": -0.061137307435274124, + "logps/rejected": -6.235174179077148, + "loss": 0.0912, + "odds_ratio_loss": 0.0024059026036411524, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00611373083665967, + "rewards/margins": 0.6174037456512451, + "rewards/rejected": -0.6235173940658569, + "sft_loss": 0.061137307435274124, + "step": 3383 + }, + { + "epoch": 4.893709327548807, + "grad_norm": 2.0299733895183385, + "learning_rate": 6.799566339184526e-07, + "logits/chosen": -0.8810280561447144, + "logits/rejected": -0.4609205722808838, + "logps/chosen": -0.0726509764790535, + "logps/rejected": -5.877103805541992, + "loss": 0.0678, + "odds_ratio_loss": 0.004596366081386805, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007265097927302122, + "rewards/margins": 0.5804452896118164, + "rewards/rejected": -0.5877103805541992, + "sft_loss": 0.0726509764790535, + "step": 3384 + }, + { + "epoch": 4.895155459146783, + "grad_norm": 1.53183380817701, + "learning_rate": 6.782253416129684e-07, + "logits/chosen": -0.7259580492973328, + "logits/rejected": -0.6520754098892212, + "logps/chosen": -0.022825662046670914, + "logps/rejected": -4.62690544128418, + "loss": 0.0748, + "odds_ratio_loss": 0.003376226406544447, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0022825661581009626, + "rewards/margins": 0.46040797233581543, + "rewards/rejected": -0.462690532207489, + "sft_loss": 0.022825662046670914, + "step": 3385 + }, + { + "epoch": 4.8966015907447575, + "grad_norm": 1.3488999656463687, + "learning_rate": 6.764960520177965e-07, + "logits/chosen": -0.817007303237915, + "logits/rejected": -0.6087720990180969, + "logps/chosen": -0.05678170174360275, + "logps/rejected": -6.089994430541992, + "loss": 0.0592, + "odds_ratio_loss": 0.0028764172457158566, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00567817036062479, + "rewards/margins": 0.6033213138580322, + "rewards/rejected": -0.6089994311332703, + "sft_loss": 0.05678170174360275, + "step": 3386 + }, + { + "epoch": 4.898047722342733, + "grad_norm": 1.4230099428004324, + "learning_rate": 6.747687661755339e-07, + "logits/chosen": -0.747872531414032, + "logits/rejected": -0.6560572385787964, + "logps/chosen": -0.06658175587654114, + "logps/rejected": -5.573917865753174, + "loss": 0.0793, + "odds_ratio_loss": 0.01551305316388607, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006658175960183144, + "rewards/margins": 0.5507336258888245, + "rewards/rejected": -0.5573917627334595, + "sft_loss": 0.06658175587654114, + "step": 3387 + }, + { + "epoch": 4.899493853940709, + "grad_norm": 1.7422732363316469, + "learning_rate": 6.73043485127566e-07, + "logits/chosen": -0.7491236329078674, + "logits/rejected": -0.6787482500076294, + "logps/chosen": -0.07064135372638702, + "logps/rejected": -5.146181106567383, + "loss": 0.0753, + "odds_ratio_loss": 0.00906374678015709, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007064135279506445, + "rewards/margins": 0.5075539946556091, + "rewards/rejected": -0.5146180987358093, + "sft_loss": 0.07064135372638702, + "step": 3388 + }, + { + "epoch": 4.900939985538684, + "grad_norm": 1.9246122301587167, + "learning_rate": 6.713202099140725e-07, + "logits/chosen": -1.0471328496932983, + "logits/rejected": -0.6187826991081238, + "logps/chosen": -0.1254141926765442, + "logps/rejected": -5.144631862640381, + "loss": 0.1055, + "odds_ratio_loss": 0.006312836892902851, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012541419826447964, + "rewards/margins": 0.5019217729568481, + "rewards/rejected": -0.5144631862640381, + "sft_loss": 0.1254141926765442, + "step": 3389 + }, + { + "epoch": 4.902386117136659, + "grad_norm": 2.6512806023231543, + "learning_rate": 6.695989415740215e-07, + "logits/chosen": -0.7667117118835449, + "logits/rejected": -0.5550485253334045, + "logps/chosen": -0.056425757706165314, + "logps/rejected": -4.555874347686768, + "loss": 0.0858, + "odds_ratio_loss": 0.0027704143431037664, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005642575677484274, + "rewards/margins": 0.4499448835849762, + "rewards/rejected": -0.4555874764919281, + "sft_loss": 0.056425757706165314, + "step": 3390 + }, + { + "epoch": 4.903832248734635, + "grad_norm": 1.5656569412623513, + "learning_rate": 6.678796811451727e-07, + "logits/chosen": -0.761025071144104, + "logits/rejected": -0.5918843746185303, + "logps/chosen": -0.07612086087465286, + "logps/rejected": -5.696231365203857, + "loss": 0.0731, + "odds_ratio_loss": 0.003577734809368849, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007612085901200771, + "rewards/margins": 0.5620110630989075, + "rewards/rejected": -0.5696231722831726, + "sft_loss": 0.07612086087465286, + "step": 3391 + }, + { + "epoch": 4.905278380332611, + "grad_norm": 1.610031355891547, + "learning_rate": 6.661624296640731e-07, + "logits/chosen": -0.6648229956626892, + "logits/rejected": -0.6174643039703369, + "logps/chosen": -0.04702949523925781, + "logps/rejected": -5.392739772796631, + "loss": 0.0891, + "odds_ratio_loss": 0.0024822133127599955, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004702950362116098, + "rewards/margins": 0.5345709919929504, + "rewards/rejected": -0.5392739176750183, + "sft_loss": 0.04702949523925781, + "step": 3392 + }, + { + "epoch": 4.906724511930586, + "grad_norm": 1.3906640545233522, + "learning_rate": 6.644471881660623e-07, + "logits/chosen": -0.7568591833114624, + "logits/rejected": -0.4995534420013428, + "logps/chosen": -0.0706809014081955, + "logps/rejected": -5.783414840698242, + "loss": 0.0842, + "odds_ratio_loss": 0.0026557797100394964, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007068090606480837, + "rewards/margins": 0.5712733864784241, + "rewards/rejected": -0.578341543674469, + "sft_loss": 0.0706809014081955, + "step": 3393 + }, + { + "epoch": 4.908170643528561, + "grad_norm": 1.5512790240610077, + "learning_rate": 6.627339576852637e-07, + "logits/chosen": -0.8462902307510376, + "logits/rejected": -0.7097457647323608, + "logps/chosen": -0.06939011067152023, + "logps/rejected": -2.7737913131713867, + "loss": 0.0757, + "odds_ratio_loss": 0.007057640701532364, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006939011625945568, + "rewards/margins": 0.27044010162353516, + "rewards/rejected": -0.2773791253566742, + "sft_loss": 0.06939011067152023, + "step": 3394 + }, + { + "epoch": 4.909616775126537, + "grad_norm": 1.65898596137372, + "learning_rate": 6.610227392545922e-07, + "logits/chosen": -0.6548706293106079, + "logits/rejected": -0.4947068393230438, + "logps/chosen": -0.09346526116132736, + "logps/rejected": -5.793054580688477, + "loss": 0.0901, + "odds_ratio_loss": 0.013216537423431873, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00934652704745531, + "rewards/margins": 0.5699589252471924, + "rewards/rejected": -0.5793054699897766, + "sft_loss": 0.09346526116132736, + "step": 3395 + }, + { + "epoch": 4.911062906724512, + "grad_norm": 1.5699203400624258, + "learning_rate": 6.593135339057463e-07, + "logits/chosen": -0.8828110694885254, + "logits/rejected": -0.6490928530693054, + "logps/chosen": -0.03655397891998291, + "logps/rejected": -4.743061065673828, + "loss": 0.0744, + "odds_ratio_loss": 0.004315060563385487, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003655398031696677, + "rewards/margins": 0.47065073251724243, + "rewards/rejected": -0.4743060767650604, + "sft_loss": 0.03655397891998291, + "step": 3396 + }, + { + "epoch": 4.912509038322487, + "grad_norm": 1.5275318305815881, + "learning_rate": 6.576063426692125e-07, + "logits/chosen": -0.6870374083518982, + "logits/rejected": -0.49240612983703613, + "logps/chosen": -0.09693014621734619, + "logps/rejected": -7.673630714416504, + "loss": 0.0751, + "odds_ratio_loss": 0.010385874658823013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009693015366792679, + "rewards/margins": 0.7576700448989868, + "rewards/rejected": -0.7673630714416504, + "sft_loss": 0.09693014621734619, + "step": 3397 + }, + { + "epoch": 4.913955169920463, + "grad_norm": 1.3408507320992828, + "learning_rate": 6.559011665742642e-07, + "logits/chosen": -0.7937183380126953, + "logits/rejected": -0.5760884284973145, + "logps/chosen": -0.12189510464668274, + "logps/rejected": -5.872251033782959, + "loss": 0.0693, + "odds_ratio_loss": 0.01099199429154396, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012189511209726334, + "rewards/margins": 0.5750356316566467, + "rewards/rejected": -0.587225079536438, + "sft_loss": 0.12189510464668274, + "step": 3398 + }, + { + "epoch": 4.915401301518438, + "grad_norm": 1.6638328443107613, + "learning_rate": 6.541980066489569e-07, + "logits/chosen": -0.7404236793518066, + "logits/rejected": -0.638684093952179, + "logps/chosen": -0.15662673115730286, + "logps/rejected": -4.5496721267700195, + "loss": 0.1009, + "odds_ratio_loss": 0.010028908029198647, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015662673860788345, + "rewards/margins": 0.43930453062057495, + "rewards/rejected": -0.454967200756073, + "sft_loss": 0.15662673115730286, + "step": 3399 + }, + { + "epoch": 4.916847433116414, + "grad_norm": 1.7502597021129793, + "learning_rate": 6.524968639201329e-07, + "logits/chosen": -0.920937180519104, + "logits/rejected": -0.6321260333061218, + "logps/chosen": -0.08427523076534271, + "logps/rejected": -6.011247158050537, + "loss": 0.091, + "odds_ratio_loss": 0.004902101121842861, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008427524007856846, + "rewards/margins": 0.5926971435546875, + "rewards/rejected": -0.60112464427948, + "sft_loss": 0.08427523076534271, + "step": 3400 + }, + { + "epoch": 4.918293564714389, + "grad_norm": 1.4833107024960808, + "learning_rate": 6.507977394134161e-07, + "logits/chosen": -0.5898668169975281, + "logits/rejected": -0.5003875494003296, + "logps/chosen": -0.13638347387313843, + "logps/rejected": -3.1561052799224854, + "loss": 0.1005, + "odds_ratio_loss": 0.013293557800352573, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013638347387313843, + "rewards/margins": 0.3019721806049347, + "rewards/rejected": -0.3156105577945709, + "sft_loss": 0.13638347387313843, + "step": 3401 + }, + { + "epoch": 4.919739696312364, + "grad_norm": 1.5630456229191343, + "learning_rate": 6.491006341532169e-07, + "logits/chosen": -0.7746043801307678, + "logits/rejected": -0.586871325969696, + "logps/chosen": -0.09608809649944305, + "logps/rejected": -4.432920455932617, + "loss": 0.0824, + "odds_ratio_loss": 0.0049471426755189896, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009608810767531395, + "rewards/margins": 0.4336831867694855, + "rewards/rejected": -0.4432920217514038, + "sft_loss": 0.09608809649944305, + "step": 3402 + }, + { + "epoch": 4.92118582791034, + "grad_norm": 1.7588181040876307, + "learning_rate": 6.474055491627246e-07, + "logits/chosen": -0.7171893119812012, + "logits/rejected": -0.6028531789779663, + "logps/chosen": -0.13583296537399292, + "logps/rejected": -4.825489521026611, + "loss": 0.0862, + "odds_ratio_loss": 0.004513260908424854, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013583295047283173, + "rewards/margins": 0.46896564960479736, + "rewards/rejected": -0.48254895210266113, + "sft_loss": 0.13583296537399292, + "step": 3403 + }, + { + "epoch": 4.9226319595083154, + "grad_norm": 1.6924015889641442, + "learning_rate": 6.45712485463914e-07, + "logits/chosen": -0.89324951171875, + "logits/rejected": -0.666816771030426, + "logps/chosen": -0.1056736633181572, + "logps/rejected": -4.227684020996094, + "loss": 0.0801, + "odds_ratio_loss": 0.016112789511680603, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010567366145551205, + "rewards/margins": 0.41220101714134216, + "rewards/rejected": -0.42276835441589355, + "sft_loss": 0.1056736633181572, + "step": 3404 + }, + { + "epoch": 4.92407809110629, + "grad_norm": 1.5042913816203431, + "learning_rate": 6.440214440775374e-07, + "logits/chosen": -0.8448358774185181, + "logits/rejected": -0.6646611094474792, + "logps/chosen": -0.05981362611055374, + "logps/rejected": -4.67680549621582, + "loss": 0.0644, + "odds_ratio_loss": 0.006723261438310146, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0059813628904521465, + "rewards/margins": 0.4616991877555847, + "rewards/rejected": -0.46768054366111755, + "sft_loss": 0.05981362611055374, + "step": 3405 + }, + { + "epoch": 4.925524222704266, + "grad_norm": 1.661160117863071, + "learning_rate": 6.423324260231324e-07, + "logits/chosen": -0.5030421614646912, + "logits/rejected": -0.4442903399467468, + "logps/chosen": -0.19611015915870667, + "logps/rejected": -4.853398323059082, + "loss": 0.1093, + "odds_ratio_loss": 0.03798852115869522, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019611012190580368, + "rewards/margins": 0.4657288193702698, + "rewards/rejected": -0.48533985018730164, + "sft_loss": 0.19611015915870667, + "step": 3406 + }, + { + "epoch": 4.926970354302242, + "grad_norm": 1.5571396653798601, + "learning_rate": 6.406454323190127e-07, + "logits/chosen": -0.722494900226593, + "logits/rejected": -0.6708986163139343, + "logps/chosen": -0.10587897151708603, + "logps/rejected": -5.6761274337768555, + "loss": 0.0664, + "odds_ratio_loss": 0.014984402805566788, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010587898083031178, + "rewards/margins": 0.5570248365402222, + "rewards/rejected": -0.5676127672195435, + "sft_loss": 0.10587897151708603, + "step": 3407 + }, + { + "epoch": 4.928416485900217, + "grad_norm": 1.6336998087518904, + "learning_rate": 6.389604639822739e-07, + "logits/chosen": -0.8598955869674683, + "logits/rejected": -0.600988507270813, + "logps/chosen": -0.0970025435090065, + "logps/rejected": -3.2531440258026123, + "loss": 0.1, + "odds_ratio_loss": 0.0073122261092066765, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009700254537165165, + "rewards/margins": 0.31561416387557983, + "rewards/rejected": -0.32531440258026123, + "sft_loss": 0.0970025435090065, + "step": 3408 + }, + { + "epoch": 4.929862617498192, + "grad_norm": 1.8046682859163223, + "learning_rate": 6.372775220287878e-07, + "logits/chosen": -0.7766846418380737, + "logits/rejected": -0.5947623252868652, + "logps/chosen": -0.128305122256279, + "logps/rejected": -4.95772647857666, + "loss": 0.1133, + "odds_ratio_loss": 0.026372672989964485, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012830512598156929, + "rewards/margins": 0.4829421639442444, + "rewards/rejected": -0.4957726001739502, + "sft_loss": 0.128305122256279, + "step": 3409 + }, + { + "epoch": 4.931308749096168, + "grad_norm": 1.8430904450067187, + "learning_rate": 6.355966074732082e-07, + "logits/chosen": -0.8618395924568176, + "logits/rejected": -0.628158688545227, + "logps/chosen": -0.04948505386710167, + "logps/rejected": -4.483854293823242, + "loss": 0.1178, + "odds_ratio_loss": 0.004379372112452984, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004948505666106939, + "rewards/margins": 0.4434368908405304, + "rewards/rejected": -0.44838541746139526, + "sft_loss": 0.04948505386710167, + "step": 3410 + }, + { + "epoch": 4.9327548806941435, + "grad_norm": 1.5954896703058488, + "learning_rate": 6.339177213289652e-07, + "logits/chosen": -0.5719497799873352, + "logits/rejected": -0.44910386204719543, + "logps/chosen": -0.052914202213287354, + "logps/rejected": -6.891765594482422, + "loss": 0.0525, + "odds_ratio_loss": 0.0075008077546954155, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005291420966386795, + "rewards/margins": 0.6838852167129517, + "rewards/rejected": -0.689176619052887, + "sft_loss": 0.052914202213287354, + "step": 3411 + }, + { + "epoch": 4.934201012292118, + "grad_norm": 1.4200289969330797, + "learning_rate": 6.322408646082635e-07, + "logits/chosen": -0.8700981140136719, + "logits/rejected": -0.599973201751709, + "logps/chosen": -0.06421230733394623, + "logps/rejected": -5.4810638427734375, + "loss": 0.0889, + "odds_ratio_loss": 0.0070202648639678955, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006421231664717197, + "rewards/margins": 0.541685163974762, + "rewards/rejected": -0.5481064319610596, + "sft_loss": 0.06421230733394623, + "step": 3412 + }, + { + "epoch": 4.935647143890094, + "grad_norm": 1.5988810203628434, + "learning_rate": 6.305660383220885e-07, + "logits/chosen": -0.8789199590682983, + "logits/rejected": -0.7220363616943359, + "logps/chosen": -0.09391533583402634, + "logps/rejected": -4.320672988891602, + "loss": 0.0727, + "odds_ratio_loss": 0.0037208469584584236, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009391534142196178, + "rewards/margins": 0.4226757884025574, + "rewards/rejected": -0.43206727504730225, + "sft_loss": 0.09391533583402634, + "step": 3413 + }, + { + "epoch": 4.93709327548807, + "grad_norm": 2.964711941744502, + "learning_rate": 6.288932434801979e-07, + "logits/chosen": -0.8900729417800903, + "logits/rejected": -0.6892324090003967, + "logps/chosen": -0.0752728208899498, + "logps/rejected": -4.755692481994629, + "loss": 0.0925, + "odds_ratio_loss": 0.008502209559082985, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007527281995862722, + "rewards/margins": 0.46804195642471313, + "rewards/rejected": -0.4755692481994629, + "sft_loss": 0.0752728208899498, + "step": 3414 + }, + { + "epoch": 4.938539407086044, + "grad_norm": 1.226582221767045, + "learning_rate": 6.272224810911262e-07, + "logits/chosen": -0.6443370580673218, + "logits/rejected": -0.46328115463256836, + "logps/chosen": -0.0617620050907135, + "logps/rejected": -6.892882347106934, + "loss": 0.0539, + "odds_ratio_loss": 0.007957663387060165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006176200229674578, + "rewards/margins": 0.6831120252609253, + "rewards/rejected": -0.6892882585525513, + "sft_loss": 0.0617620050907135, + "step": 3415 + }, + { + "epoch": 4.93998553868402, + "grad_norm": 1.7930112841480699, + "learning_rate": 6.255537521621814e-07, + "logits/chosen": -0.8234100341796875, + "logits/rejected": -0.7632617950439453, + "logps/chosen": -0.0765080451965332, + "logps/rejected": -5.386995315551758, + "loss": 0.1174, + "odds_ratio_loss": 0.006928831338882446, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0076508051715791225, + "rewards/margins": 0.5310487747192383, + "rewards/rejected": -0.5386995673179626, + "sft_loss": 0.0765080451965332, + "step": 3416 + }, + { + "epoch": 4.941431670281996, + "grad_norm": 1.5497605374260457, + "learning_rate": 6.238870576994482e-07, + "logits/chosen": -0.9648762345314026, + "logits/rejected": -0.679747462272644, + "logps/chosen": -0.06289042532444, + "logps/rejected": -6.040827751159668, + "loss": 0.0695, + "odds_ratio_loss": 0.0020546133164316416, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0062890429981052876, + "rewards/margins": 0.5977937579154968, + "rewards/rejected": -0.6040828227996826, + "sft_loss": 0.06289042532444, + "step": 3417 + }, + { + "epoch": 4.9428778018799715, + "grad_norm": 1.5782236984356215, + "learning_rate": 6.222223987077808e-07, + "logits/chosen": -0.9196997284889221, + "logits/rejected": -0.7327249646186829, + "logps/chosen": -0.02562006562948227, + "logps/rejected": -3.8854827880859375, + "loss": 0.0997, + "odds_ratio_loss": 0.0034806833136826754, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002562006702646613, + "rewards/margins": 0.3859862685203552, + "rewards/rejected": -0.3885482847690582, + "sft_loss": 0.02562006562948227, + "step": 3418 + }, + { + "epoch": 4.944323933477946, + "grad_norm": 1.5751576479599592, + "learning_rate": 6.205597761908104e-07, + "logits/chosen": -0.5640643835067749, + "logits/rejected": -0.5260137319564819, + "logps/chosen": -0.15139015018939972, + "logps/rejected": -3.5538008213043213, + "loss": 0.1029, + "odds_ratio_loss": 0.015284059569239616, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015139015391469002, + "rewards/margins": 0.34024107456207275, + "rewards/rejected": -0.3553800880908966, + "sft_loss": 0.15139015018939972, + "step": 3419 + }, + { + "epoch": 4.945770065075922, + "grad_norm": 1.4752020186966457, + "learning_rate": 6.188991911509367e-07, + "logits/chosen": -1.0183162689208984, + "logits/rejected": -0.5773686170578003, + "logps/chosen": -0.04004896432161331, + "logps/rejected": -5.913060188293457, + "loss": 0.0642, + "odds_ratio_loss": 0.0022270409390330315, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004004896618425846, + "rewards/margins": 0.5873011350631714, + "rewards/rejected": -0.5913060307502747, + "sft_loss": 0.04004896432161331, + "step": 3420 + }, + { + "epoch": 4.947216196673898, + "grad_norm": 1.4465994196175587, + "learning_rate": 6.172406445893337e-07, + "logits/chosen": -0.5818377137184143, + "logits/rejected": -0.6574020385742188, + "logps/chosen": -0.16453906893730164, + "logps/rejected": -4.426886558532715, + "loss": 0.091, + "odds_ratio_loss": 0.029384538531303406, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016453908756375313, + "rewards/margins": 0.42623475193977356, + "rewards/rejected": -0.44268864393234253, + "sft_loss": 0.16453906893730164, + "step": 3421 + }, + { + "epoch": 4.9486623282718725, + "grad_norm": 1.447822137306616, + "learning_rate": 6.15584137505945e-07, + "logits/chosen": -0.8531981706619263, + "logits/rejected": -0.6357840895652771, + "logps/chosen": -0.07084062695503235, + "logps/rejected": -7.386335372924805, + "loss": 0.0794, + "odds_ratio_loss": 0.0017525034490972757, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00708406325429678, + "rewards/margins": 0.7315495014190674, + "rewards/rejected": -0.7386335134506226, + "sft_loss": 0.07084062695503235, + "step": 3422 + }, + { + "epoch": 4.950108459869848, + "grad_norm": 1.8491072383950693, + "learning_rate": 6.139296708994837e-07, + "logits/chosen": -0.747806966304779, + "logits/rejected": -0.6480367183685303, + "logps/chosen": -0.06436759978532791, + "logps/rejected": -5.5151543617248535, + "loss": 0.1032, + "odds_ratio_loss": 0.005401041358709335, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006436760071665049, + "rewards/margins": 0.545078694820404, + "rewards/rejected": -0.5515154600143433, + "sft_loss": 0.06436759978532791, + "step": 3423 + }, + { + "epoch": 4.951554591467824, + "grad_norm": 1.4690335807293746, + "learning_rate": 6.122772457674359e-07, + "logits/chosen": -0.5605374574661255, + "logits/rejected": -0.46818655729293823, + "logps/chosen": -0.09475595504045486, + "logps/rejected": -4.236270904541016, + "loss": 0.0888, + "odds_ratio_loss": 0.01986742950975895, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009475595317780972, + "rewards/margins": 0.41415154933929443, + "rewards/rejected": -0.4236271381378174, + "sft_loss": 0.09475595504045486, + "step": 3424 + }, + { + "epoch": 4.953000723065799, + "grad_norm": 1.1868102449763724, + "learning_rate": 6.106268631060527e-07, + "logits/chosen": -0.8078406453132629, + "logits/rejected": -0.5437741875648499, + "logps/chosen": -0.03639750927686691, + "logps/rejected": -5.105257034301758, + "loss": 0.0472, + "odds_ratio_loss": 0.00621524965390563, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003639751113951206, + "rewards/margins": 0.5068859457969666, + "rewards/rejected": -0.5105257034301758, + "sft_loss": 0.03639750927686691, + "step": 3425 + }, + { + "epoch": 4.954446854663774, + "grad_norm": 1.5291294485877678, + "learning_rate": 6.089785239103582e-07, + "logits/chosen": -0.9439482688903809, + "logits/rejected": -0.6107262372970581, + "logps/chosen": -0.03788119927048683, + "logps/rejected": -3.4490232467651367, + "loss": 0.0818, + "odds_ratio_loss": 0.00338040036149323, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003788120113313198, + "rewards/margins": 0.34111422300338745, + "rewards/rejected": -0.3449023365974426, + "sft_loss": 0.03788119927048683, + "step": 3426 + }, + { + "epoch": 4.95589298626175, + "grad_norm": 1.5203093371616432, + "learning_rate": 6.073322291741405e-07, + "logits/chosen": -0.7352548241615295, + "logits/rejected": -0.6759071946144104, + "logps/chosen": -0.04905076324939728, + "logps/rejected": -3.5433459281921387, + "loss": 0.056, + "odds_ratio_loss": 0.0025512652937322855, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004905076697468758, + "rewards/margins": 0.34942954778671265, + "rewards/rejected": -0.35433462262153625, + "sft_loss": 0.04905076324939728, + "step": 3427 + }, + { + "epoch": 4.957339117859725, + "grad_norm": 1.337702504072495, + "learning_rate": 6.056879798899581e-07, + "logits/chosen": -0.8775549530982971, + "logits/rejected": -0.8563462495803833, + "logps/chosen": -0.036653611809015274, + "logps/rejected": -4.592785835266113, + "loss": 0.0516, + "odds_ratio_loss": 0.005046031437814236, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0036653613205999136, + "rewards/margins": 0.4556131958961487, + "rewards/rejected": -0.45927858352661133, + "sft_loss": 0.036653611809015274, + "step": 3428 + }, + { + "epoch": 4.9587852494577005, + "grad_norm": 1.7224191092429069, + "learning_rate": 6.040457770491345e-07, + "logits/chosen": -0.9732665419578552, + "logits/rejected": -0.621749222278595, + "logps/chosen": -0.07956325262784958, + "logps/rejected": -4.399288177490234, + "loss": 0.0908, + "odds_ratio_loss": 0.005940181203186512, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007956326007843018, + "rewards/margins": 0.4319724440574646, + "rewards/rejected": -0.43992879986763, + "sft_loss": 0.07956325262784958, + "step": 3429 + }, + { + "epoch": 4.960231381055676, + "grad_norm": 1.254116051896288, + "learning_rate": 6.024056216417595e-07, + "logits/chosen": -0.9306346774101257, + "logits/rejected": -0.6040125489234924, + "logps/chosen": -0.029323115944862366, + "logps/rejected": -6.736618995666504, + "loss": 0.037, + "odds_ratio_loss": 0.002192482352256775, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002932311501353979, + "rewards/margins": 0.6707295179367065, + "rewards/rejected": -0.6736618280410767, + "sft_loss": 0.029323115944862366, + "step": 3430 + }, + { + "epoch": 4.961677512653652, + "grad_norm": 1.293836056578048, + "learning_rate": 6.007675146566886e-07, + "logits/chosen": -0.7885620594024658, + "logits/rejected": -0.5741199851036072, + "logps/chosen": -0.05781673640012741, + "logps/rejected": -6.402325630187988, + "loss": 0.0713, + "odds_ratio_loss": 0.007596355862915516, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005781673826277256, + "rewards/margins": 0.6344509720802307, + "rewards/rejected": -0.6402326822280884, + "sft_loss": 0.05781673640012741, + "step": 3431 + }, + { + "epoch": 4.963123644251627, + "grad_norm": 1.5357216450846016, + "learning_rate": 5.99131457081544e-07, + "logits/chosen": -0.645007848739624, + "logits/rejected": -0.5896191596984863, + "logps/chosen": -0.06823316216468811, + "logps/rejected": -5.339206218719482, + "loss": 0.0845, + "odds_ratio_loss": 0.00628221919760108, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006823315750807524, + "rewards/margins": 0.5270972847938538, + "rewards/rejected": -0.5339206457138062, + "sft_loss": 0.06823316216468811, + "step": 3432 + }, + { + "epoch": 4.964569775849602, + "grad_norm": 1.6104289227539481, + "learning_rate": 5.974974499027094e-07, + "logits/chosen": -0.868780791759491, + "logits/rejected": -0.577593207359314, + "logps/chosen": -0.05608178302645683, + "logps/rejected": -6.6864471435546875, + "loss": 0.108, + "odds_ratio_loss": 0.002942313440144062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005608178209513426, + "rewards/margins": 0.6630365252494812, + "rewards/rejected": -0.6686447858810425, + "sft_loss": 0.05608178302645683, + "step": 3433 + }, + { + "epoch": 4.966015907447578, + "grad_norm": 1.6332927485572049, + "learning_rate": 5.958654941053352e-07, + "logits/chosen": -0.7936917543411255, + "logits/rejected": -0.6169689893722534, + "logps/chosen": -0.1516617238521576, + "logps/rejected": -3.740570306777954, + "loss": 0.0813, + "odds_ratio_loss": 0.027436548843979836, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015166172757744789, + "rewards/margins": 0.3588908612728119, + "rewards/rejected": -0.37405702471733093, + "sft_loss": 0.1516617238521576, + "step": 3434 + }, + { + "epoch": 4.967462039045553, + "grad_norm": 2.132685715380471, + "learning_rate": 5.942355906733318e-07, + "logits/chosen": -0.77684485912323, + "logits/rejected": -0.5762640833854675, + "logps/chosen": -0.026029953733086586, + "logps/rejected": -5.362104415893555, + "loss": 0.053, + "odds_ratio_loss": 0.003081801813095808, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0026029953733086586, + "rewards/margins": 0.5336074233055115, + "rewards/rejected": -0.5362104177474976, + "sft_loss": 0.026029953733086586, + "step": 3435 + }, + { + "epoch": 4.9689081706435285, + "grad_norm": 1.81155485785451, + "learning_rate": 5.926077405893766e-07, + "logits/chosen": -0.743396520614624, + "logits/rejected": -0.6381196975708008, + "logps/chosen": -0.06945671886205673, + "logps/rejected": -4.84320068359375, + "loss": 0.0892, + "odds_ratio_loss": 0.005398800130933523, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006945671979337931, + "rewards/margins": 0.477374404668808, + "rewards/rejected": -0.4843200743198395, + "sft_loss": 0.06945671886205673, + "step": 3436 + }, + { + "epoch": 4.970354302241504, + "grad_norm": 1.5592717402220886, + "learning_rate": 5.909819448349051e-07, + "logits/chosen": -0.8907531499862671, + "logits/rejected": -0.6397050023078918, + "logps/chosen": -0.1254955232143402, + "logps/rejected": -4.674949645996094, + "loss": 0.0766, + "odds_ratio_loss": 0.005577145144343376, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01254955306649208, + "rewards/margins": 0.45494544506073, + "rewards/rejected": -0.46749502420425415, + "sft_loss": 0.1254955232143402, + "step": 3437 + }, + { + "epoch": 4.971800433839479, + "grad_norm": 2.65297190852868, + "learning_rate": 5.893582043901144e-07, + "logits/chosen": -0.7786065340042114, + "logits/rejected": -0.6070635318756104, + "logps/chosen": -0.08002370595932007, + "logps/rejected": -3.568556070327759, + "loss": 0.0902, + "odds_ratio_loss": 0.007114575244486332, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008002370595932007, + "rewards/margins": 0.3488532304763794, + "rewards/rejected": -0.3568556308746338, + "sft_loss": 0.08002370595932007, + "step": 3438 + }, + { + "epoch": 4.973246565437455, + "grad_norm": 1.6585001087257703, + "learning_rate": 5.877365202339657e-07, + "logits/chosen": -1.0660187005996704, + "logits/rejected": -0.6834717392921448, + "logps/chosen": -0.05151912942528725, + "logps/rejected": -7.605033874511719, + "loss": 0.0773, + "odds_ratio_loss": 0.0027478046249598265, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005151913035660982, + "rewards/margins": 0.7553514838218689, + "rewards/rejected": -0.7605034112930298, + "sft_loss": 0.05151912942528725, + "step": 3439 + }, + { + "epoch": 4.97469269703543, + "grad_norm": 1.525514052881944, + "learning_rate": 5.861168933441769e-07, + "logits/chosen": -0.7787050008773804, + "logits/rejected": -0.6245294809341431, + "logps/chosen": -0.10993112623691559, + "logps/rejected": -5.3284454345703125, + "loss": 0.0818, + "odds_ratio_loss": 0.012700337916612625, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0109931118786335, + "rewards/margins": 0.5218514800071716, + "rewards/rejected": -0.5328445434570312, + "sft_loss": 0.10993112623691559, + "step": 3440 + }, + { + "epoch": 4.976138828633406, + "grad_norm": 1.7760939346248212, + "learning_rate": 5.844993246972288e-07, + "logits/chosen": -1.0633209943771362, + "logits/rejected": -0.9356187582015991, + "logps/chosen": -0.04119650647044182, + "logps/rejected": -3.5258665084838867, + "loss": 0.083, + "odds_ratio_loss": 0.00574019318446517, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004119650926440954, + "rewards/margins": 0.34846699237823486, + "rewards/rejected": -0.35258665680885315, + "sft_loss": 0.04119650647044182, + "step": 3441 + }, + { + "epoch": 4.977584960231381, + "grad_norm": 1.7749277326694144, + "learning_rate": 5.828838152683575e-07, + "logits/chosen": -0.8471897840499878, + "logits/rejected": -0.6608887314796448, + "logps/chosen": -0.06945478171110153, + "logps/rejected": -3.0917301177978516, + "loss": 0.0627, + "odds_ratio_loss": 0.008055892772972584, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006945478729903698, + "rewards/margins": 0.30222752690315247, + "rewards/rejected": -0.30917298793792725, + "sft_loss": 0.06945478171110153, + "step": 3442 + }, + { + "epoch": 4.9790310918293565, + "grad_norm": 1.4906959459399594, + "learning_rate": 5.812703660315614e-07, + "logits/chosen": -1.0138330459594727, + "logits/rejected": -0.7253773212432861, + "logps/chosen": -0.10903214663267136, + "logps/rejected": -3.4463255405426025, + "loss": 0.0798, + "odds_ratio_loss": 0.0062078689225018024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010903215035796165, + "rewards/margins": 0.3337293863296509, + "rewards/rejected": -0.3446325957775116, + "sft_loss": 0.10903214663267136, + "step": 3443 + }, + { + "epoch": 4.980477223427332, + "grad_norm": 1.4923880084454153, + "learning_rate": 5.796589779595936e-07, + "logits/chosen": -0.8511115312576294, + "logits/rejected": -0.6175816059112549, + "logps/chosen": -0.09215164184570312, + "logps/rejected": -5.326424598693848, + "loss": 0.0994, + "odds_ratio_loss": 0.0075257387943565845, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009215164929628372, + "rewards/margins": 0.5234273076057434, + "rewards/rejected": -0.5326424241065979, + "sft_loss": 0.09215164184570312, + "step": 3444 + }, + { + "epoch": 4.981923355025307, + "grad_norm": 1.9255239760331049, + "learning_rate": 5.780496520239672e-07, + "logits/chosen": -0.6826792359352112, + "logits/rejected": -0.6077350974082947, + "logps/chosen": -0.06906363368034363, + "logps/rejected": -3.452920436859131, + "loss": 0.0731, + "odds_ratio_loss": 0.0063808392733335495, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006906363181769848, + "rewards/margins": 0.3383857011795044, + "rewards/rejected": -0.34529203176498413, + "sft_loss": 0.06906363368034363, + "step": 3445 + }, + { + "epoch": 4.983369486623283, + "grad_norm": 1.657251305881418, + "learning_rate": 5.764423891949506e-07, + "logits/chosen": -0.7294119000434875, + "logits/rejected": -0.5812273621559143, + "logps/chosen": -0.07187902182340622, + "logps/rejected": -4.1396989822387695, + "loss": 0.0809, + "odds_ratio_loss": 0.003367446595802903, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007187901996076107, + "rewards/margins": 0.40678200125694275, + "rewards/rejected": -0.41396987438201904, + "sft_loss": 0.07187902182340622, + "step": 3446 + }, + { + "epoch": 4.984815618221258, + "grad_norm": 1.4641805442906786, + "learning_rate": 5.748371904415683e-07, + "logits/chosen": -0.7924904823303223, + "logits/rejected": -0.6872941255569458, + "logps/chosen": -0.15684622526168823, + "logps/rejected": -3.0761542320251465, + "loss": 0.0963, + "odds_ratio_loss": 0.02701270952820778, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015684621408581734, + "rewards/margins": 0.29193079471588135, + "rewards/rejected": -0.30761539936065674, + "sft_loss": 0.15684622526168823, + "step": 3447 + }, + { + "epoch": 4.986261749819233, + "grad_norm": 1.6232973011439962, + "learning_rate": 5.732340567315997e-07, + "logits/chosen": -0.8529285192489624, + "logits/rejected": -0.7546834349632263, + "logps/chosen": -0.08991379290819168, + "logps/rejected": -5.203977584838867, + "loss": 0.0853, + "odds_ratio_loss": 0.005872908979654312, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008991379290819168, + "rewards/margins": 0.511406421661377, + "rewards/rejected": -0.5203977823257446, + "sft_loss": 0.08991379290819168, + "step": 3448 + }, + { + "epoch": 4.987707881417209, + "grad_norm": 1.609326930688802, + "learning_rate": 5.716329890315816e-07, + "logits/chosen": -0.7413277626037598, + "logits/rejected": -0.6836074590682983, + "logps/chosen": -0.13507165014743805, + "logps/rejected": -3.857135534286499, + "loss": 0.0762, + "odds_ratio_loss": 0.00941159576177597, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01350716594606638, + "rewards/margins": 0.3722064197063446, + "rewards/rejected": -0.3857135474681854, + "sft_loss": 0.13507165014743805, + "step": 3449 + }, + { + "epoch": 4.989154013015185, + "grad_norm": 1.9671365529158957, + "learning_rate": 5.700339883068026e-07, + "logits/chosen": -0.844429075717926, + "logits/rejected": -0.6580359935760498, + "logps/chosen": -0.07697506248950958, + "logps/rejected": -3.4200170040130615, + "loss": 0.0736, + "odds_ratio_loss": 0.011311711743474007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007697506807744503, + "rewards/margins": 0.33430421352386475, + "rewards/rejected": -0.34200170636177063, + "sft_loss": 0.07697506248950958, + "step": 3450 + }, + { + "epoch": 4.990600144613159, + "grad_norm": 1.676385478801375, + "learning_rate": 5.684370555213061e-07, + "logits/chosen": -0.6407653093338013, + "logits/rejected": -0.5257874727249146, + "logps/chosen": -0.05387239158153534, + "logps/rejected": -4.147794246673584, + "loss": 0.0712, + "odds_ratio_loss": 0.0045845406129956245, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005387239158153534, + "rewards/margins": 0.40939223766326904, + "rewards/rejected": -0.4147794544696808, + "sft_loss": 0.05387239158153534, + "step": 3451 + }, + { + "epoch": 4.992046276211135, + "grad_norm": 1.5851457373820437, + "learning_rate": 5.668421916378907e-07, + "logits/chosen": -0.9896453022956848, + "logits/rejected": -0.6115586757659912, + "logps/chosen": -0.03699421137571335, + "logps/rejected": -3.2008819580078125, + "loss": 0.0717, + "odds_ratio_loss": 0.007299942895770073, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0036994214169681072, + "rewards/margins": 0.3163887560367584, + "rewards/rejected": -0.3200881779193878, + "sft_loss": 0.03699421137571335, + "step": 3452 + }, + { + "epoch": 4.993492407809111, + "grad_norm": 1.6443996781685672, + "learning_rate": 5.652493976181039e-07, + "logits/chosen": -0.7869142293930054, + "logits/rejected": -0.5275483727455139, + "logps/chosen": -0.05778158828616142, + "logps/rejected": -4.564939498901367, + "loss": 0.067, + "odds_ratio_loss": 0.003800423815846443, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005778159014880657, + "rewards/margins": 0.4507158100605011, + "rewards/rejected": -0.45649394392967224, + "sft_loss": 0.05778158828616142, + "step": 3453 + }, + { + "epoch": 4.994938539407086, + "grad_norm": 1.6418530507761049, + "learning_rate": 5.636586744222481e-07, + "logits/chosen": -0.7569884061813354, + "logits/rejected": -0.5459706783294678, + "logps/chosen": -0.059836842119693756, + "logps/rejected": -4.32498836517334, + "loss": 0.0911, + "odds_ratio_loss": 0.0062176999635994434, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005983684211969376, + "rewards/margins": 0.4265151619911194, + "rewards/rejected": -0.43249887228012085, + "sft_loss": 0.059836842119693756, + "step": 3454 + }, + { + "epoch": 4.996384671005061, + "grad_norm": 1.7891231577091458, + "learning_rate": 5.620700230093742e-07, + "logits/chosen": -0.7332042455673218, + "logits/rejected": -0.626262366771698, + "logps/chosen": -0.11887304484844208, + "logps/rejected": -4.1333417892456055, + "loss": 0.077, + "odds_ratio_loss": 0.008874907158315182, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011887304484844208, + "rewards/margins": 0.4014468789100647, + "rewards/rejected": -0.4133341908454895, + "sft_loss": 0.11887304484844208, + "step": 3455 + }, + { + "epoch": 4.997830802603037, + "grad_norm": 1.4062046371133936, + "learning_rate": 5.604834443372892e-07, + "logits/chosen": -1.033819317817688, + "logits/rejected": -0.594970703125, + "logps/chosen": -0.10729336738586426, + "logps/rejected": -5.250903606414795, + "loss": 0.063, + "odds_ratio_loss": 0.007175394333899021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010729337111115456, + "rewards/margins": 0.5143610239028931, + "rewards/rejected": -0.5250903367996216, + "sft_loss": 0.10729336738586426, + "step": 3456 + }, + { + "epoch": 4.999276934201013, + "grad_norm": 1.8521632428679058, + "learning_rate": 5.588989393625447e-07, + "logits/chosen": -0.8611646294593811, + "logits/rejected": -0.5696526765823364, + "logps/chosen": -0.0587548166513443, + "logps/rejected": -4.783111572265625, + "loss": 0.0781, + "odds_ratio_loss": 0.00351850432343781, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005875481758266687, + "rewards/margins": 0.47243571281433105, + "rewards/rejected": -0.4783111810684204, + "sft_loss": 0.0587548166513443, + "step": 3457 + }, + { + "epoch": 5.000723065798987, + "grad_norm": 1.4265437130237972, + "learning_rate": 5.573165090404464e-07, + "logits/chosen": -0.709952175617218, + "logits/rejected": -0.5098187327384949, + "logps/chosen": -0.047502551227808, + "logps/rejected": -4.869006156921387, + "loss": 0.0423, + "odds_ratio_loss": 0.002838584128767252, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004750255029648542, + "rewards/margins": 0.4821503758430481, + "rewards/rejected": -0.48690059781074524, + "sft_loss": 0.047502551227808, + "step": 3458 + }, + { + "epoch": 5.002169197396963, + "grad_norm": 1.085421565449066, + "learning_rate": 5.55736154325046e-07, + "logits/chosen": -0.7732264399528503, + "logits/rejected": -0.5995284914970398, + "logps/chosen": -0.039276450872421265, + "logps/rejected": -4.488852024078369, + "loss": 0.0527, + "odds_ratio_loss": 0.010519957169890404, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003927645273506641, + "rewards/margins": 0.44495758414268494, + "rewards/rejected": -0.4488852322101593, + "sft_loss": 0.039276450872421265, + "step": 3459 + }, + { + "epoch": 5.003615328994939, + "grad_norm": 1.0464265932381913, + "learning_rate": 5.54157876169147e-07, + "logits/chosen": -0.8605638742446899, + "logits/rejected": -0.6857526302337646, + "logps/chosen": -0.009190384298563004, + "logps/rejected": -5.061098575592041, + "loss": 0.0324, + "odds_ratio_loss": 0.0008861377718858421, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0009190384298563004, + "rewards/margins": 0.5051908493041992, + "rewards/rejected": -0.506109893321991, + "sft_loss": 0.009190384298563004, + "step": 3460 + }, + { + "epoch": 5.005061460592914, + "grad_norm": 1.1326646780174328, + "learning_rate": 5.525816755242978e-07, + "logits/chosen": -0.7338491678237915, + "logits/rejected": -0.5906820297241211, + "logps/chosen": -0.05195004120469093, + "logps/rejected": -4.651167392730713, + "loss": 0.0347, + "odds_ratio_loss": 0.0044695246033370495, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005195004865527153, + "rewards/margins": 0.4599217176437378, + "rewards/rejected": -0.4651166796684265, + "sft_loss": 0.05195004120469093, + "step": 3461 + }, + { + "epoch": 5.006507592190889, + "grad_norm": 0.9565385507003581, + "learning_rate": 5.510075533407961e-07, + "logits/chosen": -0.8748555183410645, + "logits/rejected": -0.730570912361145, + "logps/chosen": -0.02134561724960804, + "logps/rejected": -3.8099608421325684, + "loss": 0.044, + "odds_ratio_loss": 0.002423565834760666, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002134561538696289, + "rewards/margins": 0.37886154651641846, + "rewards/rejected": -0.38099610805511475, + "sft_loss": 0.02134561724960804, + "step": 3462 + }, + { + "epoch": 5.007953723788865, + "grad_norm": 0.9112409719292605, + "learning_rate": 5.494355105676853e-07, + "logits/chosen": -1.012371301651001, + "logits/rejected": -0.8462579250335693, + "logps/chosen": -0.03380393236875534, + "logps/rejected": -3.5552430152893066, + "loss": 0.0256, + "odds_ratio_loss": 0.004212790168821812, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0033803931437432766, + "rewards/margins": 0.35214394330978394, + "rewards/rejected": -0.35552430152893066, + "sft_loss": 0.03380393236875534, + "step": 3463 + }, + { + "epoch": 5.009399855386841, + "grad_norm": 1.1716206047793647, + "learning_rate": 5.478655481527559e-07, + "logits/chosen": -0.7203569412231445, + "logits/rejected": -0.5603138208389282, + "logps/chosen": -0.036938704550266266, + "logps/rejected": -5.195960998535156, + "loss": 0.0375, + "odds_ratio_loss": 0.003028081264346838, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0036938705015927553, + "rewards/margins": 0.5159022212028503, + "rewards/rejected": -0.5195960402488708, + "sft_loss": 0.036938704550266266, + "step": 3464 + }, + { + "epoch": 5.010845986984815, + "grad_norm": 0.8655848449570359, + "learning_rate": 5.462976670425461e-07, + "logits/chosen": -0.8955216407775879, + "logits/rejected": -0.7210637927055359, + "logps/chosen": -0.013777623884379864, + "logps/rejected": -6.22860860824585, + "loss": 0.0343, + "odds_ratio_loss": 0.0025187258142977953, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0013777624117210507, + "rewards/margins": 0.6214830875396729, + "rewards/rejected": -0.622860848903656, + "sft_loss": 0.013777623884379864, + "step": 3465 + }, + { + "epoch": 5.012292118582791, + "grad_norm": 1.009337477557826, + "learning_rate": 5.447318681823346e-07, + "logits/chosen": -1.2388375997543335, + "logits/rejected": -0.7586451172828674, + "logps/chosen": -0.07897603511810303, + "logps/rejected": -4.735720634460449, + "loss": 0.0448, + "odds_ratio_loss": 0.003169738221913576, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007897603325545788, + "rewards/margins": 0.46567443013191223, + "rewards/rejected": -0.4735720157623291, + "sft_loss": 0.07897603511810303, + "step": 3466 + }, + { + "epoch": 5.013738250180767, + "grad_norm": 1.121337861751337, + "learning_rate": 5.431681525161495e-07, + "logits/chosen": -0.7842767834663391, + "logits/rejected": -0.6963058710098267, + "logps/chosen": -0.10743860900402069, + "logps/rejected": -3.431588649749756, + "loss": 0.0624, + "odds_ratio_loss": 0.012040664441883564, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010743859224021435, + "rewards/margins": 0.33241501450538635, + "rewards/rejected": -0.3431588411331177, + "sft_loss": 0.10743860900402069, + "step": 3467 + }, + { + "epoch": 5.015184381778742, + "grad_norm": 1.0922147755297598, + "learning_rate": 5.416065209867598e-07, + "logits/chosen": -0.7255322337150574, + "logits/rejected": -0.561846137046814, + "logps/chosen": -0.05708377808332443, + "logps/rejected": -4.307957649230957, + "loss": 0.0513, + "odds_ratio_loss": 0.003463734406977892, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005708377808332443, + "rewards/margins": 0.4250873625278473, + "rewards/rejected": -0.43079572916030884, + "sft_loss": 0.05708377808332443, + "step": 3468 + }, + { + "epoch": 5.016630513376717, + "grad_norm": 0.9156674198366694, + "learning_rate": 5.40046974535679e-07, + "logits/chosen": -0.8717478513717651, + "logits/rejected": -0.8460903167724609, + "logps/chosen": -0.05093216896057129, + "logps/rejected": -2.9140570163726807, + "loss": 0.0363, + "odds_ratio_loss": 0.009572173468768597, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005093216896057129, + "rewards/margins": 0.2863124907016754, + "rewards/rejected": -0.29140573740005493, + "sft_loss": 0.05093216896057129, + "step": 3469 + }, + { + "epoch": 5.018076644974693, + "grad_norm": 1.2516930398057569, + "learning_rate": 5.384895141031629e-07, + "logits/chosen": -0.9424750804901123, + "logits/rejected": -0.701157808303833, + "logps/chosen": -0.027423281222581863, + "logps/rejected": -7.161192417144775, + "loss": 0.0475, + "odds_ratio_loss": 0.0023427875712513924, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002742328215390444, + "rewards/margins": 0.7133768796920776, + "rewards/rejected": -0.7161192893981934, + "sft_loss": 0.027423281222581863, + "step": 3470 + }, + { + "epoch": 5.019522776572668, + "grad_norm": 1.446751345973255, + "learning_rate": 5.369341406282113e-07, + "logits/chosen": -0.9990890026092529, + "logits/rejected": -0.6447659730911255, + "logps/chosen": -0.019773146137595177, + "logps/rejected": -5.728797435760498, + "loss": 0.0269, + "odds_ratio_loss": 0.0009713853942230344, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001977314706891775, + "rewards/margins": 0.570902407169342, + "rewards/rejected": -0.5728797316551208, + "sft_loss": 0.019773146137595177, + "step": 3471 + }, + { + "epoch": 5.0209689081706435, + "grad_norm": 1.2865825800855881, + "learning_rate": 5.353808550485635e-07, + "logits/chosen": -0.9069744348526001, + "logits/rejected": -0.7498742938041687, + "logps/chosen": -0.017203805968165398, + "logps/rejected": -6.09996223449707, + "loss": 0.0346, + "odds_ratio_loss": 0.0010495948372408748, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001720380736514926, + "rewards/margins": 0.6082758903503418, + "rewards/rejected": -0.6099961996078491, + "sft_loss": 0.017203805968165398, + "step": 3472 + }, + { + "epoch": 5.022415039768619, + "grad_norm": 1.5006156541725042, + "learning_rate": 5.338296583007027e-07, + "logits/chosen": -0.9074760675430298, + "logits/rejected": -0.6146718263626099, + "logps/chosen": -0.06319965422153473, + "logps/rejected": -5.602901458740234, + "loss": 0.0575, + "odds_ratio_loss": 0.0013854659628123045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00631996663287282, + "rewards/margins": 0.553970217704773, + "rewards/rejected": -0.5602901577949524, + "sft_loss": 0.06319965422153473, + "step": 3473 + }, + { + "epoch": 5.023861171366594, + "grad_norm": 1.5874623391961369, + "learning_rate": 5.322805513198494e-07, + "logits/chosen": -0.9391703605651855, + "logits/rejected": -0.777937650680542, + "logps/chosen": -0.08251549303531647, + "logps/rejected": -4.623871803283691, + "loss": 0.0606, + "odds_ratio_loss": 0.0030107577331364155, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008251549676060677, + "rewards/margins": 0.4541356563568115, + "rewards/rejected": -0.46238720417022705, + "sft_loss": 0.08251549303531647, + "step": 3474 + }, + { + "epoch": 5.02530730296457, + "grad_norm": 1.4057184124461983, + "learning_rate": 5.307335350399675e-07, + "logits/chosen": -0.8874735832214355, + "logits/rejected": -0.7688844799995422, + "logps/chosen": -0.10471326112747192, + "logps/rejected": -3.2333931922912598, + "loss": 0.0434, + "odds_ratio_loss": 0.010410060174763203, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010471326299011707, + "rewards/margins": 0.31286799907684326, + "rewards/rejected": -0.3233392834663391, + "sft_loss": 0.10471326112747192, + "step": 3475 + }, + { + "epoch": 5.026753434562545, + "grad_norm": 1.2902962205289839, + "learning_rate": 5.291886103937586e-07, + "logits/chosen": -0.9923503398895264, + "logits/rejected": -0.6792709231376648, + "logps/chosen": -0.015890207141637802, + "logps/rejected": -3.8003733158111572, + "loss": 0.0294, + "odds_ratio_loss": 0.0025172270834445953, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0015890207141637802, + "rewards/margins": 0.3784483075141907, + "rewards/rejected": -0.3800373375415802, + "sft_loss": 0.015890207141637802, + "step": 3476 + }, + { + "epoch": 5.028199566160521, + "grad_norm": 1.2875767952399477, + "learning_rate": 5.276457783126624e-07, + "logits/chosen": -0.8897219300270081, + "logits/rejected": -0.6773457527160645, + "logps/chosen": -0.049017373472452164, + "logps/rejected": -4.137425899505615, + "loss": 0.0361, + "odds_ratio_loss": 0.0024960683658719063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004901737906038761, + "rewards/margins": 0.4088408648967743, + "rewards/rejected": -0.41374263167381287, + "sft_loss": 0.049017373472452164, + "step": 3477 + }, + { + "epoch": 5.029645697758496, + "grad_norm": 1.372847049337572, + "learning_rate": 5.261050397268594e-07, + "logits/chosen": -0.9416077733039856, + "logits/rejected": -0.664757251739502, + "logps/chosen": -0.05168641358613968, + "logps/rejected": -4.555634498596191, + "loss": 0.0561, + "odds_ratio_loss": 0.002042082604020834, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005168641917407513, + "rewards/margins": 0.4503948390483856, + "rewards/rejected": -0.45556342601776123, + "sft_loss": 0.05168641358613968, + "step": 3478 + }, + { + "epoch": 5.0310918293564715, + "grad_norm": 1.2067829756850041, + "learning_rate": 5.245663955652655e-07, + "logits/chosen": -0.9000402688980103, + "logits/rejected": -0.79487544298172, + "logps/chosen": -0.028296932578086853, + "logps/rejected": -4.450551986694336, + "loss": 0.0379, + "odds_ratio_loss": 0.00259765493683517, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0028296932578086853, + "rewards/margins": 0.44222545623779297, + "rewards/rejected": -0.44505518674850464, + "sft_loss": 0.028296932578086853, + "step": 3479 + }, + { + "epoch": 5.032537960954447, + "grad_norm": 1.0215153751903814, + "learning_rate": 5.230298467555361e-07, + "logits/chosen": -0.7747162580490112, + "logits/rejected": -0.48691344261169434, + "logps/chosen": -0.02550322189927101, + "logps/rejected": -8.04299259185791, + "loss": 0.0276, + "odds_ratio_loss": 0.0014245238853618503, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0025503220967948437, + "rewards/margins": 0.8017489910125732, + "rewards/rejected": -0.8042992353439331, + "sft_loss": 0.02550322189927101, + "step": 3480 + }, + { + "epoch": 5.033984092552422, + "grad_norm": 1.1753442740039661, + "learning_rate": 5.214953942240612e-07, + "logits/chosen": -0.9011898040771484, + "logits/rejected": -0.7257996201515198, + "logps/chosen": -0.007925175130367279, + "logps/rejected": -5.687167644500732, + "loss": 0.0415, + "odds_ratio_loss": 0.00017209735233336687, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0007925175596028566, + "rewards/margins": 0.5679242610931396, + "rewards/rejected": -0.5687167644500732, + "sft_loss": 0.007925175130367279, + "step": 3481 + }, + { + "epoch": 5.035430224150398, + "grad_norm": 1.0682223299406943, + "learning_rate": 5.199630388959693e-07, + "logits/chosen": -1.0496717691421509, + "logits/rejected": -0.7140789031982422, + "logps/chosen": -0.019513538107275963, + "logps/rejected": -4.503504276275635, + "loss": 0.0383, + "odds_ratio_loss": 0.0018820172408595681, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001951353857293725, + "rewards/margins": 0.44839906692504883, + "rewards/rejected": -0.45035040378570557, + "sft_loss": 0.019513538107275963, + "step": 3482 + }, + { + "epoch": 5.036876355748373, + "grad_norm": 0.9663346190107585, + "learning_rate": 5.184327816951221e-07, + "logits/chosen": -0.9517099857330322, + "logits/rejected": -0.5351824164390564, + "logps/chosen": -0.047696616500616074, + "logps/rejected": -4.995662689208984, + "loss": 0.0298, + "odds_ratio_loss": 0.0005317270988598466, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00476966192945838, + "rewards/margins": 0.49479660391807556, + "rewards/rejected": -0.49956628680229187, + "sft_loss": 0.047696616500616074, + "step": 3483 + }, + { + "epoch": 5.038322487346348, + "grad_norm": 0.99346643446749, + "learning_rate": 5.169046235441175e-07, + "logits/chosen": -0.755199670791626, + "logits/rejected": -0.641110897064209, + "logps/chosen": -0.022800451144576073, + "logps/rejected": -3.9687485694885254, + "loss": 0.0388, + "odds_ratio_loss": 0.001115166931413114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0022800450678914785, + "rewards/margins": 0.39459484815597534, + "rewards/rejected": -0.3968748450279236, + "sft_loss": 0.022800451144576073, + "step": 3484 + }, + { + "epoch": 5.039768618944324, + "grad_norm": 0.8647052648937972, + "learning_rate": 5.153785653642875e-07, + "logits/chosen": -0.9964678287506104, + "logits/rejected": -0.8342572450637817, + "logps/chosen": -0.030764533206820488, + "logps/rejected": -5.155585765838623, + "loss": 0.0248, + "odds_ratio_loss": 0.002030643867328763, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0030764532275497913, + "rewards/margins": 0.5124821662902832, + "rewards/rejected": -0.5155586004257202, + "sft_loss": 0.030764533206820488, + "step": 3485 + }, + { + "epoch": 5.0412147505422995, + "grad_norm": 1.0490770833472225, + "learning_rate": 5.13854608075699e-07, + "logits/chosen": -0.8880283236503601, + "logits/rejected": -0.8042649626731873, + "logps/chosen": -0.032138124108314514, + "logps/rejected": -4.834699630737305, + "loss": 0.0413, + "odds_ratio_loss": 0.0069372812286019325, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0032138123642653227, + "rewards/margins": 0.48025619983673096, + "rewards/rejected": -0.48347002267837524, + "sft_loss": 0.032138124108314514, + "step": 3486 + }, + { + "epoch": 5.042660882140275, + "grad_norm": 1.3260914708649687, + "learning_rate": 5.123327525971501e-07, + "logits/chosen": -0.9982175230979919, + "logits/rejected": -0.6111714839935303, + "logps/chosen": -0.08715112507343292, + "logps/rejected": -6.953100204467773, + "loss": 0.0725, + "odds_ratio_loss": 0.008616614155471325, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008715112693607807, + "rewards/margins": 0.6865949034690857, + "rewards/rejected": -0.6953100562095642, + "sft_loss": 0.08715112507343292, + "step": 3487 + }, + { + "epoch": 5.04410701373825, + "grad_norm": 1.2545860974001273, + "learning_rate": 5.108129998461752e-07, + "logits/chosen": -0.9156689643859863, + "logits/rejected": -0.7918872237205505, + "logps/chosen": -0.1049668937921524, + "logps/rejected": -3.2424416542053223, + "loss": 0.0559, + "odds_ratio_loss": 0.013586277142167091, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01049669086933136, + "rewards/margins": 0.31374746561050415, + "rewards/rejected": -0.3242441713809967, + "sft_loss": 0.1049668937921524, + "step": 3488 + }, + { + "epoch": 5.045553145336226, + "grad_norm": 0.9690430135015121, + "learning_rate": 5.092953507390368e-07, + "logits/chosen": -0.8073430061340332, + "logits/rejected": -0.7314937710762024, + "logps/chosen": -0.028009576722979546, + "logps/rejected": -4.2882981300354, + "loss": 0.0272, + "odds_ratio_loss": 0.0006923983455635607, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002800957765430212, + "rewards/margins": 0.426028847694397, + "rewards/rejected": -0.4288298189640045, + "sft_loss": 0.028009576722979546, + "step": 3489 + }, + { + "epoch": 5.046999276934201, + "grad_norm": 0.9797258107602022, + "learning_rate": 5.077798061907322e-07, + "logits/chosen": -0.8194557428359985, + "logits/rejected": -0.6159572601318359, + "logps/chosen": -0.030368125066161156, + "logps/rejected": -4.142151832580566, + "loss": 0.0403, + "odds_ratio_loss": 0.0020896908827126026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0030368126463145018, + "rewards/margins": 0.4111783504486084, + "rewards/rejected": -0.4142151474952698, + "sft_loss": 0.030368125066161156, + "step": 3490 + }, + { + "epoch": 5.048445408532176, + "grad_norm": 1.1084600799985203, + "learning_rate": 5.062663671149896e-07, + "logits/chosen": -0.7641061544418335, + "logits/rejected": -0.5759693384170532, + "logps/chosen": -0.07876641303300858, + "logps/rejected": -4.516148567199707, + "loss": 0.0499, + "odds_ratio_loss": 0.004298293497413397, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007876641117036343, + "rewards/margins": 0.4437382221221924, + "rewards/rejected": -0.4516148567199707, + "sft_loss": 0.07876641303300858, + "step": 3491 + }, + { + "epoch": 5.049891540130152, + "grad_norm": 0.9494340453804003, + "learning_rate": 5.047550344242668e-07, + "logits/chosen": -0.8096336722373962, + "logits/rejected": -0.5875648856163025, + "logps/chosen": -0.020555946975946426, + "logps/rejected": -4.530761241912842, + "loss": 0.0319, + "odds_ratio_loss": 0.0013419195311143994, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002055594464763999, + "rewards/margins": 0.4510205388069153, + "rewards/rejected": -0.4530760645866394, + "sft_loss": 0.020555946975946426, + "step": 3492 + }, + { + "epoch": 5.0513376717281275, + "grad_norm": 1.1190637140374868, + "learning_rate": 5.032458090297509e-07, + "logits/chosen": -0.7927988767623901, + "logits/rejected": -0.6353293657302856, + "logps/chosen": -0.047286052256822586, + "logps/rejected": -3.4200520515441895, + "loss": 0.0454, + "odds_ratio_loss": 0.0031753459479659796, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004728605039417744, + "rewards/margins": 0.3372766077518463, + "rewards/rejected": -0.34200519323349, + "sft_loss": 0.047286052256822586, + "step": 3493 + }, + { + "epoch": 5.052783803326102, + "grad_norm": 0.9510278762097967, + "learning_rate": 5.017386918413598e-07, + "logits/chosen": -0.6925092935562134, + "logits/rejected": -0.5789450407028198, + "logps/chosen": -0.0461755096912384, + "logps/rejected": -3.9706852436065674, + "loss": 0.038, + "odds_ratio_loss": 0.0035718618892133236, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00461755134165287, + "rewards/margins": 0.3924509286880493, + "rewards/rejected": -0.39706850051879883, + "sft_loss": 0.0461755096912384, + "step": 3494 + }, + { + "epoch": 5.054229934924078, + "grad_norm": 0.8347912311398593, + "learning_rate": 5.002336837677408e-07, + "logits/chosen": -1.0878486633300781, + "logits/rejected": -0.8472960591316223, + "logps/chosen": -0.07545731961727142, + "logps/rejected": -5.063971996307373, + "loss": 0.0312, + "odds_ratio_loss": 0.004800775554031134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007545732893049717, + "rewards/margins": 0.4988514482975006, + "rewards/rejected": -0.5063971877098083, + "sft_loss": 0.07545731961727142, + "step": 3495 + }, + { + "epoch": 5.055676066522054, + "grad_norm": 1.1106826128697525, + "learning_rate": 4.987307857162672e-07, + "logits/chosen": -1.1164376735687256, + "logits/rejected": -0.9372444748878479, + "logps/chosen": -0.07102254778146744, + "logps/rejected": -3.9332127571105957, + "loss": 0.0463, + "odds_ratio_loss": 0.005403660237789154, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007102255243808031, + "rewards/margins": 0.3862190842628479, + "rewards/rejected": -0.39332127571105957, + "sft_loss": 0.07102254778146744, + "step": 3496 + }, + { + "epoch": 5.0571221981200285, + "grad_norm": 0.9065018234326692, + "learning_rate": 4.972299985930441e-07, + "logits/chosen": -0.7647346258163452, + "logits/rejected": -0.568095862865448, + "logps/chosen": -0.047225866466760635, + "logps/rejected": -4.031482696533203, + "loss": 0.0332, + "odds_ratio_loss": 0.0016099303029477596, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0047225868329405785, + "rewards/margins": 0.3984256982803345, + "rewards/rejected": -0.4031482934951782, + "sft_loss": 0.047225866466760635, + "step": 3497 + }, + { + "epoch": 5.058568329718004, + "grad_norm": 1.235864047380871, + "learning_rate": 4.957313233029001e-07, + "logits/chosen": -0.9007717967033386, + "logits/rejected": -0.6701961159706116, + "logps/chosen": -0.04815499857068062, + "logps/rejected": -4.450354099273682, + "loss": 0.0414, + "odds_ratio_loss": 0.002228476107120514, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004815499763935804, + "rewards/margins": 0.4402199387550354, + "rewards/rejected": -0.4450354278087616, + "sft_loss": 0.04815499857068062, + "step": 3498 + }, + { + "epoch": 5.06001446131598, + "grad_norm": 1.3037735453615267, + "learning_rate": 4.942347607493929e-07, + "logits/chosen": -0.7589642405509949, + "logits/rejected": -0.6731579303741455, + "logps/chosen": -0.07891285419464111, + "logps/rejected": -4.627119064331055, + "loss": 0.0565, + "odds_ratio_loss": 0.006445009261369705, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007891286164522171, + "rewards/margins": 0.4548206329345703, + "rewards/rejected": -0.462711900472641, + "sft_loss": 0.07891285419464111, + "step": 3499 + }, + { + "epoch": 5.061460592913956, + "grad_norm": 1.0390461211048085, + "learning_rate": 4.927403118348055e-07, + "logits/chosen": -0.8114687204360962, + "logits/rejected": -0.5995222330093384, + "logps/chosen": -0.023380516096949577, + "logps/rejected": -5.17085075378418, + "loss": 0.04, + "odds_ratio_loss": 0.0011913108173757792, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0023380513302981853, + "rewards/margins": 0.5147470235824585, + "rewards/rejected": -0.517085075378418, + "sft_loss": 0.023380516096949577, + "step": 3500 + }, + { + "epoch": 5.06290672451193, + "grad_norm": 1.1592534723874754, + "learning_rate": 4.912479774601465e-07, + "logits/chosen": -0.7811102271080017, + "logits/rejected": -0.6491488814353943, + "logps/chosen": -0.035685863345861435, + "logps/rejected": -4.719926834106445, + "loss": 0.0449, + "odds_ratio_loss": 0.0020906818099319935, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0035685861948877573, + "rewards/margins": 0.46842408180236816, + "rewards/rejected": -0.4719926714897156, + "sft_loss": 0.035685863345861435, + "step": 3501 + }, + { + "epoch": 5.064352856109906, + "grad_norm": 0.9900282846292379, + "learning_rate": 4.897577585251493e-07, + "logits/chosen": -0.9106247425079346, + "logits/rejected": -0.506430447101593, + "logps/chosen": -0.034568313509225845, + "logps/rejected": -5.847599506378174, + "loss": 0.0336, + "odds_ratio_loss": 0.0007620738469995558, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003456831444054842, + "rewards/margins": 0.5813031792640686, + "rewards/rejected": -0.5847600698471069, + "sft_loss": 0.034568313509225845, + "step": 3502 + }, + { + "epoch": 5.065798987707882, + "grad_norm": 1.3192284096007874, + "learning_rate": 4.882696559282728e-07, + "logits/chosen": -0.7228418588638306, + "logits/rejected": -0.6639438271522522, + "logps/chosen": -0.05023326724767685, + "logps/rejected": -4.81099796295166, + "loss": 0.0544, + "odds_ratio_loss": 0.003748027142137289, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005023326724767685, + "rewards/margins": 0.47607648372650146, + "rewards/rejected": -0.48109978437423706, + "sft_loss": 0.05023326724767685, + "step": 3503 + }, + { + "epoch": 5.0672451193058565, + "grad_norm": 1.4630431883421127, + "learning_rate": 4.867836705667008e-07, + "logits/chosen": -0.5559595227241516, + "logits/rejected": -0.4254433512687683, + "logps/chosen": -0.03188515082001686, + "logps/rejected": -5.043299674987793, + "loss": 0.0303, + "odds_ratio_loss": 0.00941612757742405, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003188515082001686, + "rewards/margins": 0.5011414289474487, + "rewards/rejected": -0.5043299794197083, + "sft_loss": 0.03188515082001686, + "step": 3504 + }, + { + "epoch": 5.068691250903832, + "grad_norm": 1.1457513160225274, + "learning_rate": 4.852998033363374e-07, + "logits/chosen": -0.8476303815841675, + "logits/rejected": -0.5761409401893616, + "logps/chosen": -0.05392240732908249, + "logps/rejected": -6.117643356323242, + "loss": 0.0486, + "odds_ratio_loss": 0.0024544643238186836, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005392240826040506, + "rewards/margins": 0.6063721179962158, + "rewards/rejected": -0.6117643713951111, + "sft_loss": 0.05392240732908249, + "step": 3505 + }, + { + "epoch": 5.070137382501808, + "grad_norm": 0.9423254883145569, + "learning_rate": 4.838180551318137e-07, + "logits/chosen": -0.9022258520126343, + "logits/rejected": -0.6234160661697388, + "logps/chosen": -0.047306958585977554, + "logps/rejected": -6.651636123657227, + "loss": 0.033, + "odds_ratio_loss": 0.000815044913906604, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004730695858597755, + "rewards/margins": 0.6604328751564026, + "rewards/rejected": -0.6651636362075806, + "sft_loss": 0.047306958585977554, + "step": 3506 + }, + { + "epoch": 5.071583514099783, + "grad_norm": 0.8296553426657953, + "learning_rate": 4.823384268464798e-07, + "logits/chosen": -1.0280296802520752, + "logits/rejected": -0.623708963394165, + "logps/chosen": -0.007112645544111729, + "logps/rejected": -5.498015403747559, + "loss": 0.0226, + "odds_ratio_loss": 0.0005501247942447662, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.000711264496203512, + "rewards/margins": 0.5490902662277222, + "rewards/rejected": -0.5498015284538269, + "sft_loss": 0.007112645544111729, + "step": 3507 + }, + { + "epoch": 5.073029645697758, + "grad_norm": 0.9846452216260417, + "learning_rate": 4.8086091937241e-07, + "logits/chosen": -0.853293776512146, + "logits/rejected": -0.660023033618927, + "logps/chosen": -0.04029952734708786, + "logps/rejected": -4.744614601135254, + "loss": 0.0329, + "odds_ratio_loss": 0.0031669042073190212, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004029952920973301, + "rewards/margins": 0.47043153643608093, + "rewards/rejected": -0.47446149587631226, + "sft_loss": 0.04029952734708786, + "step": 3508 + }, + { + "epoch": 5.074475777295734, + "grad_norm": 1.1616560808192198, + "learning_rate": 4.793855336003973e-07, + "logits/chosen": -0.8065732717514038, + "logits/rejected": -0.7198399901390076, + "logps/chosen": -0.030910378322005272, + "logps/rejected": -4.870575904846191, + "loss": 0.0367, + "odds_ratio_loss": 0.000926865846849978, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0030910377390682697, + "rewards/margins": 0.483966588973999, + "rewards/rejected": -0.487057626247406, + "sft_loss": 0.030910378322005272, + "step": 3509 + }, + { + "epoch": 5.07592190889371, + "grad_norm": 1.460148493444669, + "learning_rate": 4.7791227041996e-07, + "logits/chosen": -1.0374577045440674, + "logits/rejected": -0.7249501347541809, + "logps/chosen": -0.035487934947013855, + "logps/rejected": -6.014105796813965, + "loss": 0.0256, + "odds_ratio_loss": 0.0014334238367155194, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0035487934947013855, + "rewards/margins": 0.5978617668151855, + "rewards/rejected": -0.6014105677604675, + "sft_loss": 0.035487934947013855, + "step": 3510 + }, + { + "epoch": 5.0773680404916846, + "grad_norm": 1.24139935366732, + "learning_rate": 4.764411307193312e-07, + "logits/chosen": -1.1357358694076538, + "logits/rejected": -0.7631771564483643, + "logps/chosen": -0.051570966839790344, + "logps/rejected": -4.790763854980469, + "loss": 0.0336, + "odds_ratio_loss": 0.0028275088407099247, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005157096311450005, + "rewards/margins": 0.4739193320274353, + "rewards/rejected": -0.47907641530036926, + "sft_loss": 0.051570966839790344, + "step": 3511 + }, + { + "epoch": 5.07881417208966, + "grad_norm": 1.2128643594847868, + "learning_rate": 4.749721153854689e-07, + "logits/chosen": -0.8864186406135559, + "logits/rejected": -0.7438483238220215, + "logps/chosen": -0.05787164717912674, + "logps/rejected": -3.919307231903076, + "loss": 0.0467, + "odds_ratio_loss": 0.012838841415941715, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005787164904177189, + "rewards/margins": 0.38614362478256226, + "rewards/rejected": -0.3919307589530945, + "sft_loss": 0.05787164717912674, + "step": 3512 + }, + { + "epoch": 5.080260303687636, + "grad_norm": 0.8749759144457411, + "learning_rate": 4.735052253040459e-07, + "logits/chosen": -0.8959901332855225, + "logits/rejected": -0.7548617124557495, + "logps/chosen": -0.03229469805955887, + "logps/rejected": -3.567509651184082, + "loss": 0.0229, + "odds_ratio_loss": 0.005712658166885376, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0032294695265591145, + "rewards/margins": 0.353521466255188, + "rewards/rejected": -0.3567509353160858, + "sft_loss": 0.03229469805955887, + "step": 3513 + }, + { + "epoch": 5.081706435285611, + "grad_norm": 1.123889356229595, + "learning_rate": 4.720404613594575e-07, + "logits/chosen": -0.8648489713668823, + "logits/rejected": -0.6787019968032837, + "logps/chosen": -0.07309830188751221, + "logps/rejected": -5.374810695648193, + "loss": 0.0453, + "odds_ratio_loss": 0.002101986203342676, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007309830281883478, + "rewards/margins": 0.5301712155342102, + "rewards/rejected": -0.5374810099601746, + "sft_loss": 0.07309830188751221, + "step": 3514 + }, + { + "epoch": 5.083152566883586, + "grad_norm": 0.9471250243679993, + "learning_rate": 4.7057782443481464e-07, + "logits/chosen": -0.8933843970298767, + "logits/rejected": -0.5827321410179138, + "logps/chosen": -0.00811697170138359, + "logps/rejected": -5.076469421386719, + "loss": 0.0269, + "odds_ratio_loss": 0.000255201623076573, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0008116972167044878, + "rewards/margins": 0.5068352222442627, + "rewards/rejected": -0.507646918296814, + "sft_loss": 0.00811697170138359, + "step": 3515 + }, + { + "epoch": 5.084598698481562, + "grad_norm": 1.1133186028853113, + "learning_rate": 4.691173154119461e-07, + "logits/chosen": -1.0953632593154907, + "logits/rejected": -0.741931140422821, + "logps/chosen": -0.04556608200073242, + "logps/rejected": -5.745540618896484, + "loss": 0.0409, + "odds_ratio_loss": 0.0018734941259026527, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004556608386337757, + "rewards/margins": 0.5699974298477173, + "rewards/rejected": -0.5745540857315063, + "sft_loss": 0.04556608200073242, + "step": 3516 + }, + { + "epoch": 5.086044830079537, + "grad_norm": 0.9562504605078648, + "learning_rate": 4.6765893517139775e-07, + "logits/chosen": -0.7702620625495911, + "logits/rejected": -0.5316519141197205, + "logps/chosen": -0.0347968190908432, + "logps/rejected": -5.1375651359558105, + "loss": 0.031, + "odds_ratio_loss": 0.004826520103961229, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00347968190908432, + "rewards/margins": 0.5102768540382385, + "rewards/rejected": -0.5137565732002258, + "sft_loss": 0.0347968190908432, + "step": 3517 + }, + { + "epoch": 5.087490961677513, + "grad_norm": 0.9150585192597411, + "learning_rate": 4.662026845924334e-07, + "logits/chosen": -0.9138441681861877, + "logits/rejected": -0.624638020992279, + "logps/chosen": -0.019125226885080338, + "logps/rejected": -5.921594142913818, + "loss": 0.0292, + "odds_ratio_loss": 0.0004167947336100042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00191252282820642, + "rewards/margins": 0.5902469158172607, + "rewards/rejected": -0.5921594500541687, + "sft_loss": 0.019125226885080338, + "step": 3518 + }, + { + "epoch": 5.088937093275488, + "grad_norm": 0.9595462720990704, + "learning_rate": 4.647485645530325e-07, + "logits/chosen": -0.6247076988220215, + "logits/rejected": -0.4782922565937042, + "logps/chosen": -0.024594586342573166, + "logps/rejected": -3.9685606956481934, + "loss": 0.0443, + "odds_ratio_loss": 0.0030578388832509518, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002459458541125059, + "rewards/margins": 0.3943966031074524, + "rewards/rejected": -0.39685600996017456, + "sft_loss": 0.024594586342573166, + "step": 3519 + }, + { + "epoch": 5.090383224873463, + "grad_norm": 1.2927196416567392, + "learning_rate": 4.632965759298879e-07, + "logits/chosen": -0.9268531799316406, + "logits/rejected": -0.7796077728271484, + "logps/chosen": -0.11328568309545517, + "logps/rejected": -3.251885414123535, + "loss": 0.0521, + "odds_ratio_loss": 0.05676887556910515, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.011328568682074547, + "rewards/margins": 0.3138599693775177, + "rewards/rejected": -0.325188547372818, + "sft_loss": 0.11328568309545517, + "step": 3520 + }, + { + "epoch": 5.091829356471439, + "grad_norm": 1.1535490074379513, + "learning_rate": 4.618467195984106e-07, + "logits/chosen": -1.0983455181121826, + "logits/rejected": -0.9028714895248413, + "logps/chosen": -0.04674210399389267, + "logps/rejected": -4.079455375671387, + "loss": 0.0532, + "odds_ratio_loss": 0.003504629246890545, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004674210678786039, + "rewards/margins": 0.403271347284317, + "rewards/rejected": -0.40794551372528076, + "sft_loss": 0.04674210399389267, + "step": 3521 + }, + { + "epoch": 5.093275488069414, + "grad_norm": 1.0675836746198444, + "learning_rate": 4.603989964327235e-07, + "logits/chosen": -1.1998631954193115, + "logits/rejected": -0.9866847395896912, + "logps/chosen": -0.03705126419663429, + "logps/rejected": -4.348075866699219, + "loss": 0.0375, + "odds_ratio_loss": 0.0019462001509964466, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003705126466229558, + "rewards/margins": 0.431102454662323, + "rewards/rejected": -0.43480759859085083, + "sft_loss": 0.03705126419663429, + "step": 3522 + }, + { + "epoch": 5.09472161966739, + "grad_norm": 0.9915894254744622, + "learning_rate": 4.58953407305664e-07, + "logits/chosen": -1.121100664138794, + "logits/rejected": -0.7228765487670898, + "logps/chosen": -0.026232311502099037, + "logps/rejected": -6.338787078857422, + "loss": 0.0325, + "odds_ratio_loss": 0.0014822124503552914, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0026232311502099037, + "rewards/margins": 0.6312555074691772, + "rewards/rejected": -0.633878767490387, + "sft_loss": 0.026232311502099037, + "step": 3523 + }, + { + "epoch": 5.096167751265365, + "grad_norm": 1.0533120114898256, + "learning_rate": 4.5750995308878336e-07, + "logits/chosen": -1.0108604431152344, + "logits/rejected": -0.7267636656761169, + "logps/chosen": -0.06367263197898865, + "logps/rejected": -4.689202308654785, + "loss": 0.0441, + "odds_ratio_loss": 0.0063222860917449, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006367263849824667, + "rewards/margins": 0.4625529646873474, + "rewards/rejected": -0.46892020106315613, + "sft_loss": 0.06367263197898865, + "step": 3524 + }, + { + "epoch": 5.097613882863341, + "grad_norm": 1.0608449766673291, + "learning_rate": 4.560686346523459e-07, + "logits/chosen": -0.8225513696670532, + "logits/rejected": -0.6037327647209167, + "logps/chosen": -0.01524380873888731, + "logps/rejected": -5.777705669403076, + "loss": 0.0403, + "odds_ratio_loss": 0.0009167609387077391, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001524380873888731, + "rewards/margins": 0.5762461423873901, + "rewards/rejected": -0.5777705907821655, + "sft_loss": 0.01524380873888731, + "step": 3525 + }, + { + "epoch": 5.099060014461316, + "grad_norm": 0.801826400322166, + "learning_rate": 4.546294528653272e-07, + "logits/chosen": -0.8241301774978638, + "logits/rejected": -0.6081610918045044, + "logps/chosen": -0.017750507220625877, + "logps/rejected": -5.126501560211182, + "loss": 0.0228, + "odds_ratio_loss": 0.00040757720125839114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0017750507686287165, + "rewards/margins": 0.5108751058578491, + "rewards/rejected": -0.5126501321792603, + "sft_loss": 0.017750507220625877, + "step": 3526 + }, + { + "epoch": 5.100506146059291, + "grad_norm": 1.2157011286624781, + "learning_rate": 4.531924085954162e-07, + "logits/chosen": -0.6494585275650024, + "logits/rejected": -0.5403815507888794, + "logps/chosen": -0.00851333886384964, + "logps/rejected": -7.799459457397461, + "loss": 0.0363, + "odds_ratio_loss": 0.00040093838470056653, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0008513339562341571, + "rewards/margins": 0.7790946960449219, + "rewards/rejected": -0.779945969581604, + "sft_loss": 0.00851333886384964, + "step": 3527 + }, + { + "epoch": 5.101952277657267, + "grad_norm": 1.141295091187854, + "learning_rate": 4.5175750270901105e-07, + "logits/chosen": -0.8939746022224426, + "logits/rejected": -0.5252866744995117, + "logps/chosen": -0.01969527266919613, + "logps/rejected": -7.3741254806518555, + "loss": 0.0357, + "odds_ratio_loss": 0.0005816838820464909, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001969527220353484, + "rewards/margins": 0.735443115234375, + "rewards/rejected": -0.7374125719070435, + "sft_loss": 0.01969527266919613, + "step": 3528 + }, + { + "epoch": 5.1033984092552425, + "grad_norm": 1.4337925233330169, + "learning_rate": 4.5032473607122366e-07, + "logits/chosen": -0.78791344165802, + "logits/rejected": -0.6291660070419312, + "logps/chosen": -0.03807862848043442, + "logps/rejected": -4.798315048217773, + "loss": 0.0534, + "odds_ratio_loss": 0.0023045032285153866, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003807862987741828, + "rewards/margins": 0.4760235846042633, + "rewards/rejected": -0.4798315167427063, + "sft_loss": 0.03807862848043442, + "step": 3529 + }, + { + "epoch": 5.104844540853217, + "grad_norm": 1.0117496069143304, + "learning_rate": 4.4889410954587294e-07, + "logits/chosen": -0.8519380688667297, + "logits/rejected": -0.6767643690109253, + "logps/chosen": -0.06508355587720871, + "logps/rejected": -5.1234869956970215, + "loss": 0.0355, + "odds_ratio_loss": 0.007920399308204651, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0065083554945886135, + "rewards/margins": 0.5058403015136719, + "rewards/rejected": -0.5123487114906311, + "sft_loss": 0.06508355587720871, + "step": 3530 + }, + { + "epoch": 5.106290672451193, + "grad_norm": 0.9701742964861159, + "learning_rate": 4.4746562399548884e-07, + "logits/chosen": -0.9526634812355042, + "logits/rejected": -0.6120060086250305, + "logps/chosen": -0.0644385814666748, + "logps/rejected": -6.016313552856445, + "loss": 0.0499, + "odds_ratio_loss": 0.002297525992617011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006443857681006193, + "rewards/margins": 0.5951874852180481, + "rewards/rejected": -0.6016313433647156, + "sft_loss": 0.0644385814666748, + "step": 3531 + }, + { + "epoch": 5.107736804049169, + "grad_norm": 1.1593006104315133, + "learning_rate": 4.460392802813118e-07, + "logits/chosen": -0.837909996509552, + "logits/rejected": -0.5113778710365295, + "logps/chosen": -0.06349142640829086, + "logps/rejected": -6.190591812133789, + "loss": 0.044, + "odds_ratio_loss": 0.0006093709962442517, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006349142640829086, + "rewards/margins": 0.6127101182937622, + "rewards/rejected": -0.6190592646598816, + "sft_loss": 0.06349142640829086, + "step": 3532 + }, + { + "epoch": 5.109182935647144, + "grad_norm": 1.134106514295472, + "learning_rate": 4.4461507926328813e-07, + "logits/chosen": -1.0936976671218872, + "logits/rejected": -0.8021297454833984, + "logps/chosen": -0.03575780615210533, + "logps/rejected": -4.95504093170166, + "loss": 0.0414, + "odds_ratio_loss": 0.001436459249816835, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0035757804289460182, + "rewards/margins": 0.4919283390045166, + "rewards/rejected": -0.49550408124923706, + "sft_loss": 0.03575780615210533, + "step": 3533 + }, + { + "epoch": 5.110629067245119, + "grad_norm": 1.3570453478023297, + "learning_rate": 4.4319302180007544e-07, + "logits/chosen": -1.0296474695205688, + "logits/rejected": -0.7075653076171875, + "logps/chosen": -0.046218566596508026, + "logps/rejected": -4.962112903594971, + "loss": 0.0569, + "odds_ratio_loss": 0.0019982897210866213, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00462185638025403, + "rewards/margins": 0.4915894865989685, + "rewards/rejected": -0.49621132016181946, + "sft_loss": 0.046218566596508026, + "step": 3534 + }, + { + "epoch": 5.112075198843095, + "grad_norm": 1.0757747569105476, + "learning_rate": 4.417731087490364e-07, + "logits/chosen": -0.9343793392181396, + "logits/rejected": -0.631847620010376, + "logps/chosen": -0.07113965600728989, + "logps/rejected": -4.919393539428711, + "loss": 0.0413, + "odds_ratio_loss": 0.001878237002529204, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007113965693861246, + "rewards/margins": 0.48482537269592285, + "rewards/rejected": -0.4919393062591553, + "sft_loss": 0.07113965600728989, + "step": 3535 + }, + { + "epoch": 5.1135213304410705, + "grad_norm": 1.112657526372105, + "learning_rate": 4.4035534096624303e-07, + "logits/chosen": -0.8756897449493408, + "logits/rejected": -0.7892172932624817, + "logps/chosen": -0.04378293454647064, + "logps/rejected": -3.8384764194488525, + "loss": 0.0336, + "odds_ratio_loss": 0.003223699051886797, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004378293640911579, + "rewards/margins": 0.3794693648815155, + "rewards/rejected": -0.3838476538658142, + "sft_loss": 0.04378293454647064, + "step": 3536 + }, + { + "epoch": 5.114967462039045, + "grad_norm": 1.0098252973514328, + "learning_rate": 4.389397193064717e-07, + "logits/chosen": -0.8488441705703735, + "logits/rejected": -0.7493760585784912, + "logps/chosen": -0.01889643445611, + "logps/rejected": -4.077645301818848, + "loss": 0.0225, + "odds_ratio_loss": 0.0024902906734496355, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0018896434921771288, + "rewards/margins": 0.4058748781681061, + "rewards/rejected": -0.4077645540237427, + "sft_loss": 0.01889643445611, + "step": 3537 + }, + { + "epoch": 5.116413593637021, + "grad_norm": 1.1998715837416576, + "learning_rate": 4.375262446232066e-07, + "logits/chosen": -0.6818573474884033, + "logits/rejected": -0.4833465814590454, + "logps/chosen": -0.15603819489479065, + "logps/rejected": -6.571020603179932, + "loss": 0.0731, + "odds_ratio_loss": 0.002038759645074606, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015603819862008095, + "rewards/margins": 0.641498327255249, + "rewards/rejected": -0.657102108001709, + "sft_loss": 0.15603819489479065, + "step": 3538 + }, + { + "epoch": 5.117859725234997, + "grad_norm": 1.399862952925075, + "learning_rate": 4.36114917768637e-07, + "logits/chosen": -0.9014131426811218, + "logits/rejected": -0.7489269971847534, + "logps/chosen": -0.020976906642317772, + "logps/rejected": -4.785622596740723, + "loss": 0.0412, + "odds_ratio_loss": 0.0008768899133428931, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002097690710797906, + "rewards/margins": 0.47646456956863403, + "rewards/rejected": -0.47856229543685913, + "sft_loss": 0.020976906642317772, + "step": 3539 + }, + { + "epoch": 5.1193058568329715, + "grad_norm": 1.3979035193631253, + "learning_rate": 4.3470573959365665e-07, + "logits/chosen": -0.9969203472137451, + "logits/rejected": -0.704779863357544, + "logps/chosen": -0.022899752482771873, + "logps/rejected": -3.8194432258605957, + "loss": 0.0423, + "odds_ratio_loss": 0.0018510606605559587, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0022899750620126724, + "rewards/margins": 0.3796543478965759, + "rewards/rejected": -0.38194429874420166, + "sft_loss": 0.022899752482771873, + "step": 3540 + }, + { + "epoch": 5.120751988430947, + "grad_norm": 1.1232720454542073, + "learning_rate": 4.3329871094786383e-07, + "logits/chosen": -0.9428566694259644, + "logits/rejected": -0.6366056799888611, + "logps/chosen": -0.02941741608083248, + "logps/rejected": -7.422673225402832, + "loss": 0.0437, + "odds_ratio_loss": 0.0006806895835325122, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002941741608083248, + "rewards/margins": 0.7393256425857544, + "rewards/rejected": -0.742267370223999, + "sft_loss": 0.02941741608083248, + "step": 3541 + }, + { + "epoch": 5.122198120028923, + "grad_norm": 1.0411339399045239, + "learning_rate": 4.318938326795627e-07, + "logits/chosen": -0.8791346549987793, + "logits/rejected": -0.6777920722961426, + "logps/chosen": -0.022412490099668503, + "logps/rejected": -5.245683670043945, + "loss": 0.0359, + "odds_ratio_loss": 0.0014050828758627176, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0022412489634007215, + "rewards/margins": 0.5223270654678345, + "rewards/rejected": -0.5245683193206787, + "sft_loss": 0.022412490099668503, + "step": 3542 + }, + { + "epoch": 5.123644251626898, + "grad_norm": 1.07489111007517, + "learning_rate": 4.304911056357583e-07, + "logits/chosen": -0.7371894121170044, + "logits/rejected": -0.5516063570976257, + "logps/chosen": -0.010756559669971466, + "logps/rejected": -6.3359551429748535, + "loss": 0.0307, + "odds_ratio_loss": 0.0009100245079025626, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0010756559204310179, + "rewards/margins": 0.6325198411941528, + "rewards/rejected": -0.6335955262184143, + "sft_loss": 0.010756559669971466, + "step": 3543 + }, + { + "epoch": 5.125090383224873, + "grad_norm": 1.0239088458657568, + "learning_rate": 4.290905306621604e-07, + "logits/chosen": -0.9055813550949097, + "logits/rejected": -0.6601625680923462, + "logps/chosen": -0.014378732070326805, + "logps/rejected": -6.063301086425781, + "loss": 0.0239, + "odds_ratio_loss": 0.0005538854748010635, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0014378732303157449, + "rewards/margins": 0.6048922538757324, + "rewards/rejected": -0.606330156326294, + "sft_loss": 0.014378732070326805, + "step": 3544 + }, + { + "epoch": 5.126536514822849, + "grad_norm": 1.2712624476901007, + "learning_rate": 4.276921086031815e-07, + "logits/chosen": -0.9251024127006531, + "logits/rejected": -0.7072352170944214, + "logps/chosen": -0.057558659464120865, + "logps/rejected": -4.377816677093506, + "loss": 0.0389, + "odds_ratio_loss": 0.0034858500584959984, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005755866877734661, + "rewards/margins": 0.4320257604122162, + "rewards/rejected": -0.4377816617488861, + "sft_loss": 0.057558659464120865, + "step": 3545 + }, + { + "epoch": 5.127982646420825, + "grad_norm": 0.983345093814679, + "learning_rate": 4.2629584030193564e-07, + "logits/chosen": -0.8956122994422913, + "logits/rejected": -0.6748299598693848, + "logps/chosen": -0.07077537477016449, + "logps/rejected": -4.568532943725586, + "loss": 0.0295, + "odds_ratio_loss": 0.000756526249460876, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007077537477016449, + "rewards/margins": 0.4497757852077484, + "rewards/rejected": -0.45685333013534546, + "sft_loss": 0.07077537477016449, + "step": 3546 + }, + { + "epoch": 5.1294287780187995, + "grad_norm": 1.0892353016146112, + "learning_rate": 4.2490172660023705e-07, + "logits/chosen": -0.8571529984474182, + "logits/rejected": -0.6739146709442139, + "logps/chosen": -0.014833889901638031, + "logps/rejected": -6.780934810638428, + "loss": 0.0406, + "odds_ratio_loss": 0.0011146971955895424, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0014833889435976744, + "rewards/margins": 0.6766101121902466, + "rewards/rejected": -0.6780935525894165, + "sft_loss": 0.014833889901638031, + "step": 3547 + }, + { + "epoch": 5.130874909616775, + "grad_norm": 1.1610728467235858, + "learning_rate": 4.235097683386022e-07, + "logits/chosen": -0.913325309753418, + "logits/rejected": -0.6378690004348755, + "logps/chosen": -0.011191330850124359, + "logps/rejected": -4.821349143981934, + "loss": 0.0471, + "odds_ratio_loss": 0.0005949364276602864, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0011191332014277577, + "rewards/margins": 0.48101580142974854, + "rewards/rejected": -0.4821348786354065, + "sft_loss": 0.011191330850124359, + "step": 3548 + }, + { + "epoch": 5.132321041214751, + "grad_norm": 1.1498882814890075, + "learning_rate": 4.2211996635624867e-07, + "logits/chosen": -0.8509616255760193, + "logits/rejected": -0.7862538695335388, + "logps/chosen": -0.027645738795399666, + "logps/rejected": -5.407499313354492, + "loss": 0.0304, + "odds_ratio_loss": 0.0012020114809274673, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0027645740192383528, + "rewards/margins": 0.5379853844642639, + "rewards/rejected": -0.5407499670982361, + "sft_loss": 0.027645738795399666, + "step": 3549 + }, + { + "epoch": 5.133767172812726, + "grad_norm": 1.045508537733952, + "learning_rate": 4.207323214910925e-07, + "logits/chosen": -0.8098764419555664, + "logits/rejected": -0.673413872718811, + "logps/chosen": -0.019348647445440292, + "logps/rejected": -4.263792037963867, + "loss": 0.0281, + "odds_ratio_loss": 0.0014103625435382128, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0019348645582795143, + "rewards/margins": 0.4244443476200104, + "rewards/rejected": -0.4263792037963867, + "sft_loss": 0.019348647445440292, + "step": 3550 + }, + { + "epoch": 5.135213304410701, + "grad_norm": 0.9118889895034206, + "learning_rate": 4.193468345797511e-07, + "logits/chosen": -0.6067082285881042, + "logits/rejected": -0.3711710274219513, + "logps/chosen": -0.019313864409923553, + "logps/rejected": -6.416814804077148, + "loss": 0.0337, + "odds_ratio_loss": 0.0010191251058131456, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0019313863012939692, + "rewards/margins": 0.6397501230239868, + "rewards/rejected": -0.6416815519332886, + "sft_loss": 0.019313864409923553, + "step": 3551 + }, + { + "epoch": 5.136659436008677, + "grad_norm": 1.3245124770343912, + "learning_rate": 4.1796350645753795e-07, + "logits/chosen": -0.93780517578125, + "logits/rejected": -0.7108378410339355, + "logps/chosen": -0.04983704164624214, + "logps/rejected": -3.885666608810425, + "loss": 0.0479, + "odds_ratio_loss": 0.0015096311690285802, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004983704537153244, + "rewards/margins": 0.3835829496383667, + "rewards/rejected": -0.38856664299964905, + "sft_loss": 0.04983704164624214, + "step": 3552 + }, + { + "epoch": 5.138105567606652, + "grad_norm": 1.1490700715634683, + "learning_rate": 4.1658233795846833e-07, + "logits/chosen": -0.8055789470672607, + "logits/rejected": -0.6550903916358948, + "logps/chosen": -0.017309710383415222, + "logps/rejected": -4.0452656745910645, + "loss": 0.0241, + "odds_ratio_loss": 0.013794164173305035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0017309710383415222, + "rewards/margins": 0.40279555320739746, + "rewards/rejected": -0.40452659130096436, + "sft_loss": 0.017309710383415222, + "step": 3553 + }, + { + "epoch": 5.1395516992046275, + "grad_norm": 1.0066328630819037, + "learning_rate": 4.152033299152533e-07, + "logits/chosen": -1.1695951223373413, + "logits/rejected": -0.7323697805404663, + "logps/chosen": -0.01908089965581894, + "logps/rejected": -6.578709125518799, + "loss": 0.0284, + "odds_ratio_loss": 0.004583199508488178, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001908089965581894, + "rewards/margins": 0.6559628844261169, + "rewards/rejected": -0.657870888710022, + "sft_loss": 0.01908089965581894, + "step": 3554 + }, + { + "epoch": 5.140997830802603, + "grad_norm": 1.2003316460791233, + "learning_rate": 4.138264831593021e-07, + "logits/chosen": -0.8967068791389465, + "logits/rejected": -0.6410534381866455, + "logps/chosen": -0.05355698615312576, + "logps/rejected": -5.4609832763671875, + "loss": 0.0489, + "odds_ratio_loss": 0.045562319457530975, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.005355698522180319, + "rewards/margins": 0.5407426357269287, + "rewards/rejected": -0.5460983514785767, + "sft_loss": 0.05355698615312576, + "step": 3555 + }, + { + "epoch": 5.142443962400579, + "grad_norm": 1.2024952735105818, + "learning_rate": 4.1245179852071967e-07, + "logits/chosen": -0.9525247812271118, + "logits/rejected": -0.7273567914962769, + "logps/chosen": -0.0484442338347435, + "logps/rejected": -5.277726173400879, + "loss": 0.0394, + "odds_ratio_loss": 0.004583648405969143, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00484442338347435, + "rewards/margins": 0.5229281783103943, + "rewards/rejected": -0.5277726054191589, + "sft_loss": 0.0484442338347435, + "step": 3556 + }, + { + "epoch": 5.143890093998554, + "grad_norm": 1.248422159406862, + "learning_rate": 4.110792768283091e-07, + "logits/chosen": -0.9277870059013367, + "logits/rejected": -0.5297996401786804, + "logps/chosen": -0.03848704323172569, + "logps/rejected": -6.8542633056640625, + "loss": 0.0542, + "odds_ratio_loss": 0.002286599949002266, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003848704043775797, + "rewards/margins": 0.6815776228904724, + "rewards/rejected": -0.6854263544082642, + "sft_loss": 0.03848704323172569, + "step": 3557 + }, + { + "epoch": 5.145336225596529, + "grad_norm": 0.9575974433447748, + "learning_rate": 4.0970891890956995e-07, + "logits/chosen": -1.039902925491333, + "logits/rejected": -0.7400610446929932, + "logps/chosen": -0.012508687563240528, + "logps/rejected": -6.0867815017700195, + "loss": 0.0329, + "odds_ratio_loss": 0.0008413865580223501, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0012508687796071172, + "rewards/margins": 0.6074272990226746, + "rewards/rejected": -0.6086782217025757, + "sft_loss": 0.012508687563240528, + "step": 3558 + }, + { + "epoch": 5.146782357194505, + "grad_norm": 1.514298510265316, + "learning_rate": 4.0834072559069457e-07, + "logits/chosen": -0.8292897939682007, + "logits/rejected": -0.5984725952148438, + "logps/chosen": -0.034817490726709366, + "logps/rejected": -5.678520202636719, + "loss": 0.0387, + "odds_ratio_loss": 0.006148543208837509, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003481748979538679, + "rewards/margins": 0.5643702745437622, + "rewards/rejected": -0.5678520202636719, + "sft_loss": 0.034817490726709366, + "step": 3559 + }, + { + "epoch": 5.14822848879248, + "grad_norm": 1.236658751957511, + "learning_rate": 4.069746976965733e-07, + "logits/chosen": -0.9042907953262329, + "logits/rejected": -0.6579476594924927, + "logps/chosen": -0.05839492008090019, + "logps/rejected": -5.013850688934326, + "loss": 0.0544, + "odds_ratio_loss": 0.0038068797439336777, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005839492194354534, + "rewards/margins": 0.4955455958843231, + "rewards/rejected": -0.5013850927352905, + "sft_loss": 0.05839492008090019, + "step": 3560 + }, + { + "epoch": 5.1496746203904555, + "grad_norm": 1.566794146100296, + "learning_rate": 4.0561083605078884e-07, + "logits/chosen": -0.7548511028289795, + "logits/rejected": -0.6357748508453369, + "logps/chosen": -0.06090143322944641, + "logps/rejected": -3.7928695678710938, + "loss": 0.044, + "odds_ratio_loss": 0.009436688385903835, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006090143695473671, + "rewards/margins": 0.3731968402862549, + "rewards/rejected": -0.3792869448661804, + "sft_loss": 0.06090143322944641, + "step": 3561 + }, + { + "epoch": 5.151120751988431, + "grad_norm": 0.815848357915086, + "learning_rate": 4.0424914147561794e-07, + "logits/chosen": -0.8279675245285034, + "logits/rejected": -0.7303451299667358, + "logps/chosen": -0.014521737582981586, + "logps/rejected": -6.461040019989014, + "loss": 0.024, + "odds_ratio_loss": 0.0017956249648705125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0014521738048642874, + "rewards/margins": 0.6446517705917358, + "rewards/rejected": -0.6461040377616882, + "sft_loss": 0.014521737582981586, + "step": 3562 + }, + { + "epoch": 5.152566883586406, + "grad_norm": 1.71662469859775, + "learning_rate": 4.028896147920311e-07, + "logits/chosen": -0.964228630065918, + "logits/rejected": -0.7277406454086304, + "logps/chosen": -0.05546468123793602, + "logps/rejected": -4.663585186004639, + "loss": 0.043, + "odds_ratio_loss": 0.0061240424402058125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005546468310058117, + "rewards/margins": 0.4608120024204254, + "rewards/rejected": -0.466358482837677, + "sft_loss": 0.05546468123793602, + "step": 3563 + }, + { + "epoch": 5.154013015184382, + "grad_norm": 1.041441761654688, + "learning_rate": 4.0153225681969305e-07, + "logits/chosen": -0.9566247463226318, + "logits/rejected": -0.5977626442909241, + "logps/chosen": -0.04568367078900337, + "logps/rejected": -4.601743698120117, + "loss": 0.0402, + "odds_ratio_loss": 0.002992587396875024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004568367265164852, + "rewards/margins": 0.45560598373413086, + "rewards/rejected": -0.4601743519306183, + "sft_loss": 0.04568367078900337, + "step": 3564 + }, + { + "epoch": 5.155459146782357, + "grad_norm": 1.1027106642119917, + "learning_rate": 4.0017706837695897e-07, + "logits/chosen": -0.8168602585792542, + "logits/rejected": -0.6069326400756836, + "logps/chosen": -0.019194474443793297, + "logps/rejected": -4.3833513259887695, + "loss": 0.0617, + "odds_ratio_loss": 0.0007897147443145514, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001919447211548686, + "rewards/margins": 0.4364157021045685, + "rewards/rejected": -0.4383351504802704, + "sft_loss": 0.019194474443793297, + "step": 3565 + }, + { + "epoch": 5.156905278380332, + "grad_norm": 1.1202583156584103, + "learning_rate": 3.988240502808784e-07, + "logits/chosen": -0.8883881568908691, + "logits/rejected": -0.6389689445495605, + "logps/chosen": -0.11527302116155624, + "logps/rejected": -4.131171226501465, + "loss": 0.0563, + "odds_ratio_loss": 0.004591973032802343, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011527301743626595, + "rewards/margins": 0.4015898108482361, + "rewards/rejected": -0.41311711072921753, + "sft_loss": 0.11527302116155624, + "step": 3566 + }, + { + "epoch": 5.158351409978308, + "grad_norm": 1.0016267585153877, + "learning_rate": 3.97473203347189e-07, + "logits/chosen": -1.000503420829773, + "logits/rejected": -0.7691615223884583, + "logps/chosen": -0.02070147544145584, + "logps/rejected": -6.152464866638184, + "loss": 0.026, + "odds_ratio_loss": 0.0008875165949575603, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002070147544145584, + "rewards/margins": 0.6131762862205505, + "rewards/rejected": -0.6152464151382446, + "sft_loss": 0.02070147544145584, + "step": 3567 + }, + { + "epoch": 5.159797541576284, + "grad_norm": 0.7536591073806905, + "learning_rate": 3.9612452839032384e-07, + "logits/chosen": -0.7870176434516907, + "logits/rejected": -0.5479042530059814, + "logps/chosen": -0.018979590386152267, + "logps/rejected": -5.483916282653809, + "loss": 0.0241, + "odds_ratio_loss": 0.0007888816762715578, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0018979592714458704, + "rewards/margins": 0.5464937090873718, + "rewards/rejected": -0.5483916401863098, + "sft_loss": 0.018979590386152267, + "step": 3568 + }, + { + "epoch": 5.161243673174259, + "grad_norm": 0.9994488183017014, + "learning_rate": 3.9477802622340217e-07, + "logits/chosen": -0.8722624778747559, + "logits/rejected": -0.7047505974769592, + "logps/chosen": -0.030218884348869324, + "logps/rejected": -5.5768723487854, + "loss": 0.0393, + "odds_ratio_loss": 0.0025864059571176767, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0030218884348869324, + "rewards/margins": 0.5546653270721436, + "rewards/rejected": -0.5576872229576111, + "sft_loss": 0.030218884348869324, + "step": 3569 + }, + { + "epoch": 5.162689804772234, + "grad_norm": 1.0533039507181938, + "learning_rate": 3.934336976582355e-07, + "logits/chosen": -1.0874637365341187, + "logits/rejected": -0.7242406010627747, + "logps/chosen": -0.022548483684659004, + "logps/rejected": -5.560819625854492, + "loss": 0.0218, + "odds_ratio_loss": 0.0016226425068452954, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0022548483684659004, + "rewards/margins": 0.5538271069526672, + "rewards/rejected": -0.5560819506645203, + "sft_loss": 0.022548483684659004, + "step": 3570 + }, + { + "epoch": 5.16413593637021, + "grad_norm": 1.0401194910356404, + "learning_rate": 3.9209154350532535e-07, + "logits/chosen": -0.6649314165115356, + "logits/rejected": -0.5735057592391968, + "logps/chosen": -0.0342964343726635, + "logps/rejected": -4.208144664764404, + "loss": 0.0313, + "odds_ratio_loss": 0.005412380211055279, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0034296438097953796, + "rewards/margins": 0.4173848330974579, + "rewards/rejected": -0.4208144545555115, + "sft_loss": 0.0342964343726635, + "step": 3571 + }, + { + "epoch": 5.165582067968185, + "grad_norm": 1.12759993077361, + "learning_rate": 3.9075156457385994e-07, + "logits/chosen": -0.8029493093490601, + "logits/rejected": -0.6737925410270691, + "logps/chosen": -0.03257250413298607, + "logps/rejected": -4.218453407287598, + "loss": 0.0459, + "odds_ratio_loss": 0.0021414184011518955, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003257250413298607, + "rewards/margins": 0.41858813166618347, + "rewards/rejected": -0.42184534668922424, + "sft_loss": 0.03257250413298607, + "step": 3572 + }, + { + "epoch": 5.16702819956616, + "grad_norm": 1.228504842476863, + "learning_rate": 3.894137616717197e-07, + "logits/chosen": -0.9006476402282715, + "logits/rejected": -0.9127538204193115, + "logps/chosen": -0.02118997648358345, + "logps/rejected": -5.368014335632324, + "loss": 0.0517, + "odds_ratio_loss": 0.0006409134948626161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0021189977414906025, + "rewards/margins": 0.5346824526786804, + "rewards/rejected": -0.5368014574050903, + "sft_loss": 0.02118997648358345, + "step": 3573 + }, + { + "epoch": 5.168474331164136, + "grad_norm": 1.081205055296991, + "learning_rate": 3.8807813560546876e-07, + "logits/chosen": -0.8775331377983093, + "logits/rejected": -0.7043969035148621, + "logps/chosen": -0.021752668544650078, + "logps/rejected": -3.4318575859069824, + "loss": 0.0464, + "odds_ratio_loss": 0.004134073853492737, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0021752668544650078, + "rewards/margins": 0.34101051092147827, + "rewards/rejected": -0.34318578243255615, + "sft_loss": 0.021752668544650078, + "step": 3574 + }, + { + "epoch": 5.169920462762112, + "grad_norm": 1.0730844091789573, + "learning_rate": 3.86744687180363e-07, + "logits/chosen": -0.725942850112915, + "logits/rejected": -0.6615370512008667, + "logps/chosen": -0.022522861137986183, + "logps/rejected": -5.256943702697754, + "loss": 0.0366, + "odds_ratio_loss": 0.002798852976411581, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0022522863000631332, + "rewards/margins": 0.5234421491622925, + "rewards/rejected": -0.5256944298744202, + "sft_loss": 0.022522861137986183, + "step": 3575 + }, + { + "epoch": 5.171366594360086, + "grad_norm": 0.9166018136013868, + "learning_rate": 3.8541341720034247e-07, + "logits/chosen": -0.8853042721748352, + "logits/rejected": -0.6347314715385437, + "logps/chosen": -0.03648467734456062, + "logps/rejected": -6.572963237762451, + "loss": 0.033, + "odds_ratio_loss": 0.003740268060937524, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003648467594757676, + "rewards/margins": 0.6536478400230408, + "rewards/rejected": -0.6572964191436768, + "sft_loss": 0.03648467734456062, + "step": 3576 + }, + { + "epoch": 5.172812725958062, + "grad_norm": 0.9377469551604307, + "learning_rate": 3.840843264680349e-07, + "logits/chosen": -0.8311636447906494, + "logits/rejected": -0.6866295337677002, + "logps/chosen": -0.06475996226072311, + "logps/rejected": -5.250019550323486, + "loss": 0.0387, + "odds_ratio_loss": 0.005017520859837532, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006475997157394886, + "rewards/margins": 0.518526017665863, + "rewards/rejected": -0.5250020027160645, + "sft_loss": 0.06475996226072311, + "step": 3577 + }, + { + "epoch": 5.174258857556038, + "grad_norm": 1.2834851376400325, + "learning_rate": 3.8275741578475306e-07, + "logits/chosen": -0.8156872987747192, + "logits/rejected": -0.6633107662200928, + "logps/chosen": -0.06075110286474228, + "logps/rejected": -5.4929046630859375, + "loss": 0.074, + "odds_ratio_loss": 0.0036945268511772156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006075110752135515, + "rewards/margins": 0.5432153940200806, + "rewards/rejected": -0.5492904782295227, + "sft_loss": 0.06075110286474228, + "step": 3578 + }, + { + "epoch": 5.1757049891540134, + "grad_norm": 1.1359955274660638, + "learning_rate": 3.814326859504984e-07, + "logits/chosen": -0.9458404183387756, + "logits/rejected": -0.8643139600753784, + "logps/chosen": -0.04484132304787636, + "logps/rejected": -3.9378392696380615, + "loss": 0.0407, + "odds_ratio_loss": 0.0018446637550368905, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004484132397919893, + "rewards/margins": 0.38929980993270874, + "rewards/rejected": -0.39378395676612854, + "sft_loss": 0.04484132304787636, + "step": 3579 + }, + { + "epoch": 5.177151120751988, + "grad_norm": 1.1248681812196126, + "learning_rate": 3.801101377639533e-07, + "logits/chosen": -0.885506272315979, + "logits/rejected": -0.6628941893577576, + "logps/chosen": -0.05343920737504959, + "logps/rejected": -6.036564826965332, + "loss": 0.0496, + "odds_ratio_loss": 0.0014431718736886978, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005343920085579157, + "rewards/margins": 0.5983126163482666, + "rewards/rejected": -0.603656530380249, + "sft_loss": 0.05343920737504959, + "step": 3580 + }, + { + "epoch": 5.178597252349964, + "grad_norm": 2.1787270917311545, + "learning_rate": 3.7878977202248887e-07, + "logits/chosen": -1.0105148553848267, + "logits/rejected": -0.766710102558136, + "logps/chosen": -0.028319966048002243, + "logps/rejected": -5.47403621673584, + "loss": 0.0372, + "odds_ratio_loss": 0.0020666704513132572, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002831996651366353, + "rewards/margins": 0.5445716381072998, + "rewards/rejected": -0.5474036335945129, + "sft_loss": 0.028319966048002243, + "step": 3581 + }, + { + "epoch": 5.18004338394794, + "grad_norm": 1.0220245059137052, + "learning_rate": 3.7747158952215716e-07, + "logits/chosen": -0.7910235524177551, + "logits/rejected": -0.7252139449119568, + "logps/chosen": -0.027546580880880356, + "logps/rejected": -4.437729835510254, + "loss": 0.0296, + "odds_ratio_loss": 0.003080027410760522, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0027546582277864218, + "rewards/margins": 0.44101834297180176, + "rewards/rejected": -0.4437730312347412, + "sft_loss": 0.027546580880880356, + "step": 3582 + }, + { + "epoch": 5.181489515545914, + "grad_norm": 1.0665922920451818, + "learning_rate": 3.7615559105769633e-07, + "logits/chosen": -0.9450021982192993, + "logits/rejected": -0.7053030729293823, + "logps/chosen": -0.05629965662956238, + "logps/rejected": -5.696890354156494, + "loss": 0.0455, + "odds_ratio_loss": 0.0044265990145504475, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00562996556982398, + "rewards/margins": 0.5640590786933899, + "rewards/rejected": -0.5696890354156494, + "sft_loss": 0.05629965662956238, + "step": 3583 + }, + { + "epoch": 5.18293564714389, + "grad_norm": 1.1682569222454637, + "learning_rate": 3.748417774225259e-07, + "logits/chosen": -1.0742039680480957, + "logits/rejected": -0.7987134456634521, + "logps/chosen": -0.023768005892634392, + "logps/rejected": -5.302433490753174, + "loss": 0.0289, + "odds_ratio_loss": 0.0009636090835556388, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002376800635829568, + "rewards/margins": 0.5278666019439697, + "rewards/rejected": -0.5302433967590332, + "sft_loss": 0.023768005892634392, + "step": 3584 + }, + { + "epoch": 5.184381778741866, + "grad_norm": 1.1421657305611848, + "learning_rate": 3.7353014940874993e-07, + "logits/chosen": -0.9357766509056091, + "logits/rejected": -0.648613452911377, + "logps/chosen": -0.06362772732973099, + "logps/rejected": -6.172553539276123, + "loss": 0.047, + "odds_ratio_loss": 0.00930915866047144, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006362773012369871, + "rewards/margins": 0.6108925938606262, + "rewards/rejected": -0.6172553300857544, + "sft_loss": 0.06362772732973099, + "step": 3585 + }, + { + "epoch": 5.185827910339841, + "grad_norm": 0.8659853390467411, + "learning_rate": 3.722207078071533e-07, + "logits/chosen": -1.0251359939575195, + "logits/rejected": -0.760617733001709, + "logps/chosen": -0.05932139605283737, + "logps/rejected": -5.85208797454834, + "loss": 0.0315, + "odds_ratio_loss": 0.00122246949467808, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005932139698415995, + "rewards/margins": 0.5792766809463501, + "rewards/rejected": -0.5852087736129761, + "sft_loss": 0.05932139605283737, + "step": 3586 + }, + { + "epoch": 5.187274041937816, + "grad_norm": 0.7003907957478049, + "learning_rate": 3.7091345340720226e-07, + "logits/chosen": -0.8258812427520752, + "logits/rejected": -0.7379279136657715, + "logps/chosen": -0.006125980988144875, + "logps/rejected": -5.172117233276367, + "loss": 0.0164, + "odds_ratio_loss": 0.0004759537405334413, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0006125981453806162, + "rewards/margins": 0.516599178314209, + "rewards/rejected": -0.5172117352485657, + "sft_loss": 0.006125980988144875, + "step": 3587 + }, + { + "epoch": 5.188720173535792, + "grad_norm": 0.9972301335770085, + "learning_rate": 3.696083869970472e-07, + "logits/chosen": -0.8752817511558533, + "logits/rejected": -0.8511292934417725, + "logps/chosen": -0.010707555338740349, + "logps/rejected": -6.42230749130249, + "loss": 0.0448, + "odds_ratio_loss": 0.0006266254931688309, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0010707555338740349, + "rewards/margins": 0.6411600112915039, + "rewards/rejected": -0.642230749130249, + "sft_loss": 0.010707555338740349, + "step": 3588 + }, + { + "epoch": 5.190166305133767, + "grad_norm": 1.0174824776073386, + "learning_rate": 3.683055093635161e-07, + "logits/chosen": -0.9354603290557861, + "logits/rejected": -0.7528461217880249, + "logps/chosen": -0.04549839720129967, + "logps/rejected": -4.9645256996154785, + "loss": 0.0321, + "odds_ratio_loss": 0.003526362357661128, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004549840465188026, + "rewards/margins": 0.4919027090072632, + "rewards/rejected": -0.49645254015922546, + "sft_loss": 0.04549839720129967, + "step": 3589 + }, + { + "epoch": 5.191612436731742, + "grad_norm": 1.044001053692902, + "learning_rate": 3.670048212921202e-07, + "logits/chosen": -0.8328427672386169, + "logits/rejected": -0.7321763038635254, + "logps/chosen": -0.06193680316209793, + "logps/rejected": -5.603020668029785, + "loss": 0.0464, + "odds_ratio_loss": 0.0021486992482095957, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006193680688738823, + "rewards/margins": 0.5541083812713623, + "rewards/rejected": -0.5603021383285522, + "sft_loss": 0.06193680316209793, + "step": 3590 + }, + { + "epoch": 5.193058568329718, + "grad_norm": 1.0798462980834291, + "learning_rate": 3.657063235670468e-07, + "logits/chosen": -0.8687400221824646, + "logits/rejected": -0.6818861961364746, + "logps/chosen": -0.014150983653962612, + "logps/rejected": -5.715865135192871, + "loss": 0.0237, + "odds_ratio_loss": 0.0009401044226251543, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0014150984352454543, + "rewards/margins": 0.5701714158058167, + "rewards/rejected": -0.5715864896774292, + "sft_loss": 0.014150983653962612, + "step": 3591 + }, + { + "epoch": 5.194504699927694, + "grad_norm": 1.15719150666348, + "learning_rate": 3.644100169711679e-07, + "logits/chosen": -1.0285594463348389, + "logits/rejected": -0.725699245929718, + "logps/chosen": -0.03213905915617943, + "logps/rejected": -4.655622482299805, + "loss": 0.0505, + "odds_ratio_loss": 0.001043865573592484, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0032139059621840715, + "rewards/margins": 0.4623483717441559, + "rewards/rejected": -0.46556228399276733, + "sft_loss": 0.03213905915617943, + "step": 3592 + }, + { + "epoch": 5.195950831525669, + "grad_norm": 1.2436932447416695, + "learning_rate": 3.6311590228602995e-07, + "logits/chosen": -0.9513838887214661, + "logits/rejected": -0.8044277429580688, + "logps/chosen": -0.06614846736192703, + "logps/rejected": -3.153280735015869, + "loss": 0.0561, + "odds_ratio_loss": 0.04593777656555176, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.006614846643060446, + "rewards/margins": 0.30871322751045227, + "rewards/rejected": -0.31532809138298035, + "sft_loss": 0.06614846736192703, + "step": 3593 + }, + { + "epoch": 5.197396963123644, + "grad_norm": 1.119076034023042, + "learning_rate": 3.618239802918595e-07, + "logits/chosen": -0.8513356447219849, + "logits/rejected": -0.6646866202354431, + "logps/chosen": -0.05874083191156387, + "logps/rejected": -5.149399757385254, + "loss": 0.0623, + "odds_ratio_loss": 0.0020238799042999744, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005874083377420902, + "rewards/margins": 0.5090658664703369, + "rewards/rejected": -0.5149399638175964, + "sft_loss": 0.05874083191156387, + "step": 3594 + }, + { + "epoch": 5.19884309472162, + "grad_norm": 1.283803578447028, + "learning_rate": 3.605342517675609e-07, + "logits/chosen": -1.0322444438934326, + "logits/rejected": -0.7828861474990845, + "logps/chosen": -0.03251064568758011, + "logps/rejected": -4.867115497589111, + "loss": 0.0395, + "odds_ratio_loss": 0.0020458081271499395, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003251064568758011, + "rewards/margins": 0.48346051573753357, + "rewards/rejected": -0.4867115318775177, + "sft_loss": 0.03251064568758011, + "step": 3595 + }, + { + "epoch": 5.200289226319595, + "grad_norm": 1.0396061625618158, + "learning_rate": 3.592467174907172e-07, + "logits/chosen": -0.6187576651573181, + "logits/rejected": -0.5112270712852478, + "logps/chosen": -0.022534213960170746, + "logps/rejected": -5.44777774810791, + "loss": 0.0381, + "odds_ratio_loss": 0.002426896011456847, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0022534215822815895, + "rewards/margins": 0.5425243973731995, + "rewards/rejected": -0.5447777509689331, + "sft_loss": 0.022534213960170746, + "step": 3596 + }, + { + "epoch": 5.2017353579175705, + "grad_norm": 0.8656320470579015, + "learning_rate": 3.5796137823758653e-07, + "logits/chosen": -0.8526214957237244, + "logits/rejected": -0.6017423272132874, + "logps/chosen": -0.02538936212658882, + "logps/rejected": -4.2573161125183105, + "loss": 0.028, + "odds_ratio_loss": 0.0021512117236852646, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002538936212658882, + "rewards/margins": 0.42319270968437195, + "rewards/rejected": -0.4257315993309021, + "sft_loss": 0.02538936212658882, + "step": 3597 + }, + { + "epoch": 5.203181489515546, + "grad_norm": 1.1406021212993034, + "learning_rate": 3.5667823478310545e-07, + "logits/chosen": -1.1594440937042236, + "logits/rejected": -0.7526164650917053, + "logps/chosen": -0.01821194216609001, + "logps/rejected": -5.545048236846924, + "loss": 0.0433, + "odds_ratio_loss": 0.0004932095180265605, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0018211941933259368, + "rewards/margins": 0.5526836514472961, + "rewards/rejected": -0.5545048713684082, + "sft_loss": 0.01821194216609001, + "step": 3598 + }, + { + "epoch": 5.204627621113521, + "grad_norm": 1.1263883171005709, + "learning_rate": 3.553972879008862e-07, + "logits/chosen": -0.9884305000305176, + "logits/rejected": -0.7626382112503052, + "logps/chosen": -0.04639644920825958, + "logps/rejected": -5.2319722175598145, + "loss": 0.036, + "odds_ratio_loss": 0.004956886637955904, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004639644641429186, + "rewards/margins": 0.518557608127594, + "rewards/rejected": -0.5231972336769104, + "sft_loss": 0.04639644920825958, + "step": 3599 + }, + { + "epoch": 5.206073752711497, + "grad_norm": 0.9227492998883203, + "learning_rate": 3.5411853836321634e-07, + "logits/chosen": -0.9538089036941528, + "logits/rejected": -0.7468767166137695, + "logps/chosen": -0.02350049838423729, + "logps/rejected": -4.7276201248168945, + "loss": 0.0261, + "odds_ratio_loss": 0.0009906530613079667, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002350050024688244, + "rewards/margins": 0.4704119563102722, + "rewards/rejected": -0.47276201844215393, + "sft_loss": 0.02350049838423729, + "step": 3600 + }, + { + "epoch": 5.207519884309472, + "grad_norm": 0.8800865521581338, + "learning_rate": 3.528419869410584e-07, + "logits/chosen": -0.9478657245635986, + "logits/rejected": -0.6646949648857117, + "logps/chosen": -0.0317431278526783, + "logps/rejected": -4.364032745361328, + "loss": 0.026, + "odds_ratio_loss": 0.0042336005717515945, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0031743128784000874, + "rewards/margins": 0.4332289695739746, + "rewards/rejected": -0.4364032745361328, + "sft_loss": 0.0317431278526783, + "step": 3601 + }, + { + "epoch": 5.208966015907448, + "grad_norm": 0.6992216686421211, + "learning_rate": 3.51567634404049e-07, + "logits/chosen": -0.8134947419166565, + "logits/rejected": -0.8427659869194031, + "logps/chosen": -0.010465124621987343, + "logps/rejected": -3.6230411529541016, + "loss": 0.0108, + "odds_ratio_loss": 0.0008967835456132889, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001046512508764863, + "rewards/margins": 0.3612575829029083, + "rewards/rejected": -0.36230409145355225, + "sft_loss": 0.010465124621987343, + "step": 3602 + }, + { + "epoch": 5.210412147505423, + "grad_norm": 1.2174837052780079, + "learning_rate": 3.5029548152050214e-07, + "logits/chosen": -0.8519737124443054, + "logits/rejected": -0.7378709316253662, + "logps/chosen": -0.02950909174978733, + "logps/rejected": -4.193276405334473, + "loss": 0.0394, + "odds_ratio_loss": 0.00271158991381526, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0029509093146771193, + "rewards/margins": 0.4163767695426941, + "rewards/rejected": -0.41932767629623413, + "sft_loss": 0.02950909174978733, + "step": 3603 + }, + { + "epoch": 5.2118582791033985, + "grad_norm": 1.1976004631004957, + "learning_rate": 3.490255290574011e-07, + "logits/chosen": -0.6854555606842041, + "logits/rejected": -0.5996547937393188, + "logps/chosen": -0.057780683040618896, + "logps/rejected": -5.569599628448486, + "loss": 0.0346, + "odds_ratio_loss": 0.010212712921202183, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005778068210929632, + "rewards/margins": 0.5511819124221802, + "rewards/rejected": -0.5569599270820618, + "sft_loss": 0.057780683040618896, + "step": 3604 + }, + { + "epoch": 5.213304410701374, + "grad_norm": 0.9238247408528008, + "learning_rate": 3.4775777778040774e-07, + "logits/chosen": -1.1148799657821655, + "logits/rejected": -0.7859690189361572, + "logps/chosen": -0.03792307525873184, + "logps/rejected": -5.748502731323242, + "loss": 0.0281, + "odds_ratio_loss": 0.001623988151550293, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003792307572439313, + "rewards/margins": 0.5710579752922058, + "rewards/rejected": -0.5748502612113953, + "sft_loss": 0.03792307525873184, + "step": 3605 + }, + { + "epoch": 5.214750542299349, + "grad_norm": 1.077370433581713, + "learning_rate": 3.464922284538514e-07, + "logits/chosen": -1.1062169075012207, + "logits/rejected": -0.7723128795623779, + "logps/chosen": -0.013403604738414288, + "logps/rejected": -7.84562873840332, + "loss": 0.0353, + "odds_ratio_loss": 0.00027603365015238523, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0013403603807091713, + "rewards/margins": 0.783222496509552, + "rewards/rejected": -0.784562885761261, + "sft_loss": 0.013403604738414288, + "step": 3606 + }, + { + "epoch": 5.216196673897325, + "grad_norm": 1.0207445296410194, + "learning_rate": 3.4522888184073827e-07, + "logits/chosen": -0.8922250270843506, + "logits/rejected": -0.7089736461639404, + "logps/chosen": -0.02101828157901764, + "logps/rejected": -5.1013641357421875, + "loss": 0.0418, + "odds_ratio_loss": 0.002262045629322529, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002101828111335635, + "rewards/margins": 0.5080346465110779, + "rewards/rejected": -0.5101364850997925, + "sft_loss": 0.02101828157901764, + "step": 3607 + }, + { + "epoch": 5.2176428054953, + "grad_norm": 0.9616877093511352, + "learning_rate": 3.439677387027444e-07, + "logits/chosen": -0.8647952675819397, + "logits/rejected": -0.7060336470603943, + "logps/chosen": -0.03401113301515579, + "logps/rejected": -4.374388694763184, + "loss": 0.0328, + "odds_ratio_loss": 0.002195565262809396, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0034011139068752527, + "rewards/margins": 0.4340377449989319, + "rewards/rejected": -0.4374389052391052, + "sft_loss": 0.03401113301515579, + "step": 3608 + }, + { + "epoch": 5.219088937093275, + "grad_norm": 1.5959179216953558, + "learning_rate": 3.427087998002172e-07, + "logits/chosen": -0.7751748561859131, + "logits/rejected": -0.7734540104866028, + "logps/chosen": -0.02873246558010578, + "logps/rejected": -4.718321800231934, + "loss": 0.0603, + "odds_ratio_loss": 0.001208885689266026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0028732463251799345, + "rewards/margins": 0.46895891427993774, + "rewards/rejected": -0.47183215618133545, + "sft_loss": 0.02873246558010578, + "step": 3609 + }, + { + "epoch": 5.220535068691251, + "grad_norm": 1.0541829578203996, + "learning_rate": 3.4145206589217515e-07, + "logits/chosen": -0.8666414022445679, + "logits/rejected": -0.6371941566467285, + "logps/chosen": -0.010170397348701954, + "logps/rejected": -6.458046913146973, + "loss": 0.0202, + "odds_ratio_loss": 0.00042813006439246237, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0010170398745685816, + "rewards/margins": 0.6447876691818237, + "rewards/rejected": -0.6458047032356262, + "sft_loss": 0.010170397348701954, + "step": 3610 + }, + { + "epoch": 5.2219812002892265, + "grad_norm": 1.146404624092723, + "learning_rate": 3.401975377363082e-07, + "logits/chosen": -0.8071302175521851, + "logits/rejected": -0.5387327075004578, + "logps/chosen": -0.012531624175608158, + "logps/rejected": -6.042186737060547, + "loss": 0.0357, + "odds_ratio_loss": 0.0007533429889008403, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0012531625106930733, + "rewards/margins": 0.6029655933380127, + "rewards/rejected": -0.6042186617851257, + "sft_loss": 0.012531624175608158, + "step": 3611 + }, + { + "epoch": 5.223427331887201, + "grad_norm": 1.0744901126786583, + "learning_rate": 3.3894521608897765e-07, + "logits/chosen": -0.7540593147277832, + "logits/rejected": -0.7037791609764099, + "logps/chosen": -0.059880174696445465, + "logps/rejected": -4.644082069396973, + "loss": 0.0566, + "odds_ratio_loss": 0.0015735100023448467, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005988018121570349, + "rewards/margins": 0.4584202468395233, + "rewards/rejected": -0.4644082486629486, + "sft_loss": 0.059880174696445465, + "step": 3612 + }, + { + "epoch": 5.224873463485177, + "grad_norm": 0.9823145699084468, + "learning_rate": 3.376951017052101e-07, + "logits/chosen": -0.85802161693573, + "logits/rejected": -0.617484986782074, + "logps/chosen": -0.03595628961920738, + "logps/rejected": -6.122747421264648, + "loss": 0.0285, + "odds_ratio_loss": 0.0011885353596881032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0035956294741481543, + "rewards/margins": 0.6086791753768921, + "rewards/rejected": -0.6122748255729675, + "sft_loss": 0.03595628961920738, + "step": 3613 + }, + { + "epoch": 5.226319595083153, + "grad_norm": 1.017079576772903, + "learning_rate": 3.364471953387067e-07, + "logits/chosen": -0.7847793102264404, + "logits/rejected": -0.6176234483718872, + "logps/chosen": -0.008673002012073994, + "logps/rejected": -7.5913262367248535, + "loss": 0.0459, + "odds_ratio_loss": 0.0004855759325437248, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0008673002012073994, + "rewards/margins": 0.7582653760910034, + "rewards/rejected": -0.7591326236724854, + "sft_loss": 0.008673002012073994, + "step": 3614 + }, + { + "epoch": 5.227765726681128, + "grad_norm": 0.9512168280789924, + "learning_rate": 3.3520149774183406e-07, + "logits/chosen": -0.9897427558898926, + "logits/rejected": -0.7149899005889893, + "logps/chosen": -0.02276892587542534, + "logps/rejected": -6.676823616027832, + "loss": 0.0346, + "odds_ratio_loss": 0.0011359690688550472, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002276892773807049, + "rewards/margins": 0.6654054522514343, + "rewards/rejected": -0.6676823496818542, + "sft_loss": 0.02276892587542534, + "step": 3615 + }, + { + "epoch": 5.229211858279103, + "grad_norm": 0.9427603088962667, + "learning_rate": 3.339580096656269e-07, + "logits/chosen": -0.9399840235710144, + "logits/rejected": -0.733213484287262, + "logps/chosen": -0.03748312592506409, + "logps/rejected": -3.965913772583008, + "loss": 0.0268, + "odds_ratio_loss": 0.0014005664270371199, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0037483125925064087, + "rewards/margins": 0.3928430676460266, + "rewards/rejected": -0.3965914249420166, + "sft_loss": 0.03748312592506409, + "step": 3616 + }, + { + "epoch": 5.230657989877079, + "grad_norm": 1.1554821711969416, + "learning_rate": 3.327167318597892e-07, + "logits/chosen": -0.9230865836143494, + "logits/rejected": -0.6254661679267883, + "logps/chosen": -0.03255566582083702, + "logps/rejected": -6.558465480804443, + "loss": 0.0366, + "odds_ratio_loss": 0.0009877877309918404, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0032555670477449894, + "rewards/margins": 0.6525909900665283, + "rewards/rejected": -0.6558465957641602, + "sft_loss": 0.03255566582083702, + "step": 3617 + }, + { + "epoch": 5.2321041214750545, + "grad_norm": 0.9584221143268201, + "learning_rate": 3.3147766507269295e-07, + "logits/chosen": -0.9985564947128296, + "logits/rejected": -0.804498553276062, + "logps/chosen": -0.05001387745141983, + "logps/rejected": -4.83536434173584, + "loss": 0.0304, + "odds_ratio_loss": 0.0040518310852348804, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005001388024538755, + "rewards/margins": 0.4785350561141968, + "rewards/rejected": -0.4835364520549774, + "sft_loss": 0.05001387745141983, + "step": 3618 + }, + { + "epoch": 5.233550253073029, + "grad_norm": 1.2707849962144895, + "learning_rate": 3.3024081005137514e-07, + "logits/chosen": -0.9007452726364136, + "logits/rejected": -0.6459848284721375, + "logps/chosen": -0.014544611796736717, + "logps/rejected": -6.230293273925781, + "loss": 0.0342, + "odds_ratio_loss": 0.0008624520851299167, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0014544612495228648, + "rewards/margins": 0.621574878692627, + "rewards/rejected": -0.623029351234436, + "sft_loss": 0.014544611796736717, + "step": 3619 + }, + { + "epoch": 5.234996384671005, + "grad_norm": 0.9682404550627938, + "learning_rate": 3.2900616754154075e-07, + "logits/chosen": -0.8395979404449463, + "logits/rejected": -0.6162151098251343, + "logps/chosen": -0.08151617646217346, + "logps/rejected": -5.730838775634766, + "loss": 0.0429, + "odds_ratio_loss": 0.0006240714574232697, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008151616901159286, + "rewards/margins": 0.5649322271347046, + "rewards/rejected": -0.5730838775634766, + "sft_loss": 0.08151617646217346, + "step": 3620 + }, + { + "epoch": 5.236442516268981, + "grad_norm": 1.2687445581896963, + "learning_rate": 3.277737382875596e-07, + "logits/chosen": -0.9045553803443909, + "logits/rejected": -0.7344595193862915, + "logps/chosen": -0.058114439249038696, + "logps/rejected": -5.224374771118164, + "loss": 0.0321, + "odds_ratio_loss": 0.0007322908495552838, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005811444483697414, + "rewards/margins": 0.5166260004043579, + "rewards/rejected": -0.5224374532699585, + "sft_loss": 0.058114439249038696, + "step": 3621 + }, + { + "epoch": 5.2378886478669555, + "grad_norm": 1.3522088419469593, + "learning_rate": 3.2654352303246935e-07, + "logits/chosen": -0.895087718963623, + "logits/rejected": -0.734992265701294, + "logps/chosen": -0.04217066988348961, + "logps/rejected": -4.8282928466796875, + "loss": 0.0498, + "odds_ratio_loss": 0.0015827922616153955, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004217067267745733, + "rewards/margins": 0.4786122441291809, + "rewards/rejected": -0.4828292727470398, + "sft_loss": 0.04217066988348961, + "step": 3622 + }, + { + "epoch": 5.239334779464931, + "grad_norm": 1.0023821915021813, + "learning_rate": 3.2531552251797045e-07, + "logits/chosen": -0.8689554333686829, + "logits/rejected": -0.6638770699501038, + "logps/chosen": -0.0453006774187088, + "logps/rejected": -6.682604789733887, + "loss": 0.0355, + "odds_ratio_loss": 0.00197001826018095, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004530067555606365, + "rewards/margins": 0.6637303829193115, + "rewards/rejected": -0.6682605147361755, + "sft_loss": 0.0453006774187088, + "step": 3623 + }, + { + "epoch": 5.240780911062907, + "grad_norm": 1.1418476339361996, + "learning_rate": 3.2408973748442803e-07, + "logits/chosen": -0.880037248134613, + "logits/rejected": -0.5957615375518799, + "logps/chosen": -0.060221537947654724, + "logps/rejected": -7.038527488708496, + "loss": 0.0551, + "odds_ratio_loss": 0.0034473263658583164, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006022154353559017, + "rewards/margins": 0.6978305578231812, + "rewards/rejected": -0.7038527727127075, + "sft_loss": 0.060221537947654724, + "step": 3624 + }, + { + "epoch": 5.242227042660883, + "grad_norm": 1.073359967589388, + "learning_rate": 3.2286616867087445e-07, + "logits/chosen": -0.9191368818283081, + "logits/rejected": -0.6997087001800537, + "logps/chosen": -0.016141196712851524, + "logps/rejected": -4.670000076293945, + "loss": 0.03, + "odds_ratio_loss": 0.0009297472424805164, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0016141196247190237, + "rewards/margins": 0.46538591384887695, + "rewards/rejected": -0.4670000374317169, + "sft_loss": 0.016141196712851524, + "step": 3625 + }, + { + "epoch": 5.243673174258857, + "grad_norm": 0.9690415562551309, + "learning_rate": 3.216448168150019e-07, + "logits/chosen": -0.9207147359848022, + "logits/rejected": -0.6570227742195129, + "logps/chosen": -0.03261332958936691, + "logps/rejected": -7.021213531494141, + "loss": 0.0324, + "odds_ratio_loss": 0.003072987077757716, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003261332865804434, + "rewards/margins": 0.6988599896430969, + "rewards/rejected": -0.702121376991272, + "sft_loss": 0.03261332958936691, + "step": 3626 + }, + { + "epoch": 5.245119305856833, + "grad_norm": 0.9459142870812083, + "learning_rate": 3.2042568265316974e-07, + "logits/chosen": -0.8446757793426514, + "logits/rejected": -0.6071657538414001, + "logps/chosen": -0.04911842197179794, + "logps/rejected": -6.459288597106934, + "loss": 0.0422, + "odds_ratio_loss": 0.0015342968981713057, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004911842290312052, + "rewards/margins": 0.6410170197486877, + "rewards/rejected": -0.6459288597106934, + "sft_loss": 0.04911842197179794, + "step": 3627 + }, + { + "epoch": 5.246565437454809, + "grad_norm": 1.0470928306134115, + "learning_rate": 3.192087669203971e-07, + "logits/chosen": -1.0162910223007202, + "logits/rejected": -0.8110986948013306, + "logps/chosen": -0.03871820867061615, + "logps/rejected": -4.199558258056641, + "loss": 0.0339, + "odds_ratio_loss": 0.0035957153886556625, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003871820867061615, + "rewards/margins": 0.4160839915275574, + "rewards/rejected": -0.4199557900428772, + "sft_loss": 0.03871820867061615, + "step": 3628 + }, + { + "epoch": 5.2480115690527835, + "grad_norm": 1.2074291502339676, + "learning_rate": 3.179940703503683e-07, + "logits/chosen": -0.9160524606704712, + "logits/rejected": -0.7021023035049438, + "logps/chosen": -0.08635597676038742, + "logps/rejected": -4.336616039276123, + "loss": 0.0801, + "odds_ratio_loss": 0.004517072346061468, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008635598234832287, + "rewards/margins": 0.425025999546051, + "rewards/rejected": -0.4336616098880768, + "sft_loss": 0.08635597676038742, + "step": 3629 + }, + { + "epoch": 5.249457700650759, + "grad_norm": 1.0665810428402844, + "learning_rate": 3.167815936754272e-07, + "logits/chosen": -0.9472986459732056, + "logits/rejected": -0.7375108003616333, + "logps/chosen": -0.04879705607891083, + "logps/rejected": -4.624138832092285, + "loss": 0.037, + "odds_ratio_loss": 0.002154088346287608, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0048797051422297955, + "rewards/margins": 0.45753422379493713, + "rewards/rejected": -0.4624139070510864, + "sft_loss": 0.04879705607891083, + "step": 3630 + }, + { + "epoch": 5.250903832248735, + "grad_norm": 0.9773223496389528, + "learning_rate": 3.155713376265816e-07, + "logits/chosen": -0.8467633724212646, + "logits/rejected": -0.5423392653465271, + "logps/chosen": -0.018480662256479263, + "logps/rejected": -5.556922912597656, + "loss": 0.031, + "odds_ratio_loss": 0.0009833640651777387, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0018480663420632482, + "rewards/margins": 0.5538442730903625, + "rewards/rejected": -0.5556923151016235, + "sft_loss": 0.018480662256479263, + "step": 3631 + }, + { + "epoch": 5.25234996384671, + "grad_norm": 1.0079363653872178, + "learning_rate": 3.143633029334989e-07, + "logits/chosen": -0.7981322407722473, + "logits/rejected": -0.7674664258956909, + "logps/chosen": -0.03736051917076111, + "logps/rejected": -4.720569610595703, + "loss": 0.0279, + "odds_ratio_loss": 0.0012767021544277668, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0037360521964728832, + "rewards/margins": 0.46832096576690674, + "rewards/rejected": -0.4720570147037506, + "sft_loss": 0.03736051917076111, + "step": 3632 + }, + { + "epoch": 5.253796095444685, + "grad_norm": 1.768817806827234, + "learning_rate": 3.131574903245071e-07, + "logits/chosen": -1.0161972045898438, + "logits/rejected": -0.7614036798477173, + "logps/chosen": -0.10368499159812927, + "logps/rejected": -4.6447930335998535, + "loss": 0.0543, + "odds_ratio_loss": 0.008217780850827694, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010368499904870987, + "rewards/margins": 0.4541108310222626, + "rewards/rejected": -0.4644792973995209, + "sft_loss": 0.10368499159812927, + "step": 3633 + }, + { + "epoch": 5.255242227042661, + "grad_norm": 1.276381303497112, + "learning_rate": 3.119539005265954e-07, + "logits/chosen": -1.0342979431152344, + "logits/rejected": -0.7442997694015503, + "logps/chosen": -0.01700776070356369, + "logps/rejected": -5.295718193054199, + "loss": 0.0303, + "odds_ratio_loss": 0.0007272001821547747, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0017007759306579828, + "rewards/margins": 0.5278711318969727, + "rewards/rejected": -0.5295718908309937, + "sft_loss": 0.01700776070356369, + "step": 3634 + }, + { + "epoch": 5.256688358640636, + "grad_norm": 1.1448473614666281, + "learning_rate": 3.1075253426541357e-07, + "logits/chosen": -1.0042964220046997, + "logits/rejected": -0.7400972247123718, + "logps/chosen": -0.024408066645264626, + "logps/rejected": -7.579883575439453, + "loss": 0.0317, + "odds_ratio_loss": 0.00141650321893394, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0024408064782619476, + "rewards/margins": 0.7555475831031799, + "rewards/rejected": -0.7579883337020874, + "sft_loss": 0.024408066645264626, + "step": 3635 + }, + { + "epoch": 5.258134490238612, + "grad_norm": 1.3010295902650386, + "learning_rate": 3.095533922652684e-07, + "logits/chosen": -0.7892809510231018, + "logits/rejected": -0.6047863364219666, + "logps/chosen": -0.015120243653655052, + "logps/rejected": -6.354447364807129, + "loss": 0.0472, + "odds_ratio_loss": 0.00048150643124245107, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0015120243187993765, + "rewards/margins": 0.6339326500892639, + "rewards/rejected": -0.635444700717926, + "sft_loss": 0.015120243653655052, + "step": 3636 + }, + { + "epoch": 5.259580621836587, + "grad_norm": 1.068873809854963, + "learning_rate": 3.0835647524912744e-07, + "logits/chosen": -0.9303781986236572, + "logits/rejected": -0.7163772583007812, + "logps/chosen": -0.04354140907526016, + "logps/rejected": -5.129241943359375, + "loss": 0.029, + "odds_ratio_loss": 0.0013979937648400664, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004354140721261501, + "rewards/margins": 0.5085700750350952, + "rewards/rejected": -0.5129241943359375, + "sft_loss": 0.04354140907526016, + "step": 3637 + }, + { + "epoch": 5.261026753434563, + "grad_norm": 0.8962780000482491, + "learning_rate": 3.071617839386178e-07, + "logits/chosen": -0.8089204430580139, + "logits/rejected": -0.4361693561077118, + "logps/chosen": -0.04192004352807999, + "logps/rejected": -4.811159610748291, + "loss": 0.0228, + "odds_ratio_loss": 0.0021075494587421417, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004192003980278969, + "rewards/margins": 0.47692394256591797, + "rewards/rejected": -0.48111599683761597, + "sft_loss": 0.04192004352807999, + "step": 3638 + }, + { + "epoch": 5.262472885032538, + "grad_norm": 0.8829913670211695, + "learning_rate": 3.05969319054022e-07, + "logits/chosen": -0.8424152135848999, + "logits/rejected": -0.5214813351631165, + "logps/chosen": -0.05059170350432396, + "logps/rejected": -3.3692402839660645, + "loss": 0.0291, + "odds_ratio_loss": 0.005258472170680761, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005059171002358198, + "rewards/margins": 0.3318648636341095, + "rewards/rejected": -0.3369240164756775, + "sft_loss": 0.05059170350432396, + "step": 3639 + }, + { + "epoch": 5.263919016630513, + "grad_norm": 1.1489498228376966, + "learning_rate": 3.047790813142819e-07, + "logits/chosen": -0.8711774349212646, + "logits/rejected": -0.6267974972724915, + "logps/chosen": -0.02280341647565365, + "logps/rejected": -6.154178619384766, + "loss": 0.0389, + "odds_ratio_loss": 0.0012691746233031154, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0022803416941314936, + "rewards/margins": 0.6131375432014465, + "rewards/rejected": -0.6154178380966187, + "sft_loss": 0.02280341647565365, + "step": 3640 + }, + { + "epoch": 5.265365148228489, + "grad_norm": 0.9965937815243027, + "learning_rate": 3.0359107143699536e-07, + "logits/chosen": -0.7954994440078735, + "logits/rejected": -0.726338803768158, + "logps/chosen": -0.027501408010721207, + "logps/rejected": -5.424743175506592, + "loss": 0.0283, + "odds_ratio_loss": 0.006880844011902809, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002750141080468893, + "rewards/margins": 0.5397241711616516, + "rewards/rejected": -0.5424743294715881, + "sft_loss": 0.027501408010721207, + "step": 3641 + }, + { + "epoch": 5.266811279826464, + "grad_norm": 1.1401410273538075, + "learning_rate": 3.024052901384193e-07, + "logits/chosen": -0.8828942179679871, + "logits/rejected": -0.6594923734664917, + "logps/chosen": -0.08100616931915283, + "logps/rejected": -4.891511917114258, + "loss": 0.0521, + "odds_ratio_loss": 0.0096984738484025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008100616745650768, + "rewards/margins": 0.481050580739975, + "rewards/rejected": -0.4891512393951416, + "sft_loss": 0.08100616931915283, + "step": 3642 + }, + { + "epoch": 5.26825741142444, + "grad_norm": 1.0742692127897753, + "learning_rate": 3.0122173813346454e-07, + "logits/chosen": -0.662639856338501, + "logits/rejected": -0.5347196459770203, + "logps/chosen": -0.035757362842559814, + "logps/rejected": -5.963934421539307, + "loss": 0.0396, + "odds_ratio_loss": 0.0011856274213641882, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0035757366567850113, + "rewards/margins": 0.5928177237510681, + "rewards/rejected": -0.5963934659957886, + "sft_loss": 0.035757362842559814, + "step": 3643 + }, + { + "epoch": 5.269703543022415, + "grad_norm": 1.0375189083630167, + "learning_rate": 3.000404161357002e-07, + "logits/chosen": -0.9065849184989929, + "logits/rejected": -0.7200076580047607, + "logps/chosen": -0.02265940047800541, + "logps/rejected": -5.422698020935059, + "loss": 0.0291, + "odds_ratio_loss": 0.0008585632895119488, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0022659399546682835, + "rewards/margins": 0.5400038957595825, + "rewards/rejected": -0.5422698259353638, + "sft_loss": 0.02265940047800541, + "step": 3644 + }, + { + "epoch": 5.27114967462039, + "grad_norm": 1.1315767384118531, + "learning_rate": 2.988613248573486e-07, + "logits/chosen": -0.824489951133728, + "logits/rejected": -0.6248018741607666, + "logps/chosen": -0.058798667043447495, + "logps/rejected": -5.063660621643066, + "loss": 0.0388, + "odds_ratio_loss": 0.04671594500541687, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.005879866890609264, + "rewards/margins": 0.5004861950874329, + "rewards/rejected": -0.5063660740852356, + "sft_loss": 0.058798667043447495, + "step": 3645 + }, + { + "epoch": 5.272595806218366, + "grad_norm": 1.2204605587673572, + "learning_rate": 2.9768446500928915e-07, + "logits/chosen": -1.0131525993347168, + "logits/rejected": -0.9568882584571838, + "logps/chosen": -0.04865047708153725, + "logps/rejected": -5.075432777404785, + "loss": 0.0305, + "odds_ratio_loss": 0.0028830140363425016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004865047987550497, + "rewards/margins": 0.5026782751083374, + "rewards/rejected": -0.5075433254241943, + "sft_loss": 0.04865047708153725, + "step": 3646 + }, + { + "epoch": 5.2740419378163415, + "grad_norm": 0.9682725426804464, + "learning_rate": 2.9650983730105503e-07, + "logits/chosen": -0.8504335880279541, + "logits/rejected": -0.846091091632843, + "logps/chosen": -0.029791761189699173, + "logps/rejected": -4.313238143920898, + "loss": 0.0311, + "odds_ratio_loss": 0.0022929785773158073, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0029791758861392736, + "rewards/margins": 0.4283446669578552, + "rewards/rejected": -0.4313238561153412, + "sft_loss": 0.029791761189699173, + "step": 3647 + }, + { + "epoch": 5.275488069414317, + "grad_norm": 1.0233502761452942, + "learning_rate": 2.953374424408328e-07, + "logits/chosen": -0.6208338737487793, + "logits/rejected": -0.6304566860198975, + "logps/chosen": -0.027606675401329994, + "logps/rejected": -4.565278053283691, + "loss": 0.0385, + "odds_ratio_loss": 0.0008353454759344459, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0027606673538684845, + "rewards/margins": 0.4537671208381653, + "rewards/rejected": -0.45652779936790466, + "sft_loss": 0.027606675401329994, + "step": 3648 + }, + { + "epoch": 5.276934201012292, + "grad_norm": 0.9556294865840138, + "learning_rate": 2.9416728113546363e-07, + "logits/chosen": -0.974734902381897, + "logits/rejected": -0.6098443865776062, + "logps/chosen": -0.039620328694581985, + "logps/rejected": -7.678589820861816, + "loss": 0.0374, + "odds_ratio_loss": 0.0007251370116136968, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003962032496929169, + "rewards/margins": 0.7638970017433167, + "rewards/rejected": -0.7678591012954712, + "sft_loss": 0.039620328694581985, + "step": 3649 + }, + { + "epoch": 5.278380332610268, + "grad_norm": 1.1251246650139406, + "learning_rate": 2.929993540904436e-07, + "logits/chosen": -0.9035037755966187, + "logits/rejected": -0.7772727608680725, + "logps/chosen": -0.020724035799503326, + "logps/rejected": -4.813797473907471, + "loss": 0.0343, + "odds_ratio_loss": 0.0009286232525482774, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00207240367308259, + "rewards/margins": 0.4793073534965515, + "rewards/rejected": -0.4813797175884247, + "sft_loss": 0.020724035799503326, + "step": 3650 + }, + { + "epoch": 5.279826464208243, + "grad_norm": 0.8900567501477809, + "learning_rate": 2.918336620099184e-07, + "logits/chosen": -0.7675405740737915, + "logits/rejected": -0.6586223840713501, + "logps/chosen": -0.01650574989616871, + "logps/rejected": -6.721004009246826, + "loss": 0.0315, + "odds_ratio_loss": 0.0005637137801386416, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0016505750827491283, + "rewards/margins": 0.6704498529434204, + "rewards/rejected": -0.6721004247665405, + "sft_loss": 0.01650574989616871, + "step": 3651 + }, + { + "epoch": 5.281272595806218, + "grad_norm": 1.1633737173097518, + "learning_rate": 2.9067020559668945e-07, + "logits/chosen": -0.8274843692779541, + "logits/rejected": -0.6375514268875122, + "logps/chosen": -0.053619079291820526, + "logps/rejected": -6.285444259643555, + "loss": 0.0341, + "odds_ratio_loss": 0.003317378694191575, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0053619081154465675, + "rewards/margins": 0.6231825351715088, + "rewards/rejected": -0.6285443902015686, + "sft_loss": 0.053619079291820526, + "step": 3652 + }, + { + "epoch": 5.282718727404194, + "grad_norm": 1.157606261307862, + "learning_rate": 2.895089855522088e-07, + "logits/chosen": -0.8747952580451965, + "logits/rejected": -0.7444599866867065, + "logps/chosen": -0.04207966476678848, + "logps/rejected": -5.985525131225586, + "loss": 0.0419, + "odds_ratio_loss": 0.004523225128650665, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004207966383546591, + "rewards/margins": 0.5943444967269897, + "rewards/rejected": -0.5985524654388428, + "sft_loss": 0.04207966476678848, + "step": 3653 + }, + { + "epoch": 5.2841648590021695, + "grad_norm": 1.4338523494202897, + "learning_rate": 2.8835000257658016e-07, + "logits/chosen": -0.9471051692962646, + "logits/rejected": -0.5582077503204346, + "logps/chosen": -0.01802823692560196, + "logps/rejected": -4.929285526275635, + "loss": 0.0317, + "odds_ratio_loss": 0.0007497454062104225, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0018028237391263247, + "rewards/margins": 0.49112576246261597, + "rewards/rejected": -0.49292856454849243, + "sft_loss": 0.01802823692560196, + "step": 3654 + }, + { + "epoch": 5.285610990600144, + "grad_norm": 1.2172564226035754, + "learning_rate": 2.8719325736855873e-07, + "logits/chosen": -0.8020838499069214, + "logits/rejected": -0.7115020155906677, + "logps/chosen": -0.04903501272201538, + "logps/rejected": -4.307126045227051, + "loss": 0.0442, + "odds_ratio_loss": 0.0023487398866564035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0049035013653337955, + "rewards/margins": 0.4258091449737549, + "rewards/rejected": -0.43071264028549194, + "sft_loss": 0.04903501272201538, + "step": 3655 + }, + { + "epoch": 5.28705712219812, + "grad_norm": 1.2232310635801635, + "learning_rate": 2.860387506255497e-07, + "logits/chosen": -0.7558038234710693, + "logits/rejected": -0.5430575609207153, + "logps/chosen": -0.031046954914927483, + "logps/rejected": -6.496105194091797, + "loss": 0.0503, + "odds_ratio_loss": 0.0024577996227890253, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003104696050286293, + "rewards/margins": 0.6465058326721191, + "rewards/rejected": -0.6496106386184692, + "sft_loss": 0.031046954914927483, + "step": 3656 + }, + { + "epoch": 5.288503253796096, + "grad_norm": 1.2243215854146698, + "learning_rate": 2.848864830436111e-07, + "logits/chosen": -0.9211279153823853, + "logits/rejected": -0.5959931015968323, + "logps/chosen": -0.09139852970838547, + "logps/rejected": -4.205726623535156, + "loss": 0.0597, + "odds_ratio_loss": 0.005354328081011772, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009139853529632092, + "rewards/margins": 0.41143280267715454, + "rewards/rejected": -0.4205726683139801, + "sft_loss": 0.09139852970838547, + "step": 3657 + }, + { + "epoch": 5.2899493853940704, + "grad_norm": 1.121340360401462, + "learning_rate": 2.837364553174475e-07, + "logits/chosen": -1.0993698835372925, + "logits/rejected": -0.5834642052650452, + "logps/chosen": -0.05646635964512825, + "logps/rejected": -6.199383735656738, + "loss": 0.0398, + "odds_ratio_loss": 0.001794778392650187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00564663577824831, + "rewards/margins": 0.6142917275428772, + "rewards/rejected": -0.6199383735656738, + "sft_loss": 0.05646635964512825, + "step": 3658 + }, + { + "epoch": 5.291395516992046, + "grad_norm": 1.3170381422047455, + "learning_rate": 2.825886681404164e-07, + "logits/chosen": -0.7939042448997498, + "logits/rejected": -0.5887212157249451, + "logps/chosen": -0.03460104390978813, + "logps/rejected": -5.4101104736328125, + "loss": 0.0285, + "odds_ratio_loss": 0.0012456791009753942, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0034601043444126844, + "rewards/margins": 0.5375509262084961, + "rewards/rejected": -0.5410110354423523, + "sft_loss": 0.03460104390978813, + "step": 3659 + }, + { + "epoch": 5.292841648590022, + "grad_norm": 0.919145469791782, + "learning_rate": 2.8144312220452194e-07, + "logits/chosen": -0.6260144710540771, + "logits/rejected": -0.6858643293380737, + "logps/chosen": -0.01188691146671772, + "logps/rejected": -4.332086086273193, + "loss": 0.028, + "odds_ratio_loss": 0.0014277580194175243, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0011886911233887076, + "rewards/margins": 0.4320199489593506, + "rewards/rejected": -0.4332086145877838, + "sft_loss": 0.01188691146671772, + "step": 3660 + }, + { + "epoch": 5.2942877801879975, + "grad_norm": 1.1722799731959772, + "learning_rate": 2.802998182004188e-07, + "logits/chosen": -0.865659236907959, + "logits/rejected": -0.5501079559326172, + "logps/chosen": -0.058488842099905014, + "logps/rejected": -5.399849891662598, + "loss": 0.0557, + "odds_ratio_loss": 0.002198958769440651, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005848885513842106, + "rewards/margins": 0.5341361165046692, + "rewards/rejected": -0.5399850010871887, + "sft_loss": 0.058488842099905014, + "step": 3661 + }, + { + "epoch": 5.295733911785972, + "grad_norm": 1.0278872351784067, + "learning_rate": 2.791587568174094e-07, + "logits/chosen": -0.8193449974060059, + "logits/rejected": -0.7792457342147827, + "logps/chosen": -0.04936240613460541, + "logps/rejected": -4.964071273803711, + "loss": 0.025, + "odds_ratio_loss": 0.0040309252217411995, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004936240613460541, + "rewards/margins": 0.49147090315818787, + "rewards/rejected": -0.496407151222229, + "sft_loss": 0.04936240613460541, + "step": 3662 + }, + { + "epoch": 5.297180043383948, + "grad_norm": 1.203031731438833, + "learning_rate": 2.7801993874344297e-07, + "logits/chosen": -0.8412383198738098, + "logits/rejected": -0.7706151008605957, + "logps/chosen": -0.027266865596175194, + "logps/rejected": -5.0899739265441895, + "loss": 0.0517, + "odds_ratio_loss": 0.001991531578823924, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002726686652749777, + "rewards/margins": 0.5062706470489502, + "rewards/rejected": -0.50899738073349, + "sft_loss": 0.027266865596175194, + "step": 3663 + }, + { + "epoch": 5.298626174981924, + "grad_norm": 1.023947653615631, + "learning_rate": 2.7688336466511743e-07, + "logits/chosen": -0.9421272873878479, + "logits/rejected": -0.6665881872177124, + "logps/chosen": -0.036513958126306534, + "logps/rejected": -5.57096529006958, + "loss": 0.0252, + "odds_ratio_loss": 0.0018653357401490211, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003651396371424198, + "rewards/margins": 0.5534451007843018, + "rewards/rejected": -0.557096540927887, + "sft_loss": 0.036513958126306534, + "step": 3664 + }, + { + "epoch": 5.3000723065798985, + "grad_norm": 1.110676709749025, + "learning_rate": 2.7574903526767746e-07, + "logits/chosen": -0.6372553110122681, + "logits/rejected": -0.5765061378479004, + "logps/chosen": -0.02319909632205963, + "logps/rejected": -4.816458702087402, + "loss": 0.0382, + "odds_ratio_loss": 0.0015413042856380343, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002319909632205963, + "rewards/margins": 0.47932595014572144, + "rewards/rejected": -0.4816458225250244, + "sft_loss": 0.02319909632205963, + "step": 3665 + }, + { + "epoch": 5.301518438177874, + "grad_norm": 1.0978912010109163, + "learning_rate": 2.746169512350152e-07, + "logits/chosen": -0.7656617164611816, + "logits/rejected": -0.5390478372573853, + "logps/chosen": -0.04759720712900162, + "logps/rejected": -4.945181846618652, + "loss": 0.0338, + "odds_ratio_loss": 0.001722943503409624, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004759720992296934, + "rewards/margins": 0.4897584915161133, + "rewards/rejected": -0.4945182204246521, + "sft_loss": 0.04759720712900162, + "step": 3666 + }, + { + "epoch": 5.30296456977585, + "grad_norm": 1.012794228374218, + "learning_rate": 2.734871132496672e-07, + "logits/chosen": -1.0013247728347778, + "logits/rejected": -0.7348878383636475, + "logps/chosen": -0.023679528385400772, + "logps/rejected": -4.578136920928955, + "loss": 0.0206, + "odds_ratio_loss": 0.0015165224904194474, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002367952838540077, + "rewards/margins": 0.4554457664489746, + "rewards/rejected": -0.45781373977661133, + "sft_loss": 0.023679528385400772, + "step": 3667 + }, + { + "epoch": 5.304410701373825, + "grad_norm": 1.4279003426255663, + "learning_rate": 2.7235952199281854e-07, + "logits/chosen": -0.9840003252029419, + "logits/rejected": -0.8327428102493286, + "logps/chosen": -0.041510019451379776, + "logps/rejected": -4.400811195373535, + "loss": 0.0386, + "odds_ratio_loss": 0.0024359440430998802, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004151002038270235, + "rewards/margins": 0.43593013286590576, + "rewards/rejected": -0.4400811195373535, + "sft_loss": 0.041510019451379776, + "step": 3668 + }, + { + "epoch": 5.3058568329718, + "grad_norm": 1.1038023610062029, + "learning_rate": 2.712341781442973e-07, + "logits/chosen": -0.8254796266555786, + "logits/rejected": -0.7522153854370117, + "logps/chosen": -0.022977057844400406, + "logps/rejected": -5.291687488555908, + "loss": 0.0455, + "odds_ratio_loss": 0.001104579190723598, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0022977059707045555, + "rewards/margins": 0.5268710851669312, + "rewards/rejected": -0.5291687846183777, + "sft_loss": 0.022977057844400406, + "step": 3669 + }, + { + "epoch": 5.307302964569776, + "grad_norm": 1.072757689228915, + "learning_rate": 2.701110823825772e-07, + "logits/chosen": -0.9172158241271973, + "logits/rejected": -0.7597464323043823, + "logps/chosen": -0.03024320863187313, + "logps/rejected": -4.548722743988037, + "loss": 0.0425, + "odds_ratio_loss": 0.0015936695272102952, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0030243208166211843, + "rewards/margins": 0.45184794068336487, + "rewards/rejected": -0.4548722505569458, + "sft_loss": 0.03024320863187313, + "step": 3670 + }, + { + "epoch": 5.308749096167752, + "grad_norm": 1.3097028126399897, + "learning_rate": 2.689902353847766e-07, + "logits/chosen": -1.211554765701294, + "logits/rejected": -0.7927272915840149, + "logps/chosen": -0.04887429624795914, + "logps/rejected": -4.841168403625488, + "loss": 0.041, + "odds_ratio_loss": 0.001834406517446041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004887429066002369, + "rewards/margins": 0.4792294502258301, + "rewards/rejected": -0.4841168522834778, + "sft_loss": 0.04887429624795914, + "step": 3671 + }, + { + "epoch": 5.3101952277657265, + "grad_norm": 1.107848445275236, + "learning_rate": 2.678716378266599e-07, + "logits/chosen": -0.8874702453613281, + "logits/rejected": -0.7778452634811401, + "logps/chosen": -0.013926111161708832, + "logps/rejected": -6.384116172790527, + "loss": 0.0593, + "odds_ratio_loss": 0.0006129151443019509, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0013926110696047544, + "rewards/margins": 0.6370189785957336, + "rewards/rejected": -0.6384116411209106, + "sft_loss": 0.013926111161708832, + "step": 3672 + }, + { + "epoch": 5.311641359363702, + "grad_norm": 1.086942198501488, + "learning_rate": 2.6675529038263157e-07, + "logits/chosen": -0.855125904083252, + "logits/rejected": -0.7381969690322876, + "logps/chosen": -0.018707137554883957, + "logps/rejected": -3.6854591369628906, + "loss": 0.0419, + "odds_ratio_loss": 0.0006958214216865599, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00187071377877146, + "rewards/margins": 0.36667516827583313, + "rewards/rejected": -0.36854591965675354, + "sft_loss": 0.018707137554883957, + "step": 3673 + }, + { + "epoch": 5.313087490961678, + "grad_norm": 1.1795744742774177, + "learning_rate": 2.6564119372574347e-07, + "logits/chosen": -0.8077712059020996, + "logits/rejected": -0.6191185116767883, + "logps/chosen": -0.06234436109662056, + "logps/rejected": -6.990647315979004, + "loss": 0.0462, + "odds_ratio_loss": 0.007911072112619877, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006234435830265284, + "rewards/margins": 0.6928303241729736, + "rewards/rejected": -0.6990647315979004, + "sft_loss": 0.06234436109662056, + "step": 3674 + }, + { + "epoch": 5.314533622559653, + "grad_norm": 0.968028462978872, + "learning_rate": 2.6452934852768714e-07, + "logits/chosen": -0.7257063388824463, + "logits/rejected": -0.6271748542785645, + "logps/chosen": -0.016601260751485825, + "logps/rejected": -4.678224563598633, + "loss": 0.0308, + "odds_ratio_loss": 0.0008440697565674782, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0016601260285824537, + "rewards/margins": 0.4661623239517212, + "rewards/rejected": -0.46782243251800537, + "sft_loss": 0.016601260751485825, + "step": 3675 + }, + { + "epoch": 5.315979754157628, + "grad_norm": 1.1731861474929364, + "learning_rate": 2.634197554587998e-07, + "logits/chosen": -1.1070533990859985, + "logits/rejected": -0.7468483448028564, + "logps/chosen": -0.05605795979499817, + "logps/rejected": -6.352745532989502, + "loss": 0.037, + "odds_ratio_loss": 0.0028356327675282955, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005605795886367559, + "rewards/margins": 0.6296687722206116, + "rewards/rejected": -0.6352745890617371, + "sft_loss": 0.05605795979499817, + "step": 3676 + }, + { + "epoch": 5.317425885755604, + "grad_norm": 1.077797013393816, + "learning_rate": 2.623124151880578e-07, + "logits/chosen": -0.8413434028625488, + "logits/rejected": -0.6536193490028381, + "logps/chosen": -0.04118379205465317, + "logps/rejected": -5.763056755065918, + "loss": 0.042, + "odds_ratio_loss": 0.004730723798274994, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004118379671126604, + "rewards/margins": 0.5721873641014099, + "rewards/rejected": -0.5763057470321655, + "sft_loss": 0.04118379205465317, + "step": 3677 + }, + { + "epoch": 5.318872017353579, + "grad_norm": 1.3456733360766195, + "learning_rate": 2.612073283830818e-07, + "logits/chosen": -0.9398325681686401, + "logits/rejected": -0.6609033346176147, + "logps/chosen": -0.02295401319861412, + "logps/rejected": -6.218092441558838, + "loss": 0.0263, + "odds_ratio_loss": 0.0007425962830893695, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0022954014129936695, + "rewards/margins": 0.6195138692855835, + "rewards/rejected": -0.6218092441558838, + "sft_loss": 0.02295401319861412, + "step": 3678 + }, + { + "epoch": 5.3203181489515545, + "grad_norm": 1.3016144488878012, + "learning_rate": 2.6010449571013215e-07, + "logits/chosen": -0.8513684868812561, + "logits/rejected": -0.6279063820838928, + "logps/chosen": -0.07777070999145508, + "logps/rejected": -6.661202907562256, + "loss": 0.0392, + "odds_ratio_loss": 0.0036332891322672367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007777070626616478, + "rewards/margins": 0.6583431959152222, + "rewards/rejected": -0.6661202907562256, + "sft_loss": 0.07777070999145508, + "step": 3679 + }, + { + "epoch": 5.32176428054953, + "grad_norm": 1.0491356786995216, + "learning_rate": 2.5900391783411035e-07, + "logits/chosen": -1.0663862228393555, + "logits/rejected": -0.8108906149864197, + "logps/chosen": -0.03312306851148605, + "logps/rejected": -4.926994323730469, + "loss": 0.0266, + "odds_ratio_loss": 0.0015353760682046413, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003312306944280863, + "rewards/margins": 0.4893871545791626, + "rewards/rejected": -0.49269944429397583, + "sft_loss": 0.03312306851148605, + "step": 3680 + }, + { + "epoch": 5.323210412147505, + "grad_norm": 1.5135907197437433, + "learning_rate": 2.579055954185603e-07, + "logits/chosen": -1.0617835521697998, + "logits/rejected": -0.8939220905303955, + "logps/chosen": -0.05323568731546402, + "logps/rejected": -4.0288519859313965, + "loss": 0.0397, + "odds_ratio_loss": 0.0026793223805725574, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0053235688246786594, + "rewards/margins": 0.39756157994270325, + "rewards/rejected": -0.40288519859313965, + "sft_loss": 0.05323568731546402, + "step": 3681 + }, + { + "epoch": 5.324656543745481, + "grad_norm": 1.1680256177062494, + "learning_rate": 2.5680952912566334e-07, + "logits/chosen": -0.8926557302474976, + "logits/rejected": -0.7793102264404297, + "logps/chosen": -0.05652204155921936, + "logps/rejected": -4.692435264587402, + "loss": 0.039, + "odds_ratio_loss": 0.006315143778920174, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005652204155921936, + "rewards/margins": 0.4635912775993347, + "rewards/rejected": -0.46924352645874023, + "sft_loss": 0.05652204155921936, + "step": 3682 + }, + { + "epoch": 5.326102675343456, + "grad_norm": 1.495092313640595, + "learning_rate": 2.557157196162425e-07, + "logits/chosen": -1.017024040222168, + "logits/rejected": -0.7390504479408264, + "logps/chosen": -0.046689312905073166, + "logps/rejected": -6.402212142944336, + "loss": 0.0614, + "odds_ratio_loss": 0.00260357023216784, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004668931011110544, + "rewards/margins": 0.6355522871017456, + "rewards/rejected": -0.6402212381362915, + "sft_loss": 0.046689312905073166, + "step": 3683 + }, + { + "epoch": 5.327548806941432, + "grad_norm": 0.9797352539134356, + "learning_rate": 2.546241675497591e-07, + "logits/chosen": -0.9606846570968628, + "logits/rejected": -0.7178159952163696, + "logps/chosen": -0.05462236702442169, + "logps/rejected": -7.01438570022583, + "loss": 0.0277, + "odds_ratio_loss": 0.0016510548302903771, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005462236702442169, + "rewards/margins": 0.6959763169288635, + "rewards/rejected": -0.7014386057853699, + "sft_loss": 0.05462236702442169, + "step": 3684 + }, + { + "epoch": 5.328994938539407, + "grad_norm": 0.9631824763305563, + "learning_rate": 2.5353487358431527e-07, + "logits/chosen": -0.9838446378707886, + "logits/rejected": -0.7861350774765015, + "logps/chosen": -0.035723727196455, + "logps/rejected": -4.819340229034424, + "loss": 0.0255, + "odds_ratio_loss": 0.002584266010671854, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003572372952476144, + "rewards/margins": 0.47836166620254517, + "rewards/rejected": -0.4819340407848358, + "sft_loss": 0.035723727196455, + "step": 3685 + }, + { + "epoch": 5.3304410701373826, + "grad_norm": 1.2332280571344059, + "learning_rate": 2.524478383766491e-07, + "logits/chosen": -0.9059146642684937, + "logits/rejected": -0.7595691680908203, + "logps/chosen": -0.02609408274292946, + "logps/rejected": -4.347700119018555, + "loss": 0.0646, + "odds_ratio_loss": 0.002554179634898901, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002609408460557461, + "rewards/margins": 0.4321606159210205, + "rewards/rejected": -0.43476998805999756, + "sft_loss": 0.02609408274292946, + "step": 3686 + }, + { + "epoch": 5.331887201735358, + "grad_norm": 0.8983449394324652, + "learning_rate": 2.5136306258213857e-07, + "logits/chosen": -0.9400614500045776, + "logits/rejected": -0.7500208616256714, + "logps/chosen": -0.028386441990733147, + "logps/rejected": -4.063958644866943, + "loss": 0.0234, + "odds_ratio_loss": 0.0013379440642893314, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0028386441990733147, + "rewards/margins": 0.4035572111606598, + "rewards/rejected": -0.4063958525657654, + "sft_loss": 0.028386441990733147, + "step": 3687 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.9751487290928306, + "learning_rate": 2.502805468547984e-07, + "logits/chosen": -1.0151832103729248, + "logits/rejected": -0.7417925596237183, + "logps/chosen": -0.03261277452111244, + "logps/rejected": -5.35139274597168, + "loss": 0.0328, + "odds_ratio_loss": 0.001738175400532782, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003261277452111244, + "rewards/margins": 0.5318779945373535, + "rewards/rejected": -0.535139262676239, + "sft_loss": 0.03261277452111244, + "step": 3688 + }, + { + "epoch": 5.334779464931309, + "grad_norm": 1.1841885288210054, + "learning_rate": 2.4920029184728285e-07, + "logits/chosen": -0.8535258173942566, + "logits/rejected": -0.711528480052948, + "logps/chosen": -0.04814765602350235, + "logps/rejected": -4.029658317565918, + "loss": 0.0384, + "odds_ratio_loss": 0.0027197981253266335, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00481476541608572, + "rewards/margins": 0.3981510400772095, + "rewards/rejected": -0.40296584367752075, + "sft_loss": 0.04814765602350235, + "step": 3689 + }, + { + "epoch": 5.336225596529284, + "grad_norm": 1.0863611869122904, + "learning_rate": 2.481222982108799e-07, + "logits/chosen": -0.7337764501571655, + "logits/rejected": -0.6401863694190979, + "logps/chosen": -0.02598814107477665, + "logps/rejected": -5.963187217712402, + "loss": 0.0348, + "odds_ratio_loss": 0.0013941468205302954, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0025988139677792788, + "rewards/margins": 0.5937198996543884, + "rewards/rejected": -0.5963187217712402, + "sft_loss": 0.02598814107477665, + "step": 3690 + }, + { + "epoch": 5.337671728127259, + "grad_norm": 1.112598630427125, + "learning_rate": 2.470465665955173e-07, + "logits/chosen": -1.239977240562439, + "logits/rejected": -0.7182906866073608, + "logps/chosen": -0.03799346834421158, + "logps/rejected": -7.847053527832031, + "loss": 0.0339, + "odds_ratio_loss": 0.002206320408731699, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0037993467412889004, + "rewards/margins": 0.7809059619903564, + "rewards/rejected": -0.784705400466919, + "sft_loss": 0.03799346834421158, + "step": 3691 + }, + { + "epoch": 5.339117859725235, + "grad_norm": 0.8147289146984105, + "learning_rate": 2.4597309764975737e-07, + "logits/chosen": -0.7928810119628906, + "logits/rejected": -0.7506893873214722, + "logps/chosen": -0.005609842017292976, + "logps/rejected": -7.077286243438721, + "loss": 0.0131, + "odds_ratio_loss": 0.0003021466836798936, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0005609841900877655, + "rewards/margins": 0.7071676254272461, + "rewards/rejected": -0.7077286243438721, + "sft_loss": 0.005609842017292976, + "step": 3692 + }, + { + "epoch": 5.340563991323211, + "grad_norm": 1.3433904863941526, + "learning_rate": 2.449018920207986e-07, + "logits/chosen": -0.9152747988700867, + "logits/rejected": -0.6203851699829102, + "logps/chosen": -0.05753957852721214, + "logps/rejected": -6.203818321228027, + "loss": 0.0708, + "odds_ratio_loss": 0.0015491548692807555, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005753958132117987, + "rewards/margins": 0.6146278381347656, + "rewards/rejected": -0.6203818321228027, + "sft_loss": 0.05753957852721214, + "step": 3693 + }, + { + "epoch": 5.342010122921186, + "grad_norm": 1.3976027188444067, + "learning_rate": 2.438329503544745e-07, + "logits/chosen": -0.9144161939620972, + "logits/rejected": -0.783090353012085, + "logps/chosen": -0.04463702812790871, + "logps/rejected": -4.492361068725586, + "loss": 0.0524, + "odds_ratio_loss": 0.002019796520471573, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004463702440261841, + "rewards/margins": 0.4447723627090454, + "rewards/rejected": -0.44923609495162964, + "sft_loss": 0.04463702812790871, + "step": 3694 + }, + { + "epoch": 5.343456254519161, + "grad_norm": 0.9595421865106609, + "learning_rate": 2.427662732952531e-07, + "logits/chosen": -0.891353189945221, + "logits/rejected": -0.7665066719055176, + "logps/chosen": -0.017603110522031784, + "logps/rejected": -5.0525360107421875, + "loss": 0.0355, + "odds_ratio_loss": 0.0015732902102172375, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0017603111919015646, + "rewards/margins": 0.5034933090209961, + "rewards/rejected": -0.5052536129951477, + "sft_loss": 0.017603110522031784, + "step": 3695 + }, + { + "epoch": 5.344902386117137, + "grad_norm": 0.9786549040478829, + "learning_rate": 2.4170186148624003e-07, + "logits/chosen": -0.8309451341629028, + "logits/rejected": -0.7362473011016846, + "logps/chosen": -0.02056121453642845, + "logps/rejected": -4.408735752105713, + "loss": 0.0377, + "odds_ratio_loss": 0.0009791934862732887, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002056121826171875, + "rewards/margins": 0.43881747126579285, + "rewards/rejected": -0.44087356328964233, + "sft_loss": 0.02056121453642845, + "step": 3696 + }, + { + "epoch": 5.346348517715112, + "grad_norm": 0.9266218741647381, + "learning_rate": 2.406397155691713e-07, + "logits/chosen": -1.0537879467010498, + "logits/rejected": -0.9023940563201904, + "logps/chosen": -0.04712200164794922, + "logps/rejected": -3.5155301094055176, + "loss": 0.0293, + "odds_ratio_loss": 0.003726676106452942, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004712200257927179, + "rewards/margins": 0.34684082865715027, + "rewards/rejected": -0.3515530228614807, + "sft_loss": 0.04712200164794922, + "step": 3697 + }, + { + "epoch": 5.347794649313087, + "grad_norm": 0.8161323163215715, + "learning_rate": 2.3957983618442037e-07, + "logits/chosen": -0.8924893140792847, + "logits/rejected": -0.6488431692123413, + "logps/chosen": -0.019642913714051247, + "logps/rejected": -5.622814178466797, + "loss": 0.0344, + "odds_ratio_loss": 0.0009415658423677087, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0019642915576696396, + "rewards/margins": 0.5603170990943909, + "rewards/rejected": -0.5622814297676086, + "sft_loss": 0.019642913714051247, + "step": 3698 + }, + { + "epoch": 5.349240780911063, + "grad_norm": 1.0073435267680912, + "learning_rate": 2.385222239709903e-07, + "logits/chosen": -0.939033031463623, + "logits/rejected": -0.7678767442703247, + "logps/chosen": -0.023407379165291786, + "logps/rejected": -5.834734916687012, + "loss": 0.0317, + "odds_ratio_loss": 0.001715653808787465, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0023407377302646637, + "rewards/margins": 0.5811327695846558, + "rewards/rejected": -0.5834734439849854, + "sft_loss": 0.023407379165291786, + "step": 3699 + }, + { + "epoch": 5.350686912509039, + "grad_norm": 1.3694533433954266, + "learning_rate": 2.374668795665218e-07, + "logits/chosen": -0.9073183536529541, + "logits/rejected": -0.8298658132553101, + "logps/chosen": -0.01922656036913395, + "logps/rejected": -4.1897687911987305, + "loss": 0.0341, + "odds_ratio_loss": 0.0013735933462157845, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0019226560834795237, + "rewards/margins": 0.4170542359352112, + "rewards/rejected": -0.41897690296173096, + "sft_loss": 0.01922656036913395, + "step": 3700 + }, + { + "epoch": 5.352133044107013, + "grad_norm": 0.8168201385481627, + "learning_rate": 2.3641380360728447e-07, + "logits/chosen": -0.9471131563186646, + "logits/rejected": -0.6616654396057129, + "logps/chosen": -0.019169360399246216, + "logps/rejected": -6.367465972900391, + "loss": 0.0228, + "odds_ratio_loss": 0.0011902657570317388, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0019169360166415572, + "rewards/margins": 0.6348296403884888, + "rewards/rejected": -0.6367465853691101, + "sft_loss": 0.019169360399246216, + "step": 3701 + }, + { + "epoch": 5.353579175704989, + "grad_norm": 0.9598137915603296, + "learning_rate": 2.3536299672818205e-07, + "logits/chosen": -0.6729775071144104, + "logits/rejected": -0.6349964737892151, + "logps/chosen": -0.05535079538822174, + "logps/rejected": -4.376411437988281, + "loss": 0.0344, + "odds_ratio_loss": 0.0037783747538924217, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005535079166293144, + "rewards/margins": 0.432106077671051, + "rewards/rejected": -0.4376411736011505, + "sft_loss": 0.05535079538822174, + "step": 3702 + }, + { + "epoch": 5.355025307302965, + "grad_norm": 1.06738741943502, + "learning_rate": 2.3431445956274954e-07, + "logits/chosen": -1.015805721282959, + "logits/rejected": -0.7145316004753113, + "logps/chosen": -0.027575181797146797, + "logps/rejected": -4.448596000671387, + "loss": 0.0476, + "odds_ratio_loss": 0.0016211337642744184, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002757518319413066, + "rewards/margins": 0.4421020746231079, + "rewards/rejected": -0.4448596239089966, + "sft_loss": 0.027575181797146797, + "step": 3703 + }, + { + "epoch": 5.35647143890094, + "grad_norm": 1.1261370264518713, + "learning_rate": 2.3326819274315368e-07, + "logits/chosen": -0.7991698980331421, + "logits/rejected": -0.5997868776321411, + "logps/chosen": -0.008198452182114124, + "logps/rejected": -5.460474967956543, + "loss": 0.036, + "odds_ratio_loss": 0.00043488398659974337, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0008198452414944768, + "rewards/margins": 0.5452276468276978, + "rewards/rejected": -0.5460475087165833, + "sft_loss": 0.008198452182114124, + "step": 3704 + }, + { + "epoch": 5.357917570498915, + "grad_norm": 1.0861893135806593, + "learning_rate": 2.3222419690019435e-07, + "logits/chosen": -0.8181073069572449, + "logits/rejected": -0.6101012229919434, + "logps/chosen": -0.044420160353183746, + "logps/rejected": -5.721585750579834, + "loss": 0.0363, + "odds_ratio_loss": 0.0018686356488615274, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004442016128450632, + "rewards/margins": 0.5677164793014526, + "rewards/rejected": -0.5721585154533386, + "sft_loss": 0.044420160353183746, + "step": 3705 + }, + { + "epoch": 5.359363702096891, + "grad_norm": 0.9995579308978673, + "learning_rate": 2.3118247266329872e-07, + "logits/chosen": -0.7560127973556519, + "logits/rejected": -0.5476009845733643, + "logps/chosen": -0.04683084785938263, + "logps/rejected": -7.408351421356201, + "loss": 0.0357, + "odds_ratio_loss": 0.0008836622582748532, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004683084320276976, + "rewards/margins": 0.7361520528793335, + "rewards/rejected": -0.7408351302146912, + "sft_loss": 0.04683084785938263, + "step": 3706 + }, + { + "epoch": 5.360809833694867, + "grad_norm": 0.8843187410781583, + "learning_rate": 2.3014302066052748e-07, + "logits/chosen": -0.827779233455658, + "logits/rejected": -0.6347323656082153, + "logps/chosen": -0.010669386014342308, + "logps/rejected": -3.1830084323883057, + "loss": 0.0342, + "odds_ratio_loss": 0.0007001932244747877, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001066938741132617, + "rewards/margins": 0.3172339200973511, + "rewards/rejected": -0.31830084323883057, + "sft_loss": 0.010669386014342308, + "step": 3707 + }, + { + "epoch": 5.362255965292841, + "grad_norm": 0.9632499279044018, + "learning_rate": 2.291058415185696e-07, + "logits/chosen": -0.9718351364135742, + "logits/rejected": -0.7804862260818481, + "logps/chosen": -0.02261853590607643, + "logps/rejected": -3.541126012802124, + "loss": 0.0345, + "odds_ratio_loss": 0.044430263340473175, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.002261853776872158, + "rewards/margins": 0.3518507480621338, + "rewards/rejected": -0.3541126251220703, + "sft_loss": 0.02261853590607643, + "step": 3708 + }, + { + "epoch": 5.363702096890817, + "grad_norm": 1.055009170351824, + "learning_rate": 2.2807093586274396e-07, + "logits/chosen": -0.840499758720398, + "logits/rejected": -0.7038973569869995, + "logps/chosen": -0.04808530956506729, + "logps/rejected": -5.187693119049072, + "loss": 0.0315, + "odds_ratio_loss": 0.0014545705635100603, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004808531142771244, + "rewards/margins": 0.5139607787132263, + "rewards/rejected": -0.5187693238258362, + "sft_loss": 0.04808530956506729, + "step": 3709 + }, + { + "epoch": 5.365148228488793, + "grad_norm": 1.1656407978335859, + "learning_rate": 2.27038304316999e-07, + "logits/chosen": -0.6774870157241821, + "logits/rejected": -0.48925572633743286, + "logps/chosen": -0.009755231440067291, + "logps/rejected": -6.098196983337402, + "loss": 0.0354, + "odds_ratio_loss": 0.0006270483136177063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00097552320221439, + "rewards/margins": 0.6088441610336304, + "rewards/rejected": -0.6098197102546692, + "sft_loss": 0.009755231440067291, + "step": 3710 + }, + { + "epoch": 5.366594360086768, + "grad_norm": 1.0338534343629824, + "learning_rate": 2.2600794750391273e-07, + "logits/chosen": -0.7158972024917603, + "logits/rejected": -0.6297093629837036, + "logps/chosen": -0.022365085780620575, + "logps/rejected": -4.423712730407715, + "loss": 0.0354, + "odds_ratio_loss": 0.0009890659712255, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0022365087643265724, + "rewards/margins": 0.4401347041130066, + "rewards/rejected": -0.4423712491989136, + "sft_loss": 0.022365085780620575, + "step": 3711 + }, + { + "epoch": 5.368040491684743, + "grad_norm": 1.1168479872493293, + "learning_rate": 2.2497986604469e-07, + "logits/chosen": -0.7737710475921631, + "logits/rejected": -0.6601718068122864, + "logps/chosen": -0.008890267461538315, + "logps/rejected": -5.52163028717041, + "loss": 0.0382, + "odds_ratio_loss": 0.002172160428017378, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0008890267345122993, + "rewards/margins": 0.5512740612030029, + "rewards/rejected": -0.5521630644798279, + "sft_loss": 0.008890267461538315, + "step": 3712 + }, + { + "epoch": 5.369486623282719, + "grad_norm": 1.1080662987117444, + "learning_rate": 2.2395406055916655e-07, + "logits/chosen": -0.8222386837005615, + "logits/rejected": -0.668541431427002, + "logps/chosen": -0.0324178971350193, + "logps/rejected": -5.162585735321045, + "loss": 0.037, + "odds_ratio_loss": 0.0027628212701529264, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0032417899928987026, + "rewards/margins": 0.5130168199539185, + "rewards/rejected": -0.5162585973739624, + "sft_loss": 0.0324178971350193, + "step": 3713 + }, + { + "epoch": 5.370932754880694, + "grad_norm": 1.4577184250784672, + "learning_rate": 2.2293053166580278e-07, + "logits/chosen": -0.9864585995674133, + "logits/rejected": -0.5810337066650391, + "logps/chosen": -0.02716916799545288, + "logps/rejected": -5.216684341430664, + "loss": 0.0637, + "odds_ratio_loss": 0.002008678624406457, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002716916613280773, + "rewards/margins": 0.5189515352249146, + "rewards/rejected": -0.5216684341430664, + "sft_loss": 0.02716916799545288, + "step": 3714 + }, + { + "epoch": 5.3723788864786695, + "grad_norm": 1.471600650771753, + "learning_rate": 2.2190927998168952e-07, + "logits/chosen": -0.794467568397522, + "logits/rejected": -0.6563206315040588, + "logps/chosen": -0.056384190917015076, + "logps/rejected": -5.826344966888428, + "loss": 0.0401, + "odds_ratio_loss": 0.005085381213575602, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005638418719172478, + "rewards/margins": 0.5769960880279541, + "rewards/rejected": -0.5826345086097717, + "sft_loss": 0.056384190917015076, + "step": 3715 + }, + { + "epoch": 5.373825018076645, + "grad_norm": 1.1637947808860327, + "learning_rate": 2.2089030612254223e-07, + "logits/chosen": -0.579232394695282, + "logits/rejected": -0.5474807024002075, + "logps/chosen": -0.016251739114522934, + "logps/rejected": -4.210132122039795, + "loss": 0.0401, + "odds_ratio_loss": 0.0010360369924455881, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0016251739580184221, + "rewards/margins": 0.4193880558013916, + "rewards/rejected": -0.4210132360458374, + "sft_loss": 0.016251739114522934, + "step": 3716 + }, + { + "epoch": 5.375271149674621, + "grad_norm": 1.4630343781840194, + "learning_rate": 2.198736107027046e-07, + "logits/chosen": -1.1237835884094238, + "logits/rejected": -0.7257318496704102, + "logps/chosen": -0.008886278606951237, + "logps/rejected": -7.131657123565674, + "loss": 0.0322, + "odds_ratio_loss": 0.00011410063598304987, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0008886277792043984, + "rewards/margins": 0.7122771143913269, + "rewards/rejected": -0.7131657004356384, + "sft_loss": 0.008886278606951237, + "step": 3717 + }, + { + "epoch": 5.376717281272596, + "grad_norm": 1.022017673541387, + "learning_rate": 2.18859194335145e-07, + "logits/chosen": -0.9754039645195007, + "logits/rejected": -0.7429482340812683, + "logps/chosen": -0.024609964340925217, + "logps/rejected": -5.291652679443359, + "loss": 0.0202, + "odds_ratio_loss": 0.0017671944806352258, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002460996387526393, + "rewards/margins": 0.5267042517662048, + "rewards/rejected": -0.5291653275489807, + "sft_loss": 0.024609964340925217, + "step": 3718 + }, + { + "epoch": 5.378163412870571, + "grad_norm": 1.1961288331402615, + "learning_rate": 2.178470576314595e-07, + "logits/chosen": -0.8683935403823853, + "logits/rejected": -0.6840660572052002, + "logps/chosen": -0.021254369989037514, + "logps/rejected": -6.397770881652832, + "loss": 0.0351, + "odds_ratio_loss": 0.002227251650765538, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002125436905771494, + "rewards/margins": 0.6376516819000244, + "rewards/rejected": -0.6397770643234253, + "sft_loss": 0.021254369989037514, + "step": 3719 + }, + { + "epoch": 5.379609544468547, + "grad_norm": 1.1777353978631238, + "learning_rate": 2.1683720120186977e-07, + "logits/chosen": -0.7626063823699951, + "logits/rejected": -0.6675928831100464, + "logps/chosen": -0.006277444772422314, + "logps/rejected": -4.882328987121582, + "loss": 0.0248, + "odds_ratio_loss": 0.0005971640930511057, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0006277445354498923, + "rewards/margins": 0.48760515451431274, + "rewards/rejected": -0.48823288083076477, + "sft_loss": 0.006277444772422314, + "step": 3720 + }, + { + "epoch": 5.381055676066522, + "grad_norm": 0.9532634639236007, + "learning_rate": 2.1582962565522124e-07, + "logits/chosen": -0.9451834559440613, + "logits/rejected": -0.7273751497268677, + "logps/chosen": -0.011347649618983269, + "logps/rejected": -4.3883376121521, + "loss": 0.0325, + "odds_ratio_loss": 0.0006653146119788289, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001134765101596713, + "rewards/margins": 0.43769901990890503, + "rewards/rejected": -0.4388337731361389, + "sft_loss": 0.011347649618983269, + "step": 3721 + }, + { + "epoch": 5.3825018076644975, + "grad_norm": 0.9895790538702688, + "learning_rate": 2.1482433159898528e-07, + "logits/chosen": -1.0272815227508545, + "logits/rejected": -0.8672689199447632, + "logps/chosen": -0.016958389431238174, + "logps/rejected": -4.27727746963501, + "loss": 0.0295, + "odds_ratio_loss": 0.0013183187693357468, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001695839106105268, + "rewards/margins": 0.4260319173336029, + "rewards/rejected": -0.4277278184890747, + "sft_loss": 0.016958389431238174, + "step": 3722 + }, + { + "epoch": 5.383947939262473, + "grad_norm": 1.1363864606811938, + "learning_rate": 2.138213196392571e-07, + "logits/chosen": -1.0011087656021118, + "logits/rejected": -0.7626456022262573, + "logps/chosen": -0.041216447949409485, + "logps/rejected": -5.496888160705566, + "loss": 0.0293, + "odds_ratio_loss": 0.0034582449588924646, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004121644888073206, + "rewards/margins": 0.5455672144889832, + "rewards/rejected": -0.5496888160705566, + "sft_loss": 0.041216447949409485, + "step": 3723 + }, + { + "epoch": 5.385394070860448, + "grad_norm": 1.236532097092624, + "learning_rate": 2.128205903807574e-07, + "logits/chosen": -0.9743160605430603, + "logits/rejected": -0.8304657936096191, + "logps/chosen": -0.013960372656583786, + "logps/rejected": -3.3339335918426514, + "loss": 0.043, + "odds_ratio_loss": 0.001198888523504138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0013960371725261211, + "rewards/margins": 0.3319973349571228, + "rewards/rejected": -0.3333933651447296, + "sft_loss": 0.013960372656583786, + "step": 3724 + }, + { + "epoch": 5.386840202458424, + "grad_norm": 1.1081513188918342, + "learning_rate": 2.1182214442682755e-07, + "logits/chosen": -0.8094361424446106, + "logits/rejected": -0.5704917311668396, + "logps/chosen": -0.05938174948096275, + "logps/rejected": -6.215296268463135, + "loss": 0.0384, + "odds_ratio_loss": 0.004498139023780823, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00593817513436079, + "rewards/margins": 0.6155914068222046, + "rewards/rejected": -0.6215296387672424, + "sft_loss": 0.05938174948096275, + "step": 3725 + }, + { + "epoch": 5.388286334056399, + "grad_norm": 1.3102679271672462, + "learning_rate": 2.1082598237943627e-07, + "logits/chosen": -0.904535174369812, + "logits/rejected": -0.7191603183746338, + "logps/chosen": -0.10017338395118713, + "logps/rejected": -4.470028877258301, + "loss": 0.0492, + "odds_ratio_loss": 0.0035221197176724672, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010017338208854198, + "rewards/margins": 0.4369855523109436, + "rewards/rejected": -0.4470028877258301, + "sft_loss": 0.10017338395118713, + "step": 3726 + }, + { + "epoch": 5.389732465654374, + "grad_norm": 1.1096478004364896, + "learning_rate": 2.09832104839172e-07, + "logits/chosen": -0.9260188341140747, + "logits/rejected": -0.6095644235610962, + "logps/chosen": -0.04054839536547661, + "logps/rejected": -5.756593704223633, + "loss": 0.0386, + "odds_ratio_loss": 0.0023188358172774315, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004054839722812176, + "rewards/margins": 0.5716045498847961, + "rewards/rejected": -0.5756593942642212, + "sft_loss": 0.04054839536547661, + "step": 3727 + }, + { + "epoch": 5.39117859725235, + "grad_norm": 1.0116071233532076, + "learning_rate": 2.0884051240524837e-07, + "logits/chosen": -0.8574241399765015, + "logits/rejected": -0.7022998332977295, + "logps/chosen": -0.011260045692324638, + "logps/rejected": -5.040678977966309, + "loss": 0.0398, + "odds_ratio_loss": 0.0007709235651418567, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0011260046157985926, + "rewards/margins": 0.502941906452179, + "rewards/rejected": -0.5040678977966309, + "sft_loss": 0.011260045692324638, + "step": 3728 + }, + { + "epoch": 5.3926247288503255, + "grad_norm": 1.1338471650039585, + "learning_rate": 2.0785120567549906e-07, + "logits/chosen": -1.079079031944275, + "logits/rejected": -0.7133069038391113, + "logps/chosen": -0.047831885516643524, + "logps/rejected": -5.607325077056885, + "loss": 0.0412, + "odds_ratio_loss": 0.0006497707217931747, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004783188924193382, + "rewards/margins": 0.5559492707252502, + "rewards/rejected": -0.5607325434684753, + "sft_loss": 0.047831885516643524, + "step": 3729 + }, + { + "epoch": 5.394070860448301, + "grad_norm": 1.257604801635166, + "learning_rate": 2.0686418524638172e-07, + "logits/chosen": -1.0921964645385742, + "logits/rejected": -0.5764094591140747, + "logps/chosen": -0.06157654896378517, + "logps/rejected": -5.977487564086914, + "loss": 0.0485, + "odds_ratio_loss": 0.003265151521191001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006157655268907547, + "rewards/margins": 0.5915911197662354, + "rewards/rejected": -0.5977488160133362, + "sft_loss": 0.06157654896378517, + "step": 3730 + }, + { + "epoch": 5.395516992046276, + "grad_norm": 1.1152765120174375, + "learning_rate": 2.058794517129736e-07, + "logits/chosen": -1.0628501176834106, + "logits/rejected": -0.7114624977111816, + "logps/chosen": -0.017897220328450203, + "logps/rejected": -4.574059963226318, + "loss": 0.0478, + "odds_ratio_loss": 0.0005718549946323037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0017897221259772778, + "rewards/margins": 0.455616295337677, + "rewards/rejected": -0.4574059844017029, + "sft_loss": 0.017897220328450203, + "step": 3731 + }, + { + "epoch": 5.396963123644252, + "grad_norm": 1.43232482896934, + "learning_rate": 2.0489700566897583e-07, + "logits/chosen": -0.8116693496704102, + "logits/rejected": -0.612543523311615, + "logps/chosen": -0.05777638033032417, + "logps/rejected": -3.095944404602051, + "loss": 0.0409, + "odds_ratio_loss": 0.004115620162338018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005777638405561447, + "rewards/margins": 0.3038167953491211, + "rewards/rejected": -0.30959442257881165, + "sft_loss": 0.05777638033032417, + "step": 3732 + }, + { + "epoch": 5.398409255242227, + "grad_norm": 1.2261328303484362, + "learning_rate": 2.0391684770670747e-07, + "logits/chosen": -1.0025750398635864, + "logits/rejected": -0.8340614438056946, + "logps/chosen": -0.03172256425023079, + "logps/rejected": -4.1400065422058105, + "loss": 0.0499, + "odds_ratio_loss": 0.0006993028800934553, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003172256052494049, + "rewards/margins": 0.4108284115791321, + "rewards/rejected": -0.4140006899833679, + "sft_loss": 0.03172256425023079, + "step": 3733 + }, + { + "epoch": 5.399855386840202, + "grad_norm": 1.2380490360257563, + "learning_rate": 2.029389784171096e-07, + "logits/chosen": -0.9819266200065613, + "logits/rejected": -0.7181769609451294, + "logps/chosen": -0.027787383645772934, + "logps/rejected": -5.828107833862305, + "loss": 0.0399, + "odds_ratio_loss": 0.002232284052297473, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0027787385042756796, + "rewards/margins": 0.5800320506095886, + "rewards/rejected": -0.5828108191490173, + "sft_loss": 0.027787383645772934, + "step": 3734 + }, + { + "epoch": 5.401301518438178, + "grad_norm": 1.1825096755119007, + "learning_rate": 2.0196339838974353e-07, + "logits/chosen": -0.796126663684845, + "logits/rejected": -0.738174557685852, + "logps/chosen": -0.01109451986849308, + "logps/rejected": -5.02370548248291, + "loss": 0.0318, + "odds_ratio_loss": 0.001242693280801177, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001109451986849308, + "rewards/margins": 0.5012611150741577, + "rewards/rejected": -0.5023705959320068, + "sft_loss": 0.01109451986849308, + "step": 3735 + }, + { + "epoch": 5.4027476500361535, + "grad_norm": 1.286684498068449, + "learning_rate": 2.009901082127894e-07, + "logits/chosen": -0.9500235915184021, + "logits/rejected": -0.5735840201377869, + "logps/chosen": -0.03152807801961899, + "logps/rejected": -4.867311477661133, + "loss": 0.0536, + "odds_ratio_loss": 0.001049485756084323, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0031528077088296413, + "rewards/margins": 0.4835783541202545, + "rewards/rejected": -0.4867311418056488, + "sft_loss": 0.03152807801961899, + "step": 3736 + }, + { + "epoch": 5.404193781634128, + "grad_norm": 1.0746790175253347, + "learning_rate": 2.0001910847304893e-07, + "logits/chosen": -1.0120949745178223, + "logits/rejected": -0.7268694639205933, + "logps/chosen": -0.017900846898555756, + "logps/rejected": -7.724081039428711, + "loss": 0.0353, + "odds_ratio_loss": 0.00031501834746450186, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0017900847597047687, + "rewards/margins": 0.7706180810928345, + "rewards/rejected": -0.7724080681800842, + "sft_loss": 0.017900846898555756, + "step": 3737 + }, + { + "epoch": 5.405639913232104, + "grad_norm": 1.0475786833357763, + "learning_rate": 1.9905039975594008e-07, + "logits/chosen": -1.074700117111206, + "logits/rejected": -0.692499577999115, + "logps/chosen": -0.04430760070681572, + "logps/rejected": -5.837507247924805, + "loss": 0.0468, + "odds_ratio_loss": 0.0013199535897001624, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004430760163813829, + "rewards/margins": 0.579319953918457, + "rewards/rejected": -0.5837507247924805, + "sft_loss": 0.04430760070681572, + "step": 3738 + }, + { + "epoch": 5.40708604483008, + "grad_norm": 1.0466069420380553, + "learning_rate": 1.9808398264550142e-07, + "logits/chosen": -0.8342309594154358, + "logits/rejected": -0.5902801752090454, + "logps/chosen": -0.01617673598229885, + "logps/rejected": -6.466917037963867, + "loss": 0.0397, + "odds_ratio_loss": 0.0010513067245483398, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0016176735516637564, + "rewards/margins": 0.6450740098953247, + "rewards/rejected": -0.6466916799545288, + "sft_loss": 0.01617673598229885, + "step": 3739 + }, + { + "epoch": 5.408532176428055, + "grad_norm": 1.1221433314828366, + "learning_rate": 1.9711985772438998e-07, + "logits/chosen": -0.9502919912338257, + "logits/rejected": -0.8300921320915222, + "logps/chosen": -0.03330773860216141, + "logps/rejected": -3.3701939582824707, + "loss": 0.0392, + "odds_ratio_loss": 0.003339210757985711, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003330774139612913, + "rewards/margins": 0.3336886465549469, + "rewards/rejected": -0.3370194435119629, + "sft_loss": 0.03330773860216141, + "step": 3740 + }, + { + "epoch": 5.40997830802603, + "grad_norm": 0.82956288869114, + "learning_rate": 1.9615802557387995e-07, + "logits/chosen": -0.7121748924255371, + "logits/rejected": -0.6548438668251038, + "logps/chosen": -0.015078652650117874, + "logps/rejected": -6.196207523345947, + "loss": 0.0187, + "odds_ratio_loss": 0.002625955268740654, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0015078652650117874, + "rewards/margins": 0.6181129217147827, + "rewards/rejected": -0.6196207404136658, + "sft_loss": 0.015078652650117874, + "step": 3741 + }, + { + "epoch": 5.411424439624006, + "grad_norm": 1.1509802573765, + "learning_rate": 1.9519848677386207e-07, + "logits/chosen": -1.094519019126892, + "logits/rejected": -0.7549199461936951, + "logps/chosen": -0.03511492535471916, + "logps/rejected": -5.343645095825195, + "loss": 0.0348, + "odds_ratio_loss": 0.0017946298466995358, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0035114926286041737, + "rewards/margins": 0.5308530330657959, + "rewards/rejected": -0.5343644618988037, + "sft_loss": 0.03511492535471916, + "step": 3742 + }, + { + "epoch": 5.412870571221982, + "grad_norm": 1.1964810085365847, + "learning_rate": 1.942412419028483e-07, + "logits/chosen": -0.9508788585662842, + "logits/rejected": -0.7533563375473022, + "logps/chosen": -0.03399357944726944, + "logps/rejected": -5.815676212310791, + "loss": 0.051, + "odds_ratio_loss": 0.0037674754858016968, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0033993578981608152, + "rewards/margins": 0.5781682729721069, + "rewards/rejected": -0.581567645072937, + "sft_loss": 0.03399357944726944, + "step": 3743 + }, + { + "epoch": 5.414316702819956, + "grad_norm": 1.0946416488877777, + "learning_rate": 1.9328629153796317e-07, + "logits/chosen": -1.065413236618042, + "logits/rejected": -0.7837972640991211, + "logps/chosen": -0.025034120306372643, + "logps/rejected": -5.018168926239014, + "loss": 0.0377, + "odds_ratio_loss": 0.0013940563658252358, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002503412077203393, + "rewards/margins": 0.49931347370147705, + "rewards/rejected": -0.5018168687820435, + "sft_loss": 0.025034120306372643, + "step": 3744 + }, + { + "epoch": 5.415762834417932, + "grad_norm": 0.916282563166462, + "learning_rate": 1.9233363625495057e-07, + "logits/chosen": -0.8925954699516296, + "logits/rejected": -0.6070317625999451, + "logps/chosen": -0.018731582909822464, + "logps/rejected": -3.630098342895508, + "loss": 0.0252, + "odds_ratio_loss": 0.0011892381589859724, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0018731581512838602, + "rewards/margins": 0.36113670468330383, + "rewards/rejected": -0.36300989985466003, + "sft_loss": 0.018731582909822464, + "step": 3745 + }, + { + "epoch": 5.417208966015908, + "grad_norm": 0.9606132598371431, + "learning_rate": 1.9138327662817065e-07, + "logits/chosen": -0.8515411615371704, + "logits/rejected": -0.6688830852508545, + "logps/chosen": -0.01788986660540104, + "logps/rejected": -5.797184944152832, + "loss": 0.0263, + "odds_ratio_loss": 0.0017000783700495958, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0017889868468046188, + "rewards/margins": 0.5779294967651367, + "rewards/rejected": -0.5797185301780701, + "sft_loss": 0.01788986660540104, + "step": 3746 + }, + { + "epoch": 5.4186550976138825, + "grad_norm": 1.0628812282654512, + "learning_rate": 1.9043521323059752e-07, + "logits/chosen": -0.8878743052482605, + "logits/rejected": -0.6467165946960449, + "logps/chosen": -0.061739467084407806, + "logps/rejected": -6.201615810394287, + "loss": 0.0443, + "odds_ratio_loss": 0.001052954001352191, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006173947360366583, + "rewards/margins": 0.6139876842498779, + "rewards/rejected": -0.6201615929603577, + "sft_loss": 0.061739467084407806, + "step": 3747 + }, + { + "epoch": 5.420101229211858, + "grad_norm": 1.0019449779058243, + "learning_rate": 1.8948944663382328e-07, + "logits/chosen": -0.8111994862556458, + "logits/rejected": -0.5611193180084229, + "logps/chosen": -0.023934748023748398, + "logps/rejected": -6.190435409545898, + "loss": 0.0274, + "odds_ratio_loss": 0.0013642843114212155, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0023934748023748398, + "rewards/margins": 0.6166501045227051, + "rewards/rejected": -0.6190435290336609, + "sft_loss": 0.023934748023748398, + "step": 3748 + }, + { + "epoch": 5.421547360809834, + "grad_norm": 3.0322591142715507, + "learning_rate": 1.8854597740805267e-07, + "logits/chosen": -0.8009194731712341, + "logits/rejected": -0.5865261554718018, + "logps/chosen": -0.047983746975660324, + "logps/rejected": -4.174554824829102, + "loss": 0.0246, + "odds_ratio_loss": 0.001906828721985221, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004798375070095062, + "rewards/margins": 0.41265714168548584, + "rewards/rejected": -0.4174554944038391, + "sft_loss": 0.047983746975660324, + "step": 3749 + }, + { + "epoch": 5.422993492407809, + "grad_norm": 1.2038166729132256, + "learning_rate": 1.8760480612210848e-07, + "logits/chosen": -0.7556824088096619, + "logits/rejected": -0.704839289188385, + "logps/chosen": -0.046191371977329254, + "logps/rejected": -3.388821601867676, + "loss": 0.0348, + "odds_ratio_loss": 0.004520849324762821, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00461913738399744, + "rewards/margins": 0.33426302671432495, + "rewards/rejected": -0.338882178068161, + "sft_loss": 0.046191371977329254, + "step": 3750 + }, + { + "epoch": 5.424439624005784, + "grad_norm": 1.366842333937244, + "learning_rate": 1.866659333434244e-07, + "logits/chosen": -0.8494126796722412, + "logits/rejected": -0.5990194082260132, + "logps/chosen": -0.06559625267982483, + "logps/rejected": -5.316402912139893, + "loss": 0.0417, + "odds_ratio_loss": 0.005418956745415926, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006559625267982483, + "rewards/margins": 0.525080680847168, + "rewards/rejected": -0.531640350818634, + "sft_loss": 0.06559625267982483, + "step": 3751 + }, + { + "epoch": 5.42588575560376, + "grad_norm": 1.0710079807891604, + "learning_rate": 1.8572935963805246e-07, + "logits/chosen": -0.991873562335968, + "logits/rejected": -0.8343634605407715, + "logps/chosen": -0.026091061532497406, + "logps/rejected": -5.874682426452637, + "loss": 0.0445, + "odds_ratio_loss": 0.0013955554459244013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002609106246381998, + "rewards/margins": 0.5848591327667236, + "rewards/rejected": -0.5874682664871216, + "sft_loss": 0.026091061532497406, + "step": 3752 + }, + { + "epoch": 5.427331887201736, + "grad_norm": 1.127959440254014, + "learning_rate": 1.8479508557065525e-07, + "logits/chosen": -0.8945366740226746, + "logits/rejected": -0.5752571821212769, + "logps/chosen": -0.012399179860949516, + "logps/rejected": -8.56202507019043, + "loss": 0.0271, + "odds_ratio_loss": 0.0005926304729655385, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001239917939528823, + "rewards/margins": 0.8549625873565674, + "rewards/rejected": -0.8562024235725403, + "sft_loss": 0.012399179860949516, + "step": 3753 + }, + { + "epoch": 5.428778018799711, + "grad_norm": 1.0389549793471313, + "learning_rate": 1.838631117045102e-07, + "logits/chosen": -0.8418703079223633, + "logits/rejected": -0.6339346766471863, + "logps/chosen": -0.013523176312446594, + "logps/rejected": -6.4567999839782715, + "loss": 0.0346, + "odds_ratio_loss": 0.0007304397877305746, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0013523175148293376, + "rewards/margins": 0.6443277597427368, + "rewards/rejected": -0.6456800103187561, + "sft_loss": 0.013523176312446594, + "step": 3754 + }, + { + "epoch": 5.430224150397686, + "grad_norm": 1.2868339892184242, + "learning_rate": 1.8293343860150823e-07, + "logits/chosen": -0.9397022128105164, + "logits/rejected": -0.6365588903427124, + "logps/chosen": -0.04777473583817482, + "logps/rejected": -4.0239644050598145, + "loss": 0.0416, + "odds_ratio_loss": 0.0024959403090178967, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004777473863214254, + "rewards/margins": 0.39761897921562195, + "rewards/rejected": -0.40239647030830383, + "sft_loss": 0.04777473583817482, + "step": 3755 + }, + { + "epoch": 5.431670281995662, + "grad_norm": 0.9056856128260714, + "learning_rate": 1.8200606682215215e-07, + "logits/chosen": -0.9845834374427795, + "logits/rejected": -0.6842622756958008, + "logps/chosen": -0.07160784304141998, + "logps/rejected": -5.863983631134033, + "loss": 0.0313, + "odds_ratio_loss": 0.005471718031913042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007160785607993603, + "rewards/margins": 0.5792375802993774, + "rewards/rejected": -0.5863983631134033, + "sft_loss": 0.07160784304141998, + "step": 3756 + }, + { + "epoch": 5.433116413593637, + "grad_norm": 1.1195527461945456, + "learning_rate": 1.810809969255578e-07, + "logits/chosen": -0.8662546873092651, + "logits/rejected": -0.6356720924377441, + "logps/chosen": -0.047709230333566666, + "logps/rejected": -5.315559387207031, + "loss": 0.0366, + "odds_ratio_loss": 0.0005675168940797448, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004770922940224409, + "rewards/margins": 0.5267850160598755, + "rewards/rejected": -0.5315558910369873, + "sft_loss": 0.047709230333566666, + "step": 3757 + }, + { + "epoch": 5.434562545191612, + "grad_norm": 1.0522209896848032, + "learning_rate": 1.801582294694537e-07, + "logits/chosen": -0.690969705581665, + "logits/rejected": -0.4468667805194855, + "logps/chosen": -0.01992463506758213, + "logps/rejected": -5.407593727111816, + "loss": 0.0331, + "odds_ratio_loss": 0.0010238731047138572, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0019924635998904705, + "rewards/margins": 0.5387669205665588, + "rewards/rejected": -0.5407594442367554, + "sft_loss": 0.01992463506758213, + "step": 3758 + }, + { + "epoch": 5.436008676789588, + "grad_norm": 1.1908143743026758, + "learning_rate": 1.7923776501018017e-07, + "logits/chosen": -0.8162471055984497, + "logits/rejected": -0.5732954740524292, + "logps/chosen": -0.04143443703651428, + "logps/rejected": -6.230381011962891, + "loss": 0.0379, + "odds_ratio_loss": 0.00850686989724636, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004143443889915943, + "rewards/margins": 0.618894636631012, + "rewards/rejected": -0.623038113117218, + "sft_loss": 0.04143443703651428, + "step": 3759 + }, + { + "epoch": 5.437454808387563, + "grad_norm": 1.1326155288447952, + "learning_rate": 1.783196041026871e-07, + "logits/chosen": -1.0182111263275146, + "logits/rejected": -0.8080993294715881, + "logps/chosen": -0.028092101216316223, + "logps/rejected": -3.2441256046295166, + "loss": 0.0268, + "odds_ratio_loss": 0.0010873202700167894, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0028092102147638798, + "rewards/margins": 0.32160335779190063, + "rewards/rejected": -0.3244125545024872, + "sft_loss": 0.028092101216316223, + "step": 3760 + }, + { + "epoch": 5.438900939985539, + "grad_norm": 1.0215706085996055, + "learning_rate": 1.774037473005392e-07, + "logits/chosen": -1.03084135055542, + "logits/rejected": -0.6289142370223999, + "logps/chosen": -0.04927384853363037, + "logps/rejected": -6.17075252532959, + "loss": 0.0429, + "odds_ratio_loss": 0.0005610902444459498, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004927385598421097, + "rewards/margins": 0.612147867679596, + "rewards/rejected": -0.6170752048492432, + "sft_loss": 0.04927384853363037, + "step": 3761 + }, + { + "epoch": 5.440347071583514, + "grad_norm": 0.7157495419275686, + "learning_rate": 1.7649019515590902e-07, + "logits/chosen": -0.9550158381462097, + "logits/rejected": -0.5582857131958008, + "logps/chosen": -0.010214051231741905, + "logps/rejected": -5.4750213623046875, + "loss": 0.0143, + "odds_ratio_loss": 0.000640549638774246, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0010214050998911262, + "rewards/margins": 0.5464807152748108, + "rewards/rejected": -0.5475021600723267, + "sft_loss": 0.010214051231741905, + "step": 3762 + }, + { + "epoch": 5.44179320318149, + "grad_norm": 1.2906910872242197, + "learning_rate": 1.7557894821957996e-07, + "logits/chosen": -0.8110982179641724, + "logits/rejected": -0.6344393491744995, + "logps/chosen": -0.032719340175390244, + "logps/rejected": -4.284968376159668, + "loss": 0.0422, + "odds_ratio_loss": 0.0016023035859689116, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0032719343435019255, + "rewards/margins": 0.4252249002456665, + "rewards/rejected": -0.4284968376159668, + "sft_loss": 0.032719340175390244, + "step": 3763 + }, + { + "epoch": 5.443239334779465, + "grad_norm": 1.20926488363375, + "learning_rate": 1.7467000704094635e-07, + "logits/chosen": -1.0415936708450317, + "logits/rejected": -0.7036805748939514, + "logps/chosen": -0.07510319352149963, + "logps/rejected": -5.472287178039551, + "loss": 0.0323, + "odds_ratio_loss": 0.004779131151735783, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007510320283472538, + "rewards/margins": 0.5397183895111084, + "rewards/rejected": -0.5472287535667419, + "sft_loss": 0.07510319352149963, + "step": 3764 + }, + { + "epoch": 5.44468546637744, + "grad_norm": 1.705118382870525, + "learning_rate": 1.737633721680134e-07, + "logits/chosen": -0.8406180739402771, + "logits/rejected": -0.6496351957321167, + "logps/chosen": -0.037446528673172, + "logps/rejected": -6.6907958984375, + "loss": 0.0355, + "odds_ratio_loss": 0.002988762455061078, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003744652960449457, + "rewards/margins": 0.665334939956665, + "rewards/rejected": -0.669079601764679, + "sft_loss": 0.037446528673172, + "step": 3765 + }, + { + "epoch": 5.446131597975416, + "grad_norm": 1.0836249668191964, + "learning_rate": 1.7285904414739316e-07, + "logits/chosen": -0.6858553886413574, + "logits/rejected": -0.6467568278312683, + "logps/chosen": -0.033721573650836945, + "logps/rejected": -4.718149185180664, + "loss": 0.032, + "odds_ratio_loss": 0.002487786579877138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0033721576910465956, + "rewards/margins": 0.46844279766082764, + "rewards/rejected": -0.47181493043899536, + "sft_loss": 0.033721573650836945, + "step": 3766 + }, + { + "epoch": 5.447577729573391, + "grad_norm": 1.2582651528740414, + "learning_rate": 1.7195702352430907e-07, + "logits/chosen": -0.7453847527503967, + "logits/rejected": -0.525572657585144, + "logps/chosen": -0.05578567460179329, + "logps/rejected": -5.3713698387146, + "loss": 0.0575, + "odds_ratio_loss": 0.0022328821942210197, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005578567273914814, + "rewards/margins": 0.5315583944320679, + "rewards/rejected": -0.537136971950531, + "sft_loss": 0.05578567460179329, + "step": 3767 + }, + { + "epoch": 5.449023861171367, + "grad_norm": 1.1303720815241651, + "learning_rate": 1.7105731084259278e-07, + "logits/chosen": -0.7345448732376099, + "logits/rejected": -0.5634473562240601, + "logps/chosen": -0.04168213903903961, + "logps/rejected": -5.113981246948242, + "loss": 0.0329, + "odds_ratio_loss": 0.005578060168772936, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004168213810771704, + "rewards/margins": 0.5072299242019653, + "rewards/rejected": -0.5113981366157532, + "sft_loss": 0.04168213903903961, + "step": 3768 + }, + { + "epoch": 5.450469992769342, + "grad_norm": 1.005911181986314, + "learning_rate": 1.7015990664468415e-07, + "logits/chosen": -1.0068395137786865, + "logits/rejected": -0.6776146292686462, + "logps/chosen": -0.04365962743759155, + "logps/rejected": -4.5361223220825195, + "loss": 0.0292, + "odds_ratio_loss": 0.003663485636934638, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00436596293002367, + "rewards/margins": 0.44924625754356384, + "rewards/rejected": -0.45361220836639404, + "sft_loss": 0.04365962743759155, + "step": 3769 + }, + { + "epoch": 5.451916124367317, + "grad_norm": 1.4564721360998285, + "learning_rate": 1.6926481147163173e-07, + "logits/chosen": -0.907903790473938, + "logits/rejected": -0.6354748010635376, + "logps/chosen": -0.03070155158638954, + "logps/rejected": -5.978352069854736, + "loss": 0.0857, + "odds_ratio_loss": 0.0009719114750623703, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003070155158638954, + "rewards/margins": 0.5947650671005249, + "rewards/rejected": -0.5978351831436157, + "sft_loss": 0.03070155158638954, + "step": 3770 + }, + { + "epoch": 5.453362255965293, + "grad_norm": 1.0511769484125821, + "learning_rate": 1.6837202586309185e-07, + "logits/chosen": -0.9518678188323975, + "logits/rejected": -0.609727144241333, + "logps/chosen": -0.014927854761481285, + "logps/rejected": -7.192644119262695, + "loss": 0.0293, + "odds_ratio_loss": 0.0006604095688089728, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0014927855227142572, + "rewards/margins": 0.7177716493606567, + "rewards/rejected": -0.7192643880844116, + "sft_loss": 0.014927854761481285, + "step": 3771 + }, + { + "epoch": 5.4548083875632685, + "grad_norm": 1.347646014723408, + "learning_rate": 1.6748155035732813e-07, + "logits/chosen": -1.0877472162246704, + "logits/rejected": -0.6998102068901062, + "logps/chosen": -0.053270746022462845, + "logps/rejected": -6.024081707000732, + "loss": 0.0624, + "odds_ratio_loss": 0.0028402383904904127, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005327074788510799, + "rewards/margins": 0.5970811247825623, + "rewards/rejected": -0.6024081707000732, + "sft_loss": 0.053270746022462845, + "step": 3772 + }, + { + "epoch": 5.456254519161243, + "grad_norm": 0.7840991919195984, + "learning_rate": 1.6659338549121117e-07, + "logits/chosen": -0.6631830334663391, + "logits/rejected": -0.6605867743492126, + "logps/chosen": -0.004134493414312601, + "logps/rejected": -4.816030502319336, + "loss": 0.0235, + "odds_ratio_loss": 0.0005341377691365778, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.000413449335610494, + "rewards/margins": 0.4811896085739136, + "rewards/rejected": -0.4816030263900757, + "sft_loss": 0.004134493414312601, + "step": 3773 + }, + { + "epoch": 5.457700650759219, + "grad_norm": 1.1194943030611022, + "learning_rate": 1.6570753180021925e-07, + "logits/chosen": -0.8060338497161865, + "logits/rejected": -0.675667941570282, + "logps/chosen": -0.06142951920628548, + "logps/rejected": -4.382281303405762, + "loss": 0.0487, + "odds_ratio_loss": 0.008763323538005352, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006142952013760805, + "rewards/margins": 0.43208521604537964, + "rewards/rejected": -0.43822813034057617, + "sft_loss": 0.06142951920628548, + "step": 3774 + }, + { + "epoch": 5.459146782357195, + "grad_norm": 1.258915842060084, + "learning_rate": 1.648239898184367e-07, + "logits/chosen": -0.7317820191383362, + "logits/rejected": -0.609990119934082, + "logps/chosen": -0.047425124794244766, + "logps/rejected": -4.409567832946777, + "loss": 0.046, + "odds_ratio_loss": 0.006635497324168682, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004742512945085764, + "rewards/margins": 0.43621429800987244, + "rewards/rejected": -0.4409567713737488, + "sft_loss": 0.047425124794244766, + "step": 3775 + }, + { + "epoch": 5.46059291395517, + "grad_norm": 1.3023881499886933, + "learning_rate": 1.639427600785548e-07, + "logits/chosen": -0.6300375461578369, + "logits/rejected": -0.5426366329193115, + "logps/chosen": -0.01603570207953453, + "logps/rejected": -6.023974418640137, + "loss": 0.0337, + "odds_ratio_loss": 0.00035005330573767424, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0016035701846703887, + "rewards/margins": 0.6007938385009766, + "rewards/rejected": -0.6023973822593689, + "sft_loss": 0.01603570207953453, + "step": 3776 + }, + { + "epoch": 5.462039045553145, + "grad_norm": 1.1724540495948623, + "learning_rate": 1.6306384311186938e-07, + "logits/chosen": -0.6645495891571045, + "logits/rejected": -0.45771369338035583, + "logps/chosen": -0.013555881567299366, + "logps/rejected": -5.649051666259766, + "loss": 0.038, + "odds_ratio_loss": 0.0011070972541347146, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0013555882032960653, + "rewards/margins": 0.5635495781898499, + "rewards/rejected": -0.5649051666259766, + "sft_loss": 0.013555881567299366, + "step": 3777 + }, + { + "epoch": 5.463485177151121, + "grad_norm": 1.0593367688075082, + "learning_rate": 1.6218723944828416e-07, + "logits/chosen": -0.9113006591796875, + "logits/rejected": -0.8548449873924255, + "logps/chosen": -0.052851203829050064, + "logps/rejected": -3.4363300800323486, + "loss": 0.0516, + "odds_ratio_loss": 0.01034949254244566, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005285120103508234, + "rewards/margins": 0.3383478820323944, + "rewards/rejected": -0.3436330258846283, + "sft_loss": 0.052851203829050064, + "step": 3778 + }, + { + "epoch": 5.4649313087490965, + "grad_norm": 0.8107933568205687, + "learning_rate": 1.6131294961630526e-07, + "logits/chosen": -0.8878468871116638, + "logits/rejected": -0.6497321724891663, + "logps/chosen": -0.013816386461257935, + "logps/rejected": -6.9916181564331055, + "loss": 0.0238, + "odds_ratio_loss": 0.0004862607456743717, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0013816386926919222, + "rewards/margins": 0.697780191898346, + "rewards/rejected": -0.6991618871688843, + "sft_loss": 0.013816386461257935, + "step": 3779 + }, + { + "epoch": 5.466377440347071, + "grad_norm": 1.1183439742376857, + "learning_rate": 1.6044097414304614e-07, + "logits/chosen": -1.0266106128692627, + "logits/rejected": -0.7022008895874023, + "logps/chosen": -0.0061122276820242405, + "logps/rejected": -7.002909183502197, + "loss": 0.0498, + "odds_ratio_loss": 0.0008495476795360446, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0006112227565608919, + "rewards/margins": 0.6996797323226929, + "rewards/rejected": -0.7002909183502197, + "sft_loss": 0.0061122276820242405, + "step": 3780 + }, + { + "epoch": 5.467823571945047, + "grad_norm": 1.049467888638357, + "learning_rate": 1.5957131355422315e-07, + "logits/chosen": -0.73927241563797, + "logits/rejected": -0.6167184114456177, + "logps/chosen": -0.026645377278327942, + "logps/rejected": -3.968277931213379, + "loss": 0.0297, + "odds_ratio_loss": 0.0038094024639576674, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0026645378675311804, + "rewards/margins": 0.39416325092315674, + "rewards/rejected": -0.3968278169631958, + "sft_loss": 0.026645377278327942, + "step": 3781 + }, + { + "epoch": 5.469269703543023, + "grad_norm": 0.9677568438433538, + "learning_rate": 1.5870396837415868e-07, + "logits/chosen": -0.8513559699058533, + "logits/rejected": -0.7282320261001587, + "logps/chosen": -0.008808178827166557, + "logps/rejected": -4.3206939697265625, + "loss": 0.0331, + "odds_ratio_loss": 0.0005816483753733337, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0008808178827166557, + "rewards/margins": 0.4311886429786682, + "rewards/rejected": -0.43206942081451416, + "sft_loss": 0.008808178827166557, + "step": 3782 + }, + { + "epoch": 5.4707158351409975, + "grad_norm": 1.0715870165143846, + "learning_rate": 1.5783893912577794e-07, + "logits/chosen": -0.9287527203559875, + "logits/rejected": -0.66707444190979, + "logps/chosen": -0.06083793565630913, + "logps/rejected": -4.999859809875488, + "loss": 0.0463, + "odds_ratio_loss": 0.0012267936253920197, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006083793938159943, + "rewards/margins": 0.49390220642089844, + "rewards/rejected": -0.4999859929084778, + "sft_loss": 0.06083793565630913, + "step": 3783 + }, + { + "epoch": 5.472161966738973, + "grad_norm": 1.064733100710332, + "learning_rate": 1.5697622633061137e-07, + "logits/chosen": -1.0185866355895996, + "logits/rejected": -0.7323493957519531, + "logps/chosen": -0.02041870355606079, + "logps/rejected": -5.43452787399292, + "loss": 0.0333, + "odds_ratio_loss": 0.0006175260059535503, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0020418702624738216, + "rewards/margins": 0.5414108633995056, + "rewards/rejected": -0.5434527397155762, + "sft_loss": 0.02041870355606079, + "step": 3784 + }, + { + "epoch": 5.473608098336949, + "grad_norm": 1.0432839302721317, + "learning_rate": 1.5611583050878997e-07, + "logits/chosen": -0.8604859113693237, + "logits/rejected": -0.6487876176834106, + "logps/chosen": -0.0667792409658432, + "logps/rejected": -6.021456718444824, + "loss": 0.0364, + "odds_ratio_loss": 0.0029010814614593983, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006677924655377865, + "rewards/margins": 0.5954678058624268, + "rewards/rejected": -0.602145791053772, + "sft_loss": 0.0667792409658432, + "step": 3785 + }, + { + "epoch": 5.4750542299349245, + "grad_norm": 1.1140883737750038, + "learning_rate": 1.5525775217905125e-07, + "logits/chosen": -0.7549240589141846, + "logits/rejected": -0.5772415399551392, + "logps/chosen": -0.04872463271021843, + "logps/rejected": -5.993658065795898, + "loss": 0.0415, + "odds_ratio_loss": 0.003810073481872678, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0048724631778895855, + "rewards/margins": 0.5944933891296387, + "rewards/rejected": -0.5993658304214478, + "sft_loss": 0.04872463271021843, + "step": 3786 + }, + { + "epoch": 5.476500361532899, + "grad_norm": 1.323116620766381, + "learning_rate": 1.5440199185873294e-07, + "logits/chosen": -0.9334354996681213, + "logits/rejected": -0.9096828103065491, + "logps/chosen": -0.049010857939720154, + "logps/rejected": -4.4386420249938965, + "loss": 0.0353, + "odds_ratio_loss": 0.0016335193067789078, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00490108598023653, + "rewards/margins": 0.4389631450176239, + "rewards/rejected": -0.44386419653892517, + "sft_loss": 0.049010857939720154, + "step": 3787 + }, + { + "epoch": 5.477946493130875, + "grad_norm": 1.030544588663756, + "learning_rate": 1.5354855006377565e-07, + "logits/chosen": -1.0335475206375122, + "logits/rejected": -0.6511580944061279, + "logps/chosen": -0.041097480803728104, + "logps/rejected": -5.674369812011719, + "loss": 0.0245, + "odds_ratio_loss": 0.00024172822304535657, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004109748173505068, + "rewards/margins": 0.5633272528648376, + "rewards/rejected": -0.5674369931221008, + "sft_loss": 0.041097480803728104, + "step": 3788 + }, + { + "epoch": 5.479392624728851, + "grad_norm": 1.1010779889775346, + "learning_rate": 1.526974273087238e-07, + "logits/chosen": -0.7723639011383057, + "logits/rejected": -0.5067367553710938, + "logps/chosen": -0.026965174823999405, + "logps/rejected": -5.289927959442139, + "loss": 0.0328, + "odds_ratio_loss": 0.00020186560868751258, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0026965176220983267, + "rewards/margins": 0.5262963175773621, + "rewards/rejected": -0.5289928317070007, + "sft_loss": 0.026965174823999405, + "step": 3789 + }, + { + "epoch": 5.4808387563268255, + "grad_norm": 1.0022160795883452, + "learning_rate": 1.518486241067216e-07, + "logits/chosen": -0.8188205361366272, + "logits/rejected": -0.5412254333496094, + "logps/chosen": -0.030381806194782257, + "logps/rejected": -5.232437610626221, + "loss": 0.0318, + "odds_ratio_loss": 0.0006156917079351842, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0030381809920072556, + "rewards/margins": 0.520205557346344, + "rewards/rejected": -0.52324378490448, + "sft_loss": 0.030381806194782257, + "step": 3790 + }, + { + "epoch": 5.482284887924801, + "grad_norm": 0.8690394603358813, + "learning_rate": 1.5100214096951658e-07, + "logits/chosen": -0.923912525177002, + "logits/rejected": -0.7971416115760803, + "logps/chosen": -0.01047974918037653, + "logps/rejected": -5.119403839111328, + "loss": 0.011, + "odds_ratio_loss": 0.0005628624348901212, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0010479751508682966, + "rewards/margins": 0.510892391204834, + "rewards/rejected": -0.5119403600692749, + "sft_loss": 0.01047974918037653, + "step": 3791 + }, + { + "epoch": 5.483731019522777, + "grad_norm": 1.065241764993738, + "learning_rate": 1.5015797840745515e-07, + "logits/chosen": -1.1814494132995605, + "logits/rejected": -0.6983795166015625, + "logps/chosen": -0.02755448967218399, + "logps/rejected": -5.53033971786499, + "loss": 0.0342, + "odds_ratio_loss": 0.00043123989598825574, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0027554489206522703, + "rewards/margins": 0.5502785444259644, + "rewards/rejected": -0.5530340075492859, + "sft_loss": 0.02755448967218399, + "step": 3792 + }, + { + "epoch": 5.485177151120752, + "grad_norm": 0.9422856535843215, + "learning_rate": 1.4931613692948753e-07, + "logits/chosen": -0.9157315492630005, + "logits/rejected": -0.8082464933395386, + "logps/chosen": -0.010187807492911816, + "logps/rejected": -4.607365608215332, + "loss": 0.0246, + "odds_ratio_loss": 0.001327059231698513, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0010187807492911816, + "rewards/margins": 0.4597177505493164, + "rewards/rejected": -0.4607365131378174, + "sft_loss": 0.010187807492911816, + "step": 3793 + }, + { + "epoch": 5.486623282718727, + "grad_norm": 1.2102618901021103, + "learning_rate": 1.484766170431624e-07, + "logits/chosen": -0.7651352286338806, + "logits/rejected": -0.6443876028060913, + "logps/chosen": -0.050633467733860016, + "logps/rejected": -5.098237991333008, + "loss": 0.0429, + "odds_ratio_loss": 0.001227756729349494, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005063347052782774, + "rewards/margins": 0.5047605037689209, + "rewards/rejected": -0.5098237991333008, + "sft_loss": 0.050633467733860016, + "step": 3794 + }, + { + "epoch": 5.488069414316703, + "grad_norm": 1.1833110758817549, + "learning_rate": 1.4763941925462954e-07, + "logits/chosen": -0.8539189100265503, + "logits/rejected": -0.7393923401832581, + "logps/chosen": -0.05777610465884209, + "logps/rejected": -3.9655282497406006, + "loss": 0.0452, + "odds_ratio_loss": 0.00516713410615921, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005777610465884209, + "rewards/margins": 0.390775203704834, + "rewards/rejected": -0.39655283093452454, + "sft_loss": 0.05777610465884209, + "step": 3795 + }, + { + "epoch": 5.489515545914678, + "grad_norm": 1.214943791765227, + "learning_rate": 1.4680454406863763e-07, + "logits/chosen": -0.6452954411506653, + "logits/rejected": -0.5753892660140991, + "logps/chosen": -0.015782607719302177, + "logps/rejected": -5.62052059173584, + "loss": 0.0454, + "odds_ratio_loss": 0.00034531878191046417, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001578260911628604, + "rewards/margins": 0.5604738593101501, + "rewards/rejected": -0.5620521306991577, + "sft_loss": 0.015782607719302177, + "step": 3796 + }, + { + "epoch": 5.4909616775126535, + "grad_norm": 1.4352775561501738, + "learning_rate": 1.4597199198853782e-07, + "logits/chosen": -1.0027751922607422, + "logits/rejected": -0.7584199905395508, + "logps/chosen": -0.02233309857547283, + "logps/rejected": -6.617671489715576, + "loss": 0.0524, + "odds_ratio_loss": 0.0024742737878113985, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0022333101369440556, + "rewards/margins": 0.6595338582992554, + "rewards/rejected": -0.6617671847343445, + "sft_loss": 0.02233309857547283, + "step": 3797 + }, + { + "epoch": 5.492407809110629, + "grad_norm": 1.23422188608422, + "learning_rate": 1.451417635162775e-07, + "logits/chosen": -0.8085861802101135, + "logits/rejected": -0.6014904975891113, + "logps/chosen": -0.0513208769261837, + "logps/rejected": -5.220034599304199, + "loss": 0.0471, + "odds_ratio_loss": 0.0029555868823081255, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005132087506353855, + "rewards/margins": 0.5168713331222534, + "rewards/rejected": -0.5220034122467041, + "sft_loss": 0.0513208769261837, + "step": 3798 + }, + { + "epoch": 5.493853940708605, + "grad_norm": 1.1457292152366192, + "learning_rate": 1.4431385915240513e-07, + "logits/chosen": -0.862919270992279, + "logits/rejected": -0.627816379070282, + "logps/chosen": -0.03642456978559494, + "logps/rejected": -5.0828752517700195, + "loss": 0.0481, + "odds_ratio_loss": 0.003557011019438505, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0036424570716917515, + "rewards/margins": 0.5046451091766357, + "rewards/rejected": -0.5082875490188599, + "sft_loss": 0.03642456978559494, + "step": 3799 + }, + { + "epoch": 5.49530007230658, + "grad_norm": 0.9247872005994776, + "learning_rate": 1.4348827939606723e-07, + "logits/chosen": -0.8906946182250977, + "logits/rejected": -0.6677629351615906, + "logps/chosen": -0.01218663901090622, + "logps/rejected": -5.341521739959717, + "loss": 0.034, + "odds_ratio_loss": 0.0004317264538258314, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0012186639942228794, + "rewards/margins": 0.5329335331916809, + "rewards/rejected": -0.5341522097587585, + "sft_loss": 0.01218663901090622, + "step": 3800 + }, + { + "epoch": 5.496746203904555, + "grad_norm": 0.9295214085597644, + "learning_rate": 1.426650247450092e-07, + "logits/chosen": -0.8590705990791321, + "logits/rejected": -0.7467610836029053, + "logps/chosen": -0.012503638863563538, + "logps/rejected": -5.067593097686768, + "loss": 0.0382, + "odds_ratio_loss": 0.0009685508557595313, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0012503638863563538, + "rewards/margins": 0.5055089592933655, + "rewards/rejected": -0.5067592859268188, + "sft_loss": 0.012503638863563538, + "step": 3801 + }, + { + "epoch": 5.498192335502531, + "grad_norm": 0.9849191492306798, + "learning_rate": 1.41844095695574e-07, + "logits/chosen": -1.0076994895935059, + "logits/rejected": -0.4970240294933319, + "logps/chosen": -0.03561976179480553, + "logps/rejected": -7.417174339294434, + "loss": 0.0355, + "odds_ratio_loss": 0.0002764179080259055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0035619763657450676, + "rewards/margins": 0.7381554841995239, + "rewards/rejected": -0.7417174577713013, + "sft_loss": 0.03561976179480553, + "step": 3802 + }, + { + "epoch": 5.499638467100506, + "grad_norm": 1.3091784075782222, + "learning_rate": 1.4102549274270214e-07, + "logits/chosen": -0.9715660214424133, + "logits/rejected": -0.8041377067565918, + "logps/chosen": -0.05225411802530289, + "logps/rejected": -5.449763298034668, + "loss": 0.0405, + "odds_ratio_loss": 0.004342366475611925, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005225412547588348, + "rewards/margins": 0.5397509336471558, + "rewards/rejected": -0.5449763536453247, + "sft_loss": 0.05225411802530289, + "step": 3803 + }, + { + "epoch": 5.5010845986984815, + "grad_norm": 1.0966034645413594, + "learning_rate": 1.4020921637993356e-07, + "logits/chosen": -1.114017367362976, + "logits/rejected": -0.7877224683761597, + "logps/chosen": -0.016746368259191513, + "logps/rejected": -5.36176061630249, + "loss": 0.0411, + "odds_ratio_loss": 0.00036486214958131313, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0016746367327868938, + "rewards/margins": 0.5345014333724976, + "rewards/rejected": -0.5361760854721069, + "sft_loss": 0.016746368259191513, + "step": 3804 + }, + { + "epoch": 5.502530730296457, + "grad_norm": 1.1649674357871873, + "learning_rate": 1.3939526709940342e-07, + "logits/chosen": -0.9118725061416626, + "logits/rejected": -0.7113963961601257, + "logps/chosen": -0.04102957993745804, + "logps/rejected": -4.159704208374023, + "loss": 0.0406, + "odds_ratio_loss": 0.0027073349338024855, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004102957900613546, + "rewards/margins": 0.4118674397468567, + "rewards/rejected": -0.41597044467926025, + "sft_loss": 0.04102957993745804, + "step": 3805 + }, + { + "epoch": 5.503976861894432, + "grad_norm": 1.1222388477018694, + "learning_rate": 1.385836453918454e-07, + "logits/chosen": -0.8803320527076721, + "logits/rejected": -0.738538920879364, + "logps/chosen": -0.03088982217013836, + "logps/rejected": -3.4778146743774414, + "loss": 0.0437, + "odds_ratio_loss": 0.0015149106038734317, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0030889820773154497, + "rewards/margins": 0.3446924686431885, + "rewards/rejected": -0.3477814793586731, + "sft_loss": 0.03088982217013836, + "step": 3806 + }, + { + "epoch": 5.505422993492408, + "grad_norm": 0.8878311657170987, + "learning_rate": 1.3777435174658903e-07, + "logits/chosen": -1.147339105606079, + "logits/rejected": -0.8466560244560242, + "logps/chosen": -0.02538106217980385, + "logps/rejected": -4.762484550476074, + "loss": 0.0388, + "odds_ratio_loss": 0.001554648159071803, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0025381064042448997, + "rewards/margins": 0.4737103581428528, + "rewards/rejected": -0.47624844312667847, + "sft_loss": 0.02538106217980385, + "step": 3807 + }, + { + "epoch": 5.506869125090383, + "grad_norm": 1.1476764572136273, + "learning_rate": 1.3696738665156038e-07, + "logits/chosen": -0.9432680010795593, + "logits/rejected": -0.7418482303619385, + "logps/chosen": -0.062033966183662415, + "logps/rejected": -4.3220930099487305, + "loss": 0.0452, + "odds_ratio_loss": 0.0031421349849551916, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006203396245837212, + "rewards/margins": 0.42600592970848083, + "rewards/rejected": -0.432209312915802, + "sft_loss": 0.062033966183662415, + "step": 3808 + }, + { + "epoch": 5.508315256688359, + "grad_norm": 0.946154333369156, + "learning_rate": 1.361627505932823e-07, + "logits/chosen": -1.051152229309082, + "logits/rejected": -0.6539809703826904, + "logps/chosen": -0.022800182923674583, + "logps/rejected": -4.930760383605957, + "loss": 0.0418, + "odds_ratio_loss": 0.0010601935209706426, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0022800182923674583, + "rewards/margins": 0.4907959997653961, + "rewards/rejected": -0.49307602643966675, + "sft_loss": 0.022800182923674583, + "step": 3809 + }, + { + "epoch": 5.509761388286334, + "grad_norm": 1.1866627966271115, + "learning_rate": 1.3536044405687208e-07, + "logits/chosen": -0.6882243156433105, + "logits/rejected": -0.4875558018684387, + "logps/chosen": -0.035662174224853516, + "logps/rejected": -5.658996105194092, + "loss": 0.0525, + "odds_ratio_loss": 0.0008063373970799148, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0035662176087498665, + "rewards/margins": 0.5623334646224976, + "rewards/rejected": -0.565899670124054, + "sft_loss": 0.035662174224853516, + "step": 3810 + }, + { + "epoch": 5.51120751988431, + "grad_norm": 0.9004650888610211, + "learning_rate": 1.3456046752604323e-07, + "logits/chosen": -1.0605201721191406, + "logits/rejected": -0.5877838134765625, + "logps/chosen": -0.010158369317650795, + "logps/rejected": -6.685586929321289, + "loss": 0.0362, + "odds_ratio_loss": 0.0003088012454099953, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0010158369550481439, + "rewards/margins": 0.6675429344177246, + "rewards/rejected": -0.6685587167739868, + "sft_loss": 0.010158369317650795, + "step": 3811 + }, + { + "epoch": 5.512653651482285, + "grad_norm": 1.1943879093949175, + "learning_rate": 1.3376282148310457e-07, + "logits/chosen": -0.8050583600997925, + "logits/rejected": -0.7147977948188782, + "logps/chosen": -0.01518384087830782, + "logps/rejected": -6.681069374084473, + "loss": 0.0403, + "odds_ratio_loss": 0.0011472441256046295, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001518384087830782, + "rewards/margins": 0.666588544845581, + "rewards/rejected": -0.6681069731712341, + "sft_loss": 0.01518384087830782, + "step": 3812 + }, + { + "epoch": 5.51409978308026, + "grad_norm": 1.362313364484537, + "learning_rate": 1.3296750640896126e-07, + "logits/chosen": -1.0888941287994385, + "logits/rejected": -0.8052299618721008, + "logps/chosen": -0.03261774033308029, + "logps/rejected": -5.1450724601745605, + "loss": 0.0468, + "odds_ratio_loss": 0.0014005877310410142, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0032617738470435143, + "rewards/margins": 0.5112454295158386, + "rewards/rejected": -0.5145072937011719, + "sft_loss": 0.03261774033308029, + "step": 3813 + }, + { + "epoch": 5.515545914678236, + "grad_norm": 1.286476278575021, + "learning_rate": 1.3217452278311014e-07, + "logits/chosen": -0.7903501391410828, + "logits/rejected": -0.6983763575553894, + "logps/chosen": -0.01914118230342865, + "logps/rejected": -5.037604331970215, + "loss": 0.0488, + "odds_ratio_loss": 0.0035344092175364494, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0019141181837767363, + "rewards/margins": 0.5018463730812073, + "rewards/rejected": -0.5037604570388794, + "sft_loss": 0.01914118230342865, + "step": 3814 + }, + { + "epoch": 5.516992046276211, + "grad_norm": 1.2606595706769816, + "learning_rate": 1.3138387108364478e-07, + "logits/chosen": -1.197737216949463, + "logits/rejected": -0.8296566009521484, + "logps/chosen": -0.052802085876464844, + "logps/rejected": -4.593600749969482, + "loss": 0.0515, + "odds_ratio_loss": 0.003228149376809597, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005280209239572287, + "rewards/margins": 0.45407986640930176, + "rewards/rejected": -0.4593600630760193, + "sft_loss": 0.052802085876464844, + "step": 3815 + }, + { + "epoch": 5.518438177874186, + "grad_norm": 1.322864716916177, + "learning_rate": 1.3059555178725145e-07, + "logits/chosen": -0.8976123929023743, + "logits/rejected": -0.626125156879425, + "logps/chosen": -0.020292531698942184, + "logps/rejected": -5.278232574462891, + "loss": 0.0306, + "odds_ratio_loss": 0.0016008391976356506, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002029252937063575, + "rewards/margins": 0.5257940292358398, + "rewards/rejected": -0.527823269367218, + "sft_loss": 0.020292531698942184, + "step": 3816 + }, + { + "epoch": 5.519884309472162, + "grad_norm": 1.1411805522730452, + "learning_rate": 1.2980956536921217e-07, + "logits/chosen": -0.8470653295516968, + "logits/rejected": -0.7692535519599915, + "logps/chosen": -0.036463767290115356, + "logps/rejected": -5.199902534484863, + "loss": 0.0409, + "odds_ratio_loss": 0.0013247218448668718, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0036463765427470207, + "rewards/margins": 0.516343891620636, + "rewards/rejected": -0.5199902653694153, + "sft_loss": 0.036463767290115356, + "step": 3817 + }, + { + "epoch": 5.521330441070138, + "grad_norm": 1.1491703460727254, + "learning_rate": 1.2902591230339897e-07, + "logits/chosen": -0.9239856600761414, + "logits/rejected": -0.7358173727989197, + "logps/chosen": -0.014124940149486065, + "logps/rejected": -5.4396443367004395, + "loss": 0.0579, + "odds_ratio_loss": 0.0010359040461480618, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001412494108080864, + "rewards/margins": 0.5425519347190857, + "rewards/rejected": -0.5439644455909729, + "sft_loss": 0.014124940149486065, + "step": 3818 + }, + { + "epoch": 5.522776572668112, + "grad_norm": 1.3151003884044896, + "learning_rate": 1.2824459306228064e-07, + "logits/chosen": -0.6562463045120239, + "logits/rejected": -0.5251370072364807, + "logps/chosen": -0.04931947961449623, + "logps/rejected": -4.668588638305664, + "loss": 0.0453, + "odds_ratio_loss": 0.002963886596262455, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004931948147714138, + "rewards/margins": 0.4619269371032715, + "rewards/rejected": -0.4668588638305664, + "sft_loss": 0.04931947961449623, + "step": 3819 + }, + { + "epoch": 5.524222704266088, + "grad_norm": 0.9885430023773936, + "learning_rate": 1.2746560811691674e-07, + "logits/chosen": -1.072084903717041, + "logits/rejected": -0.89268958568573, + "logps/chosen": -0.03535031899809837, + "logps/rejected": -6.679300785064697, + "loss": 0.04, + "odds_ratio_loss": 0.0026512041222304106, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003535032505169511, + "rewards/margins": 0.6643950939178467, + "rewards/rejected": -0.6679300665855408, + "sft_loss": 0.03535031899809837, + "step": 3820 + }, + { + "epoch": 5.525668835864064, + "grad_norm": 1.061112092644839, + "learning_rate": 1.2668895793696144e-07, + "logits/chosen": -1.0231108665466309, + "logits/rejected": -0.6869786977767944, + "logps/chosen": -0.02733795717358589, + "logps/rejected": -6.306743621826172, + "loss": 0.0235, + "odds_ratio_loss": 0.000864526373334229, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002733795903623104, + "rewards/margins": 0.6279405355453491, + "rewards/rejected": -0.6306743621826172, + "sft_loss": 0.02733795717358589, + "step": 3821 + }, + { + "epoch": 5.527114967462039, + "grad_norm": 1.308480386878777, + "learning_rate": 1.2591464299065834e-07, + "logits/chosen": -0.8179947137832642, + "logits/rejected": -0.6588776111602783, + "logps/chosen": -0.06917057186365128, + "logps/rejected": -5.604111671447754, + "loss": 0.0651, + "odds_ratio_loss": 0.0042171357199549675, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006917057558894157, + "rewards/margins": 0.5534940958023071, + "rewards/rejected": -0.5604111552238464, + "sft_loss": 0.06917057186365128, + "step": 3822 + }, + { + "epoch": 5.528561099060014, + "grad_norm": 1.155715414947543, + "learning_rate": 1.2514266374484606e-07, + "logits/chosen": -0.9295008182525635, + "logits/rejected": -0.6454451084136963, + "logps/chosen": -0.03257459029555321, + "logps/rejected": -6.534205436706543, + "loss": 0.0209, + "odds_ratio_loss": 0.0021665338426828384, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0032574590295553207, + "rewards/margins": 0.6501630544662476, + "rewards/rejected": -0.6534205079078674, + "sft_loss": 0.03257459029555321, + "step": 3823 + }, + { + "epoch": 5.53000723065799, + "grad_norm": 0.9684929471760406, + "learning_rate": 1.243730206649527e-07, + "logits/chosen": -1.0152008533477783, + "logits/rejected": -0.8463742733001709, + "logps/chosen": -0.026521919295191765, + "logps/rejected": -4.2680816650390625, + "loss": 0.0446, + "odds_ratio_loss": 0.002520361915230751, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002652192022651434, + "rewards/margins": 0.42415598034858704, + "rewards/rejected": -0.4268081784248352, + "sft_loss": 0.026521919295191765, + "step": 3824 + }, + { + "epoch": 5.531453362255966, + "grad_norm": 1.1373144870061254, + "learning_rate": 1.2360571421500044e-07, + "logits/chosen": -0.9026045799255371, + "logits/rejected": -0.739682674407959, + "logps/chosen": -0.033317435532808304, + "logps/rejected": -4.749547481536865, + "loss": 0.039, + "odds_ratio_loss": 0.0017841738881543279, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003331744112074375, + "rewards/margins": 0.471623033285141, + "rewards/rejected": -0.4749547839164734, + "sft_loss": 0.033317435532808304, + "step": 3825 + }, + { + "epoch": 5.53289949385394, + "grad_norm": 0.9615381215348964, + "learning_rate": 1.2284074485760009e-07, + "logits/chosen": -0.8409866094589233, + "logits/rejected": -0.6542403697967529, + "logps/chosen": -0.0341576412320137, + "logps/rejected": -4.307783126831055, + "loss": 0.0338, + "odds_ratio_loss": 0.00148858898319304, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003415764309465885, + "rewards/margins": 0.4273625612258911, + "rewards/rejected": -0.4307783246040344, + "sft_loss": 0.0341576412320137, + "step": 3826 + }, + { + "epoch": 5.534345625451916, + "grad_norm": 1.1509488481197494, + "learning_rate": 1.2207811305395388e-07, + "logits/chosen": -0.8256188035011292, + "logits/rejected": -0.8999884128570557, + "logps/chosen": -0.03968646377325058, + "logps/rejected": -4.411059379577637, + "loss": 0.0258, + "odds_ratio_loss": 0.0018667414551600814, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0039686462841928005, + "rewards/margins": 0.437137246131897, + "rewards/rejected": -0.4411059319972992, + "sft_loss": 0.03968646377325058, + "step": 3827 + }, + { + "epoch": 5.535791757049892, + "grad_norm": 0.9584637218026467, + "learning_rate": 1.213178192638571e-07, + "logits/chosen": -0.7055195569992065, + "logits/rejected": -0.49205851554870605, + "logps/chosen": -0.04943579062819481, + "logps/rejected": -6.112897872924805, + "loss": 0.0412, + "odds_ratio_loss": 0.0011792955920100212, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004943579435348511, + "rewards/margins": 0.6063462495803833, + "rewards/rejected": -0.6112898588180542, + "sft_loss": 0.04943579062819481, + "step": 3828 + }, + { + "epoch": 5.537237888647867, + "grad_norm": 1.1558628450281507, + "learning_rate": 1.205598639456924e-07, + "logits/chosen": -1.1022794246673584, + "logits/rejected": -0.8147293329238892, + "logps/chosen": -0.05759043246507645, + "logps/rejected": -5.212984085083008, + "loss": 0.0574, + "odds_ratio_loss": 0.0012822567950934172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005759043153375387, + "rewards/margins": 0.5155394077301025, + "rewards/rejected": -0.5212984085083008, + "sft_loss": 0.05759043246507645, + "step": 3829 + }, + { + "epoch": 5.538684020245842, + "grad_norm": 1.1812263848411928, + "learning_rate": 1.198042475564347e-07, + "logits/chosen": -0.9783557653427124, + "logits/rejected": -0.8160287141799927, + "logps/chosen": -0.09639565646648407, + "logps/rejected": -7.522600173950195, + "loss": 0.0447, + "odds_ratio_loss": 0.008017424494028091, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009639564901590347, + "rewards/margins": 0.7426204681396484, + "rewards/rejected": -0.7522600293159485, + "sft_loss": 0.09639565646648407, + "step": 3830 + }, + { + "epoch": 5.540130151843818, + "grad_norm": 1.2066995822125324, + "learning_rate": 1.1905097055164714e-07, + "logits/chosen": -0.8420087099075317, + "logits/rejected": -0.6522636413574219, + "logps/chosen": -0.03687209263443947, + "logps/rejected": -3.5625946521759033, + "loss": 0.0414, + "odds_ratio_loss": 0.0023647851776331663, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0036872094497084618, + "rewards/margins": 0.3525722324848175, + "rewards/rejected": -0.3562594950199127, + "sft_loss": 0.03687209263443947, + "step": 3831 + }, + { + "epoch": 5.541576283441794, + "grad_norm": 0.8654303496060788, + "learning_rate": 1.1830003338548423e-07, + "logits/chosen": -1.0921556949615479, + "logits/rejected": -0.850192129611969, + "logps/chosen": -0.013794208876788616, + "logps/rejected": -6.353579521179199, + "loss": 0.027, + "odds_ratio_loss": 0.001722605200484395, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001379420980811119, + "rewards/margins": 0.6339784860610962, + "rewards/rejected": -0.6353579163551331, + "sft_loss": 0.013794208876788616, + "step": 3832 + }, + { + "epoch": 5.5430224150397684, + "grad_norm": 0.977867382746297, + "learning_rate": 1.1755143651068822e-07, + "logits/chosen": -0.7232560515403748, + "logits/rejected": -0.6421326398849487, + "logps/chosen": -0.03204415738582611, + "logps/rejected": -4.54522705078125, + "loss": 0.0241, + "odds_ratio_loss": 0.005562347825616598, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003204415552318096, + "rewards/margins": 0.45131826400756836, + "rewards/rejected": -0.45452266931533813, + "sft_loss": 0.03204415738582611, + "step": 3833 + }, + { + "epoch": 5.544468546637744, + "grad_norm": 1.0993207812461236, + "learning_rate": 1.1680518037859054e-07, + "logits/chosen": -0.9139794111251831, + "logits/rejected": -0.622349202632904, + "logps/chosen": -0.04837263375520706, + "logps/rejected": -7.495334625244141, + "loss": 0.0411, + "odds_ratio_loss": 0.005845530424267054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004837263375520706, + "rewards/margins": 0.7446962594985962, + "rewards/rejected": -0.7495335340499878, + "sft_loss": 0.04837263375520706, + "step": 3834 + }, + { + "epoch": 5.54591467823572, + "grad_norm": 1.089833611551456, + "learning_rate": 1.1606126543911177e-07, + "logits/chosen": -0.9266623854637146, + "logits/rejected": -0.5840288400650024, + "logps/chosen": -0.028001118451356888, + "logps/rejected": -7.177366733551025, + "loss": 0.0306, + "odds_ratio_loss": 0.0006162969511933625, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0028001118917018175, + "rewards/margins": 0.7149365544319153, + "rewards/rejected": -0.7177366018295288, + "sft_loss": 0.028001118451356888, + "step": 3835 + }, + { + "epoch": 5.547360809833695, + "grad_norm": 1.204977731108127, + "learning_rate": 1.1531969214076198e-07, + "logits/chosen": -0.8889631628990173, + "logits/rejected": -0.7671388387680054, + "logps/chosen": -0.04466234892606735, + "logps/rejected": -3.038712501525879, + "loss": 0.0505, + "odds_ratio_loss": 0.0071504805237054825, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00446623470634222, + "rewards/margins": 0.299405038356781, + "rewards/rejected": -0.3038712739944458, + "sft_loss": 0.04466234892606735, + "step": 3836 + }, + { + "epoch": 5.54880694143167, + "grad_norm": 1.7867119071684434, + "learning_rate": 1.1458046093063733e-07, + "logits/chosen": -1.1271165609359741, + "logits/rejected": -0.827365517616272, + "logps/chosen": -0.00812723208218813, + "logps/rejected": -5.488351821899414, + "loss": 0.0466, + "odds_ratio_loss": 0.0012041174340993166, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0008127232431434095, + "rewards/margins": 0.548022449016571, + "rewards/rejected": -0.5488352179527283, + "sft_loss": 0.00812723208218813, + "step": 3837 + }, + { + "epoch": 5.550253073029646, + "grad_norm": 1.1182672956514397, + "learning_rate": 1.1384357225442398e-07, + "logits/chosen": -0.706390380859375, + "logits/rejected": -0.6600760817527771, + "logps/chosen": -0.03536364808678627, + "logps/rejected": -6.150212287902832, + "loss": 0.0308, + "odds_ratio_loss": 0.001387185649946332, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003536364994943142, + "rewards/margins": 0.611484944820404, + "rewards/rejected": -0.615021288394928, + "sft_loss": 0.03536364808678627, + "step": 3838 + }, + { + "epoch": 5.551699204627621, + "grad_norm": 1.0491739599778058, + "learning_rate": 1.1310902655639454e-07, + "logits/chosen": -0.7480611801147461, + "logits/rejected": -0.7505444288253784, + "logps/chosen": -0.03836183249950409, + "logps/rejected": -4.293073654174805, + "loss": 0.0509, + "odds_ratio_loss": 0.0020930215250700712, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0038361833430826664, + "rewards/margins": 0.4254711866378784, + "rewards/rejected": -0.42930734157562256, + "sft_loss": 0.03836183249950409, + "step": 3839 + }, + { + "epoch": 5.5531453362255965, + "grad_norm": 1.2106392513909316, + "learning_rate": 1.1237682427940942e-07, + "logits/chosen": -0.8354946374893188, + "logits/rejected": -0.564354658126831, + "logps/chosen": -0.042547404766082764, + "logps/rejected": -5.4656572341918945, + "loss": 0.0358, + "odds_ratio_loss": 0.0024263339582830667, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004254741128534079, + "rewards/margins": 0.5423109531402588, + "rewards/rejected": -0.5465657114982605, + "sft_loss": 0.042547404766082764, + "step": 3840 + }, + { + "epoch": 5.554591467823572, + "grad_norm": 1.115882395839301, + "learning_rate": 1.1164696586491639e-07, + "logits/chosen": -0.7860689163208008, + "logits/rejected": -0.60820472240448, + "logps/chosen": -0.038941726088523865, + "logps/rejected": -5.316815376281738, + "loss": 0.0391, + "odds_ratio_loss": 0.005218683276325464, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00389417284168303, + "rewards/margins": 0.5277873873710632, + "rewards/rejected": -0.5316815376281738, + "sft_loss": 0.038941726088523865, + "step": 3841 + }, + { + "epoch": 5.556037599421547, + "grad_norm": 0.8934107860559392, + "learning_rate": 1.1091945175294836e-07, + "logits/chosen": -0.8913710713386536, + "logits/rejected": -0.8013601303100586, + "logps/chosen": -0.009748661890625954, + "logps/rejected": -4.847626686096191, + "loss": 0.0175, + "odds_ratio_loss": 0.0006343543063849211, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0009748662123456597, + "rewards/margins": 0.4837878346443176, + "rewards/rejected": -0.48476266860961914, + "sft_loss": 0.009748661890625954, + "step": 3842 + }, + { + "epoch": 5.557483731019523, + "grad_norm": 1.15693764012382, + "learning_rate": 1.1019428238212825e-07, + "logits/chosen": -0.779668927192688, + "logits/rejected": -0.6887148022651672, + "logps/chosen": -0.05625636503100395, + "logps/rejected": -5.346826076507568, + "loss": 0.0428, + "odds_ratio_loss": 0.007378511130809784, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005625636782497168, + "rewards/margins": 0.5290569067001343, + "rewards/rejected": -0.53468257188797, + "sft_loss": 0.05625636503100395, + "step": 3843 + }, + { + "epoch": 5.558929862617498, + "grad_norm": 1.0916272154464464, + "learning_rate": 1.0947145818966186e-07, + "logits/chosen": -0.8776683807373047, + "logits/rejected": -0.6630545854568481, + "logps/chosen": -0.01515410840511322, + "logps/rejected": -6.905978679656982, + "loss": 0.0341, + "odds_ratio_loss": 0.0017984689911827445, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001515410840511322, + "rewards/margins": 0.6890825033187866, + "rewards/rejected": -0.6905978918075562, + "sft_loss": 0.01515410840511322, + "step": 3844 + }, + { + "epoch": 5.560375994215473, + "grad_norm": 1.26311105122681, + "learning_rate": 1.0875097961134372e-07, + "logits/chosen": -0.8992950916290283, + "logits/rejected": -0.7164445519447327, + "logps/chosen": -0.03995127975940704, + "logps/rejected": -3.731813430786133, + "loss": 0.0421, + "odds_ratio_loss": 0.004833152983337641, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003995127510279417, + "rewards/margins": 0.3691861927509308, + "rewards/rejected": -0.3731813430786133, + "sft_loss": 0.03995127975940704, + "step": 3845 + }, + { + "epoch": 5.561822125813449, + "grad_norm": 0.9533168885486478, + "learning_rate": 1.0803284708155213e-07, + "logits/chosen": -0.8431378602981567, + "logits/rejected": -0.6289638876914978, + "logps/chosen": -0.050756603479385376, + "logps/rejected": -4.342966079711914, + "loss": 0.0278, + "odds_ratio_loss": 0.004125285893678665, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0050756605342030525, + "rewards/margins": 0.42922091484069824, + "rewards/rejected": -0.4342966079711914, + "sft_loss": 0.050756603479385376, + "step": 3846 + }, + { + "epoch": 5.5632682574114245, + "grad_norm": 1.1297521726341693, + "learning_rate": 1.0731706103325233e-07, + "logits/chosen": -1.105302095413208, + "logits/rejected": -0.6120625138282776, + "logps/chosen": -0.022785475477576256, + "logps/rejected": -6.2301530838012695, + "loss": 0.036, + "odds_ratio_loss": 0.0008173306123353541, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0022785477340221405, + "rewards/margins": 0.6207367777824402, + "rewards/rejected": -0.623015284538269, + "sft_loss": 0.022785475477576256, + "step": 3847 + }, + { + "epoch": 5.5647143890094, + "grad_norm": 1.0183290147990447, + "learning_rate": 1.0660362189799465e-07, + "logits/chosen": -0.8919758796691895, + "logits/rejected": -0.7195218801498413, + "logps/chosen": -0.03661835193634033, + "logps/rejected": -5.3943705558776855, + "loss": 0.0272, + "odds_ratio_loss": 0.0016591616440564394, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003661835566163063, + "rewards/margins": 0.5357752442359924, + "rewards/rejected": -0.5394370555877686, + "sft_loss": 0.03661835193634033, + "step": 3848 + }, + { + "epoch": 5.566160520607375, + "grad_norm": 1.0784389721266068, + "learning_rate": 1.058925301059137e-07, + "logits/chosen": -0.8064311742782593, + "logits/rejected": -0.6183796525001526, + "logps/chosen": -0.039033301174640656, + "logps/rejected": -4.795423984527588, + "loss": 0.0287, + "odds_ratio_loss": 0.0012832069769501686, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0039033303037285805, + "rewards/margins": 0.47563910484313965, + "rewards/rejected": -0.47954240441322327, + "sft_loss": 0.039033301174640656, + "step": 3849 + }, + { + "epoch": 5.567606652205351, + "grad_norm": 0.8967336618921115, + "learning_rate": 1.0518378608572964e-07, + "logits/chosen": -1.0078152418136597, + "logits/rejected": -0.7482145428657532, + "logps/chosen": -0.015405582264065742, + "logps/rejected": -3.899319648742676, + "loss": 0.0197, + "odds_ratio_loss": 0.0012769848108291626, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0015405581798404455, + "rewards/margins": 0.3883914053440094, + "rewards/rejected": -0.38993197679519653, + "sft_loss": 0.015405582264065742, + "step": 3850 + }, + { + "epoch": 5.569052783803326, + "grad_norm": 1.1995841884111325, + "learning_rate": 1.0447739026474645e-07, + "logits/chosen": -0.6040602922439575, + "logits/rejected": -0.5330982208251953, + "logps/chosen": -0.04164326936006546, + "logps/rejected": -5.938526153564453, + "loss": 0.0619, + "odds_ratio_loss": 0.0008817025227472186, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004164327401667833, + "rewards/margins": 0.5896883606910706, + "rewards/rejected": -0.593852698802948, + "sft_loss": 0.04164326936006546, + "step": 3851 + }, + { + "epoch": 5.570498915401301, + "grad_norm": 0.9183906800328296, + "learning_rate": 1.0377334306885322e-07, + "logits/chosen": -0.6867368817329407, + "logits/rejected": -0.7344608902931213, + "logps/chosen": -0.058505572378635406, + "logps/rejected": -5.076149940490723, + "loss": 0.0417, + "odds_ratio_loss": 0.013662266544997692, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005850557237863541, + "rewards/margins": 0.5017644762992859, + "rewards/rejected": -0.5076150298118591, + "sft_loss": 0.058505572378635406, + "step": 3852 + }, + { + "epoch": 5.571945046999277, + "grad_norm": 1.1114949751042007, + "learning_rate": 1.030716449225224e-07, + "logits/chosen": -0.8221277594566345, + "logits/rejected": -0.6434429287910461, + "logps/chosen": -0.027723146602511406, + "logps/rejected": -4.452295303344727, + "loss": 0.0497, + "odds_ratio_loss": 0.045040663331747055, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0027723144739866257, + "rewards/margins": 0.4424571990966797, + "rewards/rejected": -0.44522953033447266, + "sft_loss": 0.027723146602511406, + "step": 3853 + }, + { + "epoch": 5.5733911785972525, + "grad_norm": 1.0600027055877603, + "learning_rate": 1.0237229624881116e-07, + "logits/chosen": -0.8413448929786682, + "logits/rejected": -0.5777408480644226, + "logps/chosen": -0.023316072300076485, + "logps/rejected": -5.208468437194824, + "loss": 0.0336, + "odds_ratio_loss": 0.001969376113265753, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002331607509404421, + "rewards/margins": 0.5185152292251587, + "rewards/rejected": -0.5208468437194824, + "sft_loss": 0.023316072300076485, + "step": 3854 + }, + { + "epoch": 5.574837310195228, + "grad_norm": 0.8143625402129068, + "learning_rate": 1.0167529746935866e-07, + "logits/chosen": -0.8654884099960327, + "logits/rejected": -0.5924917459487915, + "logps/chosen": -0.011484618298709393, + "logps/rejected": -8.372770309448242, + "loss": 0.0213, + "odds_ratio_loss": 0.001312417909502983, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0011484617134556174, + "rewards/margins": 0.8361285924911499, + "rewards/rejected": -0.8372770547866821, + "sft_loss": 0.011484618298709393, + "step": 3855 + }, + { + "epoch": 5.576283441793203, + "grad_norm": 0.9756613799933704, + "learning_rate": 1.009806490043883e-07, + "logits/chosen": -0.7985130548477173, + "logits/rejected": -0.7025433778762817, + "logps/chosen": -0.027194581925868988, + "logps/rejected": -4.829598426818848, + "loss": 0.0327, + "odds_ratio_loss": 0.0021613496355712414, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002719458192586899, + "rewards/margins": 0.4802403748035431, + "rewards/rejected": -0.4829598069190979, + "sft_loss": 0.027194581925868988, + "step": 3856 + }, + { + "epoch": 5.577729573391179, + "grad_norm": 1.3785283373997754, + "learning_rate": 1.0028835127270552e-07, + "logits/chosen": -1.0098246335983276, + "logits/rejected": -0.864977240562439, + "logps/chosen": -0.018646353855729103, + "logps/rejected": -4.30742073059082, + "loss": 0.0339, + "odds_ratio_loss": 0.0010222363052889705, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0018646355019882321, + "rewards/margins": 0.42887741327285767, + "rewards/rejected": -0.4307420551776886, + "sft_loss": 0.018646353855729103, + "step": 3857 + }, + { + "epoch": 5.579175704989154, + "grad_norm": 0.9558186473549585, + "learning_rate": 9.959840469170044e-08, + "logits/chosen": -0.9542050957679749, + "logits/rejected": -0.6801411509513855, + "logps/chosen": -0.017612973228096962, + "logps/rejected": -5.700991630554199, + "loss": 0.0347, + "odds_ratio_loss": 0.001063553267158568, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0017612973460927606, + "rewards/margins": 0.5683378577232361, + "rewards/rejected": -0.5700991153717041, + "sft_loss": 0.017612973228096962, + "step": 3858 + }, + { + "epoch": 5.580621836587129, + "grad_norm": 1.1468823365770084, + "learning_rate": 9.891080967734345e-08, + "logits/chosen": -1.0696161985397339, + "logits/rejected": -0.7148972153663635, + "logps/chosen": -0.03979545831680298, + "logps/rejected": -6.093506813049316, + "loss": 0.0352, + "odds_ratio_loss": 0.0021976944990456104, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003979546017944813, + "rewards/margins": 0.6053711771965027, + "rewards/rejected": -0.6093507409095764, + "sft_loss": 0.03979545831680298, + "step": 3859 + }, + { + "epoch": 5.582067968185105, + "grad_norm": 1.0516361506414187, + "learning_rate": 9.822556664418913e-08, + "logits/chosen": -0.9046825766563416, + "logits/rejected": -0.6393083333969116, + "logps/chosen": -0.059828951954841614, + "logps/rejected": -4.932526111602783, + "loss": 0.0438, + "odds_ratio_loss": 0.004070539027452469, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005982895381748676, + "rewards/margins": 0.4872697591781616, + "rewards/rejected": -0.49325263500213623, + "sft_loss": 0.059828951954841614, + "step": 3860 + }, + { + "epoch": 5.5835140997830806, + "grad_norm": 1.1184859002794636, + "learning_rate": 9.754267600537148e-08, + "logits/chosen": -1.1668930053710938, + "logits/rejected": -0.8483262658119202, + "logps/chosen": -0.07912689447402954, + "logps/rejected": -4.509005546569824, + "loss": 0.0481, + "odds_ratio_loss": 0.003705930430442095, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00791268888860941, + "rewards/margins": 0.44298791885375977, + "rewards/rejected": -0.4509005546569824, + "sft_loss": 0.07912689447402954, + "step": 3861 + }, + { + "epoch": 5.584960231381055, + "grad_norm": 1.0245326939879393, + "learning_rate": 9.686213817260957e-08, + "logits/chosen": -0.8007428646087646, + "logits/rejected": -0.7032575011253357, + "logps/chosen": -0.024395223706960678, + "logps/rejected": -5.995718002319336, + "loss": 0.0308, + "odds_ratio_loss": 0.0045672086998820305, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0024395224172621965, + "rewards/margins": 0.5971323251724243, + "rewards/rejected": -0.5995718240737915, + "sft_loss": 0.024395223706960678, + "step": 3862 + }, + { + "epoch": 5.586406362979031, + "grad_norm": 0.792686060002036, + "learning_rate": 9.618395355620146e-08, + "logits/chosen": -0.9847527742385864, + "logits/rejected": -0.9884693622589111, + "logps/chosen": -0.016595035791397095, + "logps/rejected": -4.327853202819824, + "loss": 0.0219, + "odds_ratio_loss": 0.001219768775627017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001659503672271967, + "rewards/margins": 0.43112584948539734, + "rewards/rejected": -0.4327853322029114, + "sft_loss": 0.016595035791397095, + "step": 3863 + }, + { + "epoch": 5.587852494577007, + "grad_norm": 1.0262122915001675, + "learning_rate": 9.550812256502671e-08, + "logits/chosen": -0.7402524948120117, + "logits/rejected": -0.650458037853241, + "logps/chosen": -0.05518195778131485, + "logps/rejected": -6.806795120239258, + "loss": 0.0316, + "odds_ratio_loss": 0.005077338311821222, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0055181956849992275, + "rewards/margins": 0.6751613020896912, + "rewards/rejected": -0.6806795597076416, + "sft_loss": 0.05518195778131485, + "step": 3864 + }, + { + "epoch": 5.5892986261749815, + "grad_norm": 0.9213171627134767, + "learning_rate": 9.483464560654653e-08, + "logits/chosen": -0.7630059719085693, + "logits/rejected": -0.5672922730445862, + "logps/chosen": -0.02237100899219513, + "logps/rejected": -5.245082855224609, + "loss": 0.0263, + "odds_ratio_loss": 0.0012939394218847156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002237100852653384, + "rewards/margins": 0.5222712159156799, + "rewards/rejected": -0.5245083570480347, + "sft_loss": 0.02237100899219513, + "step": 3865 + }, + { + "epoch": 5.590744757772957, + "grad_norm": 0.9947073685141449, + "learning_rate": 9.41635230868032e-08, + "logits/chosen": -0.8050935864448547, + "logits/rejected": -0.6179883480072021, + "logps/chosen": -0.02250969037413597, + "logps/rejected": -5.135504722595215, + "loss": 0.0357, + "odds_ratio_loss": 0.0018040683353319764, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0022509689442813396, + "rewards/margins": 0.5112994909286499, + "rewards/rejected": -0.5135504603385925, + "sft_loss": 0.02250969037413597, + "step": 3866 + }, + { + "epoch": 5.592190889370933, + "grad_norm": 1.0721662892607096, + "learning_rate": 9.349475541041885e-08, + "logits/chosen": -1.047861099243164, + "logits/rejected": -0.7601584196090698, + "logps/chosen": -0.07620863616466522, + "logps/rejected": -5.097728252410889, + "loss": 0.0393, + "odds_ratio_loss": 0.003793524345383048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007620863616466522, + "rewards/margins": 0.5021519660949707, + "rewards/rejected": -0.5097728371620178, + "sft_loss": 0.07620863616466522, + "step": 3867 + }, + { + "epoch": 5.593637020968908, + "grad_norm": 1.2066394169606451, + "learning_rate": 9.282834298059539e-08, + "logits/chosen": -0.8130267858505249, + "logits/rejected": -0.7708932161331177, + "logps/chosen": -0.016214922070503235, + "logps/rejected": -5.153808116912842, + "loss": 0.0443, + "odds_ratio_loss": 0.0012468149652704597, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0016214922070503235, + "rewards/margins": 0.5137593150138855, + "rewards/rejected": -0.5153807997703552, + "sft_loss": 0.016214922070503235, + "step": 3868 + }, + { + "epoch": 5.595083152566883, + "grad_norm": 1.157611327012197, + "learning_rate": 9.216428619911587e-08, + "logits/chosen": -0.827752411365509, + "logits/rejected": -0.6157627701759338, + "logps/chosen": -0.031848832964897156, + "logps/rejected": -5.636829376220703, + "loss": 0.04, + "odds_ratio_loss": 0.0006597494357265532, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003184883389621973, + "rewards/margins": 0.5604981184005737, + "rewards/rejected": -0.5636829733848572, + "sft_loss": 0.031848832964897156, + "step": 3869 + }, + { + "epoch": 5.596529284164859, + "grad_norm": 1.440751755555854, + "learning_rate": 9.150258546634271e-08, + "logits/chosen": -0.6490340232849121, + "logits/rejected": -0.48528796434402466, + "logps/chosen": -0.02013351395726204, + "logps/rejected": -6.363707542419434, + "loss": 0.0612, + "odds_ratio_loss": 0.0006388231995515525, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0020133513025939465, + "rewards/margins": 0.6343573927879333, + "rewards/rejected": -0.6363707780838013, + "sft_loss": 0.02013351395726204, + "step": 3870 + }, + { + "epoch": 5.597975415762835, + "grad_norm": 0.9224358342142104, + "learning_rate": 9.084324118121767e-08, + "logits/chosen": -0.8425424695014954, + "logits/rejected": -0.6517443060874939, + "logps/chosen": -0.012473606504499912, + "logps/rejected": -5.4249186515808105, + "loss": 0.0261, + "odds_ratio_loss": 0.0011486727744340897, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0012473606038838625, + "rewards/margins": 0.5412445068359375, + "rewards/rejected": -0.5424919128417969, + "sft_loss": 0.012473606504499912, + "step": 3871 + }, + { + "epoch": 5.5994215473608095, + "grad_norm": 1.047728691464437, + "learning_rate": 9.018625374126188e-08, + "logits/chosen": -0.787708044052124, + "logits/rejected": -0.6361563801765442, + "logps/chosen": -0.040176477283239365, + "logps/rejected": -5.250421524047852, + "loss": 0.0409, + "odds_ratio_loss": 0.0012823616852983832, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004017648287117481, + "rewards/margins": 0.5210245251655579, + "rewards/rejected": -0.5250421166419983, + "sft_loss": 0.040176477283239365, + "step": 3872 + }, + { + "epoch": 5.600867678958785, + "grad_norm": 1.005615697820088, + "learning_rate": 8.953162354257538e-08, + "logits/chosen": -0.9109957218170166, + "logits/rejected": -0.7220777273178101, + "logps/chosen": -0.017372364178299904, + "logps/rejected": -5.085519313812256, + "loss": 0.0377, + "odds_ratio_loss": 0.0010335225379094481, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001737236394546926, + "rewards/margins": 0.50681471824646, + "rewards/rejected": -0.5085519552230835, + "sft_loss": 0.017372364178299904, + "step": 3873 + }, + { + "epoch": 5.602313810556761, + "grad_norm": 0.859332407138211, + "learning_rate": 8.887935097983712e-08, + "logits/chosen": -0.8807803392410278, + "logits/rejected": -0.7796051502227783, + "logps/chosen": -0.039813119918107986, + "logps/rejected": -5.857760906219482, + "loss": 0.0247, + "odds_ratio_loss": 0.004182462580502033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003981312271207571, + "rewards/margins": 0.581794798374176, + "rewards/rejected": -0.585776150226593, + "sft_loss": 0.039813119918107986, + "step": 3874 + }, + { + "epoch": 5.603759942154736, + "grad_norm": 0.8777433120960881, + "learning_rate": 8.822943644630454e-08, + "logits/chosen": -0.9151740074157715, + "logits/rejected": -0.6227646470069885, + "logps/chosen": -0.06858855485916138, + "logps/rejected": -5.1172966957092285, + "loss": 0.0345, + "odds_ratio_loss": 0.001364107825793326, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006858856417238712, + "rewards/margins": 0.5048707723617554, + "rewards/rejected": -0.5117296576499939, + "sft_loss": 0.06858855485916138, + "step": 3875 + }, + { + "epoch": 5.605206073752711, + "grad_norm": 1.226504746702496, + "learning_rate": 8.758188033381353e-08, + "logits/chosen": -0.7460124492645264, + "logits/rejected": -0.5613774061203003, + "logps/chosen": -0.05687641724944115, + "logps/rejected": -5.019678115844727, + "loss": 0.0342, + "odds_ratio_loss": 0.003194852964952588, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005687640979886055, + "rewards/margins": 0.49628016352653503, + "rewards/rejected": -0.5019677877426147, + "sft_loss": 0.05687641724944115, + "step": 3876 + }, + { + "epoch": 5.606652205350687, + "grad_norm": 1.1254751656230593, + "learning_rate": 8.69366830327789e-08, + "logits/chosen": -0.8707925081253052, + "logits/rejected": -0.6477944850921631, + "logps/chosen": -0.12705475091934204, + "logps/rejected": -3.9010202884674072, + "loss": 0.0603, + "odds_ratio_loss": 0.006006310693919659, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012705476023256779, + "rewards/margins": 0.3773965835571289, + "rewards/rejected": -0.3901020586490631, + "sft_loss": 0.12705475091934204, + "step": 3877 + }, + { + "epoch": 5.608098336948663, + "grad_norm": 0.8131598612601004, + "learning_rate": 8.629384493219128e-08, + "logits/chosen": -0.8724921345710754, + "logits/rejected": -0.7644715309143066, + "logps/chosen": -0.05918511375784874, + "logps/rejected": -4.924114227294922, + "loss": 0.0357, + "odds_ratio_loss": 0.00036744706449098885, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005918511189520359, + "rewards/margins": 0.48649293184280396, + "rewards/rejected": -0.49241143465042114, + "sft_loss": 0.05918511375784874, + "step": 3878 + }, + { + "epoch": 5.609544468546638, + "grad_norm": 1.1354278291872029, + "learning_rate": 8.565336641962106e-08, + "logits/chosen": -0.7309091687202454, + "logits/rejected": -0.5149399638175964, + "logps/chosen": -0.017144229263067245, + "logps/rejected": -5.465782642364502, + "loss": 0.0358, + "odds_ratio_loss": 0.000733592314645648, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001714422949589789, + "rewards/margins": 0.5448638200759888, + "rewards/rejected": -0.5465782284736633, + "sft_loss": 0.017144229263067245, + "step": 3879 + }, + { + "epoch": 5.610990600144613, + "grad_norm": 0.907687038565481, + "learning_rate": 8.501524788121494e-08, + "logits/chosen": -0.8225124478340149, + "logits/rejected": -0.456741601228714, + "logps/chosen": -0.012008003890514374, + "logps/rejected": -6.019430160522461, + "loss": 0.0395, + "odds_ratio_loss": 0.0003659829089883715, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00120080029591918, + "rewards/margins": 0.6007422208786011, + "rewards/rejected": -0.6019430160522461, + "sft_loss": 0.012008003890514374, + "step": 3880 + }, + { + "epoch": 5.612436731742589, + "grad_norm": 0.8500819053650278, + "learning_rate": 8.437948970169629e-08, + "logits/chosen": -0.8494061231613159, + "logits/rejected": -0.6348491311073303, + "logps/chosen": -0.01760895922780037, + "logps/rejected": -5.886898040771484, + "loss": 0.0273, + "odds_ratio_loss": 0.0010809004306793213, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0017608960624784231, + "rewards/margins": 0.5869288444519043, + "rewards/rejected": -0.5886898040771484, + "sft_loss": 0.01760895922780037, + "step": 3881 + }, + { + "epoch": 5.613882863340564, + "grad_norm": 1.107555678550244, + "learning_rate": 8.374609226436735e-08, + "logits/chosen": -1.0346620082855225, + "logits/rejected": -0.6543772220611572, + "logps/chosen": -0.04019223153591156, + "logps/rejected": -6.5853447914123535, + "loss": 0.0296, + "odds_ratio_loss": 0.0007978305802680552, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004019223153591156, + "rewards/margins": 0.654515266418457, + "rewards/rejected": -0.6585345268249512, + "sft_loss": 0.04019223153591156, + "step": 3882 + }, + { + "epoch": 5.615328994938539, + "grad_norm": 1.6594008368972768, + "learning_rate": 8.311505595110446e-08, + "logits/chosen": -0.9576680064201355, + "logits/rejected": -0.7107992172241211, + "logps/chosen": -0.03443386033177376, + "logps/rejected": -4.302804946899414, + "loss": 0.0296, + "odds_ratio_loss": 0.0024423853028565645, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003443386172875762, + "rewards/margins": 0.42683711647987366, + "rewards/rejected": -0.43028050661087036, + "sft_loss": 0.03443386033177376, + "step": 3883 + }, + { + "epoch": 5.616775126536515, + "grad_norm": 0.8736460272012623, + "learning_rate": 8.248638114236283e-08, + "logits/chosen": -0.8958337306976318, + "logits/rejected": -0.6962901949882507, + "logps/chosen": -0.035607580095529556, + "logps/rejected": -4.740180015563965, + "loss": 0.0246, + "odds_ratio_loss": 0.0013252833159640431, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003560757962986827, + "rewards/margins": 0.4704572558403015, + "rewards/rejected": -0.4740179777145386, + "sft_loss": 0.035607580095529556, + "step": 3884 + }, + { + "epoch": 5.61822125813449, + "grad_norm": 1.1111572783157635, + "learning_rate": 8.186006821717173e-08, + "logits/chosen": -0.9697292447090149, + "logits/rejected": -0.8010947704315186, + "logps/chosen": -0.08695336431264877, + "logps/rejected": -5.5294318199157715, + "loss": 0.0365, + "odds_ratio_loss": 0.000782729999627918, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008695336058735847, + "rewards/margins": 0.5442478656768799, + "rewards/rejected": -0.552943229675293, + "sft_loss": 0.08695336431264877, + "step": 3885 + }, + { + "epoch": 5.619667389732466, + "grad_norm": 1.0281330550146877, + "learning_rate": 8.123611755313887e-08, + "logits/chosen": -1.1211961507797241, + "logits/rejected": -0.7908611297607422, + "logps/chosen": -0.022736605256795883, + "logps/rejected": -6.762401580810547, + "loss": 0.0396, + "odds_ratio_loss": 0.001896974048577249, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0022736606188118458, + "rewards/margins": 0.6739665269851685, + "rewards/rejected": -0.6762401461601257, + "sft_loss": 0.022736605256795883, + "step": 3886 + }, + { + "epoch": 5.621113521330441, + "grad_norm": 1.2903301444306965, + "learning_rate": 8.061452952644598e-08, + "logits/chosen": -0.7981445789337158, + "logits/rejected": -0.6683976054191589, + "logps/chosen": -0.051232676953077316, + "logps/rejected": -4.502172470092773, + "loss": 0.0434, + "odds_ratio_loss": 0.0008279865724034607, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005123267415910959, + "rewards/margins": 0.4450939893722534, + "rewards/rejected": -0.45021724700927734, + "sft_loss": 0.051232676953077316, + "step": 3887 + }, + { + "epoch": 5.622559652928416, + "grad_norm": 1.4460602872131916, + "learning_rate": 7.999530451185022e-08, + "logits/chosen": -0.9000306129455566, + "logits/rejected": -0.6654245853424072, + "logps/chosen": -0.06389784812927246, + "logps/rejected": -5.316483497619629, + "loss": 0.0643, + "odds_ratio_loss": 0.006937685422599316, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006389784626662731, + "rewards/margins": 0.5252585411071777, + "rewards/rejected": -0.5316482782363892, + "sft_loss": 0.06389784812927246, + "step": 3888 + }, + { + "epoch": 5.624005784526392, + "grad_norm": 1.245773734697951, + "learning_rate": 7.937844288268447e-08, + "logits/chosen": -1.00962495803833, + "logits/rejected": -0.862888514995575, + "logps/chosen": -0.06227367743849754, + "logps/rejected": -5.003812789916992, + "loss": 0.0382, + "odds_ratio_loss": 0.0036341436207294464, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006227367557585239, + "rewards/margins": 0.4941539466381073, + "rewards/rejected": -0.5003812909126282, + "sft_loss": 0.06227367743849754, + "step": 3889 + }, + { + "epoch": 5.6254519161243675, + "grad_norm": 1.0056455308192838, + "learning_rate": 7.876394501085837e-08, + "logits/chosen": -0.9098703861236572, + "logits/rejected": -0.7144231796264648, + "logps/chosen": -0.07516808062791824, + "logps/rejected": -4.229522705078125, + "loss": 0.0351, + "odds_ratio_loss": 0.0012169769033789635, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007516808342188597, + "rewards/margins": 0.41543543338775635, + "rewards/rejected": -0.42295223474502563, + "sft_loss": 0.07516808062791824, + "step": 3890 + }, + { + "epoch": 5.626898047722342, + "grad_norm": 1.070424100517713, + "learning_rate": 7.815181126685332e-08, + "logits/chosen": -0.8877123594284058, + "logits/rejected": -0.5519871115684509, + "logps/chosen": -0.024476561695337296, + "logps/rejected": -5.705207824707031, + "loss": 0.0375, + "odds_ratio_loss": 0.0009460779256187379, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002447655890136957, + "rewards/margins": 0.5680731534957886, + "rewards/rejected": -0.5705208778381348, + "sft_loss": 0.024476561695337296, + "step": 3891 + }, + { + "epoch": 5.628344179320318, + "grad_norm": 1.002367017711253, + "learning_rate": 7.754204201972791e-08, + "logits/chosen": -1.0830185413360596, + "logits/rejected": -0.8066056966781616, + "logps/chosen": -0.02625204622745514, + "logps/rejected": -5.084045886993408, + "loss": 0.0426, + "odds_ratio_loss": 0.0028547344263643026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0026252048555761576, + "rewards/margins": 0.5057793855667114, + "rewards/rejected": -0.5084046125411987, + "sft_loss": 0.02625204622745514, + "step": 3892 + }, + { + "epoch": 5.629790310918294, + "grad_norm": 1.3223399901665873, + "learning_rate": 7.693463763711472e-08, + "logits/chosen": -0.7070760726928711, + "logits/rejected": -0.5678169131278992, + "logps/chosen": -0.028207141906023026, + "logps/rejected": -5.583174228668213, + "loss": 0.0541, + "odds_ratio_loss": 0.0005133537924848497, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0028207143768668175, + "rewards/margins": 0.5554966330528259, + "rewards/rejected": -0.5583173632621765, + "sft_loss": 0.028207141906023026, + "step": 3893 + }, + { + "epoch": 5.631236442516269, + "grad_norm": 1.0036968645987125, + "learning_rate": 7.632959848521903e-08, + "logits/chosen": -0.7904921770095825, + "logits/rejected": -0.7324553728103638, + "logps/chosen": -0.03105638548731804, + "logps/rejected": -4.606823921203613, + "loss": 0.043, + "odds_ratio_loss": 0.0032230219803750515, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003105638548731804, + "rewards/margins": 0.4575767517089844, + "rewards/rejected": -0.46068239212036133, + "sft_loss": 0.03105638548731804, + "step": 3894 + }, + { + "epoch": 5.632682574114244, + "grad_norm": 1.2934557508771576, + "learning_rate": 7.572692492882237e-08, + "logits/chosen": -1.0913989543914795, + "logits/rejected": -0.7185604572296143, + "logps/chosen": -0.06864849478006363, + "logps/rejected": -5.759372711181641, + "loss": 0.04, + "odds_ratio_loss": 0.0022043841890990734, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00686484994366765, + "rewards/margins": 0.5690724849700928, + "rewards/rejected": -0.5759373307228088, + "sft_loss": 0.06864849478006363, + "step": 3895 + }, + { + "epoch": 5.63412870571222, + "grad_norm": 0.9311986244435818, + "learning_rate": 7.512661733127723e-08, + "logits/chosen": -0.8719162940979004, + "logits/rejected": -0.5234925746917725, + "logps/chosen": -0.013395091518759727, + "logps/rejected": -7.250702857971191, + "loss": 0.0307, + "odds_ratio_loss": 0.00034357167896814644, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0013395091518759727, + "rewards/margins": 0.7237308025360107, + "rewards/rejected": -0.7250703573226929, + "sft_loss": 0.013395091518759727, + "step": 3896 + }, + { + "epoch": 5.6355748373101955, + "grad_norm": 1.3758621137819032, + "learning_rate": 7.452867605451318e-08, + "logits/chosen": -0.7366227507591248, + "logits/rejected": -0.581299901008606, + "logps/chosen": -0.033642686903476715, + "logps/rejected": -6.8972673416137695, + "loss": 0.0332, + "odds_ratio_loss": 0.0029167046304792166, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0033642686903476715, + "rewards/margins": 0.6863625049591064, + "rewards/rejected": -0.689726710319519, + "sft_loss": 0.033642686903476715, + "step": 3897 + }, + { + "epoch": 5.63702096890817, + "grad_norm": 1.1931931524825197, + "learning_rate": 7.393310145902987e-08, + "logits/chosen": -0.883941650390625, + "logits/rejected": -0.7122170925140381, + "logps/chosen": -0.038125600665807724, + "logps/rejected": -5.787795066833496, + "loss": 0.0342, + "odds_ratio_loss": 0.0028251700568944216, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0038125598803162575, + "rewards/margins": 0.5749669671058655, + "rewards/rejected": -0.5787795186042786, + "sft_loss": 0.038125600665807724, + "step": 3898 + }, + { + "epoch": 5.638467100506146, + "grad_norm": 0.8511199612804827, + "learning_rate": 7.333989390390183e-08, + "logits/chosen": -1.15104341506958, + "logits/rejected": -0.6643425226211548, + "logps/chosen": -0.05288753658533096, + "logps/rejected": -6.539332389831543, + "loss": 0.0303, + "odds_ratio_loss": 0.006646599620580673, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005288753658533096, + "rewards/margins": 0.6486445069313049, + "rewards/rejected": -0.6539332866668701, + "sft_loss": 0.05288753658533096, + "step": 3899 + }, + { + "epoch": 5.639913232104122, + "grad_norm": 0.9691714602883686, + "learning_rate": 7.274905374677631e-08, + "logits/chosen": -1.0755349397659302, + "logits/rejected": -0.599575400352478, + "logps/chosen": -0.03918579965829849, + "logps/rejected": -4.2741804122924805, + "loss": 0.0328, + "odds_ratio_loss": 0.0014571095816791058, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003918579779565334, + "rewards/margins": 0.4234994649887085, + "rewards/rejected": -0.427418053150177, + "sft_loss": 0.03918579965829849, + "step": 3900 + }, + { + "epoch": 5.641359363702097, + "grad_norm": 0.8756828917317308, + "learning_rate": 7.216058134387326e-08, + "logits/chosen": -0.8658799529075623, + "logits/rejected": -0.6761690378189087, + "logps/chosen": -0.01636935956776142, + "logps/rejected": -5.539029598236084, + "loss": 0.0242, + "odds_ratio_loss": 0.000594843877479434, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0016369358636438847, + "rewards/margins": 0.552266001701355, + "rewards/rejected": -0.5539029836654663, + "sft_loss": 0.01636935956776142, + "step": 3901 + }, + { + "epoch": 5.642805495300072, + "grad_norm": 1.2086469436479532, + "learning_rate": 7.157447704998443e-08, + "logits/chosen": -0.72590172290802, + "logits/rejected": -0.47788357734680176, + "logps/chosen": -0.05741055682301521, + "logps/rejected": -6.098160743713379, + "loss": 0.0431, + "odds_ratio_loss": 0.0019345948239788413, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005741056054830551, + "rewards/margins": 0.6040751338005066, + "rewards/rejected": -0.6098161339759827, + "sft_loss": 0.05741055682301521, + "step": 3902 + }, + { + "epoch": 5.644251626898048, + "grad_norm": 0.960222765821685, + "learning_rate": 7.099074121847426e-08, + "logits/chosen": -0.8502532243728638, + "logits/rejected": -0.6490471363067627, + "logps/chosen": -0.027551371604204178, + "logps/rejected": -4.097101211547852, + "loss": 0.0514, + "odds_ratio_loss": 0.0017816171748563647, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0027551373932510614, + "rewards/margins": 0.4069550037384033, + "rewards/rejected": -0.409710168838501, + "sft_loss": 0.027551371604204178, + "step": 3903 + }, + { + "epoch": 5.6456977584960235, + "grad_norm": 1.6231756077512831, + "learning_rate": 7.040937420127946e-08, + "logits/chosen": -0.9650408029556274, + "logits/rejected": -0.8903849720954895, + "logps/chosen": -0.08151555061340332, + "logps/rejected": -4.457081317901611, + "loss": 0.0511, + "odds_ratio_loss": 0.0022657865192741156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008151555433869362, + "rewards/margins": 0.43755653500556946, + "rewards/rejected": -0.44570815563201904, + "sft_loss": 0.08151555061340332, + "step": 3904 + }, + { + "epoch": 5.647143890093998, + "grad_norm": 0.9400284707795573, + "learning_rate": 6.983037634890809e-08, + "logits/chosen": -0.7602153420448303, + "logits/rejected": -0.5387284159660339, + "logps/chosen": -0.009475589729845524, + "logps/rejected": -6.176396369934082, + "loss": 0.0221, + "odds_ratio_loss": 0.0016149263828992844, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0009475590195506811, + "rewards/margins": 0.616692066192627, + "rewards/rejected": -0.6176395416259766, + "sft_loss": 0.009475589729845524, + "step": 3905 + }, + { + "epoch": 5.648590021691974, + "grad_norm": 0.8122607809365746, + "learning_rate": 6.925374801044048e-08, + "logits/chosen": -0.7360423803329468, + "logits/rejected": -0.7772014737129211, + "logps/chosen": -0.029816610738635063, + "logps/rejected": -5.955297470092773, + "loss": 0.0296, + "odds_ratio_loss": 0.003913013264536858, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002981661120429635, + "rewards/margins": 0.592548131942749, + "rewards/rejected": -0.5955298542976379, + "sft_loss": 0.029816610738635063, + "step": 3906 + }, + { + "epoch": 5.65003615328995, + "grad_norm": 1.3780036646742946, + "learning_rate": 6.867948953352787e-08, + "logits/chosen": -0.9336844682693481, + "logits/rejected": -0.586871862411499, + "logps/chosen": -0.06249716877937317, + "logps/rejected": -4.8502020835876465, + "loss": 0.058, + "odds_ratio_loss": 0.0021649636328220367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006249716971069574, + "rewards/margins": 0.47877049446105957, + "rewards/rejected": -0.4850202202796936, + "sft_loss": 0.06249716877937317, + "step": 3907 + }, + { + "epoch": 5.6514822848879245, + "grad_norm": 0.9200298747281895, + "learning_rate": 6.810760126439285e-08, + "logits/chosen": -0.903180718421936, + "logits/rejected": -0.6644023060798645, + "logps/chosen": -0.028901048004627228, + "logps/rejected": -5.7049994468688965, + "loss": 0.0239, + "odds_ratio_loss": 0.0007545308326371014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0028901048935949802, + "rewards/margins": 0.5676099061965942, + "rewards/rejected": -0.5705000162124634, + "sft_loss": 0.028901048004627228, + "step": 3908 + }, + { + "epoch": 5.6529284164859, + "grad_norm": 1.024708514255704, + "learning_rate": 6.753808354782898e-08, + "logits/chosen": -0.8410828113555908, + "logits/rejected": -0.6992049217224121, + "logps/chosen": -0.020299401134252548, + "logps/rejected": -3.7354702949523926, + "loss": 0.0264, + "odds_ratio_loss": 0.0014652770478278399, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002029940253123641, + "rewards/margins": 0.3715171217918396, + "rewards/rejected": -0.3735470175743103, + "sft_loss": 0.020299401134252548, + "step": 3909 + }, + { + "epoch": 5.654374548083876, + "grad_norm": 0.9967547588704444, + "learning_rate": 6.697093672720067e-08, + "logits/chosen": -0.9125003218650818, + "logits/rejected": -0.7792860269546509, + "logps/chosen": -0.027637945488095284, + "logps/rejected": -5.146880149841309, + "loss": 0.0521, + "odds_ratio_loss": 0.0023928144946694374, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0027637947350740433, + "rewards/margins": 0.5119242668151855, + "rewards/rejected": -0.5146880149841309, + "sft_loss": 0.027637945488095284, + "step": 3910 + }, + { + "epoch": 5.655820679681851, + "grad_norm": 1.0733336260472248, + "learning_rate": 6.640616114444287e-08, + "logits/chosen": -0.8398350477218628, + "logits/rejected": -0.598203718662262, + "logps/chosen": -0.019008953124284744, + "logps/rejected": -3.937981605529785, + "loss": 0.0404, + "odds_ratio_loss": 0.0007247254834510386, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0019008952658623457, + "rewards/margins": 0.3918972611427307, + "rewards/rejected": -0.39379817247390747, + "sft_loss": 0.019008953124284744, + "step": 3911 + }, + { + "epoch": 5.657266811279826, + "grad_norm": 1.0696577971049983, + "learning_rate": 6.584375714006052e-08, + "logits/chosen": -0.9154694676399231, + "logits/rejected": -0.65110182762146, + "logps/chosen": -0.011146700009703636, + "logps/rejected": -6.324227809906006, + "loss": 0.0469, + "odds_ratio_loss": 0.0005139351123943925, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0011146699544042349, + "rewards/margins": 0.6313080787658691, + "rewards/rejected": -0.6324228048324585, + "sft_loss": 0.011146700009703636, + "step": 3912 + }, + { + "epoch": 5.658712942877802, + "grad_norm": 1.1318905034129576, + "learning_rate": 6.528372505312907e-08, + "logits/chosen": -1.1783943176269531, + "logits/rejected": -0.9010468125343323, + "logps/chosen": -0.03388334438204765, + "logps/rejected": -6.663463592529297, + "loss": 0.0409, + "odds_ratio_loss": 0.002394784474745393, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0033883345313370228, + "rewards/margins": 0.662958025932312, + "rewards/rejected": -0.6663463115692139, + "sft_loss": 0.03388334438204765, + "step": 3913 + }, + { + "epoch": 5.660159074475777, + "grad_norm": 0.9399824898174974, + "learning_rate": 6.472606522129487e-08, + "logits/chosen": -0.9045823812484741, + "logits/rejected": -0.847064733505249, + "logps/chosen": -0.0137400571256876, + "logps/rejected": -4.138844966888428, + "loss": 0.0284, + "odds_ratio_loss": 0.0009114354033954442, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0013740058057010174, + "rewards/margins": 0.412510484457016, + "rewards/rejected": -0.4138845205307007, + "sft_loss": 0.0137400571256876, + "step": 3914 + }, + { + "epoch": 5.6616052060737525, + "grad_norm": 0.9973586984702861, + "learning_rate": 6.417077798077209e-08, + "logits/chosen": -1.010211706161499, + "logits/rejected": -0.861878514289856, + "logps/chosen": -0.061721622943878174, + "logps/rejected": -4.818317413330078, + "loss": 0.0299, + "odds_ratio_loss": 0.004441537894308567, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006172161549329758, + "rewards/margins": 0.4756595492362976, + "rewards/rejected": -0.48183172941207886, + "sft_loss": 0.061721622943878174, + "step": 3915 + }, + { + "epoch": 5.663051337671728, + "grad_norm": 0.998537365179468, + "learning_rate": 6.361786366634625e-08, + "logits/chosen": -1.0144376754760742, + "logits/rejected": -0.8828725218772888, + "logps/chosen": -0.038881972432136536, + "logps/rejected": -5.611100196838379, + "loss": 0.0355, + "odds_ratio_loss": 0.0021481101866811514, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0038881972432136536, + "rewards/margins": 0.5572217702865601, + "rewards/rejected": -0.5611100196838379, + "sft_loss": 0.038881972432136536, + "step": 3916 + }, + { + "epoch": 5.664497469269704, + "grad_norm": 1.3821746245355928, + "learning_rate": 6.306732261137027e-08, + "logits/chosen": -1.0927997827529907, + "logits/rejected": -0.7127183675765991, + "logps/chosen": -0.03770868852734566, + "logps/rejected": -5.420331001281738, + "loss": 0.0344, + "odds_ratio_loss": 0.0027799506206065416, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0037708692252635956, + "rewards/margins": 0.5382621884346008, + "rewards/rejected": -0.5420330762863159, + "sft_loss": 0.03770868852734566, + "step": 3917 + }, + { + "epoch": 5.665943600867679, + "grad_norm": 1.2771633707593975, + "learning_rate": 6.251915514776884e-08, + "logits/chosen": -0.7677035331726074, + "logits/rejected": -0.5734249353408813, + "logps/chosen": -0.03404954820871353, + "logps/rejected": -6.219034194946289, + "loss": 0.0283, + "odds_ratio_loss": 0.00040826547774486244, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003404954681172967, + "rewards/margins": 0.6184984445571899, + "rewards/rejected": -0.6219034194946289, + "sft_loss": 0.03404954820871353, + "step": 3918 + }, + { + "epoch": 5.667389732465654, + "grad_norm": 1.0819061444739764, + "learning_rate": 6.197336160603362e-08, + "logits/chosen": -1.1142338514328003, + "logits/rejected": -0.6450099349021912, + "logps/chosen": -0.08008662611246109, + "logps/rejected": -5.973351955413818, + "loss": 0.0478, + "odds_ratio_loss": 0.0011582360602915287, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008008661679923534, + "rewards/margins": 0.5893265008926392, + "rewards/rejected": -0.5973352193832397, + "sft_loss": 0.08008662611246109, + "step": 3919 + }, + { + "epoch": 5.66883586406363, + "grad_norm": 0.979593206342996, + "learning_rate": 6.142994231522492e-08, + "logits/chosen": -0.7907422184944153, + "logits/rejected": -0.5625496506690979, + "logps/chosen": -0.012852794490754604, + "logps/rejected": -4.704351425170898, + "loss": 0.0259, + "odds_ratio_loss": 0.0008764659287407994, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0012852795189246535, + "rewards/margins": 0.4691498577594757, + "rewards/rejected": -0.47043511271476746, + "sft_loss": 0.012852794490754604, + "step": 3920 + }, + { + "epoch": 5.670281995661605, + "grad_norm": 0.9326543092206134, + "learning_rate": 6.088889760297312e-08, + "logits/chosen": -0.769900918006897, + "logits/rejected": -0.5406790971755981, + "logps/chosen": -0.01830982230603695, + "logps/rejected": -5.646090507507324, + "loss": 0.027, + "odds_ratio_loss": 0.002648351714015007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0018309823935851455, + "rewards/margins": 0.562778115272522, + "rewards/rejected": -0.5646090507507324, + "sft_loss": 0.01830982230603695, + "step": 3921 + }, + { + "epoch": 5.6717281272595805, + "grad_norm": 1.1029927893990603, + "learning_rate": 6.035022779547549e-08, + "logits/chosen": -0.7700226306915283, + "logits/rejected": -0.752061128616333, + "logps/chosen": -0.06235011667013168, + "logps/rejected": -5.587028503417969, + "loss": 0.0411, + "odds_ratio_loss": 0.0030640983022749424, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006235011853277683, + "rewards/margins": 0.5524678230285645, + "rewards/rejected": -0.558702826499939, + "sft_loss": 0.06235011667013168, + "step": 3922 + }, + { + "epoch": 5.673174258857556, + "grad_norm": 1.1957365894214984, + "learning_rate": 5.981393321749894e-08, + "logits/chosen": -0.8172686100006104, + "logits/rejected": -0.7470732927322388, + "logps/chosen": -0.06232059746980667, + "logps/rejected": -3.8870420455932617, + "loss": 0.0631, + "odds_ratio_loss": 0.007645574398338795, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006232059560716152, + "rewards/margins": 0.38247212767601013, + "rewards/rejected": -0.38870421051979065, + "sft_loss": 0.06232059746980667, + "step": 3923 + }, + { + "epoch": 5.674620390455532, + "grad_norm": 0.9288076500148977, + "learning_rate": 5.928001419237638e-08, + "logits/chosen": -0.9686081409454346, + "logits/rejected": -0.8795914649963379, + "logps/chosen": -0.009057226590812206, + "logps/rejected": -6.156771659851074, + "loss": 0.0374, + "odds_ratio_loss": 0.0005254077259451151, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0009057226707227528, + "rewards/margins": 0.6147714853286743, + "rewards/rejected": -0.6156772375106812, + "sft_loss": 0.009057226590812206, + "step": 3924 + }, + { + "epoch": 5.676066522053507, + "grad_norm": 1.0560130705009327, + "learning_rate": 5.8748471042010305e-08, + "logits/chosen": -1.093846321105957, + "logits/rejected": -0.7616704702377319, + "logps/chosen": -0.03402628004550934, + "logps/rejected": -6.118165969848633, + "loss": 0.0326, + "odds_ratio_loss": 0.0014892476610839367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0034026282373815775, + "rewards/margins": 0.6084139943122864, + "rewards/rejected": -0.6118166446685791, + "sft_loss": 0.03402628004550934, + "step": 3925 + }, + { + "epoch": 5.677512653651482, + "grad_norm": 0.8648937856981704, + "learning_rate": 5.8219304086869705e-08, + "logits/chosen": -0.9695035219192505, + "logits/rejected": -0.7986159324645996, + "logps/chosen": -0.03180558234453201, + "logps/rejected": -4.99746561050415, + "loss": 0.0244, + "odds_ratio_loss": 0.003096992615610361, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003180558094754815, + "rewards/margins": 0.4965660274028778, + "rewards/rejected": -0.49974656105041504, + "sft_loss": 0.03180558234453201, + "step": 3926 + }, + { + "epoch": 5.678958785249458, + "grad_norm": 0.9465923606611053, + "learning_rate": 5.7692513645991814e-08, + "logits/chosen": -0.9850341081619263, + "logits/rejected": -0.7746860980987549, + "logps/chosen": -0.024150801822543144, + "logps/rejected": -5.850712776184082, + "loss": 0.0322, + "odds_ratio_loss": 0.0011510425247251987, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002415080089122057, + "rewards/margins": 0.58265620470047, + "rewards/rejected": -0.585071325302124, + "sft_loss": 0.024150801822543144, + "step": 3927 + }, + { + "epoch": 5.680404916847433, + "grad_norm": 1.0225694246772676, + "learning_rate": 5.716810003697947e-08, + "logits/chosen": -0.9132033586502075, + "logits/rejected": -0.6539651155471802, + "logps/chosen": -0.03499581664800644, + "logps/rejected": -5.632237911224365, + "loss": 0.0302, + "odds_ratio_loss": 0.0011239980813115835, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003499581478536129, + "rewards/margins": 0.5597242116928101, + "rewards/rejected": -0.5632237195968628, + "sft_loss": 0.03499581664800644, + "step": 3928 + }, + { + "epoch": 5.681851048445409, + "grad_norm": 1.2386475856717893, + "learning_rate": 5.664606357600465e-08, + "logits/chosen": -0.8918399214744568, + "logits/rejected": -0.7364063262939453, + "logps/chosen": -0.05718269944190979, + "logps/rejected": -5.176689147949219, + "loss": 0.0438, + "odds_ratio_loss": 0.0016231231857091188, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005718270316720009, + "rewards/margins": 0.511950671672821, + "rewards/rejected": -0.5176689028739929, + "sft_loss": 0.05718269944190979, + "step": 3929 + }, + { + "epoch": 5.683297180043384, + "grad_norm": 1.2202685542736273, + "learning_rate": 5.612640457780449e-08, + "logits/chosen": -0.8680728673934937, + "logits/rejected": -0.6936816573143005, + "logps/chosen": -0.035446494817733765, + "logps/rejected": -3.8519763946533203, + "loss": 0.0305, + "odds_ratio_loss": 0.002002793364226818, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003544649574905634, + "rewards/margins": 0.3816530108451843, + "rewards/rejected": -0.38519763946533203, + "sft_loss": 0.035446494817733765, + "step": 3930 + }, + { + "epoch": 5.684743311641359, + "grad_norm": 1.1173651380498892, + "learning_rate": 5.5609123355683906e-08, + "logits/chosen": -0.9929153919219971, + "logits/rejected": -0.6234613060951233, + "logps/chosen": -0.020111847668886185, + "logps/rejected": -6.586942672729492, + "loss": 0.0348, + "odds_ratio_loss": 0.00016166100976988673, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002011184813454747, + "rewards/margins": 0.656683087348938, + "rewards/rejected": -0.6586943864822388, + "sft_loss": 0.020111847668886185, + "step": 3931 + }, + { + "epoch": 5.686189443239335, + "grad_norm": 0.8926495914234045, + "learning_rate": 5.5094220221513e-08, + "logits/chosen": -1.0222707986831665, + "logits/rejected": -0.8124725222587585, + "logps/chosen": -0.01710999198257923, + "logps/rejected": -4.495701313018799, + "loss": 0.0258, + "odds_ratio_loss": 0.0012623387156054378, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0017109992913901806, + "rewards/margins": 0.44785916805267334, + "rewards/rejected": -0.4495701789855957, + "sft_loss": 0.01710999198257923, + "step": 3932 + }, + { + "epoch": 5.68763557483731, + "grad_norm": 1.2731963859242328, + "learning_rate": 5.4581695485729665e-08, + "logits/chosen": -0.7314611077308655, + "logits/rejected": -0.7094411849975586, + "logps/chosen": -0.024839885532855988, + "logps/rejected": -6.205300331115723, + "loss": 0.0328, + "odds_ratio_loss": 0.0008890972239896655, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0024839891120791435, + "rewards/margins": 0.6180460453033447, + "rewards/rejected": -0.6205300688743591, + "sft_loss": 0.024839885532855988, + "step": 3933 + }, + { + "epoch": 5.689081706435285, + "grad_norm": 1.1808337226365984, + "learning_rate": 5.407154945733605e-08, + "logits/chosen": -0.7870131731033325, + "logits/rejected": -0.5856943130493164, + "logps/chosen": -0.09459728002548218, + "logps/rejected": -5.333677291870117, + "loss": 0.0516, + "odds_ratio_loss": 0.005132491700351238, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009459727443754673, + "rewards/margins": 0.5239080190658569, + "rewards/rejected": -0.5333677530288696, + "sft_loss": 0.09459728002548218, + "step": 3934 + }, + { + "epoch": 5.690527838033261, + "grad_norm": 0.9096943069943451, + "learning_rate": 5.3563782443901254e-08, + "logits/chosen": -0.8053758144378662, + "logits/rejected": -0.6625349521636963, + "logps/chosen": -0.030897527933120728, + "logps/rejected": -3.146284580230713, + "loss": 0.0285, + "odds_ratio_loss": 0.0019149701111018658, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003089752746745944, + "rewards/margins": 0.3115387260913849, + "rewards/rejected": -0.3146284520626068, + "sft_loss": 0.030897527933120728, + "step": 3935 + }, + { + "epoch": 5.691973969631237, + "grad_norm": 1.3116878149312352, + "learning_rate": 5.305839475156082e-08, + "logits/chosen": -0.9307630658149719, + "logits/rejected": -0.6568589806556702, + "logps/chosen": -0.05684416741132736, + "logps/rejected": -5.24498176574707, + "loss": 0.0543, + "odds_ratio_loss": 0.002522802911698818, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005684417672455311, + "rewards/margins": 0.5188138484954834, + "rewards/rejected": -0.5244981646537781, + "sft_loss": 0.05684416741132736, + "step": 3936 + }, + { + "epoch": 5.693420101229211, + "grad_norm": 1.133640478705631, + "learning_rate": 5.2555386685013247e-08, + "logits/chosen": -0.8474830389022827, + "logits/rejected": -0.6886616349220276, + "logps/chosen": -0.07056474685668945, + "logps/rejected": -3.7979629039764404, + "loss": 0.0575, + "odds_ratio_loss": 0.0038194474764168262, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00705647561699152, + "rewards/margins": 0.3727398216724396, + "rewards/rejected": -0.37979626655578613, + "sft_loss": 0.07056474685668945, + "step": 3937 + }, + { + "epoch": 5.694866232827187, + "grad_norm": 0.9958326615719902, + "learning_rate": 5.2054758547525724e-08, + "logits/chosen": -0.7519106268882751, + "logits/rejected": -0.6501749753952026, + "logps/chosen": -0.06976844370365143, + "logps/rejected": -6.353296279907227, + "loss": 0.0386, + "odds_ratio_loss": 0.018397467210888863, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006976844742894173, + "rewards/margins": 0.6283528208732605, + "rewards/rejected": -0.6353296041488647, + "sft_loss": 0.06976844370365143, + "step": 3938 + }, + { + "epoch": 5.696312364425163, + "grad_norm": 0.9255804107593475, + "learning_rate": 5.1556510640927476e-08, + "logits/chosen": -0.729579746723175, + "logits/rejected": -0.6220409870147705, + "logps/chosen": -0.02262992598116398, + "logps/rejected": -4.2784223556518555, + "loss": 0.0388, + "odds_ratio_loss": 0.0035235087852925062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002262992551550269, + "rewards/margins": 0.4255792498588562, + "rewards/rejected": -0.42784222960472107, + "sft_loss": 0.02262992598116398, + "step": 3939 + }, + { + "epoch": 5.697758496023138, + "grad_norm": 1.0616569156321858, + "learning_rate": 5.1060643265614655e-08, + "logits/chosen": -0.8722561597824097, + "logits/rejected": -0.7307120561599731, + "logps/chosen": -0.018616218119859695, + "logps/rejected": -4.420228481292725, + "loss": 0.0281, + "odds_ratio_loss": 0.0006978900055401027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0018616218585520983, + "rewards/margins": 0.44016122817993164, + "rewards/rejected": -0.4420228898525238, + "sft_loss": 0.018616218119859695, + "step": 3940 + }, + { + "epoch": 5.699204627621113, + "grad_norm": 1.0380237647568364, + "learning_rate": 5.056715672054768e-08, + "logits/chosen": -0.9298012256622314, + "logits/rejected": -0.6356141567230225, + "logps/chosen": -0.04043480008840561, + "logps/rejected": -4.05310583114624, + "loss": 0.0241, + "odds_ratio_loss": 0.0012896271655336022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0040434799157083035, + "rewards/margins": 0.4012671113014221, + "rewards/rejected": -0.40531060099601746, + "sft_loss": 0.04043480008840561, + "step": 3941 + }, + { + "epoch": 5.700650759219089, + "grad_norm": 0.9822767657440377, + "learning_rate": 5.007605130325121e-08, + "logits/chosen": -0.8983743190765381, + "logits/rejected": -0.6858455538749695, + "logps/chosen": -0.02578556537628174, + "logps/rejected": -3.956112861633301, + "loss": 0.022, + "odds_ratio_loss": 0.0009663483360782266, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002578556537628174, + "rewards/margins": 0.3930327296257019, + "rewards/rejected": -0.3956112861633301, + "sft_loss": 0.02578556537628174, + "step": 3942 + }, + { + "epoch": 5.702096890817065, + "grad_norm": 0.9640921864883987, + "learning_rate": 4.958732730981374e-08, + "logits/chosen": -0.5660995841026306, + "logits/rejected": -0.5064358115196228, + "logps/chosen": -0.04737599939107895, + "logps/rejected": -5.079545021057129, + "loss": 0.0378, + "odds_ratio_loss": 0.004981108475476503, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00473759975284338, + "rewards/margins": 0.5032169222831726, + "rewards/rejected": -0.5079545378684998, + "sft_loss": 0.04737599939107895, + "step": 3943 + }, + { + "epoch": 5.703543022415039, + "grad_norm": 1.0044203070515496, + "learning_rate": 4.910098503489024e-08, + "logits/chosen": -0.5947985649108887, + "logits/rejected": -0.4939855635166168, + "logps/chosen": -0.04397990554571152, + "logps/rejected": -5.396112442016602, + "loss": 0.0367, + "odds_ratio_loss": 0.006674241274595261, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004397990647703409, + "rewards/margins": 0.53521329164505, + "rewards/rejected": -0.5396112203598022, + "sft_loss": 0.04397990554571152, + "step": 3944 + }, + { + "epoch": 5.704989154013015, + "grad_norm": 1.149313184171066, + "learning_rate": 4.861702477169727e-08, + "logits/chosen": -0.9143013954162598, + "logits/rejected": -0.6934011578559875, + "logps/chosen": -0.009784114547073841, + "logps/rejected": -4.77062463760376, + "loss": 0.0218, + "odds_ratio_loss": 0.0006179651827551425, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0009784114081412554, + "rewards/margins": 0.4760840833187103, + "rewards/rejected": -0.477062463760376, + "sft_loss": 0.009784114547073841, + "step": 3945 + }, + { + "epoch": 5.706435285610991, + "grad_norm": 1.2237874570885532, + "learning_rate": 4.8135446812016536e-08, + "logits/chosen": -1.1765408515930176, + "logits/rejected": -0.8189411163330078, + "logps/chosen": -0.06264963001012802, + "logps/rejected": -6.33249044418335, + "loss": 0.0452, + "odds_ratio_loss": 0.008238528855144978, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006264963187277317, + "rewards/margins": 0.6269841194152832, + "rewards/rejected": -0.633249044418335, + "sft_loss": 0.06264963001012802, + "step": 3946 + }, + { + "epoch": 5.7078814172089665, + "grad_norm": 1.02404607668381, + "learning_rate": 4.765625144619356e-08, + "logits/chosen": -0.9042776823043823, + "logits/rejected": -0.6364085674285889, + "logps/chosen": -0.04948314279317856, + "logps/rejected": -4.562315940856934, + "loss": 0.039, + "odds_ratio_loss": 0.0030646594241261482, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004948314279317856, + "rewards/margins": 0.4512832760810852, + "rewards/rejected": -0.45623159408569336, + "sft_loss": 0.04948314279317856, + "step": 3947 + }, + { + "epoch": 5.709327548806941, + "grad_norm": 1.1728511222826667, + "learning_rate": 4.717943896313681e-08, + "logits/chosen": -0.9472683072090149, + "logits/rejected": -0.7687931060791016, + "logps/chosen": -0.03766496852040291, + "logps/rejected": -5.983687400817871, + "loss": 0.0461, + "odds_ratio_loss": 0.0009112533880397677, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003766496665775776, + "rewards/margins": 0.5946022272109985, + "rewards/rejected": -0.598368763923645, + "sft_loss": 0.03766496852040291, + "step": 3948 + }, + { + "epoch": 5.710773680404917, + "grad_norm": 0.9785284414828915, + "learning_rate": 4.670500965031765e-08, + "logits/chosen": -0.8775100708007812, + "logits/rejected": -0.6726743578910828, + "logps/chosen": -0.055906932801008224, + "logps/rejected": -5.177661418914795, + "loss": 0.0339, + "odds_ratio_loss": 0.004313563462346792, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005590693559497595, + "rewards/margins": 0.512175440788269, + "rewards/rejected": -0.5177661180496216, + "sft_loss": 0.055906932801008224, + "step": 3949 + }, + { + "epoch": 5.712219812002893, + "grad_norm": 4.807573142568754, + "learning_rate": 4.623296379377217e-08, + "logits/chosen": -0.856716513633728, + "logits/rejected": -0.6778161525726318, + "logps/chosen": -0.0741441547870636, + "logps/rejected": -5.552947998046875, + "loss": 0.0455, + "odds_ratio_loss": 0.008630833588540554, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0074144164100289345, + "rewards/margins": 0.5478804707527161, + "rewards/rejected": -0.5552948713302612, + "sft_loss": 0.0741441547870636, + "step": 3950 + }, + { + "epoch": 5.713665943600867, + "grad_norm": 0.792542970153592, + "learning_rate": 4.5763301678098053e-08, + "logits/chosen": -1.1035850048065186, + "logits/rejected": -0.9252421855926514, + "logps/chosen": -0.04065769910812378, + "logps/rejected": -3.750488758087158, + "loss": 0.0269, + "odds_ratio_loss": 0.0019774322863668203, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004065769724547863, + "rewards/margins": 0.3709830939769745, + "rewards/rejected": -0.37504884600639343, + "sft_loss": 0.04065769910812378, + "step": 3951 + }, + { + "epoch": 5.715112075198843, + "grad_norm": 0.9629993811033565, + "learning_rate": 4.5296023586456345e-08, + "logits/chosen": -0.7754743099212646, + "logits/rejected": -0.6971641778945923, + "logps/chosen": -0.020119009539484978, + "logps/rejected": -3.7750449180603027, + "loss": 0.0333, + "odds_ratio_loss": 0.04542544111609459, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.002011900767683983, + "rewards/margins": 0.3754926323890686, + "rewards/rejected": -0.37750452756881714, + "sft_loss": 0.020119009539484978, + "step": 3952 + }, + { + "epoch": 5.716558206796819, + "grad_norm": 0.7073799305020456, + "learning_rate": 4.483112980057147e-08, + "logits/chosen": -1.0943269729614258, + "logits/rejected": -0.6168371438980103, + "logps/chosen": -0.013947287574410439, + "logps/rejected": -5.2490081787109375, + "loss": 0.0179, + "odds_ratio_loss": 0.0006277449429035187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0013947287807241082, + "rewards/margins": 0.5235061645507812, + "rewards/rejected": -0.5249009132385254, + "sft_loss": 0.013947287574410439, + "step": 3953 + }, + { + "epoch": 5.718004338394794, + "grad_norm": 0.9432485852232118, + "learning_rate": 4.436862060072855e-08, + "logits/chosen": -0.8107536435127258, + "logits/rejected": -0.7643659114837646, + "logps/chosen": -0.02018115669488907, + "logps/rejected": -3.54256010055542, + "loss": 0.0125, + "odds_ratio_loss": 0.001157900900579989, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0020181157160550356, + "rewards/margins": 0.35223788022994995, + "rewards/rejected": -0.3542560338973999, + "sft_loss": 0.02018115669488907, + "step": 3954 + }, + { + "epoch": 5.719450469992769, + "grad_norm": 1.6223932525353142, + "learning_rate": 4.3908496265776973e-08, + "logits/chosen": -0.651569128036499, + "logits/rejected": -0.5832317471504211, + "logps/chosen": -0.06823825091123581, + "logps/rejected": -4.777907848358154, + "loss": 0.0692, + "odds_ratio_loss": 0.004125387407839298, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006823825184255838, + "rewards/margins": 0.47096699476242065, + "rewards/rejected": -0.47779080271720886, + "sft_loss": 0.06823825091123581, + "step": 3955 + }, + { + "epoch": 5.720896601590745, + "grad_norm": 1.0945610279479845, + "learning_rate": 4.3450757073126844e-08, + "logits/chosen": -1.1655569076538086, + "logits/rejected": -0.6393382549285889, + "logps/chosen": -0.023160209879279137, + "logps/rejected": -6.733799457550049, + "loss": 0.0396, + "odds_ratio_loss": 0.0007181918481364846, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0023160211276263, + "rewards/margins": 0.6710639595985413, + "rewards/rejected": -0.6733798980712891, + "sft_loss": 0.023160209879279137, + "step": 3956 + }, + { + "epoch": 5.72234273318872, + "grad_norm": 0.9881255041140731, + "learning_rate": 4.2995403298751176e-08, + "logits/chosen": -0.8412911295890808, + "logits/rejected": -0.7012451887130737, + "logps/chosen": -0.035623397678136826, + "logps/rejected": -3.769057035446167, + "loss": 0.0458, + "odds_ratio_loss": 0.002924562431871891, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003562340047210455, + "rewards/margins": 0.3733433783054352, + "rewards/rejected": -0.3769057095050812, + "sft_loss": 0.035623397678136826, + "step": 3957 + }, + { + "epoch": 5.7237888647866955, + "grad_norm": 1.2450271286281953, + "learning_rate": 4.2542435217184146e-08, + "logits/chosen": -0.7657533884048462, + "logits/rejected": -0.7611573934555054, + "logps/chosen": -0.04250839352607727, + "logps/rejected": -5.744058609008789, + "loss": 0.0353, + "odds_ratio_loss": 0.003883287776261568, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004250839352607727, + "rewards/margins": 0.5701550245285034, + "rewards/rejected": -0.5744057893753052, + "sft_loss": 0.04250839352607727, + "step": 3958 + }, + { + "epoch": 5.725234996384671, + "grad_norm": 0.9684630858692042, + "learning_rate": 4.209185310152197e-08, + "logits/chosen": -1.0507510900497437, + "logits/rejected": -0.7392194867134094, + "logps/chosen": -0.020687285810709, + "logps/rejected": -5.629661560058594, + "loss": 0.0284, + "odds_ratio_loss": 0.0012234591413289309, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0020687286742031574, + "rewards/margins": 0.5608974695205688, + "rewards/rejected": -0.5629662275314331, + "sft_loss": 0.020687285810709, + "step": 3959 + }, + { + "epoch": 5.726681127982646, + "grad_norm": 1.0111903958418516, + "learning_rate": 4.164365722342245e-08, + "logits/chosen": -0.8547532558441162, + "logits/rejected": -0.7358726859092712, + "logps/chosen": -0.03250580281019211, + "logps/rejected": -4.449581623077393, + "loss": 0.0251, + "odds_ratio_loss": 0.0019836989231407642, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0032505805138498545, + "rewards/margins": 0.4417075514793396, + "rewards/rejected": -0.4449581503868103, + "sft_loss": 0.03250580281019211, + "step": 3960 + }, + { + "epoch": 5.728127259580622, + "grad_norm": 0.8954341522636333, + "learning_rate": 4.119784785310454e-08, + "logits/chosen": -0.97066330909729, + "logits/rejected": -0.7040786743164062, + "logps/chosen": -0.014331339858472347, + "logps/rejected": -5.2800421714782715, + "loss": 0.0229, + "odds_ratio_loss": 0.0006313954363577068, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0014331340789794922, + "rewards/margins": 0.5265710949897766, + "rewards/rejected": -0.5280042290687561, + "sft_loss": 0.014331339858472347, + "step": 3961 + }, + { + "epoch": 5.729573391178597, + "grad_norm": 0.8325532370707646, + "learning_rate": 4.0754425259348355e-08, + "logits/chosen": -0.9457544088363647, + "logits/rejected": -0.6079109907150269, + "logps/chosen": -0.01744541898369789, + "logps/rejected": -5.451538562774658, + "loss": 0.0226, + "odds_ratio_loss": 0.0007178646046668291, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0017445420380681753, + "rewards/margins": 0.5434092879295349, + "rewards/rejected": -0.5451538562774658, + "sft_loss": 0.01744541898369789, + "step": 3962 + }, + { + "epoch": 5.731019522776573, + "grad_norm": 1.112537502022365, + "learning_rate": 4.031338970949516e-08, + "logits/chosen": -0.7553691864013672, + "logits/rejected": -0.5517176985740662, + "logps/chosen": -0.03140060976147652, + "logps/rejected": -5.191702842712402, + "loss": 0.0459, + "odds_ratio_loss": 0.0007400895119644701, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0031400611624121666, + "rewards/margins": 0.5160301923751831, + "rewards/rejected": -0.5191702842712402, + "sft_loss": 0.03140060976147652, + "step": 3963 + }, + { + "epoch": 5.732465654374548, + "grad_norm": 1.01456036033676, + "learning_rate": 3.987474146944647e-08, + "logits/chosen": -1.02970552444458, + "logits/rejected": -0.8884641528129578, + "logps/chosen": -0.013003876432776451, + "logps/rejected": -4.971007823944092, + "loss": 0.0344, + "odds_ratio_loss": 0.0005930347251705825, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0013003875501453876, + "rewards/margins": 0.4958004057407379, + "rewards/rejected": -0.4971008002758026, + "sft_loss": 0.013003876432776451, + "step": 3964 + }, + { + "epoch": 5.7339117859725235, + "grad_norm": 1.2032055887513502, + "learning_rate": 3.943848080366541e-08, + "logits/chosen": -1.0313166379928589, + "logits/rejected": -0.6610112190246582, + "logps/chosen": -0.013816236518323421, + "logps/rejected": -5.627511978149414, + "loss": 0.0335, + "odds_ratio_loss": 0.00041836718446575105, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0013816235587000847, + "rewards/margins": 0.5613695383071899, + "rewards/rejected": -0.5627512335777283, + "sft_loss": 0.013816236518323421, + "step": 3965 + }, + { + "epoch": 5.735357917570499, + "grad_norm": 1.0401773408038113, + "learning_rate": 3.9004607975174905e-08, + "logits/chosen": -0.9395818114280701, + "logits/rejected": -0.770209789276123, + "logps/chosen": -0.027776891365647316, + "logps/rejected": -4.5239949226379395, + "loss": 0.0187, + "odds_ratio_loss": 0.001369029050692916, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0027776893693953753, + "rewards/margins": 0.4496217966079712, + "rewards/rejected": -0.45239946246147156, + "sft_loss": 0.027776891365647316, + "step": 3966 + }, + { + "epoch": 5.736804049168474, + "grad_norm": 0.9541026537621927, + "learning_rate": 3.857312324555862e-08, + "logits/chosen": -0.7973968386650085, + "logits/rejected": -0.5857910513877869, + "logps/chosen": -0.018500562757253647, + "logps/rejected": -5.999993324279785, + "loss": 0.0199, + "odds_ratio_loss": 0.0004114778130315244, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001850056229159236, + "rewards/margins": 0.598149299621582, + "rewards/rejected": -0.5999993681907654, + "sft_loss": 0.018500562757253647, + "step": 3967 + }, + { + "epoch": 5.73825018076645, + "grad_norm": 1.1225011262565519, + "learning_rate": 3.814402687496043e-08, + "logits/chosen": -1.0490517616271973, + "logits/rejected": -0.8305553793907166, + "logps/chosen": -0.005585083272308111, + "logps/rejected": -5.262382984161377, + "loss": 0.0306, + "odds_ratio_loss": 0.0003785730223171413, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0005585083272308111, + "rewards/margins": 0.5256798267364502, + "rewards/rejected": -0.5262383222579956, + "sft_loss": 0.005585083272308111, + "step": 3968 + }, + { + "epoch": 5.739696312364425, + "grad_norm": 1.026685490340686, + "learning_rate": 3.7717319122083645e-08, + "logits/chosen": -1.0224274396896362, + "logits/rejected": -0.8586872816085815, + "logps/chosen": -0.006939824670553207, + "logps/rejected": -5.025428295135498, + "loss": 0.0293, + "odds_ratio_loss": 0.00030064102611504495, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0006939824670553207, + "rewards/margins": 0.5018488764762878, + "rewards/rejected": -0.5025428533554077, + "sft_loss": 0.006939824670553207, + "step": 3969 + }, + { + "epoch": 5.741142443962401, + "grad_norm": 1.1829241105985957, + "learning_rate": 3.729300024419224e-08, + "logits/chosen": -0.8081978559494019, + "logits/rejected": -0.518790602684021, + "logps/chosen": -0.026094887405633926, + "logps/rejected": -4.8559250831604, + "loss": 0.0539, + "odds_ratio_loss": 0.001554901129566133, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0026094887871295214, + "rewards/margins": 0.4829829931259155, + "rewards/rejected": -0.48559248447418213, + "sft_loss": 0.026094887405633926, + "step": 3970 + }, + { + "epoch": 5.742588575560376, + "grad_norm": 1.1125981286298714, + "learning_rate": 3.687107049710958e-08, + "logits/chosen": -0.969440758228302, + "logits/rejected": -0.677169919013977, + "logps/chosen": -0.040819112211465836, + "logps/rejected": -5.158903121948242, + "loss": 0.044, + "odds_ratio_loss": 0.0023379060439765453, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0040819114074110985, + "rewards/margins": 0.511808454990387, + "rewards/rejected": -0.51589035987854, + "sft_loss": 0.040819112211465836, + "step": 3971 + }, + { + "epoch": 5.7440347071583515, + "grad_norm": 1.0875350672506494, + "learning_rate": 3.645153013521929e-08, + "logits/chosen": -1.1375861167907715, + "logits/rejected": -0.7329793572425842, + "logps/chosen": -0.025086410343647003, + "logps/rejected": -6.695981025695801, + "loss": 0.0328, + "odds_ratio_loss": 0.0005195082630962133, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0025086409877985716, + "rewards/margins": 0.6670895218849182, + "rewards/rejected": -0.6695981621742249, + "sft_loss": 0.025086410343647003, + "step": 3972 + }, + { + "epoch": 5.745480838756327, + "grad_norm": 1.2322911875291596, + "learning_rate": 3.603437941146303e-08, + "logits/chosen": -0.9515436887741089, + "logits/rejected": -0.791551947593689, + "logps/chosen": -0.009043844416737556, + "logps/rejected": -4.902947425842285, + "loss": 0.0559, + "odds_ratio_loss": 0.0005873933550901711, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0009043845348060131, + "rewards/margins": 0.48939037322998047, + "rewards/rejected": -0.49029478430747986, + "sft_loss": 0.009043844416737556, + "step": 3973 + }, + { + "epoch": 5.746926970354302, + "grad_norm": 1.1251125949181537, + "learning_rate": 3.561961857734275e-08, + "logits/chosen": -0.7727161645889282, + "logits/rejected": -0.610916793346405, + "logps/chosen": -0.03913598507642746, + "logps/rejected": -4.861371040344238, + "loss": 0.0393, + "odds_ratio_loss": 0.007291535846889019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003913598600775003, + "rewards/margins": 0.4822235107421875, + "rewards/rejected": -0.4861370921134949, + "sft_loss": 0.03913598507642746, + "step": 3974 + }, + { + "epoch": 5.748373101952278, + "grad_norm": 1.5473900691851843, + "learning_rate": 3.520724788291973e-08, + "logits/chosen": -0.9996442794799805, + "logits/rejected": -0.7617815732955933, + "logps/chosen": -0.06447423249483109, + "logps/rejected": -6.056490421295166, + "loss": 0.0534, + "odds_ratio_loss": 0.0018590696854516864, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006447423715144396, + "rewards/margins": 0.5992016792297363, + "rewards/rejected": -0.6056490540504456, + "sft_loss": 0.06447423249483109, + "step": 3975 + }, + { + "epoch": 5.749819233550253, + "grad_norm": 1.215426207668915, + "learning_rate": 3.479726757681289e-08, + "logits/chosen": -0.7984232902526855, + "logits/rejected": -0.5945329666137695, + "logps/chosen": -0.022249722853302956, + "logps/rejected": -6.509105205535889, + "loss": 0.0322, + "odds_ratio_loss": 0.0006876873667351902, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0022249724715948105, + "rewards/margins": 0.6486855745315552, + "rewards/rejected": -0.650910496711731, + "sft_loss": 0.022249722853302956, + "step": 3976 + }, + { + "epoch": 5.751265365148228, + "grad_norm": 1.1150668910106851, + "learning_rate": 3.4389677906201843e-08, + "logits/chosen": -0.9797489643096924, + "logits/rejected": -0.6965649127960205, + "logps/chosen": -0.014419843442738056, + "logps/rejected": -5.9641337394714355, + "loss": 0.0456, + "odds_ratio_loss": 0.0016502912621945143, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001441984437406063, + "rewards/margins": 0.5949714183807373, + "rewards/rejected": -0.5964133739471436, + "sft_loss": 0.014419843442738056, + "step": 3977 + }, + { + "epoch": 5.752711496746204, + "grad_norm": 0.9972980551037653, + "learning_rate": 3.3984479116822896e-08, + "logits/chosen": -0.7742958664894104, + "logits/rejected": -0.6961202025413513, + "logps/chosen": -0.021118473261594772, + "logps/rejected": -4.253470420837402, + "loss": 0.0226, + "odds_ratio_loss": 0.001717887818813324, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002111847046762705, + "rewards/margins": 0.4232351779937744, + "rewards/rejected": -0.42534705996513367, + "sft_loss": 0.021118473261594772, + "step": 3978 + }, + { + "epoch": 5.7541576283441795, + "grad_norm": 1.1409967671513455, + "learning_rate": 3.3581671452973084e-08, + "logits/chosen": -0.895980954170227, + "logits/rejected": -0.7544868588447571, + "logps/chosen": -0.0419120267033577, + "logps/rejected": -4.296963214874268, + "loss": 0.038, + "odds_ratio_loss": 0.0015717416536062956, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0041912030428647995, + "rewards/margins": 0.4255051016807556, + "rewards/rejected": -0.42969632148742676, + "sft_loss": 0.0419120267033577, + "step": 3979 + }, + { + "epoch": 5.755603759942154, + "grad_norm": 1.054092428879142, + "learning_rate": 3.318125515750614e-08, + "logits/chosen": -1.0647022724151611, + "logits/rejected": -0.7363189458847046, + "logps/chosen": -0.05184062197804451, + "logps/rejected": -6.126248836517334, + "loss": 0.0282, + "odds_ratio_loss": 0.0024300923105329275, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0051840622909367085, + "rewards/margins": 0.6074408292770386, + "rewards/rejected": -0.6126248836517334, + "sft_loss": 0.05184062197804451, + "step": 3980 + }, + { + "epoch": 5.75704989154013, + "grad_norm": 1.1317499613490938, + "learning_rate": 3.278323047183429e-08, + "logits/chosen": -0.673163890838623, + "logits/rejected": -0.6283612251281738, + "logps/chosen": -0.024727502837777138, + "logps/rejected": -5.055089473724365, + "loss": 0.0313, + "odds_ratio_loss": 0.004691070411354303, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0024727503769099712, + "rewards/margins": 0.5030362606048584, + "rewards/rejected": -0.5055089592933655, + "sft_loss": 0.024727502837777138, + "step": 3981 + }, + { + "epoch": 5.758496023138106, + "grad_norm": 1.2086838902742276, + "learning_rate": 3.238759763592824e-08, + "logits/chosen": -0.9168479442596436, + "logits/rejected": -0.7292149662971497, + "logps/chosen": -0.05325371026992798, + "logps/rejected": -5.35057258605957, + "loss": 0.0847, + "odds_ratio_loss": 0.0033136121928691864, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005325371399521828, + "rewards/margins": 0.5297318696975708, + "rewards/rejected": -0.5350572466850281, + "sft_loss": 0.05325371026992798, + "step": 3982 + }, + { + "epoch": 5.7599421547360805, + "grad_norm": 1.2961516935194144, + "learning_rate": 3.199435688831631e-08, + "logits/chosen": -0.8049898147583008, + "logits/rejected": -0.677711009979248, + "logps/chosen": -0.03922729194164276, + "logps/rejected": -3.4130773544311523, + "loss": 0.0436, + "odds_ratio_loss": 0.0017275214195251465, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003922729752957821, + "rewards/margins": 0.33738502860069275, + "rewards/rejected": -0.34130772948265076, + "sft_loss": 0.03922729194164276, + "step": 3983 + }, + { + "epoch": 5.761388286334056, + "grad_norm": 1.188349583853395, + "learning_rate": 3.1603508466085284e-08, + "logits/chosen": -0.9943714141845703, + "logits/rejected": -0.6691036224365234, + "logps/chosen": -0.025151284411549568, + "logps/rejected": -5.576414108276367, + "loss": 0.0331, + "odds_ratio_loss": 0.0018484786851331592, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002515128580853343, + "rewards/margins": 0.5551262497901917, + "rewards/rejected": -0.5576413869857788, + "sft_loss": 0.025151284411549568, + "step": 3984 + }, + { + "epoch": 5.762834417932032, + "grad_norm": 1.1340347678222187, + "learning_rate": 3.1215052604879114e-08, + "logits/chosen": -0.8503623008728027, + "logits/rejected": -0.7320210933685303, + "logps/chosen": -0.02072015404701233, + "logps/rejected": -3.383143901824951, + "loss": 0.0354, + "odds_ratio_loss": 0.0017923712730407715, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002072015544399619, + "rewards/margins": 0.3362423777580261, + "rewards/rejected": -0.338314414024353, + "sft_loss": 0.02072015404701233, + "step": 3985 + }, + { + "epoch": 5.764280549530008, + "grad_norm": 1.3273836595656827, + "learning_rate": 3.082898953889845e-08, + "logits/chosen": -0.8875366449356079, + "logits/rejected": -0.5429030060768127, + "logps/chosen": -0.04038511961698532, + "logps/rejected": -6.3761820793151855, + "loss": 0.0474, + "odds_ratio_loss": 0.004100095946341753, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004038511775434017, + "rewards/margins": 0.6335797309875488, + "rewards/rejected": -0.6376181840896606, + "sft_loss": 0.04038511961698532, + "step": 3986 + }, + { + "epoch": 5.765726681127982, + "grad_norm": 1.0007986876611032, + "learning_rate": 3.044531950090334e-08, + "logits/chosen": -0.8018543720245361, + "logits/rejected": -0.6697442531585693, + "logps/chosen": -0.05521427094936371, + "logps/rejected": -5.116559982299805, + "loss": 0.0362, + "odds_ratio_loss": 0.0010527849663048983, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0055214278399944305, + "rewards/margins": 0.506134569644928, + "rewards/rejected": -0.5116559863090515, + "sft_loss": 0.05521427094936371, + "step": 3987 + }, + { + "epoch": 5.767172812725958, + "grad_norm": 1.3050876636722932, + "learning_rate": 3.006404272220919e-08, + "logits/chosen": -1.0676295757293701, + "logits/rejected": -0.7167788147926331, + "logps/chosen": -0.06772436201572418, + "logps/rejected": -4.485447883605957, + "loss": 0.0367, + "odds_ratio_loss": 0.009542109444737434, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006772437132894993, + "rewards/margins": 0.44177234172821045, + "rewards/rejected": -0.44854480028152466, + "sft_loss": 0.06772436201572418, + "step": 3988 + }, + { + "epoch": 5.768618944323934, + "grad_norm": 1.275730282332951, + "learning_rate": 2.9685159432689012e-08, + "logits/chosen": -0.8527493476867676, + "logits/rejected": -0.6094472408294678, + "logps/chosen": -0.05909667909145355, + "logps/rejected": -7.165646553039551, + "loss": 0.0344, + "odds_ratio_loss": 0.003317814087495208, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005909668281674385, + "rewards/margins": 0.7106549739837646, + "rewards/rejected": -0.7165646553039551, + "sft_loss": 0.05909667909145355, + "step": 3989 + }, + { + "epoch": 5.7700650759219085, + "grad_norm": 1.2166176608174204, + "learning_rate": 2.9308669860773848e-08, + "logits/chosen": -0.9953593015670776, + "logits/rejected": -0.6956831216812134, + "logps/chosen": -0.01776827871799469, + "logps/rejected": -6.028247833251953, + "loss": 0.0369, + "odds_ratio_loss": 0.002291513839736581, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0017768278485164046, + "rewards/margins": 0.6010479927062988, + "rewards/rejected": -0.6028247475624084, + "sft_loss": 0.01776827871799469, + "step": 3990 + }, + { + "epoch": 5.771511207519884, + "grad_norm": 1.2792370639647088, + "learning_rate": 2.893457423344925e-08, + "logits/chosen": -0.6821169853210449, + "logits/rejected": -0.5734527111053467, + "logps/chosen": -0.019364001229405403, + "logps/rejected": -4.074793815612793, + "loss": 0.0419, + "odds_ratio_loss": 0.0023358322214335203, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0019364003092050552, + "rewards/margins": 0.4055430293083191, + "rewards/rejected": -0.4074794352054596, + "sft_loss": 0.019364001229405403, + "step": 3991 + }, + { + "epoch": 5.77295733911786, + "grad_norm": 0.9186218043755977, + "learning_rate": 2.8562872776260126e-08, + "logits/chosen": -0.7581536769866943, + "logits/rejected": -0.6958929300308228, + "logps/chosen": -0.025422383099794388, + "logps/rejected": -5.949764728546143, + "loss": 0.0207, + "odds_ratio_loss": 0.0005549125489778817, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0025422382168471813, + "rewards/margins": 0.5924341678619385, + "rewards/rejected": -0.5949764251708984, + "sft_loss": 0.025422383099794388, + "step": 3992 + }, + { + "epoch": 5.774403470715836, + "grad_norm": 0.9030764581302508, + "learning_rate": 2.8193565713306335e-08, + "logits/chosen": -0.9597541093826294, + "logits/rejected": -0.6982543468475342, + "logps/chosen": -0.010779261589050293, + "logps/rejected": -5.799703598022461, + "loss": 0.0222, + "odds_ratio_loss": 0.0005417458014562726, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0010779262520372868, + "rewards/margins": 0.5788924098014832, + "rewards/rejected": -0.5799703598022461, + "sft_loss": 0.010779261589050293, + "step": 3993 + }, + { + "epoch": 5.77584960231381, + "grad_norm": 0.8221212441297421, + "learning_rate": 2.7826653267243984e-08, + "logits/chosen": -0.9700363874435425, + "logits/rejected": -0.885134756565094, + "logps/chosen": -0.01687433198094368, + "logps/rejected": -5.0756330490112305, + "loss": 0.0204, + "odds_ratio_loss": 0.0016347735654562712, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0016874329885467887, + "rewards/margins": 0.5058758854866028, + "rewards/rejected": -0.5075633525848389, + "sft_loss": 0.01687433198094368, + "step": 3994 + }, + { + "epoch": 5.777295733911786, + "grad_norm": 1.0148568039942327, + "learning_rate": 2.746213565928679e-08, + "logits/chosen": -0.9796086549758911, + "logits/rejected": -0.5948249697685242, + "logps/chosen": -0.02354905754327774, + "logps/rejected": -5.957253456115723, + "loss": 0.0458, + "odds_ratio_loss": 0.0007033887668512762, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0023549057077616453, + "rewards/margins": 0.5933704972267151, + "rewards/rejected": -0.5957253575325012, + "sft_loss": 0.02354905754327774, + "step": 3995 + }, + { + "epoch": 5.778741865509762, + "grad_norm": 1.0693204506045595, + "learning_rate": 2.7100013109202957e-08, + "logits/chosen": -1.2479994297027588, + "logits/rejected": -0.8474553823471069, + "logps/chosen": -0.04476558417081833, + "logps/rejected": -4.169618606567383, + "loss": 0.0323, + "odds_ratio_loss": 0.004479460418224335, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004476558417081833, + "rewards/margins": 0.4124853014945984, + "rewards/rejected": -0.4169618785381317, + "sft_loss": 0.04476558417081833, + "step": 3996 + }, + { + "epoch": 5.780187997107737, + "grad_norm": 1.6249830066844961, + "learning_rate": 2.6740285835317844e-08, + "logits/chosen": -0.7211387157440186, + "logits/rejected": -0.5791776776313782, + "logps/chosen": -0.08303683251142502, + "logps/rejected": -6.241660118103027, + "loss": 0.0439, + "odds_ratio_loss": 0.005026006139814854, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008303683251142502, + "rewards/margins": 0.615862250328064, + "rewards/rejected": -0.624165952205658, + "sft_loss": 0.08303683251142502, + "step": 3997 + }, + { + "epoch": 5.781634128705712, + "grad_norm": 1.112640675962451, + "learning_rate": 2.638295405451263e-08, + "logits/chosen": -0.973607063293457, + "logits/rejected": -0.6138404607772827, + "logps/chosen": -0.034950047731399536, + "logps/rejected": -5.843950271606445, + "loss": 0.0291, + "odds_ratio_loss": 0.0012881564907729626, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003495004726573825, + "rewards/margins": 0.5808999538421631, + "rewards/rejected": -0.5843949913978577, + "sft_loss": 0.034950047731399536, + "step": 3998 + }, + { + "epoch": 5.783080260303688, + "grad_norm": 0.8817721981331313, + "learning_rate": 2.602801798222387e-08, + "logits/chosen": -1.0843451023101807, + "logits/rejected": -0.8955305218696594, + "logps/chosen": -0.014239782467484474, + "logps/rejected": -3.2525007724761963, + "loss": 0.0249, + "odds_ratio_loss": 0.0006716042989864945, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0014239782467484474, + "rewards/margins": 0.3238261044025421, + "rewards/rejected": -0.3252500891685486, + "sft_loss": 0.014239782467484474, + "step": 3999 + }, + { + "epoch": 5.784526391901663, + "grad_norm": 0.9708590418536193, + "learning_rate": 2.567547783244306e-08, + "logits/chosen": -0.9054265022277832, + "logits/rejected": -0.6644700765609741, + "logps/chosen": -0.03736726939678192, + "logps/rejected": -4.768521785736084, + "loss": 0.0316, + "odds_ratio_loss": 0.0016982683446258307, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003736727172508836, + "rewards/margins": 0.4731154441833496, + "rewards/rejected": -0.4768521785736084, + "sft_loss": 0.03736726939678192, + "step": 4000 + }, + { + "epoch": 5.785972523499638, + "grad_norm": 1.6144577125437707, + "learning_rate": 2.5325333817719285e-08, + "logits/chosen": -0.8311997056007385, + "logits/rejected": -0.7566261291503906, + "logps/chosen": -0.05182730779051781, + "logps/rejected": -3.975294828414917, + "loss": 0.0513, + "odds_ratio_loss": 0.00553969107568264, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005182730499655008, + "rewards/margins": 0.3923467695713043, + "rewards/rejected": -0.3975295126438141, + "sft_loss": 0.05182730779051781, + "step": 4001 + }, + { + "epoch": 5.787418655097614, + "grad_norm": 1.0544511861272714, + "learning_rate": 2.4977586149154793e-08, + "logits/chosen": -0.8234452605247498, + "logits/rejected": -0.6371239423751831, + "logps/chosen": -0.055266425013542175, + "logps/rejected": -4.471240520477295, + "loss": 0.0312, + "odds_ratio_loss": 0.002024412387982011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00552664278075099, + "rewards/margins": 0.4415974020957947, + "rewards/rejected": -0.44712403416633606, + "sft_loss": 0.055266425013542175, + "step": 4002 + }, + { + "epoch": 5.788864786695589, + "grad_norm": 0.9448910379395407, + "learning_rate": 2.4632235036408544e-08, + "logits/chosen": -1.094842553138733, + "logits/rejected": -0.8462741374969482, + "logps/chosen": -0.020257651805877686, + "logps/rejected": -5.7198896408081055, + "loss": 0.0359, + "odds_ratio_loss": 0.0031071146950125694, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00202576513402164, + "rewards/margins": 0.5699632167816162, + "rewards/rejected": -0.5719889402389526, + "sft_loss": 0.020257651805877686, + "step": 4003 + }, + { + "epoch": 5.790310918293565, + "grad_norm": 1.2029466658124404, + "learning_rate": 2.4289280687693093e-08, + "logits/chosen": -1.007336974143982, + "logits/rejected": -0.8913165330886841, + "logps/chosen": -0.04219206050038338, + "logps/rejected": -3.9805102348327637, + "loss": 0.0373, + "odds_ratio_loss": 0.007664866745471954, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004219206050038338, + "rewards/margins": 0.3938317894935608, + "rewards/rejected": -0.39805102348327637, + "sft_loss": 0.04219206050038338, + "step": 4004 + }, + { + "epoch": 5.79175704989154, + "grad_norm": 1.5579647159504868, + "learning_rate": 2.3948723309777706e-08, + "logits/chosen": -0.90506911277771, + "logits/rejected": -0.6676803827285767, + "logps/chosen": -0.04042967036366463, + "logps/rejected": -3.899822950363159, + "loss": 0.0528, + "odds_ratio_loss": 0.0016432093689218163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004042967222630978, + "rewards/margins": 0.3859393000602722, + "rewards/rejected": -0.38998228311538696, + "sft_loss": 0.04042967036366463, + "step": 4005 + }, + { + "epoch": 5.793203181489515, + "grad_norm": 0.9664642233082743, + "learning_rate": 2.361056310798526e-08, + "logits/chosen": -1.0167044401168823, + "logits/rejected": -0.588829517364502, + "logps/chosen": -0.047890324145555496, + "logps/rejected": -5.325345039367676, + "loss": 0.0256, + "odds_ratio_loss": 0.0006860626745037735, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004789032973349094, + "rewards/margins": 0.5277454853057861, + "rewards/rejected": -0.5325345396995544, + "sft_loss": 0.047890324145555496, + "step": 4006 + }, + { + "epoch": 5.794649313087491, + "grad_norm": 1.2513957261398723, + "learning_rate": 2.3274800286193997e-08, + "logits/chosen": -1.0148444175720215, + "logits/rejected": -0.7270374298095703, + "logps/chosen": -0.026285681873559952, + "logps/rejected": -3.6439900398254395, + "loss": 0.0293, + "odds_ratio_loss": 0.004125781357288361, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0026285680942237377, + "rewards/margins": 0.3617704510688782, + "rewards/rejected": -0.3643990159034729, + "sft_loss": 0.026285681873559952, + "step": 4007 + }, + { + "epoch": 5.7960954446854664, + "grad_norm": 1.4118563796416221, + "learning_rate": 2.2941435046836654e-08, + "logits/chosen": -0.9013469219207764, + "logits/rejected": -0.8452996015548706, + "logps/chosen": -0.04646843299269676, + "logps/rejected": -6.354674816131592, + "loss": 0.033, + "odds_ratio_loss": 0.0004940081853419542, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0046468437649309635, + "rewards/margins": 0.6308206915855408, + "rewards/rejected": -0.635467529296875, + "sft_loss": 0.04646843299269676, + "step": 4008 + }, + { + "epoch": 5.797541576283442, + "grad_norm": 1.5375611646583789, + "learning_rate": 2.2610467590900463e-08, + "logits/chosen": -1.030297875404358, + "logits/rejected": -0.726286768913269, + "logps/chosen": -0.022617783397436142, + "logps/rejected": -4.68665885925293, + "loss": 0.0272, + "odds_ratio_loss": 0.0016010688850656152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002261778572574258, + "rewards/margins": 0.46640413999557495, + "rewards/rejected": -0.4686659276485443, + "sft_loss": 0.022617783397436142, + "step": 4009 + }, + { + "epoch": 5.798987707881417, + "grad_norm": 0.9161126912343743, + "learning_rate": 2.2281898117926244e-08, + "logits/chosen": -1.0119342803955078, + "logits/rejected": -0.6778802275657654, + "logps/chosen": -0.019781216979026794, + "logps/rejected": -5.734238624572754, + "loss": 0.0274, + "odds_ratio_loss": 0.002621131483465433, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0019781216979026794, + "rewards/margins": 0.5714457631111145, + "rewards/rejected": -0.5734239220619202, + "sft_loss": 0.019781216979026794, + "step": 4010 + }, + { + "epoch": 5.800433839479393, + "grad_norm": 0.8694353605814207, + "learning_rate": 2.1955726826010655e-08, + "logits/chosen": -0.8888824582099915, + "logits/rejected": -0.6554268002510071, + "logps/chosen": -0.008277302607893944, + "logps/rejected": -4.462595462799072, + "loss": 0.0235, + "odds_ratio_loss": 0.0004954091855324805, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0008277302840724587, + "rewards/margins": 0.44543182849884033, + "rewards/rejected": -0.4462595582008362, + "sft_loss": 0.008277302607893944, + "step": 4011 + }, + { + "epoch": 5.801879971077368, + "grad_norm": 1.0813962888466517, + "learning_rate": 2.1631953911803058e-08, + "logits/chosen": -0.9825663566589355, + "logits/rejected": -0.7994692325592041, + "logps/chosen": -0.014045009389519691, + "logps/rejected": -5.245484352111816, + "loss": 0.0358, + "odds_ratio_loss": 0.044613584876060486, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0014045010320842266, + "rewards/margins": 0.5231439471244812, + "rewards/rejected": -0.5245484113693237, + "sft_loss": 0.014045009389519691, + "step": 4012 + }, + { + "epoch": 5.803326102675343, + "grad_norm": 0.9881938059874377, + "learning_rate": 2.131057957050775e-08, + "logits/chosen": -0.9296178221702576, + "logits/rejected": -0.662962794303894, + "logps/chosen": -0.015109667554497719, + "logps/rejected": -5.480372428894043, + "loss": 0.0432, + "odds_ratio_loss": 0.0005065487348474562, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0015109669184312224, + "rewards/margins": 0.546526312828064, + "rewards/rejected": -0.5480372905731201, + "sft_loss": 0.015109667554497719, + "step": 4013 + }, + { + "epoch": 5.804772234273319, + "grad_norm": 1.4249186140475822, + "learning_rate": 2.0991603995881736e-08, + "logits/chosen": -1.1877706050872803, + "logits/rejected": -0.7925946116447449, + "logps/chosen": -0.0431302934885025, + "logps/rejected": -6.408783912658691, + "loss": 0.0465, + "odds_ratio_loss": 0.0022244546562433243, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00431302934885025, + "rewards/margins": 0.6365653872489929, + "rewards/rejected": -0.6408783793449402, + "sft_loss": 0.0431302934885025, + "step": 4014 + }, + { + "epoch": 5.8062183658712945, + "grad_norm": 1.0239024998010973, + "learning_rate": 2.0675027380237408e-08, + "logits/chosen": -0.9312243461608887, + "logits/rejected": -0.8070486783981323, + "logps/chosen": -0.02381393127143383, + "logps/rejected": -5.620185852050781, + "loss": 0.0412, + "odds_ratio_loss": 0.0018174147699028254, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002381392987444997, + "rewards/margins": 0.559637188911438, + "rewards/rejected": -0.562018632888794, + "sft_loss": 0.02381393127143383, + "step": 4015 + }, + { + "epoch": 5.80766449746927, + "grad_norm": 1.1124521948539061, + "learning_rate": 2.0360849914439427e-08, + "logits/chosen": -0.933996319770813, + "logits/rejected": -0.8640280961990356, + "logps/chosen": -0.049488216638565063, + "logps/rejected": -4.251837730407715, + "loss": 0.0513, + "odds_ratio_loss": 0.004922335967421532, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004948821850121021, + "rewards/margins": 0.4202350080013275, + "rewards/rejected": -0.42518380284309387, + "sft_loss": 0.049488216638565063, + "step": 4016 + }, + { + "epoch": 5.809110629067245, + "grad_norm": 1.50884766745822, + "learning_rate": 2.0049071787906933e-08, + "logits/chosen": -0.7553597092628479, + "logits/rejected": -0.6329598426818848, + "logps/chosen": -0.020680755376815796, + "logps/rejected": -4.397462368011475, + "loss": 0.037, + "odds_ratio_loss": 0.0012878633569926023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0020680755842477083, + "rewards/margins": 0.43767818808555603, + "rewards/rejected": -0.43974626064300537, + "sft_loss": 0.020680755376815796, + "step": 4017 + }, + { + "epoch": 5.810556760665221, + "grad_norm": 1.254658529361879, + "learning_rate": 1.973969318861224e-08, + "logits/chosen": -0.9069069623947144, + "logits/rejected": -0.8351166248321533, + "logps/chosen": -0.07286441326141357, + "logps/rejected": -4.8769145011901855, + "loss": 0.0591, + "odds_ratio_loss": 0.004483464173972607, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00728644197806716, + "rewards/margins": 0.4804050326347351, + "rewards/rejected": -0.4876914620399475, + "sft_loss": 0.07286441326141357, + "step": 4018 + }, + { + "epoch": 5.812002892263196, + "grad_norm": 1.443364183996849, + "learning_rate": 1.9432714303080354e-08, + "logits/chosen": -0.9718085527420044, + "logits/rejected": -0.824963390827179, + "logps/chosen": -0.062426358461380005, + "logps/rejected": -4.95826530456543, + "loss": 0.0591, + "odds_ratio_loss": 0.002314309123903513, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006242636125534773, + "rewards/margins": 0.4895838499069214, + "rewards/rejected": -0.49582648277282715, + "sft_loss": 0.062426358461380005, + "step": 4019 + }, + { + "epoch": 5.813449023861171, + "grad_norm": 1.0809438633806718, + "learning_rate": 1.9128135316390348e-08, + "logits/chosen": -0.9536488652229309, + "logits/rejected": -0.690174400806427, + "logps/chosen": -0.03028007224202156, + "logps/rejected": -4.903285980224609, + "loss": 0.0359, + "odds_ratio_loss": 0.0007200206746347249, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0030280069913715124, + "rewards/margins": 0.4873005747795105, + "rewards/rejected": -0.4903286099433899, + "sft_loss": 0.03028007224202156, + "step": 4020 + }, + { + "epoch": 5.814895155459147, + "grad_norm": 1.1851391157157583, + "learning_rate": 1.882595641217355e-08, + "logits/chosen": -1.0572259426116943, + "logits/rejected": -0.785851240158081, + "logps/chosen": -0.01872413232922554, + "logps/rejected": -4.886478900909424, + "loss": 0.0436, + "odds_ratio_loss": 0.0007939153583720326, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0018724132096394897, + "rewards/margins": 0.4867754578590393, + "rewards/rejected": -0.4886478781700134, + "sft_loss": 0.01872413232922554, + "step": 4021 + }, + { + "epoch": 5.8163412870571225, + "grad_norm": 1.0114293223244128, + "learning_rate": 1.8526177772615336e-08, + "logits/chosen": -0.8728271126747131, + "logits/rejected": -0.6743614077568054, + "logps/chosen": -0.03183284401893616, + "logps/rejected": -4.913818836212158, + "loss": 0.022, + "odds_ratio_loss": 0.0015718166250735521, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003183284541592002, + "rewards/margins": 0.4881986081600189, + "rewards/rejected": -0.4913818836212158, + "sft_loss": 0.03183284401893616, + "step": 4022 + }, + { + "epoch": 5.817787418655097, + "grad_norm": 0.9150372634239352, + "learning_rate": 1.8228799578452914e-08, + "logits/chosen": -0.879639208316803, + "logits/rejected": -0.6301390528678894, + "logps/chosen": -0.015895256772637367, + "logps/rejected": -5.926244258880615, + "loss": 0.0277, + "odds_ratio_loss": 0.0004677917459048331, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0015895256074145436, + "rewards/margins": 0.5910348892211914, + "rewards/rejected": -0.5926244258880615, + "sft_loss": 0.015895256772637367, + "step": 4023 + }, + { + "epoch": 5.819233550253073, + "grad_norm": 0.8348971690681323, + "learning_rate": 1.7933822008977527e-08, + "logits/chosen": -0.8647024631500244, + "logits/rejected": -0.7957019805908203, + "logps/chosen": -0.017145434394478798, + "logps/rejected": -4.466237545013428, + "loss": 0.0225, + "odds_ratio_loss": 0.0012718301732093096, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0017145435558632016, + "rewards/margins": 0.4449092149734497, + "rewards/rejected": -0.4466237723827362, + "sft_loss": 0.017145434394478798, + "step": 4024 + }, + { + "epoch": 5.820679681851049, + "grad_norm": 1.1128379037048841, + "learning_rate": 1.764124524203092e-08, + "logits/chosen": -1.0014090538024902, + "logits/rejected": -0.808147668838501, + "logps/chosen": -0.0898212268948555, + "logps/rejected": -3.219820499420166, + "loss": 0.0551, + "odds_ratio_loss": 0.00917213223874569, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008982122875750065, + "rewards/margins": 0.3129999041557312, + "rewards/rejected": -0.3219820261001587, + "sft_loss": 0.0898212268948555, + "step": 4025 + }, + { + "epoch": 5.8221258134490235, + "grad_norm": 1.0459193455299765, + "learning_rate": 1.735106945400977e-08, + "logits/chosen": -0.9716974496841431, + "logits/rejected": -0.6809044480323792, + "logps/chosen": -0.02012844756245613, + "logps/rejected": -5.090946674346924, + "loss": 0.0366, + "odds_ratio_loss": 0.0011630118824541569, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0020128446631133556, + "rewards/margins": 0.5070818066596985, + "rewards/rejected": -0.5090946555137634, + "sft_loss": 0.02012844756245613, + "step": 4026 + }, + { + "epoch": 5.823571945046999, + "grad_norm": 1.0703583145731792, + "learning_rate": 1.706329481986213e-08, + "logits/chosen": -0.8691219687461853, + "logits/rejected": -0.5219488739967346, + "logps/chosen": -0.014108894392848015, + "logps/rejected": -5.664173126220703, + "loss": 0.0341, + "odds_ratio_loss": 0.00077531993156299, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0014108894392848015, + "rewards/margins": 0.56500643491745, + "rewards/rejected": -0.5664172768592834, + "sft_loss": 0.014108894392848015, + "step": 4027 + }, + { + "epoch": 5.825018076644975, + "grad_norm": 0.7928782376077622, + "learning_rate": 1.6777921513087433e-08, + "logits/chosen": -0.8064377307891846, + "logits/rejected": -0.6964023113250732, + "logps/chosen": -0.013883027248084545, + "logps/rejected": -3.529005765914917, + "loss": 0.0165, + "odds_ratio_loss": 0.000492023304104805, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0013883027713745832, + "rewards/margins": 0.3515123128890991, + "rewards/rejected": -0.35290059447288513, + "sft_loss": 0.013883027248084545, + "step": 4028 + }, + { + "epoch": 5.82646420824295, + "grad_norm": 1.116988376167779, + "learning_rate": 1.6494949705739613e-08, + "logits/chosen": -1.0281099081039429, + "logits/rejected": -0.9229446649551392, + "logps/chosen": -0.012211017310619354, + "logps/rejected": -4.516913414001465, + "loss": 0.0414, + "odds_ratio_loss": 0.0013461960479617119, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0012211017310619354, + "rewards/margins": 0.4504702091217041, + "rewards/rejected": -0.45169129967689514, + "sft_loss": 0.012211017310619354, + "step": 4029 + }, + { + "epoch": 5.827910339840925, + "grad_norm": 1.0687469194910033, + "learning_rate": 1.62143795684222e-08, + "logits/chosen": -0.8571509718894958, + "logits/rejected": -0.6989127397537231, + "logps/chosen": -0.03518003597855568, + "logps/rejected": -4.067432403564453, + "loss": 0.0362, + "odds_ratio_loss": 0.0014950365293771029, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003518004436045885, + "rewards/margins": 0.40322527289390564, + "rewards/rejected": -0.40674328804016113, + "sft_loss": 0.03518003597855568, + "step": 4030 + }, + { + "epoch": 5.829356471438901, + "grad_norm": 1.1180683825844981, + "learning_rate": 1.5936211270292765e-08, + "logits/chosen": -1.0204957723617554, + "logits/rejected": -0.8662518262863159, + "logps/chosen": -0.060904499143362045, + "logps/rejected": -4.2967681884765625, + "loss": 0.0523, + "odds_ratio_loss": 0.001792628550902009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006090449169278145, + "rewards/margins": 0.423586368560791, + "rewards/rejected": -0.42967677116394043, + "sft_loss": 0.060904499143362045, + "step": 4031 + }, + { + "epoch": 5.830802603036877, + "grad_norm": 1.1415004321422098, + "learning_rate": 1.566044497905983e-08, + "logits/chosen": -0.7944949865341187, + "logits/rejected": -0.6358327269554138, + "logps/chosen": -0.030271006748080254, + "logps/rejected": -5.495286464691162, + "loss": 0.0406, + "odds_ratio_loss": 0.0012499038130044937, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003027101047337055, + "rewards/margins": 0.5465015172958374, + "rewards/rejected": -0.5495285987854004, + "sft_loss": 0.030271006748080254, + "step": 4032 + }, + { + "epoch": 5.8322487346348515, + "grad_norm": 0.9578429236167958, + "learning_rate": 1.538708086098417e-08, + "logits/chosen": -0.8756827116012573, + "logits/rejected": -0.5940501689910889, + "logps/chosen": -0.01414337195456028, + "logps/rejected": -4.33171272277832, + "loss": 0.0358, + "odds_ratio_loss": 0.0012647686526179314, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001414337195456028, + "rewards/margins": 0.43175697326660156, + "rewards/rejected": -0.4331713020801544, + "sft_loss": 0.01414337195456028, + "step": 4033 + }, + { + "epoch": 5.833694866232827, + "grad_norm": 1.023502337177867, + "learning_rate": 1.511611908087751e-08, + "logits/chosen": -0.9340368509292603, + "logits/rejected": -0.7580222487449646, + "logps/chosen": -0.029262201860547066, + "logps/rejected": -5.634925842285156, + "loss": 0.0434, + "odds_ratio_loss": 0.0013582634273916483, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0029262204188853502, + "rewards/margins": 0.5605663061141968, + "rewards/rejected": -0.5634925365447998, + "sft_loss": 0.029262201860547066, + "step": 4034 + }, + { + "epoch": 5.835140997830803, + "grad_norm": 1.1248514719985416, + "learning_rate": 1.4847559802103837e-08, + "logits/chosen": -0.9322491884231567, + "logits/rejected": -0.6265773177146912, + "logps/chosen": -0.0576665922999382, + "logps/rejected": -4.378793239593506, + "loss": 0.0477, + "odds_ratio_loss": 0.0018896459368988872, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005766659043729305, + "rewards/margins": 0.4321126639842987, + "rewards/rejected": -0.4378793239593506, + "sft_loss": 0.0576665922999382, + "step": 4035 + }, + { + "epoch": 5.836587129428778, + "grad_norm": 1.2991973773075023, + "learning_rate": 1.4581403186578523e-08, + "logits/chosen": -0.8614668846130371, + "logits/rejected": -0.6605913043022156, + "logps/chosen": -0.06553688645362854, + "logps/rejected": -4.359433650970459, + "loss": 0.0473, + "odds_ratio_loss": 0.00791736226528883, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006553689017891884, + "rewards/margins": 0.42938968539237976, + "rewards/rejected": -0.4359433948993683, + "sft_loss": 0.06553688645362854, + "step": 4036 + }, + { + "epoch": 5.838033261026753, + "grad_norm": 0.9138828175946025, + "learning_rate": 1.4317649394768761e-08, + "logits/chosen": -0.9851160049438477, + "logits/rejected": -0.6789707541465759, + "logps/chosen": -0.009366050362586975, + "logps/rejected": -6.126029014587402, + "loss": 0.0244, + "odds_ratio_loss": 0.0009717661887407303, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0009366050362586975, + "rewards/margins": 0.6116663217544556, + "rewards/rejected": -0.6126028895378113, + "sft_loss": 0.009366050362586975, + "step": 4037 + }, + { + "epoch": 5.839479392624729, + "grad_norm": 1.2300109167391127, + "learning_rate": 1.4056298585692238e-08, + "logits/chosen": -1.1383479833602905, + "logits/rejected": -0.7096400856971741, + "logps/chosen": -0.13449586927890778, + "logps/rejected": -5.386882781982422, + "loss": 0.0668, + "odds_ratio_loss": 0.0036082954611629248, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013449587859213352, + "rewards/margins": 0.5252387523651123, + "rewards/rejected": -0.5386883616447449, + "sft_loss": 0.13449586927890778, + "step": 4038 + }, + { + "epoch": 5.840925524222705, + "grad_norm": 1.380616416611711, + "learning_rate": 1.3797350916918914e-08, + "logits/chosen": -0.9529186487197876, + "logits/rejected": -0.7264933586120605, + "logps/chosen": -0.02300257608294487, + "logps/rejected": -4.905316352844238, + "loss": 0.0418, + "odds_ratio_loss": 0.0007615200011059642, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002300257794559002, + "rewards/margins": 0.4882313907146454, + "rewards/rejected": -0.49053165316581726, + "sft_loss": 0.02300257608294487, + "step": 4039 + }, + { + "epoch": 5.8423716558206795, + "grad_norm": 1.0373658257981904, + "learning_rate": 1.3540806544568794e-08, + "logits/chosen": -0.6172590255737305, + "logits/rejected": -0.42707759141921997, + "logps/chosen": -0.020513903349637985, + "logps/rejected": -5.75158166885376, + "loss": 0.0309, + "odds_ratio_loss": 0.001809641718864441, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002051390241831541, + "rewards/margins": 0.5731067657470703, + "rewards/rejected": -0.5751581788063049, + "sft_loss": 0.020513903349637985, + "step": 4040 + }, + { + "epoch": 5.843817787418655, + "grad_norm": 1.0530917340846524, + "learning_rate": 1.3286665623313264e-08, + "logits/chosen": -0.6495931148529053, + "logits/rejected": -0.6027662754058838, + "logps/chosen": -0.018377287313342094, + "logps/rejected": -4.457110404968262, + "loss": 0.0297, + "odds_ratio_loss": 0.0023091405164450407, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00183772889431566, + "rewards/margins": 0.4438733458518982, + "rewards/rejected": -0.44571104645729065, + "sft_loss": 0.018377287313342094, + "step": 4041 + }, + { + "epoch": 5.845263919016631, + "grad_norm": 0.8971616920905066, + "learning_rate": 1.3034928306375537e-08, + "logits/chosen": -0.9880181550979614, + "logits/rejected": -0.7460534572601318, + "logps/chosen": -0.007409685291349888, + "logps/rejected": -6.935779571533203, + "loss": 0.0375, + "odds_ratio_loss": 0.0009576200391165912, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0007409685058519244, + "rewards/margins": 0.6928369998931885, + "rewards/rejected": -0.6935780048370361, + "sft_loss": 0.007409685291349888, + "step": 4042 + }, + { + "epoch": 5.846710050614606, + "grad_norm": 1.1496577963793577, + "learning_rate": 1.2785594745528427e-08, + "logits/chosen": -0.7424166798591614, + "logits/rejected": -0.7751407623291016, + "logps/chosen": -0.08187150955200195, + "logps/rejected": -4.955060958862305, + "loss": 0.0482, + "odds_ratio_loss": 0.004987210966646671, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008187152445316315, + "rewards/margins": 0.4873189330101013, + "rewards/rejected": -0.4955061078071594, + "sft_loss": 0.08187150955200195, + "step": 4043 + }, + { + "epoch": 5.848156182212581, + "grad_norm": 1.1586948812349995, + "learning_rate": 1.2538665091096135e-08, + "logits/chosen": -1.0148038864135742, + "logits/rejected": -0.7616249322891235, + "logps/chosen": -0.051184628158807755, + "logps/rejected": -4.20012903213501, + "loss": 0.0561, + "odds_ratio_loss": 0.0011503919959068298, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005118463188409805, + "rewards/margins": 0.4148944616317749, + "rewards/rejected": -0.420012891292572, + "sft_loss": 0.051184628158807755, + "step": 4044 + }, + { + "epoch": 5.849602313810557, + "grad_norm": 1.000916813030637, + "learning_rate": 1.2294139491953348e-08, + "logits/chosen": -1.1537386178970337, + "logits/rejected": -0.8139678835868835, + "logps/chosen": -0.07197534292936325, + "logps/rejected": -5.211237907409668, + "loss": 0.0403, + "odds_ratio_loss": 0.0006955383578315377, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00719753373414278, + "rewards/margins": 0.5139262080192566, + "rewards/rejected": -0.5211237668991089, + "sft_loss": 0.07197534292936325, + "step": 4045 + }, + { + "epoch": 5.851048445408532, + "grad_norm": 0.7275097227346992, + "learning_rate": 1.205201809552614e-08, + "logits/chosen": -0.868467390537262, + "logits/rejected": -0.738372802734375, + "logps/chosen": -0.008017164655029774, + "logps/rejected": -5.244393825531006, + "loss": 0.0164, + "odds_ratio_loss": 0.0011052724439650774, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0008017165237106383, + "rewards/margins": 0.5236376523971558, + "rewards/rejected": -0.5244393348693848, + "sft_loss": 0.008017164655029774, + "step": 4046 + }, + { + "epoch": 5.8524945770065075, + "grad_norm": 1.3911350884327334, + "learning_rate": 1.1812301047789741e-08, + "logits/chosen": -0.9431469440460205, + "logits/rejected": -0.5744068622589111, + "logps/chosen": -0.028782209381461143, + "logps/rejected": -5.3098039627075195, + "loss": 0.0439, + "odds_ratio_loss": 0.0012701970990747213, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002878220984712243, + "rewards/margins": 0.5281022191047668, + "rewards/rejected": -0.5309804677963257, + "sft_loss": 0.028782209381461143, + "step": 4047 + }, + { + "epoch": 5.853940708604483, + "grad_norm": 1.797128317747924, + "learning_rate": 1.157498849327032e-08, + "logits/chosen": -0.7650055289268494, + "logits/rejected": -0.6522244811058044, + "logps/chosen": -0.019537970423698425, + "logps/rejected": -4.800078392028809, + "loss": 0.038, + "odds_ratio_loss": 0.0024411689955741167, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001953796949237585, + "rewards/margins": 0.478054016828537, + "rewards/rejected": -0.4800078272819519, + "sft_loss": 0.019537970423698425, + "step": 4048 + }, + { + "epoch": 5.855386840202458, + "grad_norm": 1.0121678029353458, + "learning_rate": 1.134008057504543e-08, + "logits/chosen": -1.054601788520813, + "logits/rejected": -0.7696323990821838, + "logps/chosen": -0.024660352617502213, + "logps/rejected": -4.637537479400635, + "loss": 0.0491, + "odds_ratio_loss": 0.0005984175368212163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0024660350754857063, + "rewards/margins": 0.4612876772880554, + "rewards/rejected": -0.46375373005867004, + "sft_loss": 0.024660352617502213, + "step": 4049 + }, + { + "epoch": 5.856832971800434, + "grad_norm": 0.9120344722940604, + "learning_rate": 1.110757743474089e-08, + "logits/chosen": -0.880794882774353, + "logits/rejected": -0.620921790599823, + "logps/chosen": -0.027760500088334084, + "logps/rejected": -4.301288604736328, + "loss": 0.0361, + "odds_ratio_loss": 0.04534320533275604, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.002776050241664052, + "rewards/margins": 0.4273528456687927, + "rewards/rejected": -0.43012890219688416, + "sft_loss": 0.027760500088334084, + "step": 4050 + }, + { + "epoch": 5.858279103398409, + "grad_norm": 0.9597825250130472, + "learning_rate": 1.0877479212534347e-08, + "logits/chosen": -0.7906923294067383, + "logits/rejected": -0.7207321524620056, + "logps/chosen": -0.026014819741249084, + "logps/rejected": -4.051596164703369, + "loss": 0.0285, + "odds_ratio_loss": 0.045702412724494934, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0026014824397861958, + "rewards/margins": 0.4025581479072571, + "rewards/rejected": -0.405159592628479, + "sft_loss": 0.026014819741249084, + "step": 4051 + }, + { + "epoch": 5.859725234996384, + "grad_norm": 1.0139612740090866, + "learning_rate": 1.0649786047152164e-08, + "logits/chosen": -0.8806436657905579, + "logits/rejected": -0.8103398084640503, + "logps/chosen": -0.009493944235146046, + "logps/rejected": -6.130977153778076, + "loss": 0.0262, + "odds_ratio_loss": 0.00031167615088634193, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0009493945399299264, + "rewards/margins": 0.6121483445167542, + "rewards/rejected": -0.6130977869033813, + "sft_loss": 0.009493944235146046, + "step": 4052 + }, + { + "epoch": 5.86117136659436, + "grad_norm": 1.1549340742035668, + "learning_rate": 1.0424498075872534e-08, + "logits/chosen": -1.0818321704864502, + "logits/rejected": -0.6798063516616821, + "logps/chosen": -0.02330070734024048, + "logps/rejected": -6.7499237060546875, + "loss": 0.0461, + "odds_ratio_loss": 0.00016744097229093313, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0023300708271563053, + "rewards/margins": 0.6726623773574829, + "rewards/rejected": -0.6749923825263977, + "sft_loss": 0.02330070734024048, + "step": 4053 + }, + { + "epoch": 5.862617498192336, + "grad_norm": 1.1474299609090526, + "learning_rate": 1.020161543452147e-08, + "logits/chosen": -0.9909787178039551, + "logits/rejected": -0.7948153018951416, + "logps/chosen": -0.06000065430998802, + "logps/rejected": -5.568455219268799, + "loss": 0.0374, + "odds_ratio_loss": 0.0031150178983807564, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006000065244734287, + "rewards/margins": 0.5508455038070679, + "rewards/rejected": -0.5568455457687378, + "sft_loss": 0.06000065430998802, + "step": 4054 + }, + { + "epoch": 5.864063629790311, + "grad_norm": 1.2769821049762768, + "learning_rate": 9.98113825747593e-09, + "logits/chosen": -0.5809721946716309, + "logits/rejected": -0.4099165201187134, + "logps/chosen": -0.05332209914922714, + "logps/rejected": -5.088987827301025, + "loss": 0.0353, + "odds_ratio_loss": 0.0025865475181490183, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005332210101187229, + "rewards/margins": 0.503566563129425, + "rewards/rejected": -0.5088987946510315, + "sft_loss": 0.05332209914922714, + "step": 4055 + }, + { + "epoch": 5.865509761388286, + "grad_norm": 0.9667710277502991, + "learning_rate": 9.763066677662912e-09, + "logits/chosen": -0.8778347969055176, + "logits/rejected": -0.7671928405761719, + "logps/chosen": -0.01711271144449711, + "logps/rejected": -4.43693733215332, + "loss": 0.0314, + "odds_ratio_loss": 0.0010325959883630276, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0017112712375819683, + "rewards/margins": 0.4419825077056885, + "rewards/rejected": -0.44369375705718994, + "sft_loss": 0.01711271144449711, + "step": 4056 + }, + { + "epoch": 5.866955892986262, + "grad_norm": 1.2189116844595043, + "learning_rate": 9.547400826557694e-09, + "logits/chosen": -0.8398338556289673, + "logits/rejected": -0.68487548828125, + "logps/chosen": -0.06597983837127686, + "logps/rejected": -7.124354839324951, + "loss": 0.0583, + "odds_ratio_loss": 0.005012186244130135, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006597983185201883, + "rewards/margins": 0.7058374881744385, + "rewards/rejected": -0.7124354839324951, + "sft_loss": 0.06597983837127686, + "step": 4057 + }, + { + "epoch": 5.868402024584237, + "grad_norm": 0.8571045099648362, + "learning_rate": 9.334140834186933e-09, + "logits/chosen": -0.7491556406021118, + "logits/rejected": -0.6324608325958252, + "logps/chosen": -0.020480100065469742, + "logps/rejected": -6.475594997406006, + "loss": 0.0228, + "odds_ratio_loss": 0.0007072009611874819, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002048010006546974, + "rewards/margins": 0.6455115079879761, + "rewards/rejected": -0.6475595235824585, + "sft_loss": 0.020480100065469742, + "step": 4058 + }, + { + "epoch": 5.869848156182212, + "grad_norm": 1.1139143128225042, + "learning_rate": 9.123286829125554e-09, + "logits/chosen": -0.9277915954589844, + "logits/rejected": -0.49561363458633423, + "logps/chosen": -0.05119159817695618, + "logps/rejected": -5.136085510253906, + "loss": 0.0391, + "odds_ratio_loss": 0.0015518446452915668, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005119160283356905, + "rewards/margins": 0.5084893703460693, + "rewards/rejected": -0.5136085152626038, + "sft_loss": 0.05119159817695618, + "step": 4059 + }, + { + "epoch": 5.871294287780188, + "grad_norm": 0.9491723434647289, + "learning_rate": 8.914838938498093e-09, + "logits/chosen": -0.7413885593414307, + "logits/rejected": -0.6934369802474976, + "logps/chosen": -0.01211594045162201, + "logps/rejected": -4.734433174133301, + "loss": 0.0284, + "odds_ratio_loss": 0.0015892100054770708, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0012115939753130078, + "rewards/margins": 0.4722316861152649, + "rewards/rejected": -0.47344329953193665, + "sft_loss": 0.01211594045162201, + "step": 4060 + }, + { + "epoch": 5.872740419378164, + "grad_norm": 1.237417223112888, + "learning_rate": 8.708797287978687e-09, + "logits/chosen": -0.7332858443260193, + "logits/rejected": -0.5800739526748657, + "logps/chosen": -0.04828640818595886, + "logps/rejected": -3.8269550800323486, + "loss": 0.0499, + "odds_ratio_loss": 0.0024078148417174816, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004828641191124916, + "rewards/margins": 0.37786686420440674, + "rewards/rejected": -0.3826954960823059, + "sft_loss": 0.04828640818595886, + "step": 4061 + }, + { + "epoch": 5.874186550976139, + "grad_norm": 0.9927827938054246, + "learning_rate": 8.505162001790189e-09, + "logits/chosen": -0.6603043079376221, + "logits/rejected": -0.5567718744277954, + "logps/chosen": -0.03948831930756569, + "logps/rejected": -6.371557235717773, + "loss": 0.0292, + "odds_ratio_loss": 0.002562589943408966, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003948831930756569, + "rewards/margins": 0.6332069039344788, + "rewards/rejected": -0.6371557712554932, + "sft_loss": 0.03948831930756569, + "step": 4062 + }, + { + "epoch": 5.875632682574114, + "grad_norm": 1.145853515466299, + "learning_rate": 8.303933202705949e-09, + "logits/chosen": -0.8784758448600769, + "logits/rejected": -0.6068578958511353, + "logps/chosen": -0.06510598957538605, + "logps/rejected": -4.118241310119629, + "loss": 0.0385, + "odds_ratio_loss": 0.004363437183201313, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0065105995163321495, + "rewards/margins": 0.40531349182128906, + "rewards/rejected": -0.411824107170105, + "sft_loss": 0.06510598957538605, + "step": 4063 + }, + { + "epoch": 5.87707881417209, + "grad_norm": 1.0442896336680918, + "learning_rate": 8.105111012046696e-09, + "logits/chosen": -0.8733867406845093, + "logits/rejected": -0.4977574944496155, + "logps/chosen": -0.039922237396240234, + "logps/rejected": -5.064108848571777, + "loss": 0.0383, + "odds_ratio_loss": 0.001056623412296176, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003992223646491766, + "rewards/margins": 0.5024186372756958, + "rewards/rejected": -0.5064108967781067, + "sft_loss": 0.039922237396240234, + "step": 4064 + }, + { + "epoch": 5.8785249457700655, + "grad_norm": 0.9958511906403898, + "learning_rate": 7.908695549683653e-09, + "logits/chosen": -0.8687781095504761, + "logits/rejected": -0.6260200142860413, + "logps/chosen": -0.04101001098752022, + "logps/rejected": -7.215920448303223, + "loss": 0.0274, + "odds_ratio_loss": 0.0009698215289972723, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0041010016575455666, + "rewards/margins": 0.7174910306930542, + "rewards/rejected": -0.7215920686721802, + "sft_loss": 0.04101001098752022, + "step": 4065 + }, + { + "epoch": 5.87997107736804, + "grad_norm": 1.1871995791926684, + "learning_rate": 7.714686934035874e-09, + "logits/chosen": -0.7915902137756348, + "logits/rejected": -0.627166211605072, + "logps/chosen": -0.05223681777715683, + "logps/rejected": -6.953492164611816, + "loss": 0.0521, + "odds_ratio_loss": 0.008496535010635853, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005223682150244713, + "rewards/margins": 0.6901255249977112, + "rewards/rejected": -0.6953492164611816, + "sft_loss": 0.05223681777715683, + "step": 4066 + }, + { + "epoch": 5.881417208966016, + "grad_norm": 1.109079649068121, + "learning_rate": 7.523085282072461e-09, + "logits/chosen": -0.9851240515708923, + "logits/rejected": -0.7789430618286133, + "logps/chosen": -0.03919222205877304, + "logps/rejected": -5.137519836425781, + "loss": 0.0302, + "odds_ratio_loss": 0.001989867305383086, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003919222392141819, + "rewards/margins": 0.5098327398300171, + "rewards/rejected": -0.5137519836425781, + "sft_loss": 0.03919222205877304, + "step": 4067 + }, + { + "epoch": 5.882863340563992, + "grad_norm": 0.9660218011534741, + "learning_rate": 7.333890709310342e-09, + "logits/chosen": -0.8588817715644836, + "logits/rejected": -0.6250803470611572, + "logps/chosen": -0.011442586779594421, + "logps/rejected": -7.525029182434082, + "loss": 0.0248, + "odds_ratio_loss": 0.0010403214255347848, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0011442586546763778, + "rewards/margins": 0.7513586282730103, + "rewards/rejected": -0.7525028586387634, + "sft_loss": 0.011442586779594421, + "step": 4068 + }, + { + "epoch": 5.884309472161966, + "grad_norm": 0.8837517132609889, + "learning_rate": 7.147103329816051e-09, + "logits/chosen": -0.8711720705032349, + "logits/rejected": -0.5237680673599243, + "logps/chosen": -0.04525275528430939, + "logps/rejected": -6.475998878479004, + "loss": 0.0377, + "odds_ratio_loss": 0.003399358130991459, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004525275435298681, + "rewards/margins": 0.643074631690979, + "rewards/rejected": -0.6475998759269714, + "sft_loss": 0.04525275528430939, + "step": 4069 + }, + { + "epoch": 5.885755603759942, + "grad_norm": 1.288411564629479, + "learning_rate": 6.962723256203951e-09, + "logits/chosen": -0.7793477773666382, + "logits/rejected": -0.7089977264404297, + "logps/chosen": -0.02952570654451847, + "logps/rejected": -6.359105110168457, + "loss": 0.0389, + "odds_ratio_loss": 0.0031170076690614223, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002952571026980877, + "rewards/margins": 0.6329580545425415, + "rewards/rejected": -0.6359105706214905, + "sft_loss": 0.02952570654451847, + "step": 4070 + }, + { + "epoch": 5.887201735357918, + "grad_norm": 1.036569757462945, + "learning_rate": 6.780750599637564e-09, + "logits/chosen": -0.9587365984916687, + "logits/rejected": -0.8759207129478455, + "logps/chosen": -0.017817378044128418, + "logps/rejected": -5.159303188323975, + "loss": 0.0256, + "odds_ratio_loss": 0.0029128207825124264, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001781738013960421, + "rewards/margins": 0.5141485929489136, + "rewards/rejected": -0.5159302949905396, + "sft_loss": 0.017817378044128418, + "step": 4071 + }, + { + "epoch": 5.888647866955893, + "grad_norm": 1.0826197981867185, + "learning_rate": 6.601185469829129e-09, + "logits/chosen": -0.9221165776252747, + "logits/rejected": -0.646438479423523, + "logps/chosen": -0.013732078485190868, + "logps/rejected": -5.320560932159424, + "loss": 0.0397, + "odds_ratio_loss": 0.0002973644877783954, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0013732078950852156, + "rewards/margins": 0.5306828618049622, + "rewards/rejected": -0.5320560932159424, + "sft_loss": 0.013732078485190868, + "step": 4072 + }, + { + "epoch": 5.890093998553868, + "grad_norm": 1.0761047766337628, + "learning_rate": 6.424027975038715e-09, + "logits/chosen": -0.9772175550460815, + "logits/rejected": -0.7124834656715393, + "logps/chosen": -0.06356082856655121, + "logps/rejected": -5.914202690124512, + "loss": 0.0368, + "odds_ratio_loss": 0.0028968786355108023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006356082856655121, + "rewards/margins": 0.585064172744751, + "rewards/rejected": -0.5914202928543091, + "sft_loss": 0.06356082856655121, + "step": 4073 + }, + { + "epoch": 5.891540130151844, + "grad_norm": 0.981644402128146, + "learning_rate": 6.2492782220759935e-09, + "logits/chosen": -0.8365199565887451, + "logits/rejected": -0.5724873542785645, + "logps/chosen": -0.029665423557162285, + "logps/rejected": -6.783669471740723, + "loss": 0.0422, + "odds_ratio_loss": 0.00043808904592879117, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0029665424954146147, + "rewards/margins": 0.6754004955291748, + "rewards/rejected": -0.678367018699646, + "sft_loss": 0.029665423557162285, + "step": 4074 + }, + { + "epoch": 5.892986261749819, + "grad_norm": 1.2626934450956793, + "learning_rate": 6.076936316297132e-09, + "logits/chosen": -0.7279162406921387, + "logits/rejected": -0.46087944507598877, + "logps/chosen": -0.05320172384381294, + "logps/rejected": -5.229532718658447, + "loss": 0.0402, + "odds_ratio_loss": 0.00094027747400105, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005320172291249037, + "rewards/margins": 0.5176330804824829, + "rewards/rejected": -0.5229532718658447, + "sft_loss": 0.05320172384381294, + "step": 4075 + }, + { + "epoch": 5.8944323933477945, + "grad_norm": 0.8948413475516522, + "learning_rate": 5.907002361608793e-09, + "logits/chosen": -0.8324787616729736, + "logits/rejected": -0.8268545866012573, + "logps/chosen": -0.026477977633476257, + "logps/rejected": -4.179195404052734, + "loss": 0.028, + "odds_ratio_loss": 0.0011238600127398968, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002647798042744398, + "rewards/margins": 0.4152717590332031, + "rewards/rejected": -0.4179195463657379, + "sft_loss": 0.026477977633476257, + "step": 4076 + }, + { + "epoch": 5.89587852494577, + "grad_norm": 1.1699254325701098, + "learning_rate": 5.739476460464132e-09, + "logits/chosen": -1.1227381229400635, + "logits/rejected": -0.7359456419944763, + "logps/chosen": -0.016686441376805305, + "logps/rejected": -5.694275856018066, + "loss": 0.0226, + "odds_ratio_loss": 0.0007282921578735113, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0016686442540958524, + "rewards/margins": 0.5677589774131775, + "rewards/rejected": -0.5694276094436646, + "sft_loss": 0.016686441376805305, + "step": 4077 + }, + { + "epoch": 5.897324656543746, + "grad_norm": 1.093944591158634, + "learning_rate": 5.574358713865468e-09, + "logits/chosen": -0.9437555074691772, + "logits/rejected": -0.7433983683586121, + "logps/chosen": -0.11696302890777588, + "logps/rejected": -4.217023849487305, + "loss": 0.0546, + "odds_ratio_loss": 0.007626968435943127, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011696303263306618, + "rewards/margins": 0.4100060760974884, + "rewards/rejected": -0.42170241475105286, + "sft_loss": 0.11696302890777588, + "step": 4078 + }, + { + "epoch": 5.898770788141721, + "grad_norm": 0.9801406350864609, + "learning_rate": 5.411649221362502e-09, + "logits/chosen": -0.710649311542511, + "logits/rejected": -0.6215068697929382, + "logps/chosen": -0.02466878667473793, + "logps/rejected": -4.457521915435791, + "loss": 0.0285, + "odds_ratio_loss": 0.001121659530326724, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0024668786209076643, + "rewards/margins": 0.44328534603118896, + "rewards/rejected": -0.44575223326683044, + "sft_loss": 0.02466878667473793, + "step": 4079 + }, + { + "epoch": 5.900216919739696, + "grad_norm": 0.8747769046834969, + "learning_rate": 5.251348081054097e-09, + "logits/chosen": -0.8850862979888916, + "logits/rejected": -0.5741367936134338, + "logps/chosen": -0.023076584562659264, + "logps/rejected": -6.402840614318848, + "loss": 0.0327, + "odds_ratio_loss": 0.0006071379175409675, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002307658549398184, + "rewards/margins": 0.6379764080047607, + "rewards/rejected": -0.6402841210365295, + "sft_loss": 0.023076584562659264, + "step": 4080 + }, + { + "epoch": 5.901663051337672, + "grad_norm": 0.9270878906304059, + "learning_rate": 5.09345538958561e-09, + "logits/chosen": -0.8760882019996643, + "logits/rejected": -0.5293622016906738, + "logps/chosen": -0.03015088476240635, + "logps/rejected": -6.523216247558594, + "loss": 0.0332, + "odds_ratio_loss": 0.0021572900004684925, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0030150888487696648, + "rewards/margins": 0.6493065357208252, + "rewards/rejected": -0.6523215770721436, + "sft_loss": 0.03015088476240635, + "step": 4081 + }, + { + "epoch": 5.903109182935647, + "grad_norm": 0.9858558759772296, + "learning_rate": 4.9379712421515615e-09, + "logits/chosen": -1.0308140516281128, + "logits/rejected": -0.6831567287445068, + "logps/chosen": -0.025113025680184364, + "logps/rejected": -6.239609718322754, + "loss": 0.0274, + "odds_ratio_loss": 0.0007894702721387148, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002511302474886179, + "rewards/margins": 0.6214496493339539, + "rewards/rejected": -0.6239609718322754, + "sft_loss": 0.025113025680184364, + "step": 4082 + }, + { + "epoch": 5.9045553145336225, + "grad_norm": 1.236514338792531, + "learning_rate": 4.784895732493854e-09, + "logits/chosen": -0.9402459859848022, + "logits/rejected": -0.7553094029426575, + "logps/chosen": -0.01938311569392681, + "logps/rejected": -5.070533275604248, + "loss": 0.0356, + "odds_ratio_loss": 0.0006489180377684534, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0019383116159588099, + "rewards/margins": 0.5051149725914001, + "rewards/rejected": -0.5070533156394958, + "sft_loss": 0.01938311569392681, + "step": 4083 + }, + { + "epoch": 5.906001446131598, + "grad_norm": 0.8111350183711371, + "learning_rate": 4.634228952902219e-09, + "logits/chosen": -0.8608274459838867, + "logits/rejected": -0.7472224235534668, + "logps/chosen": -0.023615194484591484, + "logps/rejected": -4.508618354797363, + "loss": 0.0267, + "odds_ratio_loss": 0.003320841584354639, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002361519727855921, + "rewards/margins": 0.4485003352165222, + "rewards/rejected": -0.4508618712425232, + "sft_loss": 0.023615194484591484, + "step": 4084 + }, + { + "epoch": 5.907447577729574, + "grad_norm": 1.0202535691554027, + "learning_rate": 4.485970994214661e-09, + "logits/chosen": -0.8583634495735168, + "logits/rejected": -0.5765115022659302, + "logps/chosen": -0.04926097393035889, + "logps/rejected": -4.539674758911133, + "loss": 0.0491, + "odds_ratio_loss": 0.0009557848679833114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004926097579300404, + "rewards/margins": 0.4490413963794708, + "rewards/rejected": -0.45396748185157776, + "sft_loss": 0.04926097393035889, + "step": 4085 + }, + { + "epoch": 5.908893709327549, + "grad_norm": 1.632546593521506, + "learning_rate": 4.340121945815678e-09, + "logits/chosen": -0.7539007067680359, + "logits/rejected": -0.818702220916748, + "logps/chosen": -0.051897093653678894, + "logps/rejected": -3.8027875423431396, + "loss": 0.0464, + "odds_ratio_loss": 0.006976235192269087, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005189709831029177, + "rewards/margins": 0.37508904933929443, + "rewards/rejected": -0.3802787661552429, + "sft_loss": 0.051897093653678894, + "step": 4086 + }, + { + "epoch": 5.910339840925524, + "grad_norm": 0.996259047615235, + "learning_rate": 4.196681895638487e-09, + "logits/chosen": -1.0506974458694458, + "logits/rejected": -0.7812222242355347, + "logps/chosen": -0.023906368762254715, + "logps/rejected": -6.1207275390625, + "loss": 0.0326, + "odds_ratio_loss": 0.002137015340849757, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0023906370624899864, + "rewards/margins": 0.6096821427345276, + "rewards/rejected": -0.6120727062225342, + "sft_loss": 0.023906368762254715, + "step": 4087 + }, + { + "epoch": 5.9117859725235, + "grad_norm": 1.012901522073042, + "learning_rate": 4.055650930164134e-09, + "logits/chosen": -0.6814955472946167, + "logits/rejected": -0.5374518036842346, + "logps/chosen": -0.019741419702768326, + "logps/rejected": -4.390100955963135, + "loss": 0.0445, + "odds_ratio_loss": 0.0011601306032389402, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0019741421565413475, + "rewards/margins": 0.4370359778404236, + "rewards/rejected": -0.4390101432800293, + "sft_loss": 0.019741419702768326, + "step": 4088 + }, + { + "epoch": 5.913232104121475, + "grad_norm": 1.1705865509991935, + "learning_rate": 3.917029134420158e-09, + "logits/chosen": -0.8490438461303711, + "logits/rejected": -0.6280502080917358, + "logps/chosen": -0.014538794755935669, + "logps/rejected": -5.418238162994385, + "loss": 0.0617, + "odds_ratio_loss": 0.0008393567986786366, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0014538795221596956, + "rewards/margins": 0.5403699278831482, + "rewards/rejected": -0.5418238043785095, + "sft_loss": 0.014538794755935669, + "step": 4089 + }, + { + "epoch": 5.9146782357194505, + "grad_norm": 0.8268715863355088, + "learning_rate": 3.780816591981928e-09, + "logits/chosen": -0.925495982170105, + "logits/rejected": -0.7166245579719543, + "logps/chosen": -0.02626338042318821, + "logps/rejected": -5.465725898742676, + "loss": 0.0239, + "odds_ratio_loss": 0.000887808040715754, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0026263382751494646, + "rewards/margins": 0.5439462661743164, + "rewards/rejected": -0.5465726852416992, + "sft_loss": 0.02626338042318821, + "step": 4090 + }, + { + "epoch": 5.916124367317426, + "grad_norm": 1.104740571942427, + "learning_rate": 3.6470133849735297e-09, + "logits/chosen": -0.9207170009613037, + "logits/rejected": -0.827392041683197, + "logps/chosen": -0.04282090440392494, + "logps/rejected": -4.764659881591797, + "loss": 0.0384, + "odds_ratio_loss": 0.0029545726720243692, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004282090347260237, + "rewards/margins": 0.47218388319015503, + "rewards/rejected": -0.47646600008010864, + "sft_loss": 0.04282090440392494, + "step": 4091 + }, + { + "epoch": 5.917570498915401, + "grad_norm": 0.9063259918507958, + "learning_rate": 3.515619594064212e-09, + "logits/chosen": -1.0181001424789429, + "logits/rejected": -0.8038081526756287, + "logps/chosen": -0.03374544531106949, + "logps/rejected": -4.169094085693359, + "loss": 0.0323, + "odds_ratio_loss": 0.0018833805806934834, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003374544670805335, + "rewards/margins": 0.41353487968444824, + "rewards/rejected": -0.41690942645072937, + "sft_loss": 0.03374544531106949, + "step": 4092 + }, + { + "epoch": 5.919016630513377, + "grad_norm": 1.0335977859705563, + "learning_rate": 3.3866352984728285e-09, + "logits/chosen": -1.0784063339233398, + "logits/rejected": -0.7443411350250244, + "logps/chosen": -0.03114999271929264, + "logps/rejected": -4.165608882904053, + "loss": 0.0326, + "odds_ratio_loss": 0.0012884940952062607, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003114999271929264, + "rewards/margins": 0.4134458899497986, + "rewards/rejected": -0.4165608882904053, + "sft_loss": 0.03114999271929264, + "step": 4093 + }, + { + "epoch": 5.920462762111352, + "grad_norm": 0.8634076399115647, + "learning_rate": 3.260060575963841e-09, + "logits/chosen": -0.8990287780761719, + "logits/rejected": -0.723576545715332, + "logps/chosen": -0.055409032851457596, + "logps/rejected": -4.412961959838867, + "loss": 0.0307, + "odds_ratio_loss": 0.005224099848419428, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005540903657674789, + "rewards/margins": 0.43575531244277954, + "rewards/rejected": -0.44129621982574463, + "sft_loss": 0.055409032851457596, + "step": 4094 + }, + { + "epoch": 5.921908893709327, + "grad_norm": 1.0221510601388344, + "learning_rate": 3.1358955028495393e-09, + "logits/chosen": -0.9376885890960693, + "logits/rejected": -0.6291782855987549, + "logps/chosen": -0.016738008707761765, + "logps/rejected": -5.539161682128906, + "loss": 0.0392, + "odds_ratio_loss": 0.0006912790704518557, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0016738008707761765, + "rewards/margins": 0.5522423982620239, + "rewards/rejected": -0.5539162158966064, + "sft_loss": 0.016738008707761765, + "step": 4095 + }, + { + "epoch": 5.923355025307303, + "grad_norm": 1.1064587046513101, + "learning_rate": 3.0141401539900415e-09, + "logits/chosen": -0.6891365647315979, + "logits/rejected": -0.5489487648010254, + "logps/chosen": -0.09406330436468124, + "logps/rejected": -4.19540548324585, + "loss": 0.0651, + "odds_ratio_loss": 0.0437358096241951, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.00940632913261652, + "rewards/margins": 0.4101342260837555, + "rewards/rejected": -0.4195405840873718, + "sft_loss": 0.09406330436468124, + "step": 4096 + }, + { + "epoch": 5.9248011569052785, + "grad_norm": 1.3920590957005463, + "learning_rate": 2.894794602791517e-09, + "logits/chosen": -1.16707181930542, + "logits/rejected": -0.9460763931274414, + "logps/chosen": -0.023493018001317978, + "logps/rejected": -5.3490495681762695, + "loss": 0.072, + "odds_ratio_loss": 0.001594938919879496, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002349301939830184, + "rewards/margins": 0.5325556993484497, + "rewards/rejected": -0.534904956817627, + "sft_loss": 0.023493018001317978, + "step": 4097 + }, + { + "epoch": 5.926247288503253, + "grad_norm": 1.091799372443769, + "learning_rate": 2.777858921208409e-09, + "logits/chosen": -0.741195559501648, + "logits/rejected": -0.6491609811782837, + "logps/chosen": -0.02929551713168621, + "logps/rejected": -3.8806638717651367, + "loss": 0.0334, + "odds_ratio_loss": 0.0010335511760786176, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0029295519925653934, + "rewards/margins": 0.38513684272766113, + "rewards/rejected": -0.3880664110183716, + "sft_loss": 0.02929551713168621, + "step": 4098 + }, + { + "epoch": 5.927693420101229, + "grad_norm": 1.2381780680971113, + "learning_rate": 2.663333179741212e-09, + "logits/chosen": -0.9526777267456055, + "logits/rejected": -0.7601011395454407, + "logps/chosen": -0.05737978592514992, + "logps/rejected": -4.47212553024292, + "loss": 0.0468, + "odds_ratio_loss": 0.0037496332079172134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005737978499382734, + "rewards/margins": 0.441474586725235, + "rewards/rejected": -0.4472126066684723, + "sft_loss": 0.05737978592514992, + "step": 4099 + }, + { + "epoch": 5.929139551699205, + "grad_norm": 1.1290870074892114, + "learning_rate": 2.5512174474382475e-09, + "logits/chosen": -0.8009154796600342, + "logits/rejected": -0.6760156154632568, + "logps/chosen": -0.038477495312690735, + "logps/rejected": -4.925846099853516, + "loss": 0.0438, + "odds_ratio_loss": 0.0019320531282573938, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003847749438136816, + "rewards/margins": 0.4887368679046631, + "rewards/rejected": -0.49258458614349365, + "sft_loss": 0.038477495312690735, + "step": 4100 + }, + { + "epoch": 5.93058568329718, + "grad_norm": 1.1307251208355675, + "learning_rate": 2.441511791894335e-09, + "logits/chosen": -0.9180901050567627, + "logits/rejected": -0.581337034702301, + "logps/chosen": -0.07019197195768356, + "logps/rejected": -6.812994956970215, + "loss": 0.0422, + "odds_ratio_loss": 0.0022215438075363636, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007019197568297386, + "rewards/margins": 0.6742802858352661, + "rewards/rejected": -0.6812995076179504, + "sft_loss": 0.07019197195768356, + "step": 4101 + }, + { + "epoch": 5.932031814895155, + "grad_norm": 0.9974677158336643, + "learning_rate": 2.3342162792516772e-09, + "logits/chosen": -0.832883358001709, + "logits/rejected": -0.5870314836502075, + "logps/chosen": -0.03074686974287033, + "logps/rejected": -5.959352493286133, + "loss": 0.0416, + "odds_ratio_loss": 0.001421600696630776, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0030746874399483204, + "rewards/margins": 0.5928605794906616, + "rewards/rejected": -0.5959352254867554, + "sft_loss": 0.03074686974287033, + "step": 4102 + }, + { + "epoch": 5.933477946493131, + "grad_norm": 1.4469006514086222, + "learning_rate": 2.229330974198529e-09, + "logits/chosen": -0.8372644186019897, + "logits/rejected": -0.6750832200050354, + "logps/chosen": -0.04740104451775551, + "logps/rejected": -6.3367462158203125, + "loss": 0.0425, + "odds_ratio_loss": 0.002821737201884389, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004740104544907808, + "rewards/margins": 0.6289345622062683, + "rewards/rejected": -0.6336746215820312, + "sft_loss": 0.04740104451775551, + "step": 4103 + }, + { + "epoch": 5.934924078091107, + "grad_norm": 0.9240492769234988, + "learning_rate": 2.126855939971417e-09, + "logits/chosen": -0.8022180795669556, + "logits/rejected": -0.7096238732337952, + "logps/chosen": -0.01701393537223339, + "logps/rejected": -3.7467222213745117, + "loss": 0.0328, + "odds_ratio_loss": 0.0013247316237539053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0017013936303555965, + "rewards/margins": 0.372970849275589, + "rewards/rejected": -0.3746722340583801, + "sft_loss": 0.01701393537223339, + "step": 4104 + }, + { + "epoch": 5.936370209689081, + "grad_norm": 0.9598971144707034, + "learning_rate": 2.0267912383520324e-09, + "logits/chosen": -0.9782860279083252, + "logits/rejected": -0.621020495891571, + "logps/chosen": -0.05000707879662514, + "logps/rejected": -5.905955791473389, + "loss": 0.0432, + "odds_ratio_loss": 0.0016283662989735603, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005000708159059286, + "rewards/margins": 0.585594892501831, + "rewards/rejected": -0.5905956625938416, + "sft_loss": 0.05000707879662514, + "step": 4105 + }, + { + "epoch": 5.937816341287057, + "grad_norm": 2.7765226016012057, + "learning_rate": 1.9291369296707825e-09, + "logits/chosen": -0.8960604071617126, + "logits/rejected": -0.817623496055603, + "logps/chosen": -0.014780182391405106, + "logps/rejected": -6.727514266967773, + "loss": 0.054, + "odds_ratio_loss": 0.0007389021338894963, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0014780182391405106, + "rewards/margins": 0.671273410320282, + "rewards/rejected": -0.6727514266967773, + "sft_loss": 0.014780182391405106, + "step": 4106 + }, + { + "epoch": 5.939262472885033, + "grad_norm": 1.1274069239257218, + "learning_rate": 1.8338930728027946e-09, + "logits/chosen": -0.964221715927124, + "logits/rejected": -0.6402559876441956, + "logps/chosen": -0.05291515588760376, + "logps/rejected": -7.261722087860107, + "loss": 0.0399, + "odds_ratio_loss": 0.0011750500416383147, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005291515961289406, + "rewards/margins": 0.7208806276321411, + "rewards/rejected": -0.7261722087860107, + "sft_loss": 0.05291515588760376, + "step": 4107 + }, + { + "epoch": 5.940708604483008, + "grad_norm": 0.924331347038122, + "learning_rate": 1.7410597251719116e-09, + "logits/chosen": -0.9784330725669861, + "logits/rejected": -0.5999252796173096, + "logps/chosen": -0.03772180899977684, + "logps/rejected": -6.771916389465332, + "loss": 0.0248, + "odds_ratio_loss": 0.0036124063190072775, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0037721809931099415, + "rewards/margins": 0.6734194755554199, + "rewards/rejected": -0.6771916151046753, + "sft_loss": 0.03772180899977684, + "step": 4108 + }, + { + "epoch": 5.942154736080983, + "grad_norm": 1.3007265136932227, + "learning_rate": 1.650636942746697e-09, + "logits/chosen": -0.9911519289016724, + "logits/rejected": -0.7490329742431641, + "logps/chosen": -0.04981255158782005, + "logps/rejected": -5.104490280151367, + "loss": 0.0629, + "odds_ratio_loss": 0.0008861341630108654, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004981255158782005, + "rewards/margins": 0.5054677724838257, + "rewards/rejected": -0.5104490518569946, + "sft_loss": 0.04981255158782005, + "step": 4109 + }, + { + "epoch": 5.943600867678959, + "grad_norm": 0.9034752594114349, + "learning_rate": 1.5626247800444303e-09, + "logits/chosen": -0.9556131362915039, + "logits/rejected": -0.6480990648269653, + "logps/chosen": -0.02443695440888405, + "logps/rejected": -5.567079544067383, + "loss": 0.0258, + "odds_ratio_loss": 0.0025377324782311916, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002443695208057761, + "rewards/margins": 0.5542643070220947, + "rewards/rejected": -0.5567079782485962, + "sft_loss": 0.02443695440888405, + "step": 4110 + }, + { + "epoch": 5.945046999276935, + "grad_norm": 1.27019890219729, + "learning_rate": 1.4770232901271107e-09, + "logits/chosen": -0.7591812610626221, + "logits/rejected": -0.641418993473053, + "logps/chosen": -0.024198981001973152, + "logps/rejected": -3.361138343811035, + "loss": 0.0345, + "odds_ratio_loss": 0.002360550919547677, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002419898519292474, + "rewards/margins": 0.3336939215660095, + "rewards/rejected": -0.336113840341568, + "sft_loss": 0.024198981001973152, + "step": 4111 + }, + { + "epoch": 5.946493130874909, + "grad_norm": 1.4078149386948933, + "learning_rate": 1.3938325246045656e-09, + "logits/chosen": -1.0027142763137817, + "logits/rejected": -0.9888021945953369, + "logps/chosen": -0.02432049624621868, + "logps/rejected": -3.972032070159912, + "loss": 0.0295, + "odds_ratio_loss": 0.003202601568773389, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0024320497177541256, + "rewards/margins": 0.39477115869522095, + "rewards/rejected": -0.3972032070159912, + "sft_loss": 0.02432049624621868, + "step": 4112 + }, + { + "epoch": 5.947939262472885, + "grad_norm": 0.955294787070582, + "learning_rate": 1.313052533633119e-09, + "logits/chosen": -0.8551870584487915, + "logits/rejected": -0.6820370554924011, + "logps/chosen": -0.010347655974328518, + "logps/rejected": -4.61668586730957, + "loss": 0.0274, + "odds_ratio_loss": 0.00016555214824620634, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0010347655043005943, + "rewards/margins": 0.460633784532547, + "rewards/rejected": -0.46166858077049255, + "sft_loss": 0.010347655974328518, + "step": 4113 + }, + { + "epoch": 5.949385394070861, + "grad_norm": 1.2465098226086702, + "learning_rate": 1.2346833659147016e-09, + "logits/chosen": -0.8677682876586914, + "logits/rejected": -0.7311995029449463, + "logps/chosen": -0.03744862973690033, + "logps/rejected": -4.943976402282715, + "loss": 0.0577, + "odds_ratio_loss": 0.0047299060970544815, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0037448632065206766, + "rewards/margins": 0.49065274000167847, + "rewards/rejected": -0.4943976402282715, + "sft_loss": 0.03744862973690033, + "step": 4114 + }, + { + "epoch": 5.9508315256688356, + "grad_norm": 1.020861812684583, + "learning_rate": 1.1587250686986294e-09, + "logits/chosen": -0.9044225811958313, + "logits/rejected": -0.594450056552887, + "logps/chosen": -0.03550455719232559, + "logps/rejected": -6.127573013305664, + "loss": 0.0468, + "odds_ratio_loss": 0.0004433983704075217, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003550455439835787, + "rewards/margins": 0.6092069149017334, + "rewards/rejected": -0.6127573847770691, + "sft_loss": 0.03550455719232559, + "step": 4115 + }, + { + "epoch": 5.952277657266811, + "grad_norm": 1.0219147309467258, + "learning_rate": 1.085177687780714e-09, + "logits/chosen": -0.8875457644462585, + "logits/rejected": -0.6773673295974731, + "logps/chosen": -0.03830548748373985, + "logps/rejected": -5.578269004821777, + "loss": 0.0257, + "odds_ratio_loss": 0.0014839848736301064, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0038305488415062428, + "rewards/margins": 0.5539963841438293, + "rewards/rejected": -0.5578269362449646, + "sft_loss": 0.03830548748373985, + "step": 4116 + }, + { + "epoch": 5.953723788864787, + "grad_norm": 1.0919805731483818, + "learning_rate": 1.0140412675023747e-09, + "logits/chosen": -1.0210849046707153, + "logits/rejected": -0.7989510297775269, + "logps/chosen": -0.011519749648869038, + "logps/rejected": -4.2892608642578125, + "loss": 0.051, + "odds_ratio_loss": 0.0009958500741049647, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0011519748950377107, + "rewards/margins": 0.4277741014957428, + "rewards/rejected": -0.42892611026763916, + "sft_loss": 0.011519749648869038, + "step": 4117 + }, + { + "epoch": 5.955169920462762, + "grad_norm": 1.038445647093431, + "learning_rate": 9.453158507528592e-10, + "logits/chosen": -0.9300838112831116, + "logits/rejected": -0.8889990448951721, + "logps/chosen": -0.08415620028972626, + "logps/rejected": -4.28843879699707, + "loss": 0.0552, + "odds_ratio_loss": 0.0070391446352005005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008415620774030685, + "rewards/margins": 0.4204282760620117, + "rewards/rejected": -0.4288438856601715, + "sft_loss": 0.08415620028972626, + "step": 4118 + }, + { + "epoch": 5.956616052060737, + "grad_norm": 1.8490327493708787, + "learning_rate": 8.790014789661348e-10, + "logits/chosen": -0.842028796672821, + "logits/rejected": -0.7200061678886414, + "logps/chosen": -0.04375598579645157, + "logps/rejected": -4.8058576583862305, + "loss": 0.0314, + "odds_ratio_loss": 0.0012701171217486262, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004375598393380642, + "rewards/margins": 0.47621017694473267, + "rewards/rejected": -0.4805857241153717, + "sft_loss": 0.04375598579645157, + "step": 4119 + }, + { + "epoch": 5.958062183658713, + "grad_norm": 1.5751740362087219, + "learning_rate": 8.150981921239975e-10, + "logits/chosen": -1.0434660911560059, + "logits/rejected": -0.7287979125976562, + "logps/chosen": -0.026583680883049965, + "logps/rejected": -6.822535991668701, + "loss": 0.0562, + "odds_ratio_loss": 0.0017852864693850279, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0026583680883049965, + "rewards/margins": 0.6795952320098877, + "rewards/rejected": -0.6822535991668701, + "sft_loss": 0.026583680883049965, + "step": 4120 + }, + { + "epoch": 5.959508315256688, + "grad_norm": 1.1963090985219154, + "learning_rate": 7.536060287534063e-10, + "logits/chosen": -0.8197766542434692, + "logits/rejected": -0.6914808750152588, + "logps/chosen": -0.05560974031686783, + "logps/rejected": -7.24658727645874, + "loss": 0.0612, + "odds_ratio_loss": 0.010990302078425884, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00556097412481904, + "rewards/margins": 0.7190977931022644, + "rewards/rejected": -0.724658727645874, + "sft_loss": 0.05560974031686783, + "step": 4121 + }, + { + "epoch": 5.960954446854664, + "grad_norm": 1.8334323902812746, + "learning_rate": 6.94525025928705e-10, + "logits/chosen": -0.8709729313850403, + "logits/rejected": -0.594711184501648, + "logps/chosen": -0.03179216384887695, + "logps/rejected": -10.035598754882812, + "loss": 0.035, + "odds_ratio_loss": 0.0020171799696981907, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003179216757416725, + "rewards/margins": 1.0003807544708252, + "rewards/rejected": -1.0035598278045654, + "sft_loss": 0.03179216384887695, + "step": 4122 + }, + { + "epoch": 5.962400578452639, + "grad_norm": 0.8575311426686644, + "learning_rate": 6.37855219269845e-10, + "logits/chosen": -0.7879198789596558, + "logits/rejected": -0.7165445685386658, + "logps/chosen": -0.015290379524230957, + "logps/rejected": -4.180186748504639, + "loss": 0.0287, + "odds_ratio_loss": 0.0012166141532361507, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0015290379524230957, + "rewards/margins": 0.4164896011352539, + "rewards/rejected": -0.418018639087677, + "sft_loss": 0.015290379524230957, + "step": 4123 + }, + { + "epoch": 5.963846710050615, + "grad_norm": 1.037426781554966, + "learning_rate": 5.835966429432737e-10, + "logits/chosen": -0.7037444710731506, + "logits/rejected": -0.450633704662323, + "logps/chosen": -0.05810614302754402, + "logps/rejected": -5.363962650299072, + "loss": 0.0327, + "odds_ratio_loss": 0.0023976389784365892, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005810614209622145, + "rewards/margins": 0.5305856466293335, + "rewards/rejected": -0.5363962650299072, + "sft_loss": 0.05810614302754402, + "step": 4124 + }, + { + "epoch": 5.96529284164859, + "grad_norm": 1.2952351363680497, + "learning_rate": 5.317493296614906e-10, + "logits/chosen": -0.9828850030899048, + "logits/rejected": -0.8129862546920776, + "logps/chosen": -0.027900131419301033, + "logps/rejected": -4.454830169677734, + "loss": 0.0626, + "odds_ratio_loss": 0.0007958858041092753, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0027900133281946182, + "rewards/margins": 0.44269299507141113, + "rewards/rejected": -0.4454830288887024, + "sft_loss": 0.027900131419301033, + "step": 4125 + }, + { + "epoch": 5.966738973246565, + "grad_norm": 0.9134255978257638, + "learning_rate": 4.82313310683935e-10, + "logits/chosen": -0.9281235933303833, + "logits/rejected": -0.725083589553833, + "logps/chosen": -0.01804085075855255, + "logps/rejected": -5.371147155761719, + "loss": 0.0258, + "odds_ratio_loss": 0.0014887560391798615, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0018040850991383195, + "rewards/margins": 0.535310685634613, + "rewards/rejected": -0.5371147394180298, + "sft_loss": 0.01804085075855255, + "step": 4126 + }, + { + "epoch": 5.968185104844541, + "grad_norm": 0.9621230174545535, + "learning_rate": 4.3528861581521026e-10, + "logits/chosen": -0.9386307001113892, + "logits/rejected": -0.6423209309577942, + "logps/chosen": -0.01551287341862917, + "logps/rejected": -5.554164886474609, + "loss": 0.0274, + "odds_ratio_loss": 0.0015682197408750653, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0015512873651459813, + "rewards/margins": 0.5538651943206787, + "rewards/rejected": -0.555416464805603, + "sft_loss": 0.01551287341862917, + "step": 4127 + }, + { + "epoch": 5.969631236442516, + "grad_norm": 1.4551812506826591, + "learning_rate": 3.906752734073038e-10, + "logits/chosen": -0.9425130486488342, + "logits/rejected": -0.6164754629135132, + "logps/chosen": -0.03378310054540634, + "logps/rejected": -5.581912994384766, + "loss": 0.0562, + "odds_ratio_loss": 0.0012506656348705292, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003378310240805149, + "rewards/margins": 0.5548129677772522, + "rewards/rejected": -0.5581912994384766, + "sft_loss": 0.03378310054540634, + "step": 4128 + }, + { + "epoch": 5.971077368040492, + "grad_norm": 1.430946768404967, + "learning_rate": 3.4847331035736673e-10, + "logits/chosen": -0.7740320563316345, + "logits/rejected": -0.625731348991394, + "logps/chosen": -0.11199574172496796, + "logps/rejected": -3.3949060440063477, + "loss": 0.0756, + "odds_ratio_loss": 0.010786962695419788, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011199574917554855, + "rewards/margins": 0.32829102873802185, + "rewards/rejected": -0.3394905924797058, + "sft_loss": 0.11199574172496796, + "step": 4129 + }, + { + "epoch": 5.972523499638467, + "grad_norm": 1.0795984072522444, + "learning_rate": 3.0868275210904624e-10, + "logits/chosen": -0.8547477722167969, + "logits/rejected": -0.7329878807067871, + "logps/chosen": -0.03964013606309891, + "logps/rejected": -5.210457801818848, + "loss": 0.0474, + "odds_ratio_loss": 0.003418078413233161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003964013420045376, + "rewards/margins": 0.5170817971229553, + "rewards/rejected": -0.5210458040237427, + "sft_loss": 0.03964013606309891, + "step": 4130 + }, + { + "epoch": 5.973969631236443, + "grad_norm": 0.9997914713259782, + "learning_rate": 2.713036226520415e-10, + "logits/chosen": -0.8122600317001343, + "logits/rejected": -0.599828839302063, + "logps/chosen": -0.014296657405793667, + "logps/rejected": -6.734135627746582, + "loss": 0.0385, + "odds_ratio_loss": 0.000533695740159601, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0014296658337116241, + "rewards/margins": 0.6719839572906494, + "rewards/rejected": -0.6734135746955872, + "sft_loss": 0.014296657405793667, + "step": 4131 + }, + { + "epoch": 5.975415762834418, + "grad_norm": 0.955240881670924, + "learning_rate": 2.363359445229918e-10, + "logits/chosen": -0.9173808693885803, + "logits/rejected": -0.8124196529388428, + "logps/chosen": -0.046773605048656464, + "logps/rejected": -4.923510551452637, + "loss": 0.0306, + "odds_ratio_loss": 0.0019419525051489472, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004677360877394676, + "rewards/margins": 0.48767372965812683, + "rewards/rejected": -0.49235108494758606, + "sft_loss": 0.046773605048656464, + "step": 4132 + }, + { + "epoch": 5.9768618944323935, + "grad_norm": 1.0552507197470187, + "learning_rate": 2.037797388036999e-10, + "logits/chosen": -0.6335623860359192, + "logits/rejected": -0.5481550693511963, + "logps/chosen": -0.014158733189105988, + "logps/rejected": -5.313653469085693, + "loss": 0.028, + "odds_ratio_loss": 0.0016354866093024611, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0014158734120428562, + "rewards/margins": 0.5299494862556458, + "rewards/rejected": -0.5313653349876404, + "sft_loss": 0.014158733189105988, + "step": 4133 + }, + { + "epoch": 5.978308026030369, + "grad_norm": 1.3318819399781283, + "learning_rate": 1.7363502512246497e-10, + "logits/chosen": -0.861269474029541, + "logits/rejected": -0.8041742444038391, + "logps/chosen": -0.06389249116182327, + "logps/rejected": -4.00693416595459, + "loss": 0.061, + "odds_ratio_loss": 0.003401533467695117, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006389249116182327, + "rewards/margins": 0.39430415630340576, + "rewards/rejected": -0.400693416595459, + "sft_loss": 0.06389249116182327, + "step": 4134 + }, + { + "epoch": 5.979754157628344, + "grad_norm": 1.1397991365853217, + "learning_rate": 1.4590182165363785e-10, + "logits/chosen": -0.9926788210868835, + "logits/rejected": -0.7052085399627686, + "logps/chosen": -0.01665268838405609, + "logps/rejected": -6.901638984680176, + "loss": 0.0334, + "odds_ratio_loss": 0.0005437415675260127, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001665269024670124, + "rewards/margins": 0.6884986758232117, + "rewards/rejected": -0.6901639103889465, + "sft_loss": 0.01665268838405609, + "step": 4135 + }, + { + "epoch": 5.98120028922632, + "grad_norm": 1.0293090818178157, + "learning_rate": 1.2058014511717728e-10, + "logits/chosen": -0.7687342762947083, + "logits/rejected": -0.5340158343315125, + "logps/chosen": -0.012591686099767685, + "logps/rejected": -6.006010055541992, + "loss": 0.032, + "odds_ratio_loss": 0.0005818761419504881, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0012591686099767685, + "rewards/margins": 0.5993418097496033, + "rewards/rejected": -0.600601077079773, + "sft_loss": 0.012591686099767685, + "step": 4136 + }, + { + "epoch": 5.982646420824295, + "grad_norm": 1.0500799088028352, + "learning_rate": 9.767001078087034e-11, + "logits/chosen": -0.8796525001525879, + "logits/rejected": -0.6548293232917786, + "logps/chosen": -0.05132802948355675, + "logps/rejected": -5.8061323165893555, + "loss": 0.0436, + "odds_ratio_loss": 0.002121829893440008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005132803227752447, + "rewards/margins": 0.5754804015159607, + "rewards/rejected": -0.5806131958961487, + "sft_loss": 0.05132802948355675, + "step": 4137 + }, + { + "epoch": 5.98409255242227, + "grad_norm": 1.3440124989758566, + "learning_rate": 7.717143245589142e-11, + "logits/chosen": -1.0992934703826904, + "logits/rejected": -0.7260419130325317, + "logps/chosen": -0.04053273797035217, + "logps/rejected": -5.290022373199463, + "loss": 0.0648, + "odds_ratio_loss": 0.0017471958417445421, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00405327370390296, + "rewards/margins": 0.5249489545822144, + "rewards/rejected": -0.5290022492408752, + "sft_loss": 0.04053273797035217, + "step": 4138 + }, + { + "epoch": 5.985538684020246, + "grad_norm": 1.1554344347871206, + "learning_rate": 5.908442250168733e-11, + "logits/chosen": -0.9692047834396362, + "logits/rejected": -0.6804442405700684, + "logps/chosen": -0.048383478075265884, + "logps/rejected": -6.995375633239746, + "loss": 0.0316, + "odds_ratio_loss": 0.0027454195078462362, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004838347434997559, + "rewards/margins": 0.6946991682052612, + "rewards/rejected": -0.6995375156402588, + "sft_loss": 0.048383478075265884, + "step": 4139 + }, + { + "epoch": 5.9869848156182215, + "grad_norm": 0.8871899630146395, + "learning_rate": 4.3408991823312704e-11, + "logits/chosen": -0.9535342454910278, + "logits/rejected": -0.8384948968887329, + "logps/chosen": -0.029851065948605537, + "logps/rejected": -3.435246467590332, + "loss": 0.0232, + "odds_ratio_loss": 0.0023980343248695135, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002985106548294425, + "rewards/margins": 0.3405395746231079, + "rewards/rejected": -0.34352466464042664, + "sft_loss": 0.029851065948605537, + "step": 4140 + }, + { + "epoch": 5.988430947216196, + "grad_norm": 0.9226185349789879, + "learning_rate": 3.014514987054184e-11, + "logits/chosen": -0.9060051441192627, + "logits/rejected": -0.8312381505966187, + "logps/chosen": -0.0319242998957634, + "logps/rejected": -4.604552745819092, + "loss": 0.0285, + "odds_ratio_loss": 0.0014555552043020725, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003192430129274726, + "rewards/margins": 0.4572628140449524, + "rewards/rejected": -0.4604552388191223, + "sft_loss": 0.0319242998957634, + "step": 4141 + }, + { + "epoch": 5.989877078814172, + "grad_norm": 0.9592362753561068, + "learning_rate": 1.9292904640977324e-11, + "logits/chosen": -1.004399061203003, + "logits/rejected": -0.97425377368927, + "logps/chosen": -0.029115300625562668, + "logps/rejected": -4.420380115509033, + "loss": 0.0256, + "odds_ratio_loss": 0.002002717461436987, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0029115299694240093, + "rewards/margins": 0.4391264319419861, + "rewards/rejected": -0.4420379400253296, + "sft_loss": 0.029115300625562668, + "step": 4142 + }, + { + "epoch": 5.991323210412148, + "grad_norm": 1.1190789757311155, + "learning_rate": 1.0852262677385482e-11, + "logits/chosen": -0.7953178286552429, + "logits/rejected": -0.6801662445068359, + "logps/chosen": -0.028218841180205345, + "logps/rejected": -4.215031623840332, + "loss": 0.0411, + "odds_ratio_loss": 0.0011911361943930387, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0028218841180205345, + "rewards/margins": 0.4186813235282898, + "rewards/rejected": -0.4215031862258911, + "sft_loss": 0.028218841180205345, + "step": 4143 + }, + { + "epoch": 5.9927693420101225, + "grad_norm": 0.9158155976864413, + "learning_rate": 4.823229068140478e-12, + "logits/chosen": -0.8988432884216309, + "logits/rejected": -0.7506909966468811, + "logps/chosen": -0.029472343623638153, + "logps/rejected": -3.428910970687866, + "loss": 0.0258, + "odds_ratio_loss": 0.0006631941068917513, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002947234082967043, + "rewards/margins": 0.33994388580322266, + "rewards/rejected": -0.3428910970687866, + "sft_loss": 0.029472343623638153, + "step": 4144 + }, + { + "epoch": 5.994215473608098, + "grad_norm": 1.056239819329163, + "learning_rate": 1.2058074490006732e-12, + "logits/chosen": -0.9353763461112976, + "logits/rejected": -0.7485519647598267, + "logps/chosen": -0.03557208925485611, + "logps/rejected": -5.1342082023620605, + "loss": 0.0394, + "odds_ratio_loss": 0.0014125681482255459, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0035572086926549673, + "rewards/margins": 0.5098636150360107, + "rewards/rejected": -0.513420820236206, + "sft_loss": 0.03557208925485611, + "step": 4145 + }, + { + "epoch": 5.995661605206074, + "grad_norm": 1.0557108508160415, + "learning_rate": 0.0, + "logits/chosen": -0.6718660593032837, + "logits/rejected": -0.5680021643638611, + "logps/chosen": -0.026484325528144836, + "logps/rejected": -5.1328911781311035, + "loss": 0.0381, + "odds_ratio_loss": 0.004209108650684357, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0026484327390789986, + "rewards/margins": 0.5106407403945923, + "rewards/rejected": -0.5132891535758972, + "sft_loss": 0.026484325528144836, + "step": 4146 + } + ], + "logging_steps": 1.0, + "max_steps": 4146, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "total_flos": 1411327439929344.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}