{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 3821, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "dpo_losses": 0.6931471824645996, "epoch": 0.0, "grad_norm": 1.666406547592859, "learning_rate": 1.3054830287206266e-09, "logits/chosen": -2.7590973377227783, "logits/rejected": -2.847461462020874, "logps/chosen": -183.89276123046875, "logps/rejected": -240.56399536132812, "loss": 0.6931, "positive_losses": 0.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "dpo_losses": 0.6931865215301514, "epoch": 0.0, "grad_norm": 30.19681129042229, "learning_rate": 1.3054830287206264e-08, "logits/chosen": -2.865060329437256, "logits/rejected": -2.7412307262420654, "logps/chosen": -287.0556640625, "logps/rejected": -190.19590759277344, "loss": 0.6958, "positive_losses": 0.018089719116687775, "rewards/accuracies": 0.0833333358168602, "rewards/chosen": 5.116145621286705e-05, "rewards/margins": -7.855256990296766e-05, "rewards/margins_max": 0.0002732997527346015, "rewards/margins_min": -0.00045305039384402335, "rewards/margins_std": 0.00030465322197414935, "rewards/rejected": 0.0001297140261158347, "step": 10 }, { "dpo_losses": 0.6932038068771362, "epoch": 0.01, "grad_norm": 26.185775226613973, "learning_rate": 2.610966057441253e-08, "logits/chosen": -2.9035611152648926, "logits/rejected": -2.83616042137146, "logps/chosen": -350.1943359375, "logps/rejected": -269.9788818359375, "loss": 0.7027, "positive_losses": 0.07787647098302841, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.00026953319320455194, "rewards/margins": -0.00011073592031607404, "rewards/margins_max": 0.003399646608158946, "rewards/margins_min": -0.0036606634967029095, "rewards/margins_std": 0.0031352010555565357, "rewards/rejected": 0.00038026898982934654, "step": 20 }, { "dpo_losses": 0.6929337978363037, "epoch": 0.01, "grad_norm": 32.03306397302637, "learning_rate": 3.91644908616188e-08, "logits/chosen": -2.8334875106811523, "logits/rejected": -2.848536729812622, "logps/chosen": -251.3134307861328, "logps/rejected": -251.9360809326172, "loss": 0.6993, "positive_losses": 0.05228748172521591, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0004572083707898855, "rewards/margins": 0.0004286346083972603, "rewards/margins_max": 0.003322898643091321, "rewards/margins_min": -0.00229655927978456, "rewards/margins_std": 0.0024444316513836384, "rewards/rejected": 2.8573758754646406e-05, "step": 30 }, { "dpo_losses": 0.6932997107505798, "epoch": 0.01, "grad_norm": 28.33388144767012, "learning_rate": 5.221932114882506e-08, "logits/chosen": -2.803170919418335, "logits/rejected": -2.8049044609069824, "logps/chosen": -225.2213592529297, "logps/rejected": -243.0467071533203, "loss": 0.7001, "positive_losses": 0.05524027347564697, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.00021809982717968524, "rewards/margins": -0.0003027426719199866, "rewards/margins_max": 0.002072124509140849, "rewards/margins_min": -0.004094444215297699, "rewards/margins_std": 0.0028748027980327606, "rewards/rejected": 0.0005208424990996718, "step": 40 }, { "dpo_losses": 0.69282066822052, "epoch": 0.01, "grad_norm": 8.755876584092501, "learning_rate": 6.527415143603133e-08, "logits/chosen": -2.954601764678955, "logits/rejected": -2.915971279144287, "logps/chosen": -341.208984375, "logps/rejected": -307.0840148925781, "loss": 0.6972, "positive_losses": 0.0233170036226511, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0012528609950095415, "rewards/margins": 0.0006549887475557625, "rewards/margins_max": 0.0030162143521010876, "rewards/margins_min": -0.0022026845254004, "rewards/margins_std": 0.002286398783326149, "rewards/rejected": 0.0005978723056614399, "step": 50 }, { "dpo_losses": 0.6929444074630737, "epoch": 0.02, "grad_norm": 16.87014271095211, "learning_rate": 7.83289817232376e-08, "logits/chosen": -2.739804744720459, "logits/rejected": -2.6878609657287598, "logps/chosen": -247.22518920898438, "logps/rejected": -251.5140838623047, "loss": 0.6955, "positive_losses": 0.025938797742128372, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0018961990717798471, "rewards/margins": 0.00040858075954020023, "rewards/margins_max": 0.004767539910972118, "rewards/margins_min": -0.002739040181040764, "rewards/margins_std": 0.003281188430264592, "rewards/rejected": 0.0014876185450702906, "step": 60 }, { "dpo_losses": 0.693077802658081, "epoch": 0.02, "grad_norm": 1.6246733703996752, "learning_rate": 9.138381201044386e-08, "logits/chosen": -2.8502087593078613, "logits/rejected": -2.812725305557251, "logps/chosen": -260.4454650878906, "logps/rejected": -244.32363891601562, "loss": 0.6952, "positive_losses": 0.01544876117259264, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.00289685046300292, "rewards/margins": 0.000141006734338589, "rewards/margins_max": 0.0029947375878691673, "rewards/margins_min": -0.00309961661696434, "rewards/margins_std": 0.002640590537339449, "rewards/rejected": 0.0027558435685932636, "step": 70 }, { "dpo_losses": 0.692978024482727, "epoch": 0.02, "grad_norm": 2.4895937824961045, "learning_rate": 1.0443864229765012e-07, "logits/chosen": -2.748595952987671, "logits/rejected": -2.7783217430114746, "logps/chosen": -296.84954833984375, "logps/rejected": -234.31381225585938, "loss": 0.6952, "positive_losses": 0.01120681781321764, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.003352649509906769, "rewards/margins": 0.00034075783332809806, "rewards/margins_max": 0.003386855823919177, "rewards/margins_min": -0.003298679366707802, "rewards/margins_std": 0.002919359365478158, "rewards/rejected": 0.003011892084032297, "step": 80 }, { "dpo_losses": 0.6929032206535339, "epoch": 0.02, "grad_norm": 16.15562665020882, "learning_rate": 1.174934725848564e-07, "logits/chosen": -2.96268892288208, "logits/rejected": -2.9666476249694824, "logps/chosen": -355.95611572265625, "logps/rejected": -323.64129638671875, "loss": 0.6945, "positive_losses": 0.0033386230934411287, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0049633425660431385, "rewards/margins": 0.0004909709095954895, "rewards/margins_max": 0.004024089779704809, "rewards/margins_min": -0.0036105443723499775, "rewards/margins_std": 0.003409436671063304, "rewards/rejected": 0.004472372122108936, "step": 90 }, { "dpo_losses": 0.6929186582565308, "epoch": 0.03, "grad_norm": 19.22387984128968, "learning_rate": 1.3054830287206266e-07, "logits/chosen": -2.705064535140991, "logits/rejected": -2.6813759803771973, "logps/chosen": -297.81158447265625, "logps/rejected": -226.8807373046875, "loss": 0.6943, "positive_losses": 0.011927795596420765, "rewards/accuracies": 0.5, "rewards/chosen": 0.004185602534562349, "rewards/margins": 0.00045963405864313245, "rewards/margins_max": 0.003522497834637761, "rewards/margins_min": -0.0030625914223492146, "rewards/margins_std": 0.0029240392614156008, "rewards/rejected": 0.0037259687669575214, "step": 100 }, { "epoch": 0.03, "eval_dpo_losses": 0.6929485201835632, "eval_logits/chosen": -2.8206725120544434, "eval_logits/rejected": -2.782633066177368, "eval_logps/chosen": -283.98443603515625, "eval_logps/rejected": -261.6789855957031, "eval_loss": 0.693632960319519, "eval_positive_losses": 0.0069915228523314, "eval_rewards/accuracies": 0.5555555820465088, "eval_rewards/chosen": 0.005104683805257082, "eval_rewards/margins": 0.0004005789814982563, "eval_rewards/margins_max": 0.005383267533034086, "eval_rewards/margins_min": -0.00492233969271183, "eval_rewards/margins_std": 0.003428671509027481, "eval_rewards/rejected": 0.004704104270786047, "eval_runtime": 388.5975, "eval_samples_per_second": 5.147, "eval_steps_per_second": 0.162, "step": 100 }, { "dpo_losses": 0.6934686899185181, "epoch": 0.03, "grad_norm": 11.788573237859406, "learning_rate": 1.4360313315926893e-07, "logits/chosen": -2.8273611068725586, "logits/rejected": -2.814751386642456, "logps/chosen": -266.32379150390625, "logps/rejected": -263.38446044921875, "loss": 0.6939, "positive_losses": 0.01450958289206028, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.004909469746053219, "rewards/margins": -0.0006404476007446647, "rewards/margins_max": 0.002480804454535246, "rewards/margins_min": -0.003917304333299398, "rewards/margins_std": 0.0027779233641922474, "rewards/rejected": 0.005549917463213205, "step": 110 }, { "dpo_losses": 0.693248450756073, "epoch": 0.03, "grad_norm": 10.899515515680617, "learning_rate": 1.566579634464752e-07, "logits/chosen": -2.8180179595947266, "logits/rejected": -2.7959036827087402, "logps/chosen": -248.500244140625, "logps/rejected": -239.03329467773438, "loss": 0.6937, "positive_losses": 0.02230529859662056, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.005369266960769892, "rewards/margins": -0.00020054270862601697, "rewards/margins_max": 0.0017862394452095032, "rewards/margins_min": -0.0030471840873360634, "rewards/margins_std": 0.0022475814912468195, "rewards/rejected": 0.005569809582084417, "step": 120 }, { "dpo_losses": 0.6927030086517334, "epoch": 0.03, "grad_norm": 13.482247442238627, "learning_rate": 1.6971279373368143e-07, "logits/chosen": -2.7805299758911133, "logits/rejected": -2.7365939617156982, "logps/chosen": -278.9297790527344, "logps/rejected": -392.56842041015625, "loss": 0.6933, "positive_losses": 0.0033142089378088713, "rewards/accuracies": 0.625, "rewards/chosen": 0.005917480681091547, "rewards/margins": 0.0008933775825425982, "rewards/margins_max": 0.006129108369350433, "rewards/margins_min": -0.00324707361869514, "rewards/margins_std": 0.004221538081765175, "rewards/rejected": 0.005024102982133627, "step": 130 }, { "dpo_losses": 0.6928731203079224, "epoch": 0.04, "grad_norm": 8.023760795580056, "learning_rate": 1.8276762402088773e-07, "logits/chosen": -2.7624192237854004, "logits/rejected": -2.7280631065368652, "logps/chosen": -233.57473754882812, "logps/rejected": -216.6000518798828, "loss": 0.6953, "positive_losses": 0.006344604305922985, "rewards/accuracies": 0.625, "rewards/chosen": 0.00621133903041482, "rewards/margins": 0.0005498835816979408, "rewards/margins_max": 0.0035002590157091618, "rewards/margins_min": -0.0017805719980970025, "rewards/margins_std": 0.00228001456707716, "rewards/rejected": 0.005661455448716879, "step": 140 }, { "dpo_losses": 0.6931334137916565, "epoch": 0.04, "grad_norm": 1.8466653566091873, "learning_rate": 1.95822454308094e-07, "logits/chosen": -2.7768664360046387, "logits/rejected": -2.756864070892334, "logps/chosen": -218.4131622314453, "logps/rejected": -248.4625244140625, "loss": 0.6934, "positive_losses": 0.0017547607421875, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.006885058246552944, "rewards/margins": 3.0700430215802044e-05, "rewards/margins_max": 0.004112632479518652, "rewards/margins_min": -0.0041068727150559425, "rewards/margins_std": 0.003603233490139246, "rewards/rejected": 0.0068543581292033195, "step": 150 }, { "dpo_losses": 0.6929692029953003, "epoch": 0.04, "grad_norm": 10.628778481407727, "learning_rate": 2.0887728459530023e-07, "logits/chosen": -2.8364949226379395, "logits/rejected": -2.828240156173706, "logps/chosen": -263.7819519042969, "logps/rejected": -227.7448272705078, "loss": 0.6931, "positive_losses": 0.00244903564453125, "rewards/accuracies": 0.5, "rewards/chosen": 0.00650602113455534, "rewards/margins": 0.00035825843224301934, "rewards/margins_max": 0.0036457558162510395, "rewards/margins_min": -0.0027374648489058018, "rewards/margins_std": 0.0028280240949243307, "rewards/rejected": 0.006147762760519981, "step": 160 }, { "dpo_losses": 0.6932560205459595, "epoch": 0.04, "grad_norm": 1.8417289130737209, "learning_rate": 2.2193211488250652e-07, "logits/chosen": -2.7584662437438965, "logits/rejected": -2.7368972301483154, "logps/chosen": -244.76736450195312, "logps/rejected": -219.04421997070312, "loss": 0.6933, "positive_losses": 0.00555496197193861, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.006044226232916117, "rewards/margins": -0.00021317150094546378, "rewards/margins_max": 0.0035289092920720577, "rewards/margins_min": -0.004762549884617329, "rewards/margins_std": 0.003684843424707651, "rewards/rejected": 0.006257397588342428, "step": 170 }, { "dpo_losses": 0.692129373550415, "epoch": 0.05, "grad_norm": 8.525853053395064, "learning_rate": 2.349869451697128e-07, "logits/chosen": -2.7833104133605957, "logits/rejected": -2.7295079231262207, "logps/chosen": -312.50042724609375, "logps/rejected": -244.234375, "loss": 0.6931, "positive_losses": 0.012158965691924095, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.00696224719285965, "rewards/margins": 0.0020412951707839966, "rewards/margins_max": 0.006969262845814228, "rewards/margins_min": -0.0024727259296923876, "rewards/margins_std": 0.004140241537243128, "rewards/rejected": 0.00492095248773694, "step": 180 }, { "dpo_losses": 0.6932461857795715, "epoch": 0.05, "grad_norm": 6.851936912963262, "learning_rate": 2.4804177545691903e-07, "logits/chosen": -2.8912341594696045, "logits/rejected": -2.83351993560791, "logps/chosen": -282.2035217285156, "logps/rejected": -211.73965454101562, "loss": 0.6933, "positive_losses": 0.0056968689896166325, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.007540510501712561, "rewards/margins": -0.0001929441059473902, "rewards/margins_max": 0.004584602080285549, "rewards/margins_min": -0.0037689092569053173, "rewards/margins_std": 0.0036659010220319033, "rewards/rejected": 0.007733455393463373, "step": 190 }, { "dpo_losses": 0.6919293999671936, "epoch": 0.05, "grad_norm": 8.245672767261967, "learning_rate": 2.610966057441253e-07, "logits/chosen": -2.8154959678649902, "logits/rejected": -2.7628376483917236, "logps/chosen": -322.11981201171875, "logps/rejected": -254.35159301757812, "loss": 0.6937, "positive_losses": 0.0044952393509447575, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.009347244165837765, "rewards/margins": 0.002443180652335286, "rewards/margins_max": 0.00754980742931366, "rewards/margins_min": -0.0015672739828005433, "rewards/margins_std": 0.004176629241555929, "rewards/rejected": 0.006904063280671835, "step": 200 }, { "epoch": 0.05, "eval_dpo_losses": 0.6924603581428528, "eval_logits/chosen": -2.8224804401397705, "eval_logits/rejected": -2.784637928009033, "eval_logps/chosen": -283.6239929199219, "eval_logps/rejected": -261.4164733886719, "eval_loss": 0.693173348903656, "eval_positive_losses": 0.005795312114059925, "eval_rewards/accuracies": 0.591269850730896, "eval_rewards/chosen": 0.008708693087100983, "eval_rewards/margins": 0.001379763358272612, "eval_rewards/margins_max": 0.008318053558468819, "eval_rewards/margins_min": -0.005318824201822281, "eval_rewards/margins_std": 0.004544954281300306, "eval_rewards/rejected": 0.00732893031090498, "eval_runtime": 387.8762, "eval_samples_per_second": 5.156, "eval_steps_per_second": 0.162, "step": 200 }, { "dpo_losses": 0.6922547221183777, "epoch": 0.05, "grad_norm": 19.29159055932387, "learning_rate": 2.7415143603133156e-07, "logits/chosen": -2.798863649368286, "logits/rejected": -2.7512946128845215, "logps/chosen": -340.0872802734375, "logps/rejected": -274.6262512207031, "loss": 0.6926, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.010198279283940792, "rewards/margins": 0.0018013219814747572, "rewards/margins_max": 0.008373035117983818, "rewards/margins_min": -0.002471528248861432, "rewards/margins_std": 0.004959521349519491, "rewards/rejected": 0.008396958000957966, "step": 210 }, { "dpo_losses": 0.692650318145752, "epoch": 0.06, "grad_norm": 1.7940467137152343, "learning_rate": 2.8720626631853785e-07, "logits/chosen": -2.7393550872802734, "logits/rejected": -2.6828718185424805, "logps/chosen": -265.1138000488281, "logps/rejected": -295.94427490234375, "loss": 0.6926, "positive_losses": 0.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.009583190083503723, "rewards/margins": 0.0009993657004088163, "rewards/margins_max": 0.00544703658670187, "rewards/margins_min": -0.004009666386991739, "rewards/margins_std": 0.004203102085739374, "rewards/rejected": 0.008583825081586838, "step": 220 }, { "dpo_losses": 0.6927578449249268, "epoch": 0.06, "grad_norm": 1.8830337077895423, "learning_rate": 3.002610966057441e-07, "logits/chosen": -2.8563461303710938, "logits/rejected": -2.8284122943878174, "logps/chosen": -306.5498352050781, "logps/rejected": -253.4873046875, "loss": 0.6928, "positive_losses": 0.018727874383330345, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.01034473441541195, "rewards/margins": 0.0007884896476753056, "rewards/margins_max": 0.008461997844278812, "rewards/margins_min": -0.006560837384313345, "rewards/margins_std": 0.006800378207117319, "rewards/rejected": 0.009556243196129799, "step": 230 }, { "dpo_losses": 0.6917439699172974, "epoch": 0.06, "grad_norm": 2.1350279539542942, "learning_rate": 3.133159268929504e-07, "logits/chosen": -2.729341506958008, "logits/rejected": -2.6087653636932373, "logps/chosen": -274.9852600097656, "logps/rejected": -224.34011840820312, "loss": 0.6919, "positive_losses": 0.0032783509232103825, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.012574483640491962, "rewards/margins": 0.002816675463691354, "rewards/margins_max": 0.009632418863475323, "rewards/margins_min": -0.0025681552942842245, "rewards/margins_std": 0.005478983279317617, "rewards/rejected": 0.009757807478308678, "step": 240 }, { "dpo_losses": 0.6922317743301392, "epoch": 0.07, "grad_norm": 1.6556520642278625, "learning_rate": 3.263707571801567e-07, "logits/chosen": -2.7859044075012207, "logits/rejected": -2.8035387992858887, "logps/chosen": -284.1858215332031, "logps/rejected": -252.52474975585938, "loss": 0.6925, "positive_losses": 0.0, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.01263434998691082, "rewards/margins": 0.0018406830495223403, "rewards/margins_max": 0.007658310234546661, "rewards/margins_min": -0.0037867389619350433, "rewards/margins_std": 0.0052383774891495705, "rewards/rejected": 0.01079366635531187, "step": 250 }, { "dpo_losses": 0.6910029649734497, "epoch": 0.07, "grad_norm": 3.2893183737781766, "learning_rate": 3.3942558746736286e-07, "logits/chosen": -2.8234846591949463, "logits/rejected": -2.789393901824951, "logps/chosen": -349.8510437011719, "logps/rejected": -324.61932373046875, "loss": 0.6916, "positive_losses": 0.0, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.012757892720401287, "rewards/margins": 0.0043033878318965435, "rewards/margins_max": 0.009838690981268883, "rewards/margins_min": -0.002494791056960821, "rewards/margins_std": 0.005606819875538349, "rewards/rejected": 0.008454503491520882, "step": 260 }, { "dpo_losses": 0.6918529272079468, "epoch": 0.07, "grad_norm": 1.7905826690090754, "learning_rate": 3.5248041775456916e-07, "logits/chosen": -2.8090696334838867, "logits/rejected": -2.7536022663116455, "logps/chosen": -324.2267761230469, "logps/rejected": -298.77349853515625, "loss": 0.6916, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.015475763007998466, "rewards/margins": 0.002604722511023283, "rewards/margins_max": 0.011933142319321632, "rewards/margins_min": -0.004363273270428181, "rewards/margins_std": 0.007273535244166851, "rewards/rejected": 0.012871041893959045, "step": 270 }, { "dpo_losses": 0.6923967003822327, "epoch": 0.07, "grad_norm": 9.155954193440616, "learning_rate": 3.6553524804177545e-07, "logits/chosen": -2.903094530105591, "logits/rejected": -2.8605706691741943, "logps/chosen": -276.69207763671875, "logps/rejected": -249.9399871826172, "loss": 0.6923, "positive_losses": 0.0, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.015179550275206566, "rewards/margins": 0.001524943159893155, "rewards/margins_max": 0.012009905651211739, "rewards/margins_min": -0.010082701221108437, "rewards/margins_std": 0.009722010232508183, "rewards/rejected": 0.013654607348144054, "step": 280 }, { "dpo_losses": 0.6904204487800598, "epoch": 0.08, "grad_norm": 10.5627611116718, "learning_rate": 3.785900783289817e-07, "logits/chosen": -2.8307137489318848, "logits/rejected": -2.7851150035858154, "logps/chosen": -299.5377502441406, "logps/rejected": -239.16635131835938, "loss": 0.6916, "positive_losses": 0.015967559069395065, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.015545010566711426, "rewards/margins": 0.005487080663442612, "rewards/margins_max": 0.015951601788401604, "rewards/margins_min": -0.004018872976303101, "rewards/margins_std": 0.00920058973133564, "rewards/rejected": 0.01005792897194624, "step": 290 }, { "dpo_losses": 0.6916751861572266, "epoch": 0.08, "grad_norm": 1.8403639655253654, "learning_rate": 3.91644908616188e-07, "logits/chosen": -2.8961424827575684, "logits/rejected": -2.8574767112731934, "logps/chosen": -295.5889587402344, "logps/rejected": -239.2697296142578, "loss": 0.6932, "positive_losses": 0.01744537428021431, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.01415463350713253, "rewards/margins": 0.0029651448130607605, "rewards/margins_max": 0.013198497705161572, "rewards/margins_min": -0.0035127296578139067, "rewards/margins_std": 0.007742941379547119, "rewards/rejected": 0.01118948683142662, "step": 300 }, { "epoch": 0.08, "eval_dpo_losses": 0.6907697319984436, "eval_logits/chosen": -2.8172719478607178, "eval_logits/rejected": -2.779116153717041, "eval_logps/chosen": -282.73956298828125, "eval_logps/rejected": -260.8730163574219, "eval_loss": 0.6918479204177856, "eval_positive_losses": 0.011128379963338375, "eval_rewards/accuracies": 0.670634925365448, "eval_rewards/chosen": 0.017553498968482018, "eval_rewards/margins": 0.004790027160197496, "eval_rewards/margins_max": 0.021332116797566414, "eval_rewards/margins_min": -0.010061729699373245, "eval_rewards/margins_std": 0.010371250100433826, "eval_rewards/rejected": 0.012763473205268383, "eval_runtime": 401.6615, "eval_samples_per_second": 4.979, "eval_steps_per_second": 0.157, "step": 300 }, { "dpo_losses": 0.6913976669311523, "epoch": 0.08, "grad_norm": 2.2550175140445052, "learning_rate": 4.046997389033943e-07, "logits/chosen": -2.865398406982422, "logits/rejected": -2.8224949836730957, "logps/chosen": -286.3303527832031, "logps/rejected": -305.35626220703125, "loss": 0.6913, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0183410681784153, "rewards/margins": 0.0035322513431310654, "rewards/margins_max": 0.01414328534156084, "rewards/margins_min": -0.0053473422303795815, "rewards/margins_std": 0.00869253184646368, "rewards/rejected": 0.014808815903961658, "step": 310 }, { "dpo_losses": 0.6903918981552124, "epoch": 0.08, "grad_norm": 9.72644405675876, "learning_rate": 4.1775456919060046e-07, "logits/chosen": -2.8780643939971924, "logits/rejected": -2.8514111042022705, "logps/chosen": -307.68963623046875, "logps/rejected": -265.0461730957031, "loss": 0.6915, "positive_losses": 0.0016929625999182463, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.020931031554937363, "rewards/margins": 0.0055774180218577385, "rewards/margins_max": 0.02189483866095543, "rewards/margins_min": -0.009504149667918682, "rewards/margins_std": 0.014499841257929802, "rewards/rejected": 0.015353617258369923, "step": 320 }, { "dpo_losses": 0.6896374225616455, "epoch": 0.09, "grad_norm": 2.2611511256087677, "learning_rate": 4.3080939947780675e-07, "logits/chosen": -2.8339321613311768, "logits/rejected": -2.773601770401001, "logps/chosen": -285.16424560546875, "logps/rejected": -290.3405456542969, "loss": 0.6906, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.024154286831617355, "rewards/margins": 0.007089032791554928, "rewards/margins_max": 0.023823006078600883, "rewards/margins_min": -0.0061348374001681805, "rewards/margins_std": 0.013329845853149891, "rewards/rejected": 0.017065253108739853, "step": 330 }, { "dpo_losses": 0.6893516778945923, "epoch": 0.09, "grad_norm": 5.815296443818852, "learning_rate": 4.4386422976501305e-07, "logits/chosen": -2.714614152908325, "logits/rejected": -2.7669053077697754, "logps/chosen": -321.5865173339844, "logps/rejected": -296.74346923828125, "loss": 0.6921, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.02406420186161995, "rewards/margins": 0.0076649547554552555, "rewards/margins_max": 0.025969142094254494, "rewards/margins_min": -0.004413464106619358, "rewards/margins_std": 0.013922092504799366, "rewards/rejected": 0.016399245709180832, "step": 340 }, { "dpo_losses": 0.6887660026550293, "epoch": 0.09, "grad_norm": 3.686826870790532, "learning_rate": 4.569190600522193e-07, "logits/chosen": -2.757418632507324, "logits/rejected": -2.725862979888916, "logps/chosen": -306.6487731933594, "logps/rejected": -232.5929718017578, "loss": 0.6917, "positive_losses": 0.03159179538488388, "rewards/accuracies": 0.75, "rewards/chosen": 0.024827757850289345, "rewards/margins": 0.008878981694579124, "rewards/margins_max": 0.031129935756325722, "rewards/margins_min": -0.011993775144219398, "rewards/margins_std": 0.019443338736891747, "rewards/rejected": 0.01594877615571022, "step": 350 }, { "dpo_losses": 0.6898253560066223, "epoch": 0.09, "grad_norm": 2.1252427803924783, "learning_rate": 4.699738903394256e-07, "logits/chosen": -2.947758197784424, "logits/rejected": -2.923774480819702, "logps/chosen": -374.69000244140625, "logps/rejected": -284.81695556640625, "loss": 0.6914, "positive_losses": 0.04075012356042862, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.024526912719011307, "rewards/margins": 0.006719623692333698, "rewards/margins_max": 0.024043092504143715, "rewards/margins_min": -0.010639860294759274, "rewards/margins_std": 0.015631282702088356, "rewards/rejected": 0.017807289958000183, "step": 360 }, { "dpo_losses": 0.6916292309761047, "epoch": 0.1, "grad_norm": 1.8193199496915609, "learning_rate": 4.830287206266319e-07, "logits/chosen": -2.6774754524230957, "logits/rejected": -2.7066006660461426, "logps/chosen": -211.9786376953125, "logps/rejected": -250.2374725341797, "loss": 0.6923, "positive_losses": 0.0, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.02256864868104458, "rewards/margins": 0.0030855671502649784, "rewards/margins_max": 0.016740715131163597, "rewards/margins_min": -0.009803814813494682, "rewards/margins_std": 0.01225105207413435, "rewards/rejected": 0.019483083859086037, "step": 370 }, { "dpo_losses": 0.6892152428627014, "epoch": 0.1, "grad_norm": 1.8605234141835845, "learning_rate": 4.960835509138381e-07, "logits/chosen": -2.7890875339508057, "logits/rejected": -2.6685492992401123, "logps/chosen": -278.82476806640625, "logps/rejected": -283.8394470214844, "loss": 0.691, "positive_losses": 0.0, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.02961309812963009, "rewards/margins": 0.007972048595547676, "rewards/margins_max": 0.029571373015642166, "rewards/margins_min": -0.009612159803509712, "rewards/margins_std": 0.018037427216768265, "rewards/rejected": 0.021641049534082413, "step": 380 }, { "dpo_losses": 0.6870719194412231, "epoch": 0.1, "grad_norm": 2.048819630098869, "learning_rate": 4.999948856244767e-07, "logits/chosen": -2.91493558883667, "logits/rejected": -2.873018741607666, "logps/chosen": -285.544677734375, "logps/rejected": -257.34625244140625, "loss": 0.6894, "positive_losses": 0.0, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.03688435256481171, "rewards/margins": 0.012254970148205757, "rewards/margins_max": 0.031174445524811745, "rewards/margins_min": -0.004109539091587067, "rewards/margins_std": 0.015911713242530823, "rewards/rejected": 0.0246293805539608, "step": 390 }, { "dpo_losses": 0.6913290023803711, "epoch": 0.1, "grad_norm": 1.6765440869631212, "learning_rate": 4.999698361256577e-07, "logits/chosen": -2.7324068546295166, "logits/rejected": -2.728743076324463, "logps/chosen": -252.0926971435547, "logps/rejected": -232.7787628173828, "loss": 0.6923, "positive_losses": 0.08732833713293076, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.030203068628907204, "rewards/margins": 0.0038094078190624714, "rewards/margins_max": 0.032358523458242416, "rewards/margins_min": -0.020671313628554344, "rewards/margins_std": 0.023817040026187897, "rewards/rejected": 0.02639366313815117, "step": 400 }, { "epoch": 0.1, "eval_dpo_losses": 0.6882805824279785, "eval_logits/chosen": -2.817486047744751, "eval_logits/rejected": -2.7794790267944336, "eval_logps/chosen": -280.82073974609375, "eval_logps/rejected": -259.46270751953125, "eval_loss": 0.6901962161064148, "eval_positive_losses": 0.01547150406986475, "eval_rewards/accuracies": 0.6785714030265808, "eval_rewards/chosen": 0.03674148768186569, "eval_rewards/margins": 0.009874720126390457, "eval_rewards/margins_max": 0.042554907500743866, "eval_rewards/margins_min": -0.020002564415335655, "eval_rewards/margins_std": 0.0206435713917017, "eval_rewards/rejected": 0.026866771280765533, "eval_runtime": 399.8496, "eval_samples_per_second": 5.002, "eval_steps_per_second": 0.158, "step": 400 }, { "dpo_losses": 0.6877659559249878, "epoch": 0.11, "grad_norm": 1.8218651563532247, "learning_rate": 4.99923914217458e-07, "logits/chosen": -2.7876436710357666, "logits/rejected": -2.679112434387207, "logps/chosen": -315.75225830078125, "logps/rejected": -291.5887145996094, "loss": 0.6933, "positive_losses": 0.06274566799402237, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0355767160654068, "rewards/margins": 0.01087341457605362, "rewards/margins_max": 0.030309131368994713, "rewards/margins_min": -0.007646770682185888, "rewards/margins_std": 0.016793150454759598, "rewards/rejected": 0.02470330148935318, "step": 410 }, { "dpo_losses": 0.6863200664520264, "epoch": 0.11, "grad_norm": 4.46313080590029, "learning_rate": 4.99857123734344e-07, "logits/chosen": -2.7639195919036865, "logits/rejected": -2.7428085803985596, "logps/chosen": -267.6919860839844, "logps/rejected": -230.65151977539062, "loss": 0.6869, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.03915037587285042, "rewards/margins": 0.013787394389510155, "rewards/margins_max": 0.035141073167324066, "rewards/margins_min": -0.005410642828792334, "rewards/margins_std": 0.01859271712601185, "rewards/rejected": 0.025362977758049965, "step": 420 }, { "dpo_losses": 0.6852012872695923, "epoch": 0.11, "grad_norm": 12.85730423598963, "learning_rate": 4.997694702533016e-07, "logits/chosen": -2.793182373046875, "logits/rejected": -2.6905808448791504, "logps/chosen": -296.16558837890625, "logps/rejected": -218.5005645751953, "loss": 0.6865, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.047698576003313065, "rewards/margins": 0.01618335209786892, "rewards/margins_max": 0.04767851531505585, "rewards/margins_min": -0.011801841668784618, "rewards/margins_std": 0.025887608528137207, "rewards/rejected": 0.0315152183175087, "step": 430 }, { "dpo_losses": 0.6853216886520386, "epoch": 0.12, "grad_norm": 1.8741895831606603, "learning_rate": 4.996609610933712e-07, "logits/chosen": -2.8489298820495605, "logits/rejected": -2.844210624694824, "logps/chosen": -280.28546142578125, "logps/rejected": -257.695068359375, "loss": 0.687, "positive_losses": 0.0074554444290697575, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.04551452770829201, "rewards/margins": 0.01586158759891987, "rewards/margins_max": 0.0459897443652153, "rewards/margins_min": -0.002934148535132408, "rewards/margins_std": 0.022323202341794968, "rewards/rejected": 0.029652941972017288, "step": 440 }, { "dpo_losses": 0.6888092756271362, "epoch": 0.12, "grad_norm": 2.0146739993228095, "learning_rate": 4.995316053150366e-07, "logits/chosen": -2.8890292644500732, "logits/rejected": -2.8318934440612793, "logps/chosen": -281.5171203613281, "logps/rejected": -230.30618286132812, "loss": 0.6877, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.04681592434644699, "rewards/margins": 0.008902020752429962, "rewards/margins_max": 0.04170208051800728, "rewards/margins_min": -0.020604267716407776, "rewards/margins_std": 0.02728102169930935, "rewards/rejected": 0.037913911044597626, "step": 450 }, { "dpo_losses": 0.6891074180603027, "epoch": 0.12, "grad_norm": 2.0343813894613922, "learning_rate": 4.99381413719468e-07, "logits/chosen": -2.7213335037231445, "logits/rejected": -2.719630241394043, "logps/chosen": -247.08349609375, "logps/rejected": -261.12261962890625, "loss": 0.687, "positive_losses": 0.0772472396492958, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.04410042613744736, "rewards/margins": 0.008285664021968842, "rewards/margins_max": 0.04062087833881378, "rewards/margins_min": -0.022265169769525528, "rewards/margins_std": 0.027511686086654663, "rewards/rejected": 0.03581475839018822, "step": 460 }, { "dpo_losses": 0.6866526007652283, "epoch": 0.12, "grad_norm": 1.3789397065423763, "learning_rate": 4.992103988476205e-07, "logits/chosen": -2.767130136489868, "logits/rejected": -2.8326306343078613, "logps/chosen": -282.37945556640625, "logps/rejected": -272.17578125, "loss": 0.6883, "positive_losses": 0.0, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.05081998184323311, "rewards/margins": 0.013341334648430347, "rewards/margins_max": 0.0426035150885582, "rewards/margins_min": -0.01956797018647194, "rewards/margins_std": 0.028744056820869446, "rewards/rejected": 0.037478648126125336, "step": 470 }, { "dpo_losses": 0.6860936880111694, "epoch": 0.13, "grad_norm": 1.4536725725069872, "learning_rate": 4.990185749791864e-07, "logits/chosen": -2.8672947883605957, "logits/rejected": -2.805577516555786, "logps/chosen": -270.2125549316406, "logps/rejected": -237.41305541992188, "loss": 0.6892, "positive_losses": 0.008692169561982155, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.05292079970240593, "rewards/margins": 0.014513748697936535, "rewards/margins_max": 0.051043324172496796, "rewards/margins_min": -0.026107680052518845, "rewards/margins_std": 0.03386848792433739, "rewards/rejected": 0.03840705007314682, "step": 480 }, { "dpo_losses": 0.6850025057792664, "epoch": 0.13, "grad_norm": 11.915723501018912, "learning_rate": 4.988059581314039e-07, "logits/chosen": -2.8255181312561035, "logits/rejected": -2.832737445831299, "logps/chosen": -317.77325439453125, "logps/rejected": -313.11309814453125, "loss": 0.6886, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.05682548135519028, "rewards/margins": 0.01671528071165085, "rewards/margins_max": 0.06309463828802109, "rewards/margins_min": -0.02339771017432213, "rewards/margins_std": 0.03948745131492615, "rewards/rejected": 0.040110208094120026, "step": 490 }, { "dpo_losses": 0.6825217604637146, "epoch": 0.13, "grad_norm": 1.7951523193932593, "learning_rate": 4.985725660577184e-07, "logits/chosen": -2.7949891090393066, "logits/rejected": -2.7831168174743652, "logps/chosen": -293.61553955078125, "logps/rejected": -242.59927368164062, "loss": 0.6931, "positive_losses": 0.17864075303077698, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.059095822274684906, "rewards/margins": 0.021753598004579544, "rewards/margins_max": 0.06915644556283951, "rewards/margins_min": -0.015057327225804329, "rewards/margins_std": 0.03711111471056938, "rewards/rejected": 0.03734221309423447, "step": 500 }, { "epoch": 0.13, "eval_dpo_losses": 0.6845101714134216, "eval_logits/chosen": -2.8117895126342773, "eval_logits/rejected": -2.773566722869873, "eval_logps/chosen": -278.5678405761719, "eval_logps/rejected": -257.99334716796875, "eval_loss": 0.6883335113525391, "eval_positive_losses": 0.026538720354437828, "eval_rewards/accuracies": 0.6865079402923584, "eval_rewards/chosen": 0.0592704601585865, "eval_rewards/margins": 0.01771017536520958, "eval_rewards/margins_max": 0.07632049918174744, "eval_rewards/margins_min": -0.034054145216941833, "eval_rewards/margins_std": 0.03624986857175827, "eval_rewards/rejected": 0.04156028851866722, "eval_runtime": 388.2124, "eval_samples_per_second": 5.152, "eval_steps_per_second": 0.162, "step": 500 }, { "dpo_losses": 0.6836234331130981, "epoch": 0.13, "grad_norm": 1.7674795218094803, "learning_rate": 4.983184182463008e-07, "logits/chosen": -2.680546522140503, "logits/rejected": -2.707136392593384, "logps/chosen": -286.67022705078125, "logps/rejected": -249.7257080078125, "loss": 0.6859, "positive_losses": 0.023099135607481003, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.057285137474536896, "rewards/margins": 0.01968751661479473, "rewards/margins_max": 0.07216949760913849, "rewards/margins_min": -0.025082409381866455, "rewards/margins_std": 0.04353880137205124, "rewards/rejected": 0.037597618997097015, "step": 510 }, { "dpo_losses": 0.6839173436164856, "epoch": 0.14, "grad_norm": 1.7000138450507565, "learning_rate": 4.980435359184203e-07, "logits/chosen": -2.7557244300842285, "logits/rejected": -2.666841745376587, "logps/chosen": -268.85308837890625, "logps/rejected": -226.5156707763672, "loss": 0.6881, "positive_losses": 0.038689423352479935, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.05459153652191162, "rewards/margins": 0.018840208649635315, "rewards/margins_max": 0.05965876579284668, "rewards/margins_min": -0.014995383098721504, "rewards/margins_std": 0.03378991782665253, "rewards/rejected": 0.035751327872276306, "step": 520 }, { "dpo_losses": 0.6827250719070435, "epoch": 0.14, "grad_norm": 2.027310115704259, "learning_rate": 4.977479420266723e-07, "logits/chosen": -2.8694539070129395, "logits/rejected": -2.8462345600128174, "logps/chosen": -320.22021484375, "logps/rejected": -287.8216552734375, "loss": 0.6863, "positive_losses": 0.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.06608148664236069, "rewards/margins": 0.021329481154680252, "rewards/margins_max": 0.06904635578393936, "rewards/margins_min": -0.013772931881248951, "rewards/margins_std": 0.038150329142808914, "rewards/rejected": 0.044752009212970734, "step": 530 }, { "dpo_losses": 0.6873120069503784, "epoch": 0.14, "grad_norm": 1.9903594362387926, "learning_rate": 4.974316612530614e-07, "logits/chosen": -2.791822910308838, "logits/rejected": -2.7837400436401367, "logps/chosen": -282.8401794433594, "logps/rejected": -246.26394653320312, "loss": 0.6846, "positive_losses": 0.026886368170380592, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.061132561415433884, "rewards/margins": 0.012306076474487782, "rewards/margins_max": 0.06637242436408997, "rewards/margins_min": -0.038782063871622086, "rewards/margins_std": 0.04558960720896721, "rewards/rejected": 0.04882648214697838, "step": 540 }, { "dpo_losses": 0.6811521053314209, "epoch": 0.14, "grad_norm": 1.7338383849703332, "learning_rate": 4.970947200069415e-07, "logits/chosen": -2.896639347076416, "logits/rejected": -2.8555636405944824, "logps/chosen": -291.7834167480469, "logps/rejected": -262.4410705566406, "loss": 0.686, "positive_losses": 0.01641998253762722, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.06366805732250214, "rewards/margins": 0.024676566943526268, "rewards/margins_max": 0.07537047564983368, "rewards/margins_min": -0.02262812666594982, "rewards/margins_std": 0.04408256709575653, "rewards/rejected": 0.038991499692201614, "step": 550 }, { "dpo_losses": 0.6864355802536011, "epoch": 0.15, "grad_norm": 8.223314997365627, "learning_rate": 4.967371464228095e-07, "logits/chosen": -2.8287415504455566, "logits/rejected": -2.8781867027282715, "logps/chosen": -292.7107238769531, "logps/rejected": -259.1818542480469, "loss": 0.6862, "positive_losses": 0.03968963772058487, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.07235224545001984, "rewards/margins": 0.014133909717202187, "rewards/margins_max": 0.06725303828716278, "rewards/margins_min": -0.04378097131848335, "rewards/margins_std": 0.04836827144026756, "rewards/rejected": 0.0582183413207531, "step": 560 }, { "dpo_losses": 0.6787232756614685, "epoch": 0.15, "grad_norm": 1.704103031540377, "learning_rate": 4.963589703579569e-07, "logits/chosen": -2.806661367416382, "logits/rejected": -2.766479969024658, "logps/chosen": -265.540771484375, "logps/rejected": -223.92953491210938, "loss": 0.6854, "positive_losses": 0.04793090745806694, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.07533114403486252, "rewards/margins": 0.0297311432659626, "rewards/margins_max": 0.08528684079647064, "rewards/margins_min": -0.022052111104130745, "rewards/margins_std": 0.04731011018157005, "rewards/rejected": 0.045600004494190216, "step": 570 }, { "dpo_losses": 0.673004686832428, "epoch": 0.15, "grad_norm": 9.335852918680665, "learning_rate": 4.959602233899761e-07, "logits/chosen": -2.9390029907226562, "logits/rejected": -2.87630033493042, "logps/chosen": -360.3851623535156, "logps/rejected": -260.69146728515625, "loss": 0.6863, "positive_losses": 0.02658386155962944, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.09925512969493866, "rewards/margins": 0.04134751111268997, "rewards/margins_max": 0.08122013509273529, "rewards/margins_min": -0.0026921990793198347, "rewards/margins_std": 0.03794340789318085, "rewards/rejected": 0.05790762975811958, "step": 580 }, { "dpo_losses": 0.6868249177932739, "epoch": 0.15, "grad_norm": 1.5910712524849442, "learning_rate": 4.955409388141243e-07, "logits/chosen": -2.8337204456329346, "logits/rejected": -2.7996432781219482, "logps/chosen": -251.7097625732422, "logps/rejected": -233.59188842773438, "loss": 0.684, "positive_losses": 0.04377937316894531, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.07259367406368256, "rewards/margins": 0.013290634378790855, "rewards/margins_max": 0.06030074879527092, "rewards/margins_min": -0.041350338608026505, "rewards/margins_std": 0.04475581645965576, "rewards/rejected": 0.05930304527282715, "step": 590 }, { "dpo_losses": 0.6834608912467957, "epoch": 0.16, "grad_norm": 1.6366856100510174, "learning_rate": 4.951011516405429e-07, "logits/chosen": -2.904270648956299, "logits/rejected": -2.8730576038360596, "logps/chosen": -372.6429748535156, "logps/rejected": -296.50006103515625, "loss": 0.6831, "positive_losses": 0.017871856689453125, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.08529319614171982, "rewards/margins": 0.02043917588889599, "rewards/margins_max": 0.08833781629800797, "rewards/margins_min": -0.039192624390125275, "rewards/margins_std": 0.057125210762023926, "rewards/rejected": 0.06485401839017868, "step": 600 }, { "epoch": 0.16, "eval_dpo_losses": 0.68129962682724, "eval_logits/chosen": -2.8138515949249268, "eval_logits/rejected": -2.7760884761810303, "eval_logps/chosen": -276.13824462890625, "eval_logps/rejected": -256.2458190917969, "eval_loss": 0.6870078444480896, "eval_positive_losses": 0.03946831822395325, "eval_rewards/accuracies": 0.6964285969734192, "eval_rewards/chosen": 0.08356665074825287, "eval_rewards/margins": 0.024530887603759766, "eval_rewards/margins_max": 0.10627257823944092, "eval_rewards/margins_min": -0.047064412385225296, "eval_rewards/margins_std": 0.05019204691052437, "eval_rewards/rejected": 0.0590357705950737, "eval_runtime": 388.6656, "eval_samples_per_second": 5.146, "eval_steps_per_second": 0.162, "step": 600 }, { "dpo_losses": 0.6777583360671997, "epoch": 0.16, "grad_norm": 8.536802615813908, "learning_rate": 4.946408985913344e-07, "logits/chosen": -2.8229687213897705, "logits/rejected": -2.768874168395996, "logps/chosen": -334.3592834472656, "logps/rejected": -264.8375549316406, "loss": 0.6821, "positive_losses": 0.14571304619312286, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.09296802431344986, "rewards/margins": 0.03191830962896347, "rewards/margins_max": 0.0921122133731842, "rewards/margins_min": -0.02777908369898796, "rewards/margins_std": 0.056708864867687225, "rewards/rejected": 0.06104971095919609, "step": 610 }, { "dpo_losses": 0.6855438947677612, "epoch": 0.16, "grad_norm": 7.472399980903684, "learning_rate": 4.941602180974958e-07, "logits/chosen": -2.8649790287017822, "logits/rejected": -2.816356897354126, "logps/chosen": -259.48626708984375, "logps/rejected": -225.6002960205078, "loss": 0.6815, "positive_losses": 0.0016845703357830644, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.06873434782028198, "rewards/margins": 0.016284704208374023, "rewards/margins_max": 0.07119239866733551, "rewards/margins_min": -0.036721087992191315, "rewards/margins_std": 0.050042442977428436, "rewards/rejected": 0.05244964361190796, "step": 620 }, { "dpo_losses": 0.6780611872673035, "epoch": 0.16, "grad_norm": 1.7025178999667026, "learning_rate": 4.936591502957101e-07, "logits/chosen": -2.820408582687378, "logits/rejected": -2.810722589492798, "logps/chosen": -285.3260498046875, "logps/rejected": -270.40850830078125, "loss": 0.6891, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.09266801178455353, "rewards/margins": 0.03133053332567215, "rewards/margins_max": 0.09321032464504242, "rewards/margins_min": -0.029340893030166626, "rewards/margins_std": 0.053192298859357834, "rewards/rejected": 0.06133747845888138, "step": 630 }, { "dpo_losses": 0.6769616007804871, "epoch": 0.17, "grad_norm": 9.36334439838426, "learning_rate": 4.931377370249945e-07, "logits/chosen": -2.8390297889709473, "logits/rejected": -2.7957053184509277, "logps/chosen": -335.83233642578125, "logps/rejected": -253.6592559814453, "loss": 0.6861, "positive_losses": 0.010548400692641735, "rewards/accuracies": 0.75, "rewards/chosen": 0.09559468179941177, "rewards/margins": 0.03336268663406372, "rewards/margins_max": 0.08360464125871658, "rewards/margins_min": -0.01670246385037899, "rewards/margins_std": 0.045782435685396194, "rewards/rejected": 0.06223199516534805, "step": 640 }, { "dpo_losses": 0.6821959614753723, "epoch": 0.17, "grad_norm": 2.141563710905807, "learning_rate": 4.925960218232072e-07, "logits/chosen": -2.853116035461426, "logits/rejected": -2.851471424102783, "logps/chosen": -270.49798583984375, "logps/rejected": -272.83270263671875, "loss": 0.683, "positive_losses": 0.07119579613208771, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.08185555040836334, "rewards/margins": 0.022922957316040993, "rewards/margins_max": 0.09071676433086395, "rewards/margins_min": -0.03058004379272461, "rewards/margins_std": 0.053156398236751556, "rewards/rejected": 0.0589325949549675, "step": 650 }, { "dpo_losses": 0.6855509877204895, "epoch": 0.17, "grad_norm": 1.6887987578473747, "learning_rate": 4.920340499234116e-07, "logits/chosen": -2.834197998046875, "logits/rejected": -2.8090615272521973, "logps/chosen": -231.2488555908203, "logps/rejected": -246.72470092773438, "loss": 0.688, "positive_losses": 0.10030250251293182, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.08967218548059464, "rewards/margins": 0.016500327736139297, "rewards/margins_max": 0.09120874851942062, "rewards/margins_min": -0.06212595850229263, "rewards/margins_std": 0.06792709976434708, "rewards/rejected": 0.07317186146974564, "step": 660 }, { "dpo_losses": 0.6879802346229553, "epoch": 0.18, "grad_norm": 1.766555473439177, "learning_rate": 4.914518682500995e-07, "logits/chosen": -2.834306240081787, "logits/rejected": -2.8038411140441895, "logps/chosen": -239.792724609375, "logps/rejected": -240.0399169921875, "loss": 0.6927, "positive_losses": 0.2406913787126541, "rewards/accuracies": 0.625, "rewards/chosen": 0.07756872475147247, "rewards/margins": 0.011351787485182285, "rewards/margins_max": 0.0727124959230423, "rewards/margins_min": -0.05722881481051445, "rewards/margins_std": 0.05708152800798416, "rewards/rejected": 0.06621693819761276, "step": 670 }, { "dpo_losses": 0.6810533404350281, "epoch": 0.18, "grad_norm": 1.791549885810375, "learning_rate": 4.90849525415273e-07, "logits/chosen": -2.761420488357544, "logits/rejected": -2.7136807441711426, "logps/chosen": -319.48455810546875, "logps/rejected": -283.24859619140625, "loss": 0.6814, "positive_losses": 0.04459686204791069, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.09917163103818893, "rewards/margins": 0.02512495219707489, "rewards/margins_max": 0.07780589163303375, "rewards/margins_min": -0.030969727784395218, "rewards/margins_std": 0.048123858869075775, "rewards/rejected": 0.07404667884111404, "step": 680 }, { "dpo_losses": 0.6744126081466675, "epoch": 0.18, "grad_norm": 1.7544814358215872, "learning_rate": 4.902270717143858e-07, "logits/chosen": -2.786694288253784, "logits/rejected": -2.731166362762451, "logps/chosen": -276.16217041015625, "logps/rejected": -235.80343627929688, "loss": 0.6849, "positive_losses": 0.00790252722799778, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.08214370906352997, "rewards/margins": 0.03874465078115463, "rewards/margins_max": 0.09999613463878632, "rewards/margins_min": -0.015251509845256805, "rewards/margins_std": 0.04996136948466301, "rewards/rejected": 0.043399058282375336, "step": 690 }, { "dpo_losses": 0.6797881126403809, "epoch": 0.18, "grad_norm": 8.765541878644441, "learning_rate": 4.895845591221426e-07, "logits/chosen": -2.7817792892456055, "logits/rejected": -2.749866008758545, "logps/chosen": -264.08209228515625, "logps/rejected": -254.13796997070312, "loss": 0.6843, "positive_losses": 0.13192901015281677, "rewards/accuracies": 0.625, "rewards/chosen": 0.08249294757843018, "rewards/margins": 0.028021205216646194, "rewards/margins_max": 0.1065501719713211, "rewards/margins_min": -0.030667319893836975, "rewards/margins_std": 0.060979198664426804, "rewards/rejected": 0.05447175353765488, "step": 700 }, { "epoch": 0.18, "eval_dpo_losses": 0.6787125468254089, "eval_logits/chosen": -2.8047168254852295, "eval_logits/rejected": -2.766716480255127, "eval_logps/chosen": -275.483642578125, "eval_logps/rejected": -256.1453857421875, "eval_loss": 0.6863225102424622, "eval_positive_losses": 0.05308011174201965, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 0.09011287987232208, "eval_rewards/margins": 0.030073018744587898, "eval_rewards/margins_max": 0.1277952939271927, "eval_rewards/margins_min": -0.05525188893079758, "eval_rewards/margins_std": 0.059921521693468094, "eval_rewards/rejected": 0.060039862990379333, "eval_runtime": 389.1185, "eval_samples_per_second": 5.14, "eval_steps_per_second": 0.162, "step": 700 }, { "dpo_losses": 0.6902292966842651, "epoch": 0.19, "grad_norm": 1.685836395824075, "learning_rate": 4.8892204128816e-07, "logits/chosen": -2.836456775665283, "logits/rejected": -2.8118834495544434, "logps/chosen": -216.6274871826172, "logps/rejected": -182.49378967285156, "loss": 0.6842, "positive_losses": 0.022132491692900658, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0716724544763565, "rewards/margins": 0.006178057286888361, "rewards/margins_max": 0.04306349158287048, "rewards/margins_min": -0.03043294884264469, "rewards/margins_std": 0.03296063840389252, "rewards/rejected": 0.06549438089132309, "step": 710 }, { "dpo_losses": 0.6763695478439331, "epoch": 0.19, "grad_norm": 2.1240272539477205, "learning_rate": 4.882395735324863e-07, "logits/chosen": -2.751163959503174, "logits/rejected": -2.6967310905456543, "logps/chosen": -358.1401062011719, "logps/rejected": -294.829833984375, "loss": 0.6781, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.11145804822444916, "rewards/margins": 0.035284630954265594, "rewards/margins_max": 0.11927734315395355, "rewards/margins_min": -0.04303320124745369, "rewards/margins_std": 0.07272578030824661, "rewards/rejected": 0.07617342472076416, "step": 720 }, { "dpo_losses": 0.6721448302268982, "epoch": 0.19, "grad_norm": 2.0424881145453653, "learning_rate": 4.875372128409829e-07, "logits/chosen": -2.9102444648742676, "logits/rejected": -2.827521562576294, "logps/chosen": -305.36785888671875, "logps/rejected": -248.75381469726562, "loss": 0.679, "positive_losses": 0.03110790252685547, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.10629504919052124, "rewards/margins": 0.043546758592128754, "rewards/margins_max": 0.11803199350833893, "rewards/margins_min": -0.011555962264537811, "rewards/margins_std": 0.06165642663836479, "rewards/rejected": 0.06274829059839249, "step": 730 }, { "dpo_losses": 0.6869141459465027, "epoch": 0.19, "grad_norm": 1.7550163579107814, "learning_rate": 4.868150178605653e-07, "logits/chosen": -2.800950050354004, "logits/rejected": -2.857544422149658, "logps/chosen": -259.14111328125, "logps/rejected": -322.3183898925781, "loss": 0.6882, "positive_losses": 0.17758464813232422, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.09367353469133377, "rewards/margins": 0.014143924228847027, "rewards/margins_max": 0.10260222107172012, "rewards/margins_min": -0.0663861557841301, "rewards/margins_std": 0.07564298063516617, "rewards/rejected": 0.07952960580587387, "step": 740 }, { "dpo_losses": 0.6848233938217163, "epoch": 0.2, "grad_norm": 2.134068604758163, "learning_rate": 4.860730488943068e-07, "logits/chosen": -2.7879323959350586, "logits/rejected": -2.856818914413452, "logps/chosen": -243.7280731201172, "logps/rejected": -260.96710205078125, "loss": 0.6914, "positive_losses": 0.08630981296300888, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.09625248610973358, "rewards/margins": 0.017914965748786926, "rewards/margins_max": 0.09145281463861465, "rewards/margins_min": -0.04039599746465683, "rewards/margins_std": 0.058698005974292755, "rewards/rejected": 0.07833750545978546, "step": 750 }, { "dpo_losses": 0.664521336555481, "epoch": 0.2, "grad_norm": 1.9969820856504785, "learning_rate": 4.853113678964021e-07, "logits/chosen": -2.883469820022583, "logits/rejected": -2.8021795749664307, "logps/chosen": -285.27667236328125, "logps/rejected": -236.72909545898438, "loss": 0.6755, "positive_losses": 0.0, "rewards/accuracies": 0.875, "rewards/chosen": 0.12383918464183807, "rewards/margins": 0.058944981545209885, "rewards/margins_max": 0.12066600471735, "rewards/margins_min": 0.015928596258163452, "rewards/margins_std": 0.04676266759634018, "rewards/rejected": 0.06489420682191849, "step": 760 }, { "dpo_losses": 0.6735109686851501, "epoch": 0.2, "grad_norm": 1.87848136681826, "learning_rate": 4.845300384669957e-07, "logits/chosen": -2.84818172454834, "logits/rejected": -2.7825305461883545, "logps/chosen": -304.6402893066406, "logps/rejected": -265.74615478515625, "loss": 0.6755, "positive_losses": 0.07650699466466904, "rewards/accuracies": 0.625, "rewards/chosen": 0.10168993473052979, "rewards/margins": 0.041267722845077515, "rewards/margins_max": 0.14881454408168793, "rewards/margins_min": -0.037732165306806564, "rewards/margins_std": 0.08322665095329285, "rewards/rejected": 0.06042221933603287, "step": 770 }, { "dpo_losses": 0.6793437004089355, "epoch": 0.2, "grad_norm": 9.001723654254144, "learning_rate": 4.8372912584687e-07, "logits/chosen": -2.8889095783233643, "logits/rejected": -2.825033664703369, "logps/chosen": -301.7953796386719, "logps/rejected": -273.88629150390625, "loss": 0.6872, "positive_losses": 0.2866264283657074, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.10100637376308441, "rewards/margins": 0.02894478105008602, "rewards/margins_max": 0.10271352529525757, "rewards/margins_min": -0.028590435162186623, "rewards/margins_std": 0.06048471853137016, "rewards/rejected": 0.07206159085035324, "step": 780 }, { "dpo_losses": 0.6845918893814087, "epoch": 0.21, "grad_norm": 1.9355510630788444, "learning_rate": 4.829086969119983e-07, "logits/chosen": -2.9074501991271973, "logits/rejected": -2.8751301765441895, "logps/chosen": -280.39056396484375, "logps/rejected": -259.8931579589844, "loss": 0.6946, "positive_losses": 0.09826965630054474, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.08458300679922104, "rewards/margins": 0.0185464508831501, "rewards/margins_max": 0.0906822457909584, "rewards/margins_min": -0.058589279651641846, "rewards/margins_std": 0.06743675470352173, "rewards/rejected": 0.06603654474020004, "step": 790 }, { "dpo_losses": 0.6688677072525024, "epoch": 0.21, "grad_norm": 8.99267152598806, "learning_rate": 4.820688201679605e-07, "logits/chosen": -2.7524070739746094, "logits/rejected": -2.7194697856903076, "logps/chosen": -343.477294921875, "logps/rejected": -247.39419555664062, "loss": 0.678, "positive_losses": 0.03696594387292862, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.09179838001728058, "rewards/margins": 0.05082429200410843, "rewards/margins_max": 0.15096034109592438, "rewards/margins_min": -0.020338425412774086, "rewards/margins_std": 0.07840122282505035, "rewards/rejected": 0.04097408801317215, "step": 800 }, { "epoch": 0.21, "eval_dpo_losses": 0.6756108999252319, "eval_logits/chosen": -2.8027398586273193, "eval_logits/rejected": -2.7648611068725586, "eval_logps/chosen": -274.7101745605469, "eval_logps/rejected": -256.04681396484375, "eval_loss": 0.6881770491600037, "eval_positive_losses": 0.09072524309158325, "eval_rewards/accuracies": 0.7003968358039856, "eval_rewards/chosen": 0.09784739464521408, "eval_rewards/margins": 0.03682180121541023, "eval_rewards/margins_max": 0.15395967662334442, "eval_rewards/margins_min": -0.06626705825328827, "eval_rewards/margins_std": 0.07224141061306, "eval_rewards/rejected": 0.06102558597922325, "eval_runtime": 388.4571, "eval_samples_per_second": 5.149, "eval_steps_per_second": 0.162, "step": 800 }, { "dpo_losses": 0.6745853424072266, "epoch": 0.21, "grad_norm": 28.433617094113924, "learning_rate": 4.812095657442231e-07, "logits/chosen": -2.875488519668579, "logits/rejected": -2.8277158737182617, "logps/chosen": -298.0208435058594, "logps/rejected": -262.80096435546875, "loss": 0.688, "positive_losses": 0.01885681226849556, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1029839739203453, "rewards/margins": 0.039221733808517456, "rewards/margins_max": 0.14284637570381165, "rewards/margins_min": -0.045712970197200775, "rewards/margins_std": 0.0840546116232872, "rewards/rejected": 0.06376224756240845, "step": 810 }, { "dpo_losses": 0.6649328470230103, "epoch": 0.21, "grad_norm": 15.648486742464959, "learning_rate": 4.803310053882831e-07, "logits/chosen": -2.736679792404175, "logits/rejected": -2.6685967445373535, "logps/chosen": -259.08319091796875, "logps/rejected": -206.3892364501953, "loss": 0.6822, "positive_losses": 0.01941223070025444, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.10811863094568253, "rewards/margins": 0.05877862125635147, "rewards/margins_max": 0.1449100822210312, "rewards/margins_min": -0.013782364316284657, "rewards/margins_std": 0.06897840648889542, "rewards/rejected": 0.04934001341462135, "step": 820 }, { "dpo_losses": 0.6795674562454224, "epoch": 0.22, "grad_norm": 1.9645455287668387, "learning_rate": 4.794332124596775e-07, "logits/chosen": -2.768758535385132, "logits/rejected": -2.727670669555664, "logps/chosen": -312.9381103515625, "logps/rejected": -294.339599609375, "loss": 0.7017, "positive_losses": 0.09621696174144745, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.09689287841320038, "rewards/margins": 0.029259273782372475, "rewards/margins_max": 0.11800308525562286, "rewards/margins_min": -0.03915540874004364, "rewards/margins_std": 0.07343783229589462, "rewards/rejected": 0.06763359904289246, "step": 830 }, { "dpo_losses": 0.6785067319869995, "epoch": 0.22, "grad_norm": 18.788130174373894, "learning_rate": 4.785162619238574e-07, "logits/chosen": -2.8277535438537598, "logits/rejected": -2.8106484413146973, "logps/chosen": -246.59152221679688, "logps/rejected": -203.9308624267578, "loss": 0.6877, "positive_losses": 0.08641128242015839, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.08607254922389984, "rewards/margins": 0.030666109174489975, "rewards/margins_max": 0.10051500797271729, "rewards/margins_min": -0.037806764245033264, "rewards/margins_std": 0.06067012995481491, "rewards/rejected": 0.055406440049409866, "step": 840 }, { "dpo_losses": 0.6822186708450317, "epoch": 0.22, "grad_norm": 2.1377190683995804, "learning_rate": 4.775802303459287e-07, "logits/chosen": -2.753934144973755, "logits/rejected": -2.715641736984253, "logps/chosen": -223.06021118164062, "logps/rejected": -232.00149536132812, "loss": 0.6773, "positive_losses": 0.04858856275677681, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.09215366095304489, "rewards/margins": 0.023227885365486145, "rewards/margins_max": 0.07749857753515244, "rewards/margins_min": -0.0410555824637413, "rewards/margins_std": 0.055815864354372025, "rewards/rejected": 0.06892578303813934, "step": 850 }, { "dpo_losses": 0.6828041076660156, "epoch": 0.23, "grad_norm": 1.9617373594360756, "learning_rate": 4.766251958842589e-07, "logits/chosen": -2.727834463119507, "logits/rejected": -2.806042194366455, "logps/chosen": -139.188720703125, "logps/rejected": -196.4252166748047, "loss": 0.6854, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.09413941949605942, "rewards/margins": 0.021806001663208008, "rewards/margins_max": 0.08895751088857651, "rewards/margins_min": -0.04246797785162926, "rewards/margins_std": 0.05625462532043457, "rewards/rejected": 0.07233341783285141, "step": 860 }, { "dpo_losses": 0.6727257966995239, "epoch": 0.23, "grad_norm": 1.9573024173567872, "learning_rate": 4.756512382839506e-07, "logits/chosen": -2.760953187942505, "logits/rejected": -2.732851505279541, "logps/chosen": -296.09368896484375, "logps/rejected": -229.08847045898438, "loss": 0.6927, "positive_losses": 0.15652236342430115, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1098223552107811, "rewards/margins": 0.0430893711745739, "rewards/margins_max": 0.1367332488298416, "rewards/margins_min": -0.034009821712970734, "rewards/margins_std": 0.07827076315879822, "rewards/rejected": 0.0667329877614975, "step": 870 }, { "dpo_losses": 0.6918343305587769, "epoch": 0.23, "grad_norm": 1.8900833615079249, "learning_rate": 4.746584388701831e-07, "logits/chosen": -2.8361616134643555, "logits/rejected": -2.8230698108673096, "logps/chosen": -255.2892608642578, "logps/rejected": -299.9884033203125, "loss": 0.6891, "positive_losses": 0.1362869292497635, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.10028652846813202, "rewards/margins": 0.003979234956204891, "rewards/margins_max": 0.07634096592664719, "rewards/margins_min": -0.07755633443593979, "rewards/margins_std": 0.06962737441062927, "rewards/rejected": 0.09630729258060455, "step": 880 }, { "dpo_losses": 0.6761296987533569, "epoch": 0.23, "grad_norm": 15.70216978593494, "learning_rate": 4.736468805414218e-07, "logits/chosen": -2.7853493690490723, "logits/rejected": -2.737541675567627, "logps/chosen": -294.12774658203125, "logps/rejected": -244.80654907226562, "loss": 0.6859, "positive_losses": 0.10331420600414276, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.10274813324213028, "rewards/margins": 0.036404069513082504, "rewards/margins_max": 0.12679249048233032, "rewards/margins_min": -0.040073007345199585, "rewards/margins_std": 0.07672830671072006, "rewards/rejected": 0.06634406000375748, "step": 890 }, { "dpo_losses": 0.6793249845504761, "epoch": 0.24, "grad_norm": 1.811255981772413, "learning_rate": 4.7261664776249595e-07, "logits/chosen": -2.782780885696411, "logits/rejected": -2.7844455242156982, "logps/chosen": -217.6700439453125, "logps/rejected": -262.0546569824219, "loss": 0.6788, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.12033214420080185, "rewards/margins": 0.029326725751161575, "rewards/margins_max": 0.10667027533054352, "rewards/margins_min": -0.06298742443323135, "rewards/margins_std": 0.07723227888345718, "rewards/rejected": 0.09100539982318878, "step": 900 }, { "epoch": 0.24, "eval_dpo_losses": 0.6740882396697998, "eval_logits/chosen": -2.8025319576263428, "eval_logits/rejected": -2.7642123699188232, "eval_logps/chosen": -272.8671875, "eval_logps/rejected": -254.53573608398438, "eval_loss": 0.6861064434051514, "eval_positive_losses": 0.0828118622303009, "eval_rewards/accuracies": 0.7123016119003296, "eval_rewards/chosen": 0.11627738922834396, "eval_rewards/margins": 0.04014097899198532, "eval_rewards/margins_max": 0.16791382431983948, "eval_rewards/margins_min": -0.06933987140655518, "eval_rewards/margins_std": 0.07767920196056366, "eval_rewards/rejected": 0.07613641023635864, "eval_runtime": 388.6477, "eval_samples_per_second": 5.146, "eval_steps_per_second": 0.162, "step": 900 }, { "dpo_losses": 0.6753177046775818, "epoch": 0.24, "grad_norm": 11.841192366899621, "learning_rate": 4.7156782655754624e-07, "logits/chosen": -2.8079895973205566, "logits/rejected": -2.7867820262908936, "logps/chosen": -257.70550537109375, "logps/rejected": -235.9842529296875, "loss": 0.6895, "positive_losses": 0.18343773484230042, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.11322704702615738, "rewards/margins": 0.03787517175078392, "rewards/margins_max": 0.11973077058792114, "rewards/margins_min": -0.05645657330751419, "rewards/margins_std": 0.07862423360347748, "rewards/rejected": 0.07535187900066376, "step": 910 }, { "dpo_losses": 0.6834079623222351, "epoch": 0.24, "grad_norm": 8.095979950408838, "learning_rate": 4.705005045028414e-07, "logits/chosen": -2.822742462158203, "logits/rejected": -2.7415847778320312, "logps/chosen": -278.8885498046875, "logps/rejected": -243.7444305419922, "loss": 0.6852, "positive_losses": 0.06942252814769745, "rewards/accuracies": 0.625, "rewards/chosen": 0.11463338136672974, "rewards/margins": 0.021800417453050613, "rewards/margins_max": 0.1151091605424881, "rewards/margins_min": -0.07252896577119827, "rewards/margins_std": 0.08234294503927231, "rewards/rejected": 0.09283297508955002, "step": 920 }, { "dpo_losses": 0.6700179576873779, "epoch": 0.24, "grad_norm": 6.0696545100674415, "learning_rate": 4.694147707194659e-07, "logits/chosen": -2.7774658203125, "logits/rejected": -2.689136028289795, "logps/chosen": -322.48638916015625, "logps/rejected": -286.92474365234375, "loss": 0.6783, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.12033887207508087, "rewards/margins": 0.04955907538533211, "rewards/margins_max": 0.16666612029075623, "rewards/margins_min": -0.06129683926701546, "rewards/margins_std": 0.10141287744045258, "rewards/rejected": 0.07077980041503906, "step": 930 }, { "dpo_losses": 0.6771188974380493, "epoch": 0.25, "grad_norm": 10.970419210597651, "learning_rate": 4.683107158658781e-07, "logits/chosen": -2.7917165756225586, "logits/rejected": -2.7873263359069824, "logps/chosen": -296.0466003417969, "logps/rejected": -269.75799560546875, "loss": 0.6823, "positive_losses": 0.11498375236988068, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.11963772773742676, "rewards/margins": 0.03364872187376022, "rewards/margins_max": 0.1158483475446701, "rewards/margins_min": -0.04820042848587036, "rewards/margins_std": 0.07006116211414337, "rewards/rejected": 0.08598899841308594, "step": 940 }, { "dpo_losses": 0.6775475740432739, "epoch": 0.25, "grad_norm": 8.962956381527691, "learning_rate": 4.6718843213034066e-07, "logits/chosen": -2.8136909008026123, "logits/rejected": -2.8473093509674072, "logps/chosen": -261.88470458984375, "logps/rejected": -299.9488220214844, "loss": 0.6828, "positive_losses": 0.07727966457605362, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.11626646667718887, "rewards/margins": 0.03341097757220268, "rewards/margins_max": 0.12769392132759094, "rewards/margins_min": -0.06785953044891357, "rewards/margins_std": 0.08994203805923462, "rewards/rejected": 0.08285548537969589, "step": 950 }, { "dpo_losses": 0.6749299764633179, "epoch": 0.25, "grad_norm": 1.979032277681068, "learning_rate": 4.660480132232224e-07, "logits/chosen": -2.8280081748962402, "logits/rejected": -2.727102756500244, "logps/chosen": -358.97918701171875, "logps/rejected": -292.6557922363281, "loss": 0.6805, "positive_losses": 0.10472335666418076, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.11151669174432755, "rewards/margins": 0.038539350032806396, "rewards/margins_max": 0.1373523324728012, "rewards/margins_min": -0.047525554895401, "rewards/margins_std": 0.08352877199649811, "rewards/rejected": 0.07297734171152115, "step": 960 }, { "dpo_losses": 0.6668910980224609, "epoch": 0.25, "grad_norm": 1.888579531032461, "learning_rate": 4.64889554369174e-07, "logits/chosen": -2.7996866703033447, "logits/rejected": -2.7919023036956787, "logps/chosen": -318.7244567871094, "logps/rejected": -272.59552001953125, "loss": 0.6809, "positive_losses": 0.05099544674158096, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.12181965261697769, "rewards/margins": 0.054952751845121384, "rewards/margins_max": 0.14387230575084686, "rewards/margins_min": -0.03384255990386009, "rewards/margins_std": 0.07864460349082947, "rewards/rejected": 0.0668669119477272, "step": 970 }, { "dpo_losses": 0.6755861043930054, "epoch": 0.26, "grad_norm": 1.7811269884967806, "learning_rate": 4.637131522991764e-07, "logits/chosen": -2.8695130348205566, "logits/rejected": -2.8582377433776855, "logps/chosen": -295.46136474609375, "logps/rejected": -235.12045288085938, "loss": 0.6812, "positive_losses": 0.07995452731847763, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.10505534708499908, "rewards/margins": 0.03675197437405586, "rewards/margins_max": 0.11415354907512665, "rewards/margins_min": -0.03910742700099945, "rewards/margins_std": 0.06744858622550964, "rewards/rejected": 0.06830336898565292, "step": 980 }, { "dpo_losses": 0.6747775077819824, "epoch": 0.26, "grad_norm": 1.7332998156363515, "learning_rate": 4.6251890524246375e-07, "logits/chosen": -2.793673038482666, "logits/rejected": -2.7048346996307373, "logps/chosen": -328.4063415527344, "logps/rejected": -283.97613525390625, "loss": 0.6845, "positive_losses": 0.3158671259880066, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.11113989353179932, "rewards/margins": 0.039355430752038956, "rewards/margins_max": 0.12707474827766418, "rewards/margins_min": -0.06679748743772507, "rewards/margins_std": 0.08647724986076355, "rewards/rejected": 0.07178448140621185, "step": 990 }, { "dpo_losses": 0.6791088581085205, "epoch": 0.26, "grad_norm": 11.379057108294523, "learning_rate": 4.613069129183218e-07, "logits/chosen": -2.796535015106201, "logits/rejected": -2.753174304962158, "logps/chosen": -227.5648193359375, "logps/rejected": -206.048095703125, "loss": 0.6883, "positive_losses": 0.13388271629810333, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.09791069477796555, "rewards/margins": 0.029800478368997574, "rewards/margins_max": 0.11369460821151733, "rewards/margins_min": -0.057328272610902786, "rewards/margins_std": 0.07600894570350647, "rewards/rejected": 0.06811021268367767, "step": 1000 }, { "epoch": 0.26, "eval_dpo_losses": 0.6726115345954895, "eval_logits/chosen": -2.803205966949463, "eval_logits/rejected": -2.7647831439971924, "eval_logps/chosen": -272.3450622558594, "eval_logps/rejected": -254.33460998535156, "eval_loss": 0.6859395503997803, "eval_positive_losses": 0.09098844230175018, "eval_rewards/accuracies": 0.7142857313156128, "eval_rewards/chosen": 0.12149831652641296, "eval_rewards/margins": 0.04335065931081772, "eval_rewards/margins_max": 0.17721711099147797, "eval_rewards/margins_min": -0.0735059604048729, "eval_rewards/margins_std": 0.0821399986743927, "eval_rewards/rejected": 0.07814766466617584, "eval_runtime": 388.4799, "eval_samples_per_second": 5.148, "eval_steps_per_second": 0.162, "step": 1000 }, { "dpo_losses": 0.6663374900817871, "epoch": 0.26, "grad_norm": 1.8741883110446367, "learning_rate": 4.6007727652776065e-07, "logits/chosen": -2.8542165756225586, "logits/rejected": -2.7912869453430176, "logps/chosen": -271.45343017578125, "logps/rejected": -273.6263427734375, "loss": 0.6792, "positive_losses": 0.1299796998500824, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.11328332126140594, "rewards/margins": 0.05630182474851608, "rewards/margins_max": 0.16808533668518066, "rewards/margins_min": -0.02279646322131157, "rewards/margins_std": 0.08499892055988312, "rewards/rejected": 0.05698147416114807, "step": 1010 }, { "dpo_losses": 0.6849120855331421, "epoch": 0.27, "grad_norm": 1.6245219187621243, "learning_rate": 4.588300987450652e-07, "logits/chosen": -2.9056270122528076, "logits/rejected": -2.8224825859069824, "logps/chosen": -256.3388977050781, "logps/rejected": -264.8789978027344, "loss": 0.6981, "positive_losses": 0.16140174865722656, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.10395157337188721, "rewards/margins": 0.017809074372053146, "rewards/margins_max": 0.08751221001148224, "rewards/margins_min": -0.04618312045931816, "rewards/margins_std": 0.062398601323366165, "rewards/rejected": 0.08614251017570496, "step": 1020 }, { "dpo_losses": 0.6712228059768677, "epoch": 0.27, "grad_norm": 10.216083517021472, "learning_rate": 4.5756548370922134e-07, "logits/chosen": -2.810163736343384, "logits/rejected": -2.8218376636505127, "logps/chosen": -289.07733154296875, "logps/rejected": -286.98638916015625, "loss": 0.6867, "positive_losses": 0.20693854987621307, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.11013605445623398, "rewards/margins": 0.04705684632062912, "rewards/margins_max": 0.1697276532649994, "rewards/margins_min": -0.04310871288180351, "rewards/margins_std": 0.09699669480323792, "rewards/rejected": 0.06307922303676605, "step": 1030 }, { "dpo_losses": 0.6609120965003967, "epoch": 0.27, "grad_norm": 1.8241041911257145, "learning_rate": 4.5628353701522047e-07, "logits/chosen": -2.743584632873535, "logits/rejected": -2.688563823699951, "logps/chosen": -285.1751708984375, "logps/rejected": -252.52377319335938, "loss": 0.6819, "positive_losses": 0.06223297119140625, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.12947912514209747, "rewards/margins": 0.06809819489717484, "rewards/margins_max": 0.18549703061580658, "rewards/margins_min": -0.023058805614709854, "rewards/margins_std": 0.09334772080183029, "rewards/rejected": 0.06138092279434204, "step": 1040 }, { "dpo_losses": 0.6756216287612915, "epoch": 0.27, "grad_norm": 11.2175543952917, "learning_rate": 4.549843657052429e-07, "logits/chosen": -2.802422046661377, "logits/rejected": -2.721710205078125, "logps/chosen": -314.43414306640625, "logps/rejected": -288.079345703125, "loss": 0.6818, "positive_losses": 0.06746216118335724, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.12974092364311218, "rewards/margins": 0.03823871165513992, "rewards/margins_max": 0.15835562348365784, "rewards/margins_min": -0.07311789691448212, "rewards/margins_std": 0.10263626277446747, "rewards/rejected": 0.09150221198797226, "step": 1050 }, { "dpo_losses": 0.6694084405899048, "epoch": 0.28, "grad_norm": 1.9713422270144096, "learning_rate": 4.5366807825971907e-07, "logits/chosen": -2.846350908279419, "logits/rejected": -2.7868642807006836, "logps/chosen": -340.27117919921875, "logps/rejected": -330.462646484375, "loss": 0.6793, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.13603763282299042, "rewards/margins": 0.050386250019073486, "rewards/margins_max": 0.14835581183433533, "rewards/margins_min": -0.02989931032061577, "rewards/margins_std": 0.0805257111787796, "rewards/rejected": 0.08565138280391693, "step": 1060 }, { "dpo_losses": 0.6645206212997437, "epoch": 0.28, "grad_norm": 5.586548330348038, "learning_rate": 4.5233478458827176e-07, "logits/chosen": -2.840010166168213, "logits/rejected": -2.842350959777832, "logps/chosen": -247.47793579101562, "logps/rejected": -243.4279022216797, "loss": 0.6846, "positive_losses": 0.0, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.11831434071063995, "rewards/margins": 0.060487616807222366, "rewards/margins_max": 0.1816258579492569, "rewards/margins_min": -0.043870292603969574, "rewards/margins_std": 0.09767617285251617, "rewards/rejected": 0.05782672017812729, "step": 1070 }, { "dpo_losses": 0.6843565702438354, "epoch": 0.28, "grad_norm": 1.8777669835140915, "learning_rate": 4.509845960205389e-07, "logits/chosen": -2.8549046516418457, "logits/rejected": -2.7536978721618652, "logps/chosen": -310.20135498046875, "logps/rejected": -312.7437744140625, "loss": 0.6844, "positive_losses": 0.03246307373046875, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.10435821115970612, "rewards/margins": 0.020014088600873947, "rewards/margins_max": 0.10428164899349213, "rewards/margins_min": -0.08708689361810684, "rewards/margins_std": 0.0864337682723999, "rewards/rejected": 0.08434412628412247, "step": 1080 }, { "dpo_losses": 0.6706165075302124, "epoch": 0.29, "grad_norm": 2.032524530654526, "learning_rate": 4.4961762529687736e-07, "logits/chosen": -2.8304388523101807, "logits/rejected": -2.756781578063965, "logps/chosen": -243.0189666748047, "logps/rejected": -204.32138061523438, "loss": 0.6773, "positive_losses": 0.16189880669116974, "rewards/accuracies": 0.75, "rewards/chosen": 0.11879110336303711, "rewards/margins": 0.04820892959833145, "rewards/margins_max": 0.13754215836524963, "rewards/margins_min": -0.05152437090873718, "rewards/margins_std": 0.08577823638916016, "rewards/rejected": 0.07058216631412506, "step": 1090 }, { "dpo_losses": 0.6765921711921692, "epoch": 0.29, "grad_norm": 8.889417086908429, "learning_rate": 4.482339865589492e-07, "logits/chosen": -2.7313170433044434, "logits/rejected": -2.742469549179077, "logps/chosen": -277.88238525390625, "logps/rejected": -260.5584411621094, "loss": 0.692, "positive_losses": 0.09104885905981064, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.09932636469602585, "rewards/margins": 0.034714534878730774, "rewards/margins_max": 0.10242130607366562, "rewards/margins_min": -0.04449024051427841, "rewards/margins_std": 0.06656105071306229, "rewards/rejected": 0.06461183726787567, "step": 1100 }, { "epoch": 0.29, "eval_dpo_losses": 0.6716243028640747, "eval_logits/chosen": -2.808830738067627, "eval_logits/rejected": -2.770286798477173, "eval_logps/chosen": -271.9104919433594, "eval_logps/rejected": -254.11595153808594, "eval_loss": 0.685075581073761, "eval_positive_losses": 0.09168452024459839, "eval_rewards/accuracies": 0.7023809552192688, "eval_rewards/chosen": 0.12584403157234192, "eval_rewards/margins": 0.04550952836871147, "eval_rewards/margins_max": 0.18452809751033783, "eval_rewards/margins_min": -0.07608763873577118, "eval_rewards/margins_std": 0.08527926355600357, "eval_rewards/rejected": 0.08033448457717896, "eval_runtime": 388.4713, "eval_samples_per_second": 5.148, "eval_steps_per_second": 0.162, "step": 1100 }, { "dpo_losses": 0.6707266569137573, "epoch": 0.29, "grad_norm": 2.1967816307227843, "learning_rate": 4.4683379534019076e-07, "logits/chosen": -2.8102149963378906, "logits/rejected": -2.807123899459839, "logps/chosen": -261.33843994140625, "logps/rejected": -248.5167999267578, "loss": 0.6784, "positive_losses": 0.09397812187671661, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.11437442153692245, "rewards/margins": 0.04670627787709236, "rewards/margins_max": 0.13146765530109406, "rewards/margins_min": -0.022779863327741623, "rewards/margins_std": 0.06781142950057983, "rewards/rejected": 0.06766814738512039, "step": 1110 }, { "dpo_losses": 0.6730443835258484, "epoch": 0.29, "grad_norm": 14.967622527345275, "learning_rate": 4.4541716855616593e-07, "logits/chosen": -2.8452019691467285, "logits/rejected": -2.7901253700256348, "logps/chosen": -257.3014831542969, "logps/rejected": -195.07559204101562, "loss": 0.6867, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1417730152606964, "rewards/margins": 0.04227545112371445, "rewards/margins_max": 0.11548507213592529, "rewards/margins_min": -0.0384410098195076, "rewards/margins_std": 0.0674058347940445, "rewards/rejected": 0.09949756413698196, "step": 1120 }, { "dpo_losses": 0.664470911026001, "epoch": 0.3, "grad_norm": 2.0627318743208862, "learning_rate": 4.4398422449480357e-07, "logits/chosen": -2.9047281742095947, "logits/rejected": -2.886209726333618, "logps/chosen": -281.00128173828125, "logps/rejected": -247.7537384033203, "loss": 0.6671, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.13944607973098755, "rewards/margins": 0.060615021735429764, "rewards/margins_max": 0.17492035031318665, "rewards/margins_min": -0.026431847363710403, "rewards/margins_std": 0.08961961418390274, "rewards/rejected": 0.07883106172084808, "step": 1130 }, { "dpo_losses": 0.6733167171478271, "epoch": 0.3, "grad_norm": 16.524324924455595, "learning_rate": 4.4253508280652036e-07, "logits/chosen": -2.821608066558838, "logits/rejected": -2.7850451469421387, "logps/chosen": -217.66879272460938, "logps/rejected": -190.71035766601562, "loss": 0.6828, "positive_losses": 0.09988708794116974, "rewards/accuracies": 0.75, "rewards/chosen": 0.11857322603464127, "rewards/margins": 0.04155484959483147, "rewards/margins_max": 0.11579285562038422, "rewards/margins_min": -0.034957364201545715, "rewards/margins_std": 0.06700852513313293, "rewards/rejected": 0.0770183727145195, "step": 1140 }, { "dpo_losses": 0.6570025682449341, "epoch": 0.3, "grad_norm": 2.147627244228236, "learning_rate": 4.410698644942302e-07, "logits/chosen": -2.8799514770507812, "logits/rejected": -2.866712808609009, "logps/chosen": -294.8155822753906, "logps/rejected": -241.6029052734375, "loss": 0.6726, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.14851152896881104, "rewards/margins": 0.07623986899852753, "rewards/margins_max": 0.18512070178985596, "rewards/margins_min": -0.04003595933318138, "rewards/margins_std": 0.10335429012775421, "rewards/rejected": 0.07227165997028351, "step": 1150 }, { "dpo_losses": 0.6709702610969543, "epoch": 0.3, "grad_norm": 2.018872151650389, "learning_rate": 4.3958869190324057e-07, "logits/chosen": -2.773653268814087, "logits/rejected": -2.753624200820923, "logps/chosen": -179.3298797607422, "logps/rejected": -187.51380920410156, "loss": 0.6789, "positive_losses": 0.06843414157629013, "rewards/accuracies": 0.625, "rewards/chosen": 0.12052376568317413, "rewards/margins": 0.04608723521232605, "rewards/margins_max": 0.13753105700016022, "rewards/margins_min": -0.018335824832320213, "rewards/margins_std": 0.0684729665517807, "rewards/rejected": 0.07443653792142868, "step": 1160 }, { "dpo_losses": 0.6723008155822754, "epoch": 0.31, "grad_norm": 1.9989585549130962, "learning_rate": 4.380916887110365e-07, "logits/chosen": -2.788508176803589, "logits/rejected": -2.764782428741455, "logps/chosen": -235.6918487548828, "logps/rejected": -247.7445526123047, "loss": 0.6773, "positive_losses": 0.0934390053153038, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.12237273156642914, "rewards/margins": 0.044223930686712265, "rewards/margins_max": 0.12869183719158173, "rewards/margins_min": -0.05642607808113098, "rewards/margins_std": 0.08192186057567596, "rewards/rejected": 0.07814880460500717, "step": 1170 }, { "dpo_losses": 0.6675256490707397, "epoch": 0.31, "grad_norm": 1.8728500756068767, "learning_rate": 4.3657897991695394e-07, "logits/chosen": -2.7991480827331543, "logits/rejected": -2.7395639419555664, "logps/chosen": -255.3263702392578, "logps/rejected": -236.8482208251953, "loss": 0.6748, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1421738862991333, "rewards/margins": 0.053548168390989304, "rewards/margins_max": 0.13603021204471588, "rewards/margins_min": -0.020064514130353928, "rewards/margins_std": 0.07148457318544388, "rewards/rejected": 0.0886257067322731, "step": 1180 }, { "dpo_losses": 0.6752602458000183, "epoch": 0.31, "grad_norm": 4.44738270802729, "learning_rate": 4.350506918317416e-07, "logits/chosen": -2.8661818504333496, "logits/rejected": -2.868072748184204, "logps/chosen": -247.4238739013672, "logps/rejected": -208.9613800048828, "loss": 0.6928, "positive_losses": 0.06978531181812286, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1355341225862503, "rewards/margins": 0.03722946345806122, "rewards/margins_max": 0.10634903609752655, "rewards/margins_min": -0.03015676699578762, "rewards/margins_std": 0.05882125347852707, "rewards/rejected": 0.09830465167760849, "step": 1190 }, { "dpo_losses": 0.6878072619438171, "epoch": 0.31, "grad_norm": 1.8532475397208668, "learning_rate": 4.335069520670149e-07, "logits/chosen": -2.7378358840942383, "logits/rejected": -2.694200277328491, "logps/chosen": -217.06900024414062, "logps/rejected": -233.5520477294922, "loss": 0.6781, "positive_losses": 0.0, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.1189335361123085, "rewards/margins": 0.012583857402205467, "rewards/margins_max": 0.1028449758887291, "rewards/margins_min": -0.064508818089962, "rewards/margins_std": 0.07415572553873062, "rewards/rejected": 0.10634968429803848, "step": 1200 }, { "epoch": 0.31, "eval_dpo_losses": 0.6704273819923401, "eval_logits/chosen": -2.8056468963623047, "eval_logits/rejected": -2.767200231552124, "eval_logps/chosen": -271.125244140625, "eval_logps/rejected": -253.5946807861328, "eval_loss": 0.6848036050796509, "eval_positive_losses": 0.08884982764720917, "eval_rewards/accuracies": 0.716269850730896, "eval_rewards/chosen": 0.133696511387825, "eval_rewards/margins": 0.04814951494336128, "eval_rewards/margins_max": 0.19328844547271729, "eval_rewards/margins_min": -0.07867568731307983, "eval_rewards/margins_std": 0.0892573893070221, "eval_rewards/rejected": 0.08554700762033463, "eval_runtime": 409.1152, "eval_samples_per_second": 4.889, "eval_steps_per_second": 0.154, "step": 1200 }, { "dpo_losses": 0.6781080961227417, "epoch": 0.32, "grad_norm": 1.9127627572464478, "learning_rate": 4.319478895245999e-07, "logits/chosen": -2.8690595626831055, "logits/rejected": -2.8480398654937744, "logps/chosen": -293.2466735839844, "logps/rejected": -253.9585723876953, "loss": 0.6756, "positive_losses": 0.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.15057069063186646, "rewards/margins": 0.033803313970565796, "rewards/margins_max": 0.16536416113376617, "rewards/margins_min": -0.08521204441785812, "rewards/margins_std": 0.11450543254613876, "rewards/rejected": 0.11676736921072006, "step": 1210 }, { "dpo_losses": 0.6695367693901062, "epoch": 0.32, "grad_norm": 1.971512566824635, "learning_rate": 4.3037363438577036e-07, "logits/chosen": -2.86470365524292, "logits/rejected": -2.800983428955078, "logps/chosen": -275.1996765136719, "logps/rejected": -262.8089294433594, "loss": 0.673, "positive_losses": 0.01301498431712389, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.11730766296386719, "rewards/margins": 0.05050047114491463, "rewards/margins_max": 0.1559460461139679, "rewards/margins_min": -0.055488090962171555, "rewards/margins_std": 0.09744496643543243, "rewards/rejected": 0.06680719554424286, "step": 1220 }, { "dpo_losses": 0.6828508377075195, "epoch": 0.32, "grad_norm": 7.802496036331749, "learning_rate": 4.2878431810037716e-07, "logits/chosen": -2.7981674671173096, "logits/rejected": -2.8011233806610107, "logps/chosen": -264.6150817871094, "logps/rejected": -263.75469970703125, "loss": 0.69, "positive_losses": 0.25762253999710083, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.10469107329845428, "rewards/margins": 0.022840503603219986, "rewards/margins_max": 0.11655263602733612, "rewards/margins_min": -0.06864559650421143, "rewards/margins_std": 0.08273427188396454, "rewards/rejected": 0.0818505734205246, "step": 1230 }, { "dpo_losses": 0.6616807579994202, "epoch": 0.32, "grad_norm": 8.05648555721981, "learning_rate": 4.271800733758729e-07, "logits/chosen": -2.6408848762512207, "logits/rejected": -2.6748952865600586, "logps/chosen": -241.6726531982422, "logps/rejected": -206.65603637695312, "loss": 0.6785, "positive_losses": 0.004642486572265625, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.14186879992485046, "rewards/margins": 0.06542383134365082, "rewards/margins_max": 0.1544228345155716, "rewards/margins_min": -0.00011541061394382268, "rewards/margins_std": 0.0685013085603714, "rewards/rejected": 0.07644496113061905, "step": 1240 }, { "dpo_losses": 0.6754915118217468, "epoch": 0.33, "grad_norm": 74.92735414011027, "learning_rate": 4.255610341662304e-07, "logits/chosen": -2.7187135219573975, "logits/rejected": -2.7095625400543213, "logps/chosen": -259.538818359375, "logps/rejected": -245.74658203125, "loss": 0.6953, "positive_losses": 0.20729827880859375, "rewards/accuracies": 0.625, "rewards/chosen": 0.12701359391212463, "rewards/margins": 0.037803538143634796, "rewards/margins_max": 0.13912460207939148, "rewards/margins_min": -0.06255535036325455, "rewards/margins_std": 0.08858311921358109, "rewards/rejected": 0.08921004831790924, "step": 1250 }, { "dpo_losses": 0.6648576259613037, "epoch": 0.33, "grad_norm": 11.821818879145804, "learning_rate": 4.2392733566075757e-07, "logits/chosen": -2.747102975845337, "logits/rejected": -2.7102127075195312, "logps/chosen": -221.44418334960938, "logps/rejected": -193.30667114257812, "loss": 0.6788, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1266448050737381, "rewards/margins": 0.05905945971608162, "rewards/margins_max": 0.14472545683383942, "rewards/margins_min": -0.018324170261621475, "rewards/margins_std": 0.07059869915246964, "rewards/rejected": 0.06758534163236618, "step": 1260 }, { "dpo_losses": 0.6734832525253296, "epoch": 0.33, "grad_norm": 10.816641130304586, "learning_rate": 4.2227911427280973e-07, "logits/chosen": -2.790510416030884, "logits/rejected": -2.817786455154419, "logps/chosen": -277.05218505859375, "logps/rejected": -285.94293212890625, "loss": 0.6962, "positive_losses": 0.008324814029037952, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1278315931558609, "rewards/margins": 0.04247141629457474, "rewards/margins_max": 0.1652711182832718, "rewards/margins_min": -0.047576092183589935, "rewards/margins_std": 0.09299333393573761, "rewards/rejected": 0.08536018431186676, "step": 1270 }, { "dpo_losses": 0.6762028932571411, "epoch": 0.33, "grad_norm": 2.326050826477611, "learning_rate": 4.206165076283982e-07, "logits/chosen": -2.67604398727417, "logits/rejected": -2.7359328269958496, "logps/chosen": -212.44430541992188, "logps/rejected": -236.13589477539062, "loss": 0.6915, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.12036889791488647, "rewards/margins": 0.03659987077116966, "rewards/margins_max": 0.12502220273017883, "rewards/margins_min": -0.06223265454173088, "rewards/margins_std": 0.0837571993470192, "rewards/rejected": 0.08376900851726532, "step": 1280 }, { "dpo_losses": 0.6659013032913208, "epoch": 0.34, "grad_norm": 9.185758464565456, "learning_rate": 4.1893965455469946e-07, "logits/chosen": -2.783086061477661, "logits/rejected": -2.7353854179382324, "logps/chosen": -231.83056640625, "logps/rejected": -233.4626922607422, "loss": 0.6817, "positive_losses": 0.008187675848603249, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.13617418706417084, "rewards/margins": 0.05796939134597778, "rewards/margins_max": 0.16482461988925934, "rewards/margins_min": -0.047472696751356125, "rewards/margins_std": 0.0955493301153183, "rewards/rejected": 0.07820478826761246, "step": 1290 }, { "dpo_losses": 0.6711560487747192, "epoch": 0.34, "grad_norm": 7.688879370025054, "learning_rate": 4.172486950684626e-07, "logits/chosen": -2.8060498237609863, "logits/rejected": -2.763659954071045, "logps/chosen": -196.8289794921875, "logps/rejected": -220.77841186523438, "loss": 0.6977, "positive_losses": 0.6763796806335449, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.11435681581497192, "rewards/margins": 0.04713314771652222, "rewards/margins_max": 0.1410677134990692, "rewards/margins_min": -0.06612564623355865, "rewards/margins_std": 0.09420207142829895, "rewards/rejected": 0.06722366809844971, "step": 1300 }, { "epoch": 0.34, "eval_dpo_losses": 0.6696622967720032, "eval_logits/chosen": -2.8049025535583496, "eval_logits/rejected": -2.7666215896606445, "eval_logps/chosen": -270.84783935546875, "eval_logps/rejected": -253.48585510253906, "eval_loss": 0.6843598484992981, "eval_positive_losses": 0.09547553956508636, "eval_rewards/accuracies": 0.7222222089767456, "eval_rewards/chosen": 0.13647052645683289, "eval_rewards/margins": 0.04983547329902649, "eval_rewards/margins_max": 0.19835184514522552, "eval_rewards/margins_min": -0.0814487487077713, "eval_rewards/margins_std": 0.09169920533895493, "eval_rewards/rejected": 0.0866350531578064, "eval_runtime": 408.2798, "eval_samples_per_second": 4.899, "eval_steps_per_second": 0.154, "step": 1300 }, { "dpo_losses": 0.6705228686332703, "epoch": 0.34, "grad_norm": 9.831172209939382, "learning_rate": 4.155437703643181e-07, "logits/chosen": -2.797250747680664, "logits/rejected": -2.759342908859253, "logps/chosen": -281.2738952636719, "logps/rejected": -258.2003479003906, "loss": 0.6783, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.14924822747707367, "rewards/margins": 0.048123955726623535, "rewards/margins_max": 0.15159112215042114, "rewards/margins_min": -0.04315425828099251, "rewards/margins_std": 0.08673207461833954, "rewards/rejected": 0.10112428665161133, "step": 1310 }, { "dpo_losses": 0.6825979948043823, "epoch": 0.35, "grad_norm": 14.90326780801519, "learning_rate": 4.138250228029881e-07, "logits/chosen": -2.8126060962677, "logits/rejected": -2.771669626235962, "logps/chosen": -255.3660888671875, "logps/rejected": -250.10989379882812, "loss": 0.6879, "positive_losses": 0.28322991728782654, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.12895464897155762, "rewards/margins": 0.024239787831902504, "rewards/margins_max": 0.16116644442081451, "rewards/margins_min": -0.0756097361445427, "rewards/margins_std": 0.1079140156507492, "rewards/rejected": 0.10471485555171967, "step": 1320 }, { "dpo_losses": 0.6643841862678528, "epoch": 0.35, "grad_norm": 1.8995218829857292, "learning_rate": 4.1209259589939935e-07, "logits/chosen": -2.8548474311828613, "logits/rejected": -2.847646474838257, "logps/chosen": -260.09259033203125, "logps/rejected": -255.16983032226562, "loss": 0.6853, "positive_losses": 0.0995582565665245, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15146958827972412, "rewards/margins": 0.06084311753511429, "rewards/margins_max": 0.166192427277565, "rewards/margins_min": -0.032936133444309235, "rewards/margins_std": 0.08817348629236221, "rewards/rejected": 0.09062648564577103, "step": 1330 }, { "dpo_losses": 0.6773894429206848, "epoch": 0.35, "grad_norm": 1.9373582972729462, "learning_rate": 4.103466343106998e-07, "logits/chosen": -2.6664412021636963, "logits/rejected": -2.6139297485351562, "logps/chosen": -329.4361267089844, "logps/rejected": -256.4240417480469, "loss": 0.6795, "positive_losses": 0.013278961181640625, "rewards/accuracies": 0.625, "rewards/chosen": 0.11371631920337677, "rewards/margins": 0.034226398915052414, "rewards/margins_max": 0.12248452007770538, "rewards/margins_min": -0.06852659583091736, "rewards/margins_std": 0.09230764210224152, "rewards/rejected": 0.07948991656303406, "step": 1340 }, { "dpo_losses": 0.6746512055397034, "epoch": 0.35, "grad_norm": 2.1153716573785166, "learning_rate": 4.085872838241796e-07, "logits/chosen": -2.8163838386535645, "logits/rejected": -2.8192570209503174, "logps/chosen": -283.8049621582031, "logps/rejected": -240.8268585205078, "loss": 0.6828, "positive_losses": 0.1023712158203125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.12577365338802338, "rewards/margins": 0.03993845731019974, "rewards/margins_max": 0.15859736502170563, "rewards/margins_min": -0.08044598996639252, "rewards/margins_std": 0.1035895124077797, "rewards/rejected": 0.08583520352840424, "step": 1350 }, { "dpo_losses": 0.6808849573135376, "epoch": 0.36, "grad_norm": 2.2392466328970273, "learning_rate": 4.06814691345098e-07, "logits/chosen": -2.7731029987335205, "logits/rejected": -2.790194511413574, "logps/chosen": -204.32423400878906, "logps/rejected": -191.43460083007812, "loss": 0.6798, "positive_losses": 0.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.12921229004859924, "rewards/margins": 0.026992127299308777, "rewards/margins_max": 0.144235298037529, "rewards/margins_min": -0.0724717378616333, "rewards/margins_std": 0.09821444004774094, "rewards/rejected": 0.10222016274929047, "step": 1360 }, { "dpo_losses": 0.6734897494316101, "epoch": 0.36, "grad_norm": 1.873548963474501, "learning_rate": 4.0502900488441707e-07, "logits/chosen": -2.881939172744751, "logits/rejected": -2.820146322250366, "logps/chosen": -267.6750793457031, "logps/rejected": -236.7908172607422, "loss": 0.6721, "positive_losses": 0.13007812201976776, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.13470585644245148, "rewards/margins": 0.0420747809112072, "rewards/margins_max": 0.1425054520368576, "rewards/margins_min": -0.050276100635528564, "rewards/margins_std": 0.08550667762756348, "rewards/rejected": 0.09263106435537338, "step": 1370 }, { "dpo_losses": 0.6570364236831665, "epoch": 0.36, "grad_norm": 1.6063052780198595, "learning_rate": 4.032303735464422e-07, "logits/chosen": -2.7805044651031494, "logits/rejected": -2.788447856903076, "logps/chosen": -271.3067626953125, "logps/rejected": -262.964599609375, "loss": 0.6738, "positive_losses": 0.008975982666015625, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.16087505221366882, "rewards/margins": 0.07718931138515472, "rewards/margins_max": 0.21250836551189423, "rewards/margins_min": -0.028435688465833664, "rewards/margins_std": 0.10952192544937134, "rewards/rejected": 0.0836857259273529, "step": 1380 }, { "dpo_losses": 0.666195273399353, "epoch": 0.36, "grad_norm": 1.9813531409531817, "learning_rate": 4.014189475163726e-07, "logits/chosen": -2.8767552375793457, "logits/rejected": -2.779618978500366, "logps/chosen": -256.12518310546875, "logps/rejected": -206.1197967529297, "loss": 0.6808, "positive_losses": 0.04650726169347763, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.13240960240364075, "rewards/margins": 0.05797024443745613, "rewards/margins_max": 0.17624790966510773, "rewards/margins_min": -0.05525298789143562, "rewards/margins_std": 0.10630662739276886, "rewards/rejected": 0.07443936169147491, "step": 1390 }, { "dpo_losses": 0.6712676882743835, "epoch": 0.37, "grad_norm": 1.8304967900081062, "learning_rate": 3.995948780477605e-07, "logits/chosen": -2.7871243953704834, "logits/rejected": -2.703411817550659, "logps/chosen": -264.5346374511719, "logps/rejected": -212.4336395263672, "loss": 0.6773, "positive_losses": 0.18939360976219177, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.13132144510746002, "rewards/margins": 0.047726646065711975, "rewards/margins_max": 0.16912686824798584, "rewards/margins_min": -0.05112838000059128, "rewards/margins_std": 0.09666156768798828, "rewards/rejected": 0.08359479904174805, "step": 1400 }, { "epoch": 0.37, "eval_dpo_losses": 0.6683064699172974, "eval_logits/chosen": -2.8007144927978516, "eval_logits/rejected": -2.7625784873962402, "eval_logps/chosen": -270.8923034667969, "eval_logps/rejected": -253.83428955078125, "eval_loss": 0.6851855516433716, "eval_positive_losses": 0.1091412678360939, "eval_rewards/accuracies": 0.716269850730896, "eval_rewards/chosen": 0.1360260248184204, "eval_rewards/margins": 0.0528750941157341, "eval_rewards/margins_max": 0.20844675600528717, "eval_rewards/margins_min": -0.0865757167339325, "eval_rewards/margins_std": 0.0966721922159195, "eval_rewards/rejected": 0.08315093070268631, "eval_runtime": 388.262, "eval_samples_per_second": 5.151, "eval_steps_per_second": 0.162, "step": 1400 }, { "dpo_losses": 0.6750961542129517, "epoch": 0.37, "grad_norm": 10.10602364271861, "learning_rate": 3.977583174498816e-07, "logits/chosen": -2.7661843299865723, "logits/rejected": -2.786423444747925, "logps/chosen": -217.5006866455078, "logps/rejected": -231.273681640625, "loss": 0.6789, "positive_losses": 0.026738548651337624, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.12990109622478485, "rewards/margins": 0.038937196135520935, "rewards/margins_max": 0.1547955423593521, "rewards/margins_min": -0.06859288364648819, "rewards/margins_std": 0.09591875970363617, "rewards/rejected": 0.09096390753984451, "step": 1410 }, { "dpo_losses": 0.6683656573295593, "epoch": 0.37, "grad_norm": 9.593516194728377, "learning_rate": 3.9590941907501717e-07, "logits/chosen": -2.807760715484619, "logits/rejected": -2.7231407165527344, "logps/chosen": -235.6064910888672, "logps/rejected": -185.83670043945312, "loss": 0.6906, "positive_losses": 0.01294860802590847, "rewards/accuracies": 0.75, "rewards/chosen": 0.13492132723331451, "rewards/margins": 0.05229531601071358, "rewards/margins_max": 0.15305258333683014, "rewards/margins_min": -0.03562027961015701, "rewards/margins_std": 0.08461041748523712, "rewards/rejected": 0.08262600749731064, "step": 1420 }, { "dpo_losses": 0.6754311323165894, "epoch": 0.37, "grad_norm": 2.4033266615367173, "learning_rate": 3.9404833730564974e-07, "logits/chosen": -2.844654083251953, "logits/rejected": -2.864405632019043, "logps/chosen": -199.28477478027344, "logps/rejected": -222.0977783203125, "loss": 0.6963, "positive_losses": 0.055442046374082565, "rewards/accuracies": 0.625, "rewards/chosen": 0.10395960509777069, "rewards/margins": 0.03751935809850693, "rewards/margins_max": 0.12094000726938248, "rewards/margins_min": -0.02232285775244236, "rewards/margins_std": 0.06388449668884277, "rewards/rejected": 0.06644026190042496, "step": 1430 }, { "dpo_losses": 0.6634265780448914, "epoch": 0.38, "grad_norm": 13.354282769438905, "learning_rate": 3.9217522754157117e-07, "logits/chosen": -2.815695285797119, "logits/rejected": -2.7707958221435547, "logps/chosen": -314.9673156738281, "logps/rejected": -316.7956237792969, "loss": 0.669, "positive_losses": 0.024194717407226562, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1410738229751587, "rewards/margins": 0.06209304928779602, "rewards/margins_max": 0.15248478949069977, "rewards/margins_min": -0.021471448242664337, "rewards/margins_std": 0.07498336583375931, "rewards/rejected": 0.07898075878620148, "step": 1440 }, { "dpo_losses": 0.657206654548645, "epoch": 0.38, "grad_norm": 8.045910705268138, "learning_rate": 3.9029024618690785e-07, "logits/chosen": -2.8342227935791016, "logits/rejected": -2.754894495010376, "logps/chosen": -305.3880920410156, "logps/rejected": -259.7880859375, "loss": 0.6751, "positive_losses": 0.1556953489780426, "rewards/accuracies": 0.75, "rewards/chosen": 0.14140953123569489, "rewards/margins": 0.07678183168172836, "rewards/margins_max": 0.18914107978343964, "rewards/margins_min": -0.04185623675584793, "rewards/margins_std": 0.10669572651386261, "rewards/rejected": 0.06462768465280533, "step": 1450 }, { "dpo_losses": 0.659722626209259, "epoch": 0.38, "grad_norm": 1.8479703027395027, "learning_rate": 3.883935506370605e-07, "logits/chosen": -2.7728967666625977, "logits/rejected": -2.7116799354553223, "logps/chosen": -286.15850830078125, "logps/rejected": -233.5446014404297, "loss": 0.6665, "positive_losses": 0.06868667900562286, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.13742755353450775, "rewards/margins": 0.07048223912715912, "rewards/margins_max": 0.1735135018825531, "rewards/margins_min": -0.046617552638053894, "rewards/margins_std": 0.09807271510362625, "rewards/rejected": 0.06694532930850983, "step": 1460 }, { "dpo_losses": 0.6679905652999878, "epoch": 0.38, "grad_norm": 9.303180306667144, "learning_rate": 3.864852992655616e-07, "logits/chosen": -2.8669474124908447, "logits/rejected": -2.833524227142334, "logps/chosen": -239.24234008789062, "logps/rejected": -227.33242797851562, "loss": 0.6741, "positive_losses": 0.14037056267261505, "rewards/accuracies": 0.75, "rewards/chosen": 0.12945117056369781, "rewards/margins": 0.05349854752421379, "rewards/margins_max": 0.15357722342014313, "rewards/margins_min": -0.04545611888170242, "rewards/margins_std": 0.09062834084033966, "rewards/rejected": 0.07595261186361313, "step": 1470 }, { "dpo_losses": 0.6731019616127014, "epoch": 0.39, "grad_norm": 1.9833142306372231, "learning_rate": 3.845656514108515e-07, "logits/chosen": -2.8072688579559326, "logits/rejected": -2.7603797912597656, "logps/chosen": -226.1439208984375, "logps/rejected": -260.9153747558594, "loss": 0.6837, "positive_losses": 0.23621253669261932, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.11956068128347397, "rewards/margins": 0.04162520542740822, "rewards/margins_max": 0.1307816356420517, "rewards/margins_min": -0.028078163042664528, "rewards/margins_std": 0.06945054233074188, "rewards/rejected": 0.07793547958135605, "step": 1480 }, { "dpo_losses": 0.6623013615608215, "epoch": 0.39, "grad_norm": 9.77740929612468, "learning_rate": 3.8263476736297375e-07, "logits/chosen": -2.6985790729522705, "logits/rejected": -2.696704864501953, "logps/chosen": -262.1617431640625, "logps/rejected": -234.5415802001953, "loss": 0.681, "positive_losses": 0.22186526656150818, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.13082730770111084, "rewards/margins": 0.06536930799484253, "rewards/margins_max": 0.19230645895004272, "rewards/margins_min": -0.032393865287303925, "rewards/margins_std": 0.10202561318874359, "rewards/rejected": 0.06545799970626831, "step": 1490 }, { "dpo_losses": 0.6603057980537415, "epoch": 0.39, "grad_norm": 1.674629580404987, "learning_rate": 3.8069280835019055e-07, "logits/chosen": -2.8541314601898193, "logits/rejected": -2.781519651412964, "logps/chosen": -225.85855102539062, "logps/rejected": -193.4555206298828, "loss": 0.6802, "positive_losses": 0.39620399475097656, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.1380954384803772, "rewards/margins": 0.07008825242519379, "rewards/margins_max": 0.20899653434753418, "rewards/margins_min": -0.04256419837474823, "rewards/margins_std": 0.11214927583932877, "rewards/rejected": 0.06800718605518341, "step": 1500 }, { "epoch": 0.39, "eval_dpo_losses": 0.667341411113739, "eval_logits/chosen": -2.7934176921844482, "eval_logits/rejected": -2.7549002170562744, "eval_logps/chosen": -270.5391540527344, "eval_logps/rejected": -253.69781494140625, "eval_loss": 0.6854449510574341, "eval_positive_losses": 0.12425005435943604, "eval_rewards/accuracies": 0.7202380895614624, "eval_rewards/chosen": 0.13955748081207275, "eval_rewards/margins": 0.0550418496131897, "eval_rewards/margins_max": 0.21547779440879822, "eval_rewards/margins_min": -0.08949919790029526, "eval_rewards/margins_std": 0.10008959472179413, "eval_rewards/rejected": 0.08451561629772186, "eval_runtime": 388.5958, "eval_samples_per_second": 5.147, "eval_steps_per_second": 0.162, "step": 1500 }, { "dpo_losses": 0.6754101514816284, "epoch": 0.4, "grad_norm": 2.220484740457583, "learning_rate": 3.7873993652552073e-07, "logits/chosen": -2.8614954948425293, "logits/rejected": -2.7796661853790283, "logps/chosen": -292.0195617675781, "logps/rejected": -251.2197265625, "loss": 0.6926, "positive_losses": 0.2932479977607727, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.12026111036539078, "rewards/margins": 0.03840740770101547, "rewards/margins_max": 0.1505320966243744, "rewards/margins_min": -0.07637099921703339, "rewards/margins_std": 0.10024436563253403, "rewards/rejected": 0.08185369521379471, "step": 1510 }, { "dpo_losses": 0.663870632648468, "epoch": 0.4, "grad_norm": 2.203422580398517, "learning_rate": 3.767763149531995e-07, "logits/chosen": -2.823636054992676, "logits/rejected": -2.744633197784424, "logps/chosen": -271.84796142578125, "logps/rejected": -210.0604705810547, "loss": 0.6816, "positive_losses": 0.22603663802146912, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.13827243447303772, "rewards/margins": 0.06253242492675781, "rewards/margins_max": 0.19688096642494202, "rewards/margins_min": -0.054659806191921234, "rewards/margins_std": 0.110364630818367, "rewards/rejected": 0.07573998719453812, "step": 1520 }, { "dpo_losses": 0.6732046604156494, "epoch": 0.4, "grad_norm": 1.8093560015219587, "learning_rate": 3.7480210759506326e-07, "logits/chosen": -2.7630019187927246, "logits/rejected": -2.7755062580108643, "logps/chosen": -263.8840637207031, "logps/rejected": -230.9619598388672, "loss": 0.6767, "positive_losses": 0.01628875732421875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.13674938678741455, "rewards/margins": 0.04170869663357735, "rewards/margins_max": 0.14020602405071259, "rewards/margins_min": -0.040444426238536835, "rewards/margins_std": 0.07792656123638153, "rewards/rejected": 0.0950406938791275, "step": 1530 }, { "dpo_losses": 0.6739233136177063, "epoch": 0.4, "grad_norm": 2.148534710158725, "learning_rate": 3.728174792968582e-07, "logits/chosen": -2.7825496196746826, "logits/rejected": -2.7479727268218994, "logps/chosen": -363.34619140625, "logps/rejected": -367.6847839355469, "loss": 0.678, "positive_losses": 0.06572417914867401, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.14340153336524963, "rewards/margins": 0.042062122374773026, "rewards/margins_max": 0.15321552753448486, "rewards/margins_min": -0.05941515043377876, "rewards/margins_std": 0.0947646051645279, "rewards/rejected": 0.1013394147157669, "step": 1540 }, { "dpo_losses": 0.6643571853637695, "epoch": 0.41, "grad_norm": 6.019472630615777, "learning_rate": 3.70822595774476e-07, "logits/chosen": -2.748858690261841, "logits/rejected": -2.7964751720428467, "logps/chosen": -291.16668701171875, "logps/rejected": -272.07269287109375, "loss": 0.6811, "positive_losses": 0.009188842959702015, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16058219969272614, "rewards/margins": 0.06413300335407257, "rewards/margins_max": 0.2249596118927002, "rewards/margins_min": -0.04774421080946922, "rewards/margins_std": 0.12317200750112534, "rewards/rejected": 0.09644921123981476, "step": 1550 }, { "dpo_losses": 0.6761940121650696, "epoch": 0.41, "grad_norm": 2.066512395371634, "learning_rate": 3.688176236001168e-07, "logits/chosen": -2.7642502784729004, "logits/rejected": -2.744047164916992, "logps/chosen": -256.73370361328125, "logps/rejected": -243.156982421875, "loss": 0.6854, "positive_losses": 0.03718109056353569, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.13707491755485535, "rewards/margins": 0.03892552852630615, "rewards/margins_max": 0.19296100735664368, "rewards/margins_min": -0.10505714267492294, "rewards/margins_std": 0.13237911462783813, "rewards/rejected": 0.098149374127388, "step": 1560 }, { "dpo_losses": 0.6572185754776001, "epoch": 0.41, "grad_norm": 8.177892274792171, "learning_rate": 3.6680273018838016e-07, "logits/chosen": -2.798093318939209, "logits/rejected": -2.7817108631134033, "logps/chosen": -345.5523986816406, "logps/rejected": -253.31076049804688, "loss": 0.6728, "positive_losses": 0.03433532640337944, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1384911984205246, "rewards/margins": 0.07636358588933945, "rewards/margins_max": 0.19075119495391846, "rewards/margins_min": -0.03996484354138374, "rewards/margins_std": 0.10142095386981964, "rewards/rejected": 0.06212761253118515, "step": 1570 }, { "dpo_losses": 0.6835813522338867, "epoch": 0.41, "grad_norm": 14.303723172028716, "learning_rate": 3.6477808378228596e-07, "logits/chosen": -2.8051185607910156, "logits/rejected": -2.855712413787842, "logps/chosen": -256.68109130859375, "logps/rejected": -242.5233612060547, "loss": 0.68, "positive_losses": 0.11357422173023224, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.11277903616428375, "rewards/margins": 0.021566368639469147, "rewards/margins_max": 0.11864233016967773, "rewards/margins_min": -0.06020314246416092, "rewards/margins_std": 0.07883908599615097, "rewards/rejected": 0.09121266007423401, "step": 1580 }, { "dpo_losses": 0.6786788105964661, "epoch": 0.42, "grad_norm": 1.940205807338258, "learning_rate": 3.6274385343922674e-07, "logits/chosen": -2.8355250358581543, "logits/rejected": -2.888169765472412, "logps/chosen": -294.1134338378906, "logps/rejected": -279.9457092285156, "loss": 0.6724, "positive_losses": 0.0, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.13763108849525452, "rewards/margins": 0.03273003175854683, "rewards/margins_max": 0.1772867739200592, "rewards/margins_min": -0.08706100285053253, "rewards/margins_std": 0.11496637761592865, "rewards/rejected": 0.1049010381102562, "step": 1590 }, { "dpo_losses": 0.6733390092849731, "epoch": 0.42, "grad_norm": 1.767002412393906, "learning_rate": 3.6070020901685057e-07, "logits/chosen": -2.7376842498779297, "logits/rejected": -2.7593541145324707, "logps/chosen": -239.187744140625, "logps/rejected": -196.0511932373047, "loss": 0.6816, "positive_losses": 0.295419305562973, "rewards/accuracies": 0.625, "rewards/chosen": 0.1259082853794098, "rewards/margins": 0.042100995779037476, "rewards/margins_max": 0.14133401215076447, "rewards/margins_min": -0.03318362310528755, "rewards/margins_std": 0.07885169237852097, "rewards/rejected": 0.08380730450153351, "step": 1600 }, { "epoch": 0.42, "eval_dpo_losses": 0.6668837070465088, "eval_logits/chosen": -2.795292377471924, "eval_logits/rejected": -2.7573533058166504, "eval_logps/chosen": -270.2237548828125, "eval_logps/rejected": -253.4888458251953, "eval_loss": 0.6848409175872803, "eval_positive_losses": 0.12261239439249039, "eval_rewards/accuracies": 0.726190447807312, "eval_rewards/chosen": 0.14271163940429688, "eval_rewards/margins": 0.05610635504126549, "eval_rewards/margins_max": 0.21962520480155945, "eval_rewards/margins_min": -0.0916348546743393, "eval_rewards/margins_std": 0.10248645395040512, "eval_rewards/rejected": 0.08660528808832169, "eval_runtime": 389.7, "eval_samples_per_second": 5.132, "eval_steps_per_second": 0.162, "step": 1600 }, { "dpo_losses": 0.663489818572998, "epoch": 0.42, "grad_norm": 1.8293309594872051, "learning_rate": 3.5864732115887863e-07, "logits/chosen": -2.747380495071411, "logits/rejected": -2.742300510406494, "logps/chosen": -261.0233459472656, "logps/rejected": -240.55416870117188, "loss": 0.6719, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.14141225814819336, "rewards/margins": 0.0640452578663826, "rewards/margins_max": 0.1955864131450653, "rewards/margins_min": -0.028682339936494827, "rewards/margins_std": 0.1018737331032753, "rewards/rejected": 0.07736701518297195, "step": 1610 }, { "dpo_losses": 0.6576683521270752, "epoch": 0.42, "grad_norm": 2.427551115305182, "learning_rate": 3.565853612808562e-07, "logits/chosen": -2.799431562423706, "logits/rejected": -2.7337801456451416, "logps/chosen": -269.6554260253906, "logps/rejected": -235.99038696289062, "loss": 0.6778, "positive_losses": 0.008718108758330345, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.16144730150699615, "rewards/margins": 0.07694243639707565, "rewards/margins_max": 0.20366080105304718, "rewards/margins_min": -0.026887202635407448, "rewards/margins_std": 0.10323099046945572, "rewards/rejected": 0.0845048725605011, "step": 1620 }, { "dpo_losses": 0.6518866419792175, "epoch": 0.43, "grad_norm": 10.793623306246538, "learning_rate": 3.5451450155583984e-07, "logits/chosen": -2.915398359298706, "logits/rejected": -2.8122076988220215, "logps/chosen": -282.4895324707031, "logps/rejected": -249.7183074951172, "loss": 0.685, "positive_losses": 0.1295158416032791, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.15124383568763733, "rewards/margins": 0.08674627542495728, "rewards/margins_max": 0.17734436690807343, "rewards/margins_min": -0.00693632522597909, "rewards/margins_std": 0.08227060735225677, "rewards/rejected": 0.06449756771326065, "step": 1630 }, { "dpo_losses": 0.6686679720878601, "epoch": 0.43, "grad_norm": 10.090910615538187, "learning_rate": 3.5243491490002055e-07, "logits/chosen": -2.876235008239746, "logits/rejected": -2.8371047973632812, "logps/chosen": -268.490966796875, "logps/rejected": -227.34738159179688, "loss": 0.6765, "positive_losses": 0.04539031907916069, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.14302578568458557, "rewards/margins": 0.0534796416759491, "rewards/margins_max": 0.19195103645324707, "rewards/margins_min": -0.06843677908182144, "rewards/margins_std": 0.11482664197683334, "rewards/rejected": 0.08954615145921707, "step": 1640 }, { "dpo_losses": 0.6530863046646118, "epoch": 0.43, "grad_norm": 2.2831072628106055, "learning_rate": 3.503467749582857e-07, "logits/chosen": -2.8279144763946533, "logits/rejected": -2.712979793548584, "logps/chosen": -374.8084716796875, "logps/rejected": -279.04193115234375, "loss": 0.6833, "positive_losses": 0.15686893463134766, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.16030217707157135, "rewards/margins": 0.08735756576061249, "rewards/margins_max": 0.21465222537517548, "rewards/margins_min": -0.06027594953775406, "rewards/margins_std": 0.12725059688091278, "rewards/rejected": 0.07294458150863647, "step": 1650 }, { "dpo_losses": 0.6651492118835449, "epoch": 0.43, "grad_norm": 8.591032063098105, "learning_rate": 3.482502560897194e-07, "logits/chosen": -2.7746264934539795, "logits/rejected": -2.7501578330993652, "logps/chosen": -237.156494140625, "logps/rejected": -277.9545593261719, "loss": 0.6731, "positive_losses": 0.10025139153003693, "rewards/accuracies": 0.625, "rewards/chosen": 0.1401684582233429, "rewards/margins": 0.060249678790569305, "rewards/margins_max": 0.17184853553771973, "rewards/margins_min": -0.05993221327662468, "rewards/margins_std": 0.1048416867852211, "rewards/rejected": 0.079918771982193, "step": 1660 }, { "dpo_losses": 0.6637318730354309, "epoch": 0.44, "grad_norm": 1.7493149233877625, "learning_rate": 3.4614553335304403e-07, "logits/chosen": -2.8647806644439697, "logits/rejected": -2.821565628051758, "logps/chosen": -248.33609008789062, "logps/rejected": -221.92294311523438, "loss": 0.679, "positive_losses": 0.35750922560691833, "rewards/accuracies": 0.75, "rewards/chosen": 0.14442947506904602, "rewards/margins": 0.0623578205704689, "rewards/margins_max": 0.17120857536792755, "rewards/margins_min": -0.0359518863260746, "rewards/margins_std": 0.09716440737247467, "rewards/rejected": 0.08207164704799652, "step": 1670 }, { "dpo_losses": 0.6731246709823608, "epoch": 0.44, "grad_norm": 8.473047046081899, "learning_rate": 3.440327824920022e-07, "logits/chosen": -2.7717459201812744, "logits/rejected": -2.715242385864258, "logps/chosen": -297.0379943847656, "logps/rejected": -245.60391235351562, "loss": 0.682, "positive_losses": 0.05773162841796875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.13697683811187744, "rewards/margins": 0.042325470596551895, "rewards/margins_max": 0.14347949624061584, "rewards/margins_min": -0.0518612377345562, "rewards/margins_std": 0.08395689725875854, "rewards/rejected": 0.09465137869119644, "step": 1680 }, { "dpo_losses": 0.6721340417861938, "epoch": 0.44, "grad_norm": 6.683373434782019, "learning_rate": 3.4191217992068287e-07, "logits/chosen": -2.6870052814483643, "logits/rejected": -2.6862998008728027, "logps/chosen": -227.3953094482422, "logps/rejected": -246.79638671875, "loss": 0.6696, "positive_losses": 0.04251289367675781, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.13126316666603088, "rewards/margins": 0.0451076440513134, "rewards/margins_max": 0.1354115754365921, "rewards/margins_min": -0.04070950672030449, "rewards/margins_std": 0.07816879451274872, "rewards/rejected": 0.08615552634000778, "step": 1690 }, { "dpo_losses": 0.6801950931549072, "epoch": 0.44, "grad_norm": 1.983917724306998, "learning_rate": 3.3978390270879056e-07, "logits/chosen": -2.7492804527282715, "logits/rejected": -2.744694232940674, "logps/chosen": -199.28369140625, "logps/rejected": -252.99630737304688, "loss": 0.6737, "positive_losses": 0.18640442192554474, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.11901885271072388, "rewards/margins": 0.029588323086500168, "rewards/margins_max": 0.13933536410331726, "rewards/margins_min": -0.08721192181110382, "rewards/margins_std": 0.09942667186260223, "rewards/rejected": 0.0894305482506752, "step": 1700 }, { "epoch": 0.44, "eval_dpo_losses": 0.6653831005096436, "eval_logits/chosen": -2.793133020401001, "eval_logits/rejected": -2.7549686431884766, "eval_logps/chosen": -270.14947509765625, "eval_logps/rejected": -253.75079345703125, "eval_loss": 0.6862542629241943, "eval_positive_losses": 0.14278966188430786, "eval_rewards/accuracies": 0.7202380895614624, "eval_rewards/chosen": 0.1434543877840042, "eval_rewards/margins": 0.059468500316143036, "eval_rewards/margins_max": 0.23018646240234375, "eval_rewards/margins_min": -0.0957309901714325, "eval_rewards/margins_std": 0.10726842284202576, "eval_rewards/rejected": 0.08398589491844177, "eval_runtime": 389.1458, "eval_samples_per_second": 5.139, "eval_steps_per_second": 0.162, "step": 1700 }, { "dpo_losses": 0.6800428628921509, "epoch": 0.45, "grad_norm": 14.473184981063904, "learning_rate": 3.376481285668599e-07, "logits/chosen": -2.860978603363037, "logits/rejected": -2.842038631439209, "logps/chosen": -240.4024200439453, "logps/rejected": -230.64913940429688, "loss": 0.6958, "positive_losses": 0.13513031601905823, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.14729043841362, "rewards/margins": 0.02992558851838112, "rewards/margins_max": 0.14201180636882782, "rewards/margins_min": -0.07873591035604477, "rewards/margins_std": 0.09927816689014435, "rewards/rejected": 0.11736486107110977, "step": 1710 }, { "dpo_losses": 0.6766767501831055, "epoch": 0.45, "grad_norm": 2.140319802462128, "learning_rate": 3.355050358314172e-07, "logits/chosen": -2.800375461578369, "logits/rejected": -2.7702994346618652, "logps/chosen": -244.689697265625, "logps/rejected": -273.8795471191406, "loss": 0.6794, "positive_losses": 0.05370616912841797, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.14131440222263336, "rewards/margins": 0.03704925253987312, "rewards/margins_max": 0.16323471069335938, "rewards/margins_min": -0.08994705975055695, "rewards/margins_std": 0.11388404667377472, "rewards/rejected": 0.10426516830921173, "step": 1720 }, { "dpo_losses": 0.6766036748886108, "epoch": 0.45, "grad_norm": 2.4323905799461247, "learning_rate": 3.33354803450089e-07, "logits/chosen": -2.851832866668701, "logits/rejected": -2.7648284435272217, "logps/chosen": -265.53814697265625, "logps/rejected": -291.24462890625, "loss": 0.6799, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.14070968329906464, "rewards/margins": 0.03625965490937233, "rewards/margins_max": 0.1697661578655243, "rewards/margins_min": -0.06259065866470337, "rewards/margins_std": 0.10244093835353851, "rewards/rejected": 0.1044500470161438, "step": 1730 }, { "dpo_losses": 0.654105544090271, "epoch": 0.46, "grad_norm": 10.980245712115805, "learning_rate": 3.311976109666605e-07, "logits/chosen": -2.7741200923919678, "logits/rejected": -2.701322078704834, "logps/chosen": -306.71246337890625, "logps/rejected": -232.8634490966797, "loss": 0.6806, "positive_losses": 0.13347473740577698, "rewards/accuracies": 0.75, "rewards/chosen": 0.1625155210494995, "rewards/margins": 0.08344938606023788, "rewards/margins_max": 0.2080841064453125, "rewards/margins_min": -0.047725483775138855, "rewards/margins_std": 0.11776135861873627, "rewards/rejected": 0.07906611263751984, "step": 1740 }, { "dpo_losses": 0.6599873900413513, "epoch": 0.46, "grad_norm": 12.333233875870272, "learning_rate": 3.2903363850608317e-07, "logits/chosen": -2.7754392623901367, "logits/rejected": -2.762289047241211, "logps/chosen": -258.095458984375, "logps/rejected": -257.9560241699219, "loss": 0.6767, "positive_losses": 0.0, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.15608219802379608, "rewards/margins": 0.07032088190317154, "rewards/margins_max": 0.18100914359092712, "rewards/margins_min": -0.04148901253938675, "rewards/margins_std": 0.10034122318029404, "rewards/rejected": 0.08576132357120514, "step": 1750 }, { "dpo_losses": 0.6527787446975708, "epoch": 0.46, "grad_norm": 12.099111073526487, "learning_rate": 3.2686306675943477e-07, "logits/chosen": -2.6830406188964844, "logits/rejected": -2.641451597213745, "logps/chosen": -256.3885803222656, "logps/rejected": -233.4460906982422, "loss": 0.6736, "positive_losses": 0.0111083984375, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.18093226850032806, "rewards/margins": 0.08572479337453842, "rewards/margins_max": 0.22395269572734833, "rewards/margins_min": -0.010747433640062809, "rewards/margins_std": 0.10372080653905869, "rewards/rejected": 0.09520746767520905, "step": 1760 }, { "dpo_losses": 0.6773605942726135, "epoch": 0.46, "grad_norm": 11.900600745160016, "learning_rate": 3.2468607696883145e-07, "logits/chosen": -2.8585307598114014, "logits/rejected": -2.8548781871795654, "logps/chosen": -291.2569885253906, "logps/rejected": -283.5742492675781, "loss": 0.6797, "positive_losses": 0.02425079420208931, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.14840981364250183, "rewards/margins": 0.03481290489435196, "rewards/margins_max": 0.16832002997398376, "rewards/margins_min": -0.06583338230848312, "rewards/margins_std": 0.10414840281009674, "rewards/rejected": 0.11359691619873047, "step": 1770 }, { "dpo_losses": 0.6759124994277954, "epoch": 0.47, "grad_norm": 2.0423300567114206, "learning_rate": 3.2250285091229435e-07, "logits/chosen": -2.799861431121826, "logits/rejected": -2.733640670776367, "logps/chosen": -268.4593200683594, "logps/rejected": -248.49301147460938, "loss": 0.6701, "positive_losses": 0.1370445191860199, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1430630385875702, "rewards/margins": 0.03846190497279167, "rewards/margins_max": 0.14976127445697784, "rewards/margins_min": -0.09638581424951553, "rewards/margins_std": 0.10821805894374847, "rewards/rejected": 0.10460114479064941, "step": 1780 }, { "dpo_losses": 0.6586912870407104, "epoch": 0.47, "grad_norm": 12.508617024992398, "learning_rate": 3.2031357088857083e-07, "logits/chosen": -2.816551923751831, "logits/rejected": -2.7572274208068848, "logps/chosen": -268.2996826171875, "logps/rejected": -223.4669647216797, "loss": 0.6824, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.16138462722301483, "rewards/margins": 0.07290495932102203, "rewards/margins_max": 0.20781588554382324, "rewards/margins_min": -0.025771930813789368, "rewards/margins_std": 0.10554580390453339, "rewards/rejected": 0.0884796530008316, "step": 1790 }, { "dpo_losses": 0.6669738292694092, "epoch": 0.47, "grad_norm": 2.193588068275639, "learning_rate": 3.1811841970191267e-07, "logits/chosen": -2.8713536262512207, "logits/rejected": -2.7351841926574707, "logps/chosen": -335.30426025390625, "logps/rejected": -294.9971618652344, "loss": 0.6913, "positive_losses": 0.09362602233886719, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.14786486327648163, "rewards/margins": 0.055970776826143265, "rewards/margins_max": 0.17564713954925537, "rewards/margins_min": -0.04175977408885956, "rewards/margins_std": 0.09620113670825958, "rewards/rejected": 0.09189409017562866, "step": 1800 }, { "epoch": 0.47, "eval_dpo_losses": 0.666235089302063, "eval_logits/chosen": -2.7922110557556152, "eval_logits/rejected": -2.754131555557251, "eval_logps/chosen": -269.03106689453125, "eval_logps/rejected": -252.44105529785156, "eval_loss": 0.6821897625923157, "eval_positive_losses": 0.1096610277891159, "eval_rewards/accuracies": 0.7202380895614624, "eval_rewards/chosen": 0.15463854372501373, "eval_rewards/margins": 0.05755544453859329, "eval_rewards/margins_max": 0.2257683426141739, "eval_rewards/margins_min": -0.09160422533750534, "eval_rewards/margins_std": 0.10461423546075821, "eval_rewards/rejected": 0.09708309173583984, "eval_runtime": 388.9754, "eval_samples_per_second": 5.142, "eval_steps_per_second": 0.162, "step": 1800 }, { "dpo_losses": 0.6621342897415161, "epoch": 0.47, "grad_norm": 2.1187524396043984, "learning_rate": 3.1591758064681257e-07, "logits/chosen": -2.818814992904663, "logits/rejected": -2.7996132373809814, "logps/chosen": -315.8829040527344, "logps/rejected": -281.2411804199219, "loss": 0.6787, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.17190691828727722, "rewards/margins": 0.06481160968542099, "rewards/margins_max": 0.1503869891166687, "rewards/margins_min": -0.014813661575317383, "rewards/margins_std": 0.07438337802886963, "rewards/rejected": 0.10709531605243683, "step": 1810 }, { "dpo_losses": 0.6597224473953247, "epoch": 0.48, "grad_norm": 9.000680685793558, "learning_rate": 3.13711237492698e-07, "logits/chosen": -2.77579402923584, "logits/rejected": -2.6911988258361816, "logps/chosen": -275.7292785644531, "logps/rejected": -284.06390380859375, "loss": 0.686, "positive_losses": 0.08585052192211151, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1561126410961151, "rewards/margins": 0.07060912251472473, "rewards/margins_max": 0.1737409383058548, "rewards/margins_min": -0.014506603591144085, "rewards/margins_std": 0.08497828245162964, "rewards/rejected": 0.08550353348255157, "step": 1820 }, { "dpo_losses": 0.6639354228973389, "epoch": 0.48, "grad_norm": 4.982395460656168, "learning_rate": 3.1149957446858767e-07, "logits/chosen": -2.7818925380706787, "logits/rejected": -2.7447280883789062, "logps/chosen": -301.8582458496094, "logps/rejected": -402.88580322265625, "loss": 0.6831, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.16820065677165985, "rewards/margins": 0.06484942138195038, "rewards/margins_max": 0.21876180171966553, "rewards/margins_min": -0.08734156936407089, "rewards/margins_std": 0.1392730474472046, "rewards/rejected": 0.10335125029087067, "step": 1830 }, { "dpo_losses": 0.6596516370773315, "epoch": 0.48, "grad_norm": 2.5992478162942168, "learning_rate": 3.0928277624770736e-07, "logits/chosen": -2.810455799102783, "logits/rejected": -2.7492587566375732, "logps/chosen": -231.97482299804688, "logps/rejected": -233.7396697998047, "loss": 0.6789, "positive_losses": 0.08066530525684357, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.146644726395607, "rewards/margins": 0.07179627567529678, "rewards/margins_max": 0.1726509928703308, "rewards/margins_min": -0.04879069700837135, "rewards/margins_std": 0.10129410028457642, "rewards/rejected": 0.07484843581914902, "step": 1840 }, { "dpo_losses": 0.6742871999740601, "epoch": 0.48, "grad_norm": 7.1406853671203745, "learning_rate": 3.0706102793207073e-07, "logits/chosen": -2.8016586303710938, "logits/rejected": -2.7073276042938232, "logps/chosen": -223.1007537841797, "logps/rejected": -206.54574584960938, "loss": 0.6759, "positive_losses": 0.20059967041015625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1434759795665741, "rewards/margins": 0.04163094609975815, "rewards/margins_max": 0.1384052038192749, "rewards/margins_min": -0.074953094124794, "rewards/margins_std": 0.0965457558631897, "rewards/rejected": 0.10184504091739655, "step": 1850 }, { "dpo_losses": 0.6566171050071716, "epoch": 0.49, "grad_norm": 6.202071170028259, "learning_rate": 3.048345150370226e-07, "logits/chosen": -2.6942129135131836, "logits/rejected": -2.657686471939087, "logps/chosen": -268.5079345703125, "logps/rejected": -256.33282470703125, "loss": 0.6743, "positive_losses": 0.3425118327140808, "rewards/accuracies": 0.75, "rewards/chosen": 0.16685865819454193, "rewards/margins": 0.07956352084875107, "rewards/margins_max": 0.22378845512866974, "rewards/margins_min": -0.03199433535337448, "rewards/margins_std": 0.11701379716396332, "rewards/rejected": 0.08729512244462967, "step": 1860 }, { "dpo_losses": 0.6586459875106812, "epoch": 0.49, "grad_norm": 2.081671535200173, "learning_rate": 3.0260342347574913e-07, "logits/chosen": -2.705451488494873, "logits/rejected": -2.723024606704712, "logps/chosen": -259.8138732910156, "logps/rejected": -275.05108642578125, "loss": 0.6623, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.15820066630840302, "rewards/margins": 0.07316181063652039, "rewards/margins_max": 0.16405954957008362, "rewards/margins_min": -0.02333083376288414, "rewards/margins_std": 0.08455059677362442, "rewards/rejected": 0.08503885567188263, "step": 1870 }, { "dpo_losses": 0.6664489507675171, "epoch": 0.49, "grad_norm": 15.607777129020638, "learning_rate": 3.0036793954375357e-07, "logits/chosen": -2.789748430252075, "logits/rejected": -2.715607166290283, "logps/chosen": -258.93841552734375, "logps/rejected": -248.56689453125, "loss": 0.6671, "positive_losses": 0.006903409957885742, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.15480181574821472, "rewards/margins": 0.05630043148994446, "rewards/margins_max": 0.15635153651237488, "rewards/margins_min": -0.04761496186256409, "rewards/margins_std": 0.09293356537818909, "rewards/rejected": 0.09850136935710907, "step": 1880 }, { "dpo_losses": 0.6808010339736938, "epoch": 0.49, "grad_norm": 2.1090421060742464, "learning_rate": 2.9812824990330085e-07, "logits/chosen": -2.8155131340026855, "logits/rejected": -2.814131259918213, "logps/chosen": -288.41436767578125, "logps/rejected": -339.4684753417969, "loss": 0.6773, "positive_losses": 0.14153671264648438, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.12658026814460754, "rewards/margins": 0.02811632677912712, "rewards/margins_max": 0.14468391239643097, "rewards/margins_min": -0.0852559506893158, "rewards/margins_std": 0.1038467064499855, "rewards/rejected": 0.09846396744251251, "step": 1890 }, { "dpo_losses": 0.6638901233673096, "epoch": 0.5, "grad_norm": 2.030096695011095, "learning_rate": 2.958845415678316e-07, "logits/chosen": -2.841752052307129, "logits/rejected": -2.759030818939209, "logps/chosen": -267.19378662109375, "logps/rejected": -222.11886596679688, "loss": 0.691, "positive_losses": 0.27200716733932495, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.144225612282753, "rewards/margins": 0.06363539397716522, "rewards/margins_max": 0.20767728984355927, "rewards/margins_min": -0.053487379103899, "rewards/margins_std": 0.11568088829517365, "rewards/rejected": 0.08059023320674896, "step": 1900 }, { "epoch": 0.5, "eval_dpo_losses": 0.6649388074874878, "eval_logits/chosen": -2.78464412689209, "eval_logits/rejected": -2.746262788772583, "eval_logps/chosen": -269.3756408691406, "eval_logps/rejected": -253.0802459716797, "eval_loss": 0.6836426854133606, "eval_positive_losses": 0.13374747335910797, "eval_rewards/accuracies": 0.7222222089767456, "eval_rewards/chosen": 0.15119239687919617, "eval_rewards/margins": 0.06050121411681175, "eval_rewards/margins_max": 0.23447264730930328, "eval_rewards/margins_min": -0.09601601213216782, "eval_rewards/margins_std": 0.10917651653289795, "eval_rewards/rejected": 0.09069117158651352, "eval_runtime": 390.0032, "eval_samples_per_second": 5.128, "eval_steps_per_second": 0.162, "step": 1900 }, { "dpo_losses": 0.6559640765190125, "epoch": 0.5, "grad_norm": 12.147662505424783, "learning_rate": 2.936370018863459e-07, "logits/chosen": -2.8035902976989746, "logits/rejected": -2.748021125793457, "logps/chosen": -241.41848754882812, "logps/rejected": -229.57290649414062, "loss": 0.6783, "positive_losses": 0.0, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.16947892308235168, "rewards/margins": 0.0783829391002655, "rewards/margins_max": 0.19369716942310333, "rewards/margins_min": -0.01700790971517563, "rewards/margins_std": 0.09164775907993317, "rewards/rejected": 0.09109597653150558, "step": 1910 }, { "dpo_losses": 0.6684740781784058, "epoch": 0.5, "grad_norm": 4.552578734000376, "learning_rate": 2.913858185277605e-07, "logits/chosen": -2.7616660594940186, "logits/rejected": -2.737854242324829, "logps/chosen": -258.0267028808594, "logps/rejected": -198.5316925048828, "loss": 0.6805, "positive_losses": 0.19840697944164276, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.14402440190315247, "rewards/margins": 0.052981119602918625, "rewards/margins_max": 0.17679978907108307, "rewards/margins_min": -0.04407616704702377, "rewards/margins_std": 0.09694145619869232, "rewards/rejected": 0.09104329347610474, "step": 1920 }, { "dpo_losses": 0.65994793176651, "epoch": 0.51, "grad_norm": 7.729074592418925, "learning_rate": 2.89131179465238e-07, "logits/chosen": -2.8554089069366455, "logits/rejected": -2.733668327331543, "logps/chosen": -340.9375305175781, "logps/rejected": -238.2502899169922, "loss": 0.6778, "positive_losses": 0.10468940436840057, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.14773961901664734, "rewards/margins": 0.0713655948638916, "rewards/margins_max": 0.19929155707359314, "rewards/margins_min": -0.032435785979032516, "rewards/margins_std": 0.10413695871829987, "rewards/rejected": 0.07637403905391693, "step": 1930 }, { "dpo_losses": 0.6654216647148132, "epoch": 0.51, "grad_norm": 19.203100367406904, "learning_rate": 2.8687327296049125e-07, "logits/chosen": -2.791396379470825, "logits/rejected": -2.7972702980041504, "logps/chosen": -253.9713592529297, "logps/rejected": -249.3101348876953, "loss": 0.6877, "positive_losses": 0.1420997679233551, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.13887836039066315, "rewards/margins": 0.05868466570973396, "rewards/margins_max": 0.15763770043849945, "rewards/margins_min": -0.03611772507429123, "rewards/margins_std": 0.08720506727695465, "rewards/rejected": 0.08019369840621948, "step": 1940 }, { "dpo_losses": 0.6656764149665833, "epoch": 0.51, "grad_norm": 1.9695130337905855, "learning_rate": 2.846122875480637e-07, "logits/chosen": -2.816713333129883, "logits/rejected": -2.8190550804138184, "logps/chosen": -278.9736022949219, "logps/rejected": -269.1643981933594, "loss": 0.6844, "positive_losses": 0.1202617660164833, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.14900079369544983, "rewards/margins": 0.05904467776417732, "rewards/margins_max": 0.17524728178977966, "rewards/margins_min": -0.06684452295303345, "rewards/margins_std": 0.10755829513072968, "rewards/rejected": 0.0899561196565628, "step": 1950 }, { "dpo_losses": 0.6570440530776978, "epoch": 0.51, "grad_norm": 10.334737948863516, "learning_rate": 2.8234841201958647e-07, "logits/chosen": -2.8929736614227295, "logits/rejected": -2.8422701358795166, "logps/chosen": -280.4637756347656, "logps/rejected": -246.18167114257812, "loss": 0.6811, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.16151626408100128, "rewards/margins": 0.07666916400194168, "rewards/margins_max": 0.19590993225574493, "rewards/margins_min": -0.03747622296214104, "rewards/margins_std": 0.10714240372180939, "rewards/rejected": 0.0848471149802208, "step": 1960 }, { "dpo_losses": 0.6631742715835571, "epoch": 0.52, "grad_norm": 9.15214743507445, "learning_rate": 2.800818354080148e-07, "logits/chosen": -2.8967909812927246, "logits/rejected": -2.877074718475342, "logps/chosen": -274.1905517578125, "logps/rejected": -249.51174926757812, "loss": 0.6852, "positive_losses": 0.31591281294822693, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.15203619003295898, "rewards/margins": 0.064478799700737, "rewards/margins_max": 0.17947709560394287, "rewards/margins_min": -0.04000955447554588, "rewards/margins_std": 0.09873761236667633, "rewards/rejected": 0.08755739033222198, "step": 1970 }, { "dpo_losses": 0.6746450662612915, "epoch": 0.52, "grad_norm": 11.208157524989577, "learning_rate": 2.778127469718435e-07, "logits/chosen": -2.818582534790039, "logits/rejected": -2.7697455883026123, "logps/chosen": -195.31723022460938, "logps/rejected": -208.20095825195312, "loss": 0.6837, "positive_losses": 0.24081268906593323, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.15358874201774597, "rewards/margins": 0.0399480015039444, "rewards/margins_max": 0.15406204760074615, "rewards/margins_min": -0.04506593197584152, "rewards/margins_std": 0.09146953374147415, "rewards/rejected": 0.11364071071147919, "step": 1980 }, { "dpo_losses": 0.6614011526107788, "epoch": 0.52, "grad_norm": 1.8222015140618013, "learning_rate": 2.755413361793039e-07, "logits/chosen": -2.841437816619873, "logits/rejected": -2.704587936401367, "logps/chosen": -250.64920043945312, "logps/rejected": -241.6019287109375, "loss": 0.6725, "positive_losses": 0.10384368896484375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.14233501255512238, "rewards/margins": 0.0674857422709465, "rewards/margins_max": 0.18242308497428894, "rewards/margins_min": -0.029808182269334793, "rewards/margins_std": 0.09713619947433472, "rewards/rejected": 0.07484927773475647, "step": 1990 }, { "dpo_losses": 0.6818459630012512, "epoch": 0.52, "grad_norm": 1.9816734366893491, "learning_rate": 2.7326779269254356e-07, "logits/chosen": -2.785947799682617, "logits/rejected": -2.7597334384918213, "logps/chosen": -222.0934600830078, "logps/rejected": -214.197021484375, "loss": 0.6743, "positive_losses": 0.22264710068702698, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.13967108726501465, "rewards/margins": 0.02557896077632904, "rewards/margins_max": 0.13581949472427368, "rewards/margins_min": -0.0820813775062561, "rewards/margins_std": 0.10081374645233154, "rewards/rejected": 0.11409211158752441, "step": 2000 }, { "epoch": 0.52, "eval_dpo_losses": 0.6653285622596741, "eval_logits/chosen": -2.78452730178833, "eval_logits/rejected": -2.7460243701934814, "eval_logps/chosen": -268.96856689453125, "eval_logps/rejected": -252.5889129638672, "eval_loss": 0.6819599866867065, "eval_positive_losses": 0.11702584475278854, "eval_rewards/accuracies": 0.7182539701461792, "eval_rewards/chosen": 0.15526309609413147, "eval_rewards/margins": 0.059658586978912354, "eval_rewards/margins_max": 0.23282285034656525, "eval_rewards/margins_min": -0.09585469961166382, "eval_rewards/margins_std": 0.10850544273853302, "eval_rewards/rejected": 0.09560451656579971, "eval_runtime": 389.4379, "eval_samples_per_second": 5.136, "eval_steps_per_second": 0.162, "step": 2000 }, { "dpo_losses": 0.6639446020126343, "epoch": 0.53, "grad_norm": 2.148193567473034, "learning_rate": 2.709923063517895e-07, "logits/chosen": -2.7333264350891113, "logits/rejected": -2.756164073944092, "logps/chosen": -242.3228759765625, "logps/rejected": -218.51513671875, "loss": 0.6658, "positive_losses": 0.08923111110925674, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1582655906677246, "rewards/margins": 0.06178750470280647, "rewards/margins_max": 0.17090222239494324, "rewards/margins_min": -0.03684164583683014, "rewards/margins_std": 0.09074191749095917, "rewards/rejected": 0.09647808969020844, "step": 2010 }, { "dpo_losses": 0.6573031544685364, "epoch": 0.53, "grad_norm": 10.929660276279535, "learning_rate": 2.68715067159496e-07, "logits/chosen": -2.8864777088165283, "logits/rejected": -2.8166232109069824, "logps/chosen": -290.9087219238281, "logps/rejected": -229.5659637451172, "loss": 0.6787, "positive_losses": 0.08258895576000214, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.17253132164478302, "rewards/margins": 0.0766986683011055, "rewards/margins_max": 0.20891022682189941, "rewards/margins_min": -0.05160030722618103, "rewards/margins_std": 0.11397655308246613, "rewards/rejected": 0.09583264589309692, "step": 2020 }, { "dpo_losses": 0.6642154455184937, "epoch": 0.53, "grad_norm": 7.321003721451175, "learning_rate": 2.664362652644806e-07, "logits/chosen": -2.8380165100097656, "logits/rejected": -2.8278119564056396, "logps/chosen": -271.6103820800781, "logps/rejected": -254.341796875, "loss": 0.6834, "positive_losses": 0.34309062361717224, "rewards/accuracies": 0.75, "rewards/chosen": 0.148406982421875, "rewards/margins": 0.06311032921075821, "rewards/margins_max": 0.21719925105571747, "rewards/margins_min": -0.05854882672429085, "rewards/margins_std": 0.12032978236675262, "rewards/rejected": 0.0852966457605362, "step": 2030 }, { "dpo_losses": 0.6638925075531006, "epoch": 0.53, "grad_norm": 2.022652020676524, "learning_rate": 2.6415609094604555e-07, "logits/chosen": -2.616114854812622, "logits/rejected": -2.670973062515259, "logps/chosen": -285.9636535644531, "logps/rejected": -204.63404846191406, "loss": 0.6706, "positive_losses": 0.14490394294261932, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.137128084897995, "rewards/margins": 0.062432728707790375, "rewards/margins_max": 0.17718909680843353, "rewards/margins_min": -0.04789603129029274, "rewards/margins_std": 0.09822549670934677, "rewards/rejected": 0.07469536364078522, "step": 2040 }, { "dpo_losses": 0.6626821160316467, "epoch": 0.54, "grad_norm": 17.23544803436547, "learning_rate": 2.618747345980904e-07, "logits/chosen": -2.7996604442596436, "logits/rejected": -2.7959325313568115, "logps/chosen": -266.400146484375, "logps/rejected": -245.243896484375, "loss": 0.6937, "positive_losses": 0.0, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.14582450687885284, "rewards/margins": 0.06493877619504929, "rewards/margins_max": 0.17728032171726227, "rewards/margins_min": -0.05895475670695305, "rewards/margins_std": 0.10525840520858765, "rewards/rejected": 0.08088572323322296, "step": 2050 }, { "dpo_losses": 0.6693453192710876, "epoch": 0.54, "grad_norm": 10.744319390725588, "learning_rate": 2.595923867132136e-07, "logits/chosen": -2.8013222217559814, "logits/rejected": -2.7826759815216064, "logps/chosen": -293.9305419921875, "logps/rejected": -248.7853240966797, "loss": 0.6889, "positive_losses": 0.3333267271518707, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16735167801380157, "rewards/margins": 0.05091765522956848, "rewards/margins_max": 0.16602104902267456, "rewards/margins_min": -0.04544571787118912, "rewards/margins_std": 0.09119518101215363, "rewards/rejected": 0.1164340227842331, "step": 2060 }, { "dpo_losses": 0.6830196976661682, "epoch": 0.54, "grad_norm": 1.9390942567104548, "learning_rate": 2.5730923786680667e-07, "logits/chosen": -2.7646231651306152, "logits/rejected": -2.766101837158203, "logps/chosen": -214.64804077148438, "logps/rejected": -267.1541748046875, "loss": 0.6715, "positive_losses": 0.0, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.14950905740261078, "rewards/margins": 0.02300949953496456, "rewards/margins_max": 0.11503533273935318, "rewards/margins_min": -0.07700347900390625, "rewards/margins_std": 0.08432716876268387, "rewards/rejected": 0.12649956345558167, "step": 2070 }, { "dpo_losses": 0.6756635308265686, "epoch": 0.54, "grad_norm": 2.0422233549208757, "learning_rate": 2.5502547870114135e-07, "logits/chosen": -2.8308072090148926, "logits/rejected": -2.7787580490112305, "logps/chosen": -208.3933563232422, "logps/rejected": -225.6388397216797, "loss": 0.6679, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15099266171455383, "rewards/margins": 0.03903906047344208, "rewards/margins_max": 0.14198842644691467, "rewards/margins_min": -0.09506646543741226, "rewards/margins_std": 0.10874740779399872, "rewards/rejected": 0.11195359379053116, "step": 2080 }, { "dpo_losses": 0.6744376420974731, "epoch": 0.55, "grad_norm": 8.139545273885084, "learning_rate": 2.527412999094506e-07, "logits/chosen": -2.7396240234375, "logits/rejected": -2.7286412715911865, "logps/chosen": -257.28582763671875, "logps/rejected": -290.82537841796875, "loss": 0.6855, "positive_losses": 0.23939552903175354, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.12208755314350128, "rewards/margins": 0.04112662002444267, "rewards/margins_max": 0.15945208072662354, "rewards/margins_min": -0.07624942809343338, "rewards/margins_std": 0.10272153466939926, "rewards/rejected": 0.08096092194318771, "step": 2090 }, { "dpo_losses": 0.6722500920295715, "epoch": 0.55, "grad_norm": 1.7640498739488435, "learning_rate": 2.5045689222000636e-07, "logits/chosen": -2.7935140132904053, "logits/rejected": -2.794473171234131, "logps/chosen": -237.05728149414062, "logps/rejected": -202.26231384277344, "loss": 0.6787, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.13129688799381256, "rewards/margins": 0.04541920870542526, "rewards/margins_max": 0.1597176045179367, "rewards/margins_min": -0.04207003861665726, "rewards/margins_std": 0.09029006958007812, "rewards/rejected": 0.0858776792883873, "step": 2100 }, { "epoch": 0.55, "eval_dpo_losses": 0.6646059155464172, "eval_logits/chosen": -2.783231735229492, "eval_logits/rejected": -2.7445175647735596, "eval_logps/chosen": -269.0392761230469, "eval_logps/rejected": -252.820556640625, "eval_loss": 0.6826277375221252, "eval_positive_losses": 0.12551376223564148, "eval_rewards/accuracies": 0.7182539701461792, "eval_rewards/chosen": 0.15455636382102966, "eval_rewards/margins": 0.06126810982823372, "eval_rewards/margins_max": 0.2372942566871643, "eval_rewards/margins_min": -0.09696952998638153, "eval_rewards/margins_std": 0.11048813909292221, "eval_rewards/rejected": 0.09328825026750565, "eval_runtime": 399.0656, "eval_samples_per_second": 5.012, "eval_steps_per_second": 0.158, "step": 2100 }, { "dpo_losses": 0.6669995188713074, "epoch": 0.55, "grad_norm": 1.8424762916851205, "learning_rate": 2.481724463801933e-07, "logits/chosen": -2.7471299171447754, "logits/rejected": -2.6674296855926514, "logps/chosen": -251.7501983642578, "logps/rejected": -212.1397247314453, "loss": 0.6783, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15100225806236267, "rewards/margins": 0.0555257685482502, "rewards/margins_max": 0.18034568428993225, "rewards/margins_min": -0.029644068330526352, "rewards/margins_std": 0.09330420196056366, "rewards/rejected": 0.09547650068998337, "step": 2110 }, { "dpo_losses": 0.6630354523658752, "epoch": 0.55, "grad_norm": 21.202486662507965, "learning_rate": 2.4588815314058154e-07, "logits/chosen": -2.729743003845215, "logits/rejected": -2.711665391921997, "logps/chosen": -227.17294311523438, "logps/rejected": -248.97134399414062, "loss": 0.673, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.1562889814376831, "rewards/margins": 0.06316865980625153, "rewards/margins_max": 0.13936588168144226, "rewards/margins_min": -0.014763864688575268, "rewards/margins_std": 0.07098125666379929, "rewards/rejected": 0.09312032163143158, "step": 2120 }, { "dpo_losses": 0.6803591847419739, "epoch": 0.56, "grad_norm": 2.3717799191522384, "learning_rate": 2.4360420323899917e-07, "logits/chosen": -2.7886428833007812, "logits/rejected": -2.7954256534576416, "logps/chosen": -194.2913055419922, "logps/rejected": -247.00051879882812, "loss": 0.6785, "positive_losses": 0.07509269565343857, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.14525488018989563, "rewards/margins": 0.02834526263177395, "rewards/margins_max": 0.1326114535331726, "rewards/margins_min": -0.0706716850399971, "rewards/margins_std": 0.09119173139333725, "rewards/rejected": 0.11690962314605713, "step": 2130 }, { "dpo_losses": 0.6663404107093811, "epoch": 0.56, "grad_norm": 2.462527388212223, "learning_rate": 2.4132078738460583e-07, "logits/chosen": -2.8206303119659424, "logits/rejected": -2.8253495693206787, "logps/chosen": -274.0443420410156, "logps/rejected": -268.05853271484375, "loss": 0.6789, "positive_losses": 0.10881118476390839, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.16541233658790588, "rewards/margins": 0.0581609308719635, "rewards/margins_max": 0.18914298713207245, "rewards/margins_min": -0.05697988346219063, "rewards/margins_std": 0.10790137201547623, "rewards/rejected": 0.10725139081478119, "step": 2140 }, { "dpo_losses": 0.6610409617424011, "epoch": 0.56, "grad_norm": 8.627465821659698, "learning_rate": 2.390380962419682e-07, "logits/chosen": -2.8324084281921387, "logits/rejected": -2.7803261280059814, "logps/chosen": -268.6856994628906, "logps/rejected": -203.6580810546875, "loss": 0.6753, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1652398705482483, "rewards/margins": 0.06922824680805206, "rewards/margins_max": 0.20310267806053162, "rewards/margins_min": -0.04356042295694351, "rewards/margins_std": 0.11182417720556259, "rewards/rejected": 0.09601160883903503, "step": 2150 }, { "dpo_losses": 0.672164797782898, "epoch": 0.57, "grad_norm": 6.040860598055274, "learning_rate": 2.3675632041513977e-07, "logits/chosen": -2.655301570892334, "logits/rejected": -2.660374879837036, "logps/chosen": -208.69058227539062, "logps/rejected": -226.7170867919922, "loss": 0.6758, "positive_losses": 0.11751461029052734, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1308482438325882, "rewards/margins": 0.04433317855000496, "rewards/margins_max": 0.1322614550590515, "rewards/margins_min": -0.02744942344725132, "rewards/margins_std": 0.07075206935405731, "rewards/rejected": 0.08651508390903473, "step": 2160 }, { "dpo_losses": 0.6654033064842224, "epoch": 0.57, "grad_norm": 9.69070952377682, "learning_rate": 2.344756504317453e-07, "logits/chosen": -2.625159502029419, "logits/rejected": -2.6462783813476562, "logps/chosen": -237.1065216064453, "logps/rejected": -233.6138916015625, "loss": 0.6773, "positive_losses": 0.12059593200683594, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.14112631976604462, "rewards/margins": 0.05939141660928726, "rewards/margins_max": 0.1860862672328949, "rewards/margins_min": -0.05647587776184082, "rewards/margins_std": 0.10976777970790863, "rewards/rejected": 0.08173491060733795, "step": 2170 }, { "dpo_losses": 0.6683284640312195, "epoch": 0.57, "grad_norm": 20.456535597036282, "learning_rate": 2.3219627672707237e-07, "logits/chosen": -2.898542642593384, "logits/rejected": -2.832404375076294, "logps/chosen": -323.9666442871094, "logps/rejected": -243.0486297607422, "loss": 0.675, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16770347952842712, "rewards/margins": 0.0541713610291481, "rewards/margins_max": 0.19824166595935822, "rewards/margins_min": -0.07504001259803772, "rewards/margins_std": 0.12081035226583481, "rewards/rejected": 0.11353211104869843, "step": 2180 }, { "dpo_losses": 0.6708992719650269, "epoch": 0.57, "grad_norm": 5.65471210964526, "learning_rate": 2.2991838962816918e-07, "logits/chosen": -2.854893207550049, "logits/rejected": -2.881042718887329, "logps/chosen": -269.02606201171875, "logps/rejected": -322.48980712890625, "loss": 0.6757, "positive_losses": 0.005802154541015625, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.13117845356464386, "rewards/margins": 0.048213545233011246, "rewards/margins_max": 0.16716358065605164, "rewards/margins_min": -0.08829066902399063, "rewards/margins_std": 0.11207526922225952, "rewards/rejected": 0.08296488225460052, "step": 2190 }, { "dpo_losses": 0.6560848951339722, "epoch": 0.58, "grad_norm": 2.031732237865772, "learning_rate": 2.2764217933795297e-07, "logits/chosen": -2.8572869300842285, "logits/rejected": -2.7690012454986572, "logps/chosen": -347.05364990234375, "logps/rejected": -270.6988525390625, "loss": 0.6738, "positive_losses": 0.16602382063865662, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.17479580640792847, "rewards/margins": 0.07932952046394348, "rewards/margins_max": 0.21747556328773499, "rewards/margins_min": -0.0325501449406147, "rewards/margins_std": 0.11231104284524918, "rewards/rejected": 0.09546627104282379, "step": 2200 }, { "epoch": 0.58, "eval_dpo_losses": 0.6644929647445679, "eval_logits/chosen": -2.780320644378662, "eval_logits/rejected": -2.7417633533477783, "eval_logps/chosen": -268.6586608886719, "eval_logps/rejected": -252.46463012695312, "eval_loss": 0.6815550327301025, "eval_positive_losses": 0.1156652644276619, "eval_rewards/accuracies": 0.7182539701461792, "eval_rewards/chosen": 0.1583622545003891, "eval_rewards/margins": 0.06151484698057175, "eval_rewards/margins_max": 0.23826445639133453, "eval_rewards/margins_min": -0.09692387282848358, "eval_rewards/margins_std": 0.11083362251520157, "eval_rewards/rejected": 0.09684741497039795, "eval_runtime": 389.3755, "eval_samples_per_second": 5.136, "eval_steps_per_second": 0.162, "step": 2200 }, { "dpo_losses": 0.6702221035957336, "epoch": 0.58, "grad_norm": 1.8553276570532913, "learning_rate": 2.253678359193278e-07, "logits/chosen": -2.837172746658325, "logits/rejected": -2.839228868484497, "logps/chosen": -249.7177276611328, "logps/rejected": -244.18408203125, "loss": 0.676, "positive_losses": 0.37015992403030396, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16470842063426971, "rewards/margins": 0.0498427152633667, "rewards/margins_max": 0.17060586810112, "rewards/margins_min": -0.08222378045320511, "rewards/margins_std": 0.11427643150091171, "rewards/rejected": 0.11486568301916122, "step": 2210 }, { "dpo_losses": 0.6609566807746887, "epoch": 0.58, "grad_norm": 2.680723118088112, "learning_rate": 2.230955492793149e-07, "logits/chosen": -2.8189260959625244, "logits/rejected": -2.804194688796997, "logps/chosen": -306.3068542480469, "logps/rejected": -256.5824890136719, "loss": 0.6763, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.15530112385749817, "rewards/margins": 0.06911114603281021, "rewards/margins_max": 0.2111690789461136, "rewards/margins_min": -0.06732382625341415, "rewards/margins_std": 0.1288124918937683, "rewards/rejected": 0.08619000017642975, "step": 2220 }, { "dpo_losses": 0.6690382957458496, "epoch": 0.58, "grad_norm": 6.015237533179085, "learning_rate": 2.2082550915319468e-07, "logits/chosen": -2.770582675933838, "logits/rejected": -2.7712674140930176, "logps/chosen": -246.59390258789062, "logps/rejected": -265.6105041503906, "loss": 0.6801, "positive_losses": 0.2791542112827301, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1340925395488739, "rewards/margins": 0.05243048071861267, "rewards/margins_max": 0.18013811111450195, "rewards/margins_min": -0.05579759553074837, "rewards/margins_std": 0.10320155322551727, "rewards/rejected": 0.08166205883026123, "step": 2230 }, { "dpo_losses": 0.667000412940979, "epoch": 0.59, "grad_norm": 2.614393513591382, "learning_rate": 2.1855790508866433e-07, "logits/chosen": -2.8280067443847656, "logits/rejected": -2.782679557800293, "logps/chosen": -272.31341552734375, "logps/rejected": -226.23159790039062, "loss": 0.6911, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.15997463464736938, "rewards/margins": 0.05621781200170517, "rewards/margins_max": 0.1634344756603241, "rewards/margins_min": -0.05732632428407669, "rewards/margins_std": 0.1002826914191246, "rewards/rejected": 0.10375680774450302, "step": 2240 }, { "dpo_losses": 0.6544098258018494, "epoch": 0.59, "grad_norm": 6.138037993242967, "learning_rate": 2.162929264300107e-07, "logits/chosen": -2.8011958599090576, "logits/rejected": -2.787996292114258, "logps/chosen": -271.0982360839844, "logps/rejected": -234.0089111328125, "loss": 0.6835, "positive_losses": 0.12156429141759872, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1650722324848175, "rewards/margins": 0.08291522413492203, "rewards/margins_max": 0.24250057339668274, "rewards/margins_min": -0.02806040272116661, "rewards/margins_std": 0.12254680693149567, "rewards/rejected": 0.08215700834989548, "step": 2250 }, { "dpo_losses": 0.667629599571228, "epoch": 0.59, "grad_norm": 2.031461968732793, "learning_rate": 2.1403076230230005e-07, "logits/chosen": -2.7610487937927246, "logits/rejected": -2.715662717819214, "logps/chosen": -227.35494995117188, "logps/rejected": -223.0266571044922, "loss": 0.6805, "positive_losses": 0.009455109015107155, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.139937624335289, "rewards/margins": 0.05409275367856026, "rewards/margins_max": 0.167481929063797, "rewards/margins_min": -0.03251870721578598, "rewards/margins_std": 0.09049482643604279, "rewards/rejected": 0.08584487438201904, "step": 2260 }, { "dpo_losses": 0.6574854850769043, "epoch": 0.59, "grad_norm": 2.015774625207423, "learning_rate": 2.1177160159558596e-07, "logits/chosen": -2.7960407733917236, "logits/rejected": -2.708859443664551, "logps/chosen": -246.37698364257812, "logps/rejected": -247.0392608642578, "loss": 0.6688, "positive_losses": 0.21091079711914062, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.17055392265319824, "rewards/margins": 0.07748468220233917, "rewards/margins_max": 0.24710354208946228, "rewards/margins_min": -0.05870268493890762, "rewards/margins_std": 0.13558170199394226, "rewards/rejected": 0.09306924045085907, "step": 2270 }, { "dpo_losses": 0.6679006814956665, "epoch": 0.6, "grad_norm": 10.93004175120596, "learning_rate": 2.0951563294913734e-07, "logits/chosen": -2.8326520919799805, "logits/rejected": -2.8108348846435547, "logps/chosen": -244.97140502929688, "logps/rejected": -255.8402862548828, "loss": 0.6984, "positive_losses": 0.2469741851091385, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.14643892645835876, "rewards/margins": 0.05538954213261604, "rewards/margins_max": 0.17577563226222992, "rewards/margins_min": -0.08614195883274078, "rewards/margins_std": 0.12158197164535522, "rewards/rejected": 0.09104935824871063, "step": 2280 }, { "dpo_losses": 0.6632338762283325, "epoch": 0.6, "grad_norm": 1.677939068316556, "learning_rate": 2.072630447356869e-07, "logits/chosen": -2.7692759037017822, "logits/rejected": -2.681032657623291, "logps/chosen": -209.3841094970703, "logps/rejected": -208.48489379882812, "loss": 0.6739, "positive_losses": 0.0, "rewards/accuracies": 0.875, "rewards/chosen": 0.17293371260166168, "rewards/margins": 0.06350459158420563, "rewards/margins_max": 0.17482289671897888, "rewards/margins_min": -0.030863529071211815, "rewards/margins_std": 0.09300075471401215, "rewards/rejected": 0.10942912101745605, "step": 2290 }, { "dpo_losses": 0.6755298376083374, "epoch": 0.6, "grad_norm": 9.301373484924401, "learning_rate": 2.0501402504570232e-07, "logits/chosen": -2.7473981380462646, "logits/rejected": -2.7651193141937256, "logps/chosen": -241.1534881591797, "logps/rejected": -235.19644165039062, "loss": 0.675, "positive_losses": 0.027724647894501686, "rewards/accuracies": 0.625, "rewards/chosen": 0.14800508320331573, "rewards/margins": 0.04038618132472038, "rewards/margins_max": 0.1762312650680542, "rewards/margins_min": -0.09547598659992218, "rewards/margins_std": 0.1235724464058876, "rewards/rejected": 0.10761890560388565, "step": 2300 }, { "epoch": 0.6, "eval_dpo_losses": 0.6642228960990906, "eval_logits/chosen": -2.7834320068359375, "eval_logits/rejected": -2.744966506958008, "eval_logps/chosen": -268.5911560058594, "eval_logps/rejected": -252.45948791503906, "eval_loss": 0.6816120743751526, "eval_positive_losses": 0.12096957862377167, "eval_rewards/accuracies": 0.7242063283920288, "eval_rewards/chosen": 0.15903764963150024, "eval_rewards/margins": 0.06213868409395218, "eval_rewards/margins_max": 0.24037744104862213, "eval_rewards/margins_min": -0.09743154793977737, "eval_rewards/margins_std": 0.11178537458181381, "eval_rewards/rejected": 0.09689898043870926, "eval_runtime": 390.1632, "eval_samples_per_second": 5.126, "eval_steps_per_second": 0.161, "step": 2300 }, { "dpo_losses": 0.6589769124984741, "epoch": 0.6, "grad_norm": 5.2593957294275455, "learning_rate": 2.027687616716804e-07, "logits/chosen": -2.848520517349243, "logits/rejected": -2.7592415809631348, "logps/chosen": -311.2559509277344, "logps/rejected": -236.20046997070312, "loss": 0.6641, "positive_losses": 0.06801052391529083, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16040074825286865, "rewards/margins": 0.07437174022197723, "rewards/margins_max": 0.25320133566856384, "rewards/margins_min": -0.03677508980035782, "rewards/margins_std": 0.1288147270679474, "rewards/rejected": 0.08602902293205261, "step": 2310 }, { "dpo_losses": 0.677810788154602, "epoch": 0.61, "grad_norm": 11.593297903289386, "learning_rate": 2.005274420924668e-07, "logits/chosen": -2.7715988159179688, "logits/rejected": -2.731139659881592, "logps/chosen": -266.9964599609375, "logps/rejected": -257.47216796875, "loss": 0.6717, "positive_losses": 0.07237549126148224, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.16181442141532898, "rewards/margins": 0.03579026088118553, "rewards/margins_max": 0.17425528168678284, "rewards/margins_min": -0.10560673475265503, "rewards/margins_std": 0.12581433355808258, "rewards/rejected": 0.12602415680885315, "step": 2320 }, { "dpo_losses": 0.6600149869918823, "epoch": 0.61, "grad_norm": 6.220344684041331, "learning_rate": 1.9829025345760121e-07, "logits/chosen": -2.8089206218719482, "logits/rejected": -2.7957355976104736, "logps/chosen": -284.0976257324219, "logps/rejected": -308.8465881347656, "loss": 0.6715, "positive_losses": 0.14536181092262268, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.16044309735298157, "rewards/margins": 0.0716666579246521, "rewards/margins_max": 0.19783233106136322, "rewards/margins_min": -0.04586394503712654, "rewards/margins_std": 0.1109810620546341, "rewards/rejected": 0.08877645432949066, "step": 2330 }, { "dpo_losses": 0.679665744304657, "epoch": 0.61, "grad_norm": 2.0598822656338167, "learning_rate": 1.960573825716911e-07, "logits/chosen": -2.822514533996582, "logits/rejected": -2.7786355018615723, "logps/chosen": -318.4324951171875, "logps/rejected": -318.00213623046875, "loss": 0.676, "positive_losses": 0.089727021753788, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.15282617509365082, "rewards/margins": 0.03167200833559036, "rewards/margins_max": 0.17082737386226654, "rewards/margins_min": -0.11809341609477997, "rewards/margins_std": 0.12609757483005524, "rewards/rejected": 0.12115416675806046, "step": 2340 }, { "dpo_losses": 0.6711980700492859, "epoch": 0.62, "grad_norm": 2.115547248617438, "learning_rate": 1.9382901587881273e-07, "logits/chosen": -2.834235429763794, "logits/rejected": -2.8224523067474365, "logps/chosen": -294.26800537109375, "logps/rejected": -243.66738891601562, "loss": 0.6746, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1676826775074005, "rewards/margins": 0.047168977558612823, "rewards/margins_max": 0.14759781956672668, "rewards/margins_min": -0.06013220548629761, "rewards/margins_std": 0.09161853790283203, "rewards/rejected": 0.1205136775970459, "step": 2350 }, { "dpo_losses": 0.6665584444999695, "epoch": 0.62, "grad_norm": 7.707595487455357, "learning_rate": 1.9160533944694364e-07, "logits/chosen": -2.813931703567505, "logits/rejected": -2.763089179992676, "logps/chosen": -270.51202392578125, "logps/rejected": -204.17807006835938, "loss": 0.6854, "positive_losses": 0.49280548095703125, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.15664884448051453, "rewards/margins": 0.05828050523996353, "rewards/margins_max": 0.1808789223432541, "rewards/margins_min": -0.05433495715260506, "rewards/margins_std": 0.10690093040466309, "rewards/rejected": 0.09836836159229279, "step": 2360 }, { "dpo_losses": 0.6780496835708618, "epoch": 0.62, "grad_norm": 2.712138500823685, "learning_rate": 1.8938653895242602e-07, "logits/chosen": -2.854897975921631, "logits/rejected": -2.810944080352783, "logps/chosen": -243.7271270751953, "logps/rejected": -226.6142120361328, "loss": 0.6882, "positive_losses": 0.14659099280834198, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.13124720752239227, "rewards/margins": 0.0334155447781086, "rewards/margins_max": 0.12614381313323975, "rewards/margins_min": -0.08595071732997894, "rewards/margins_std": 0.09597662836313248, "rewards/rejected": 0.09783166646957397, "step": 2370 }, { "dpo_losses": 0.6700248718261719, "epoch": 0.62, "grad_norm": 2.1139950489863195, "learning_rate": 1.8717279966446264e-07, "logits/chosen": -2.8533968925476074, "logits/rejected": -2.8190789222717285, "logps/chosen": -231.2190704345703, "logps/rejected": -203.7569580078125, "loss": 0.6723, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.14022384583950043, "rewards/margins": 0.04948741942644119, "rewards/margins_max": 0.1588309109210968, "rewards/margins_min": -0.044143229722976685, "rewards/margins_std": 0.09119255840778351, "rewards/rejected": 0.09073643386363983, "step": 2380 }, { "dpo_losses": 0.6732557415962219, "epoch": 0.63, "grad_norm": 23.04693061433968, "learning_rate": 1.8496430642964694e-07, "logits/chosen": -2.847764492034912, "logits/rejected": -2.849989414215088, "logps/chosen": -266.61572265625, "logps/rejected": -279.8492431640625, "loss": 0.6739, "positive_losses": 0.03173675388097763, "rewards/accuracies": 0.625, "rewards/chosen": 0.16146275401115417, "rewards/margins": 0.044871553778648376, "rewards/margins_max": 0.18712307512760162, "rewards/margins_min": -0.1083759069442749, "rewards/margins_std": 0.12768994271755219, "rewards/rejected": 0.1165911927819252, "step": 2390 }, { "dpo_losses": 0.6605753898620605, "epoch": 0.63, "grad_norm": 1.9620799350067777, "learning_rate": 1.8276124365652855e-07, "logits/chosen": -2.7887072563171387, "logits/rejected": -2.7103452682495117, "logps/chosen": -257.5548095703125, "logps/rejected": -290.41815185546875, "loss": 0.6821, "positive_losses": 0.10989990085363388, "rewards/accuracies": 0.75, "rewards/chosen": 0.18278703093528748, "rewards/margins": 0.07031223922967911, "rewards/margins_max": 0.2028309404850006, "rewards/margins_min": -0.05043686553835869, "rewards/margins_std": 0.11253918707370758, "rewards/rejected": 0.11247478425502777, "step": 2400 }, { "epoch": 0.63, "eval_dpo_losses": 0.663316547870636, "eval_logits/chosen": -2.784898042678833, "eval_logits/rejected": -2.7465882301330566, "eval_logps/chosen": -268.8607482910156, "eval_logps/rejected": -252.93472290039062, "eval_loss": 0.683151125907898, "eval_positive_losses": 0.14105184376239777, "eval_rewards/accuracies": 0.7202380895614624, "eval_rewards/chosen": 0.15634165704250336, "eval_rewards/margins": 0.06419505923986435, "eval_rewards/margins_max": 0.2465353161096573, "eval_rewards/margins_min": -0.10097683221101761, "eval_rewards/margins_std": 0.11476168036460876, "eval_rewards/rejected": 0.09214659780263901, "eval_runtime": 389.2899, "eval_samples_per_second": 5.138, "eval_steps_per_second": 0.162, "step": 2400 }, { "dpo_losses": 0.6710997819900513, "epoch": 0.63, "grad_norm": 2.284679604403075, "learning_rate": 1.805637953002149e-07, "logits/chosen": -2.8017404079437256, "logits/rejected": -2.7698140144348145, "logps/chosen": -288.701416015625, "logps/rejected": -244.4599609375, "loss": 0.6725, "positive_losses": 0.056781768798828125, "rewards/accuracies": 0.625, "rewards/chosen": 0.17063315212726593, "rewards/margins": 0.04911976680159569, "rewards/margins_max": 0.19049224257469177, "rewards/margins_min": -0.08515028655529022, "rewards/margins_std": 0.12400348484516144, "rewards/rejected": 0.12151336669921875, "step": 2410 }, { "dpo_losses": 0.6658786535263062, "epoch": 0.63, "grad_norm": 11.696369253359672, "learning_rate": 1.7837214484701153e-07, "logits/chosen": -2.834495782852173, "logits/rejected": -2.774224042892456, "logps/chosen": -341.9219970703125, "logps/rejected": -286.7921142578125, "loss": 0.6863, "positive_losses": 0.20940017700195312, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16808286309242249, "rewards/margins": 0.05958018824458122, "rewards/margins_max": 0.17783430218696594, "rewards/margins_min": -0.08136147260665894, "rewards/margins_std": 0.11622990667819977, "rewards/rejected": 0.10850267112255096, "step": 2420 }, { "dpo_losses": 0.6711954474449158, "epoch": 0.64, "grad_norm": 2.085611496499541, "learning_rate": 1.761864752991004e-07, "logits/chosen": -2.8036391735076904, "logits/rejected": -2.780700922012329, "logps/chosen": -275.9913635253906, "logps/rejected": -250.76846313476562, "loss": 0.6908, "positive_losses": 0.0, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.16365042328834534, "rewards/margins": 0.048547059297561646, "rewards/margins_max": 0.2041003406047821, "rewards/margins_min": -0.08558313548564911, "rewards/margins_std": 0.12569783627986908, "rewards/rejected": 0.11510336399078369, "step": 2430 }, { "dpo_losses": 0.6707032918930054, "epoch": 0.64, "grad_norm": 6.078052196704086, "learning_rate": 1.7400696915925995e-07, "logits/chosen": -2.7483696937561035, "logits/rejected": -2.706458568572998, "logps/chosen": -277.4805908203125, "logps/rejected": -247.01528930664062, "loss": 0.6767, "positive_losses": 0.11072997748851776, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1552312672138214, "rewards/margins": 0.05023397132754326, "rewards/margins_max": 0.2068198025226593, "rewards/margins_min": -0.0821874588727951, "rewards/margins_std": 0.12569008767604828, "rewards/rejected": 0.10499731451272964, "step": 2440 }, { "dpo_losses": 0.6737798452377319, "epoch": 0.64, "grad_norm": 1.9705347115847889, "learning_rate": 1.718338084156254e-07, "logits/chosen": -2.836562156677246, "logits/rejected": -2.8479294776916504, "logps/chosen": -281.20611572265625, "logps/rejected": -335.68292236328125, "loss": 0.6803, "positive_losses": 0.18814153969287872, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1451827734708786, "rewards/margins": 0.043305326253175735, "rewards/margins_max": 0.14558716118335724, "rewards/margins_min": -0.09249875694513321, "rewards/margins_std": 0.1084173321723938, "rewards/rejected": 0.10187745094299316, "step": 2450 }, { "dpo_losses": 0.6762491464614868, "epoch": 0.64, "grad_norm": 9.533198923439455, "learning_rate": 1.696671745264937e-07, "logits/chosen": -2.7576818466186523, "logits/rejected": -2.7617905139923096, "logps/chosen": -230.6830291748047, "logps/rejected": -228.86172485351562, "loss": 0.6838, "positive_losses": 0.06390075385570526, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.14102314412593842, "rewards/margins": 0.038981709629297256, "rewards/margins_max": 0.1658230423927307, "rewards/margins_min": -0.1091441735625267, "rewards/margins_std": 0.12223289906978607, "rewards/rejected": 0.10204143822193146, "step": 2460 }, { "dpo_losses": 0.6576075553894043, "epoch": 0.65, "grad_norm": 7.047728087221175, "learning_rate": 1.67507248405171e-07, "logits/chosen": -2.7969839572906494, "logits/rejected": -2.811170816421509, "logps/chosen": -324.28948974609375, "logps/rejected": -287.66900634765625, "loss": 0.6849, "positive_losses": 0.25962066650390625, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16784675419330597, "rewards/margins": 0.07765379548072815, "rewards/margins_max": 0.24460339546203613, "rewards/margins_min": -0.05037788301706314, "rewards/margins_std": 0.13253864645957947, "rewards/rejected": 0.09019295871257782, "step": 2470 }, { "dpo_losses": 0.6706292033195496, "epoch": 0.65, "grad_norm": 1.7907451063422377, "learning_rate": 1.6535421040486683e-07, "logits/chosen": -2.9084179401397705, "logits/rejected": -2.8807246685028076, "logps/chosen": -287.76953125, "logps/rejected": -237.6548614501953, "loss": 0.681, "positive_losses": 0.00829315185546875, "rewards/accuracies": 0.625, "rewards/chosen": 0.15879002213478088, "rewards/margins": 0.04883551970124245, "rewards/margins_max": 0.16194190084934235, "rewards/margins_min": -0.0865946039557457, "rewards/margins_std": 0.11053447425365448, "rewards/rejected": 0.10995452105998993, "step": 2480 }, { "dpo_losses": 0.6417179703712463, "epoch": 0.65, "grad_norm": 7.307945463297551, "learning_rate": 1.6320824030363456e-07, "logits/chosen": -2.678077220916748, "logits/rejected": -2.6205215454101562, "logps/chosen": -313.7894287109375, "logps/rejected": -258.7198791503906, "loss": 0.6577, "positive_losses": 0.048689745366573334, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.18587902188301086, "rewards/margins": 0.11005387455224991, "rewards/margins_max": 0.25625452399253845, "rewards/margins_min": -0.04094923287630081, "rewards/margins_std": 0.12633609771728516, "rewards/rejected": 0.07582515478134155, "step": 2490 }, { "dpo_losses": 0.6599959135055542, "epoch": 0.65, "grad_norm": 7.724047121209455, "learning_rate": 1.6106951728936024e-07, "logits/chosen": -2.7736682891845703, "logits/rejected": -2.790349245071411, "logps/chosen": -239.8456573486328, "logps/rejected": -275.7630310058594, "loss": 0.6881, "positive_losses": 0.1184925064444542, "rewards/accuracies": 0.75, "rewards/chosen": 0.1587900072336197, "rewards/margins": 0.07100746780633926, "rewards/margins_max": 0.21270182728767395, "rewards/margins_min": -0.03353138267993927, "rewards/margins_std": 0.11178290843963623, "rewards/rejected": 0.08778254687786102, "step": 2500 }, { "epoch": 0.65, "eval_dpo_losses": 0.6630551815032959, "eval_logits/chosen": -2.787369966506958, "eval_logits/rejected": -2.749213695526123, "eval_logps/chosen": -268.79345703125, "eval_logps/rejected": -252.9271697998047, "eval_loss": 0.6830382347106934, "eval_positive_losses": 0.14260074496269226, "eval_rewards/accuracies": 0.7222222089767456, "eval_rewards/chosen": 0.15701442956924438, "eval_rewards/margins": 0.06479236483573914, "eval_rewards/margins_max": 0.24739092588424683, "eval_rewards/margins_min": -0.10220367461442947, "eval_rewards/margins_std": 0.11561723798513412, "eval_rewards/rejected": 0.09222202748060226, "eval_runtime": 389.6283, "eval_samples_per_second": 5.133, "eval_steps_per_second": 0.162, "step": 2500 }, { "dpo_losses": 0.6517472267150879, "epoch": 0.66, "grad_norm": 1.9878626534549249, "learning_rate": 1.5893821994479994e-07, "logits/chosen": -2.7108683586120605, "logits/rejected": -2.649024486541748, "logps/chosen": -240.2740478515625, "logps/rejected": -232.99734497070312, "loss": 0.6704, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.16711577773094177, "rewards/margins": 0.08962143212556839, "rewards/margins_max": 0.23768293857574463, "rewards/margins_min": -0.05823158472776413, "rewards/margins_std": 0.12899050116539001, "rewards/rejected": 0.07749433815479279, "step": 2510 }, { "dpo_losses": 0.6465938687324524, "epoch": 0.66, "grad_norm": 12.428593652764857, "learning_rate": 1.5681452623266867e-07, "logits/chosen": -2.8300561904907227, "logits/rejected": -2.7787134647369385, "logps/chosen": -244.55810546875, "logps/rejected": -229.2414093017578, "loss": 0.6752, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1652805656194687, "rewards/margins": 0.10005130618810654, "rewards/margins_max": 0.251314252614975, "rewards/margins_min": -0.03838371858000755, "rewards/margins_std": 0.12975125014781952, "rewards/rejected": 0.06522925198078156, "step": 2520 }, { "dpo_losses": 0.6602810025215149, "epoch": 0.66, "grad_norm": 7.703697446402158, "learning_rate": 1.546986134807801e-07, "logits/chosen": -2.7837038040161133, "logits/rejected": -2.819833517074585, "logps/chosen": -269.00579833984375, "logps/rejected": -288.3307800292969, "loss": 0.6664, "positive_losses": 0.2041366547346115, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.14903002977371216, "rewards/margins": 0.07221291214227676, "rewards/margins_max": 0.24120278656482697, "rewards/margins_min": -0.07335890829563141, "rewards/margins_std": 0.13466687500476837, "rewards/rejected": 0.0768171027302742, "step": 2530 }, { "dpo_losses": 0.6748021841049194, "epoch": 0.66, "grad_norm": 2.0470362010115877, "learning_rate": 1.5259065836724034e-07, "logits/chosen": -2.728024959564209, "logits/rejected": -2.743741512298584, "logps/chosen": -272.1194152832031, "logps/rejected": -314.8052062988281, "loss": 0.683, "positive_losses": 0.09459342807531357, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.12930192053318024, "rewards/margins": 0.04087451845407486, "rewards/margins_max": 0.16113092005252838, "rewards/margins_min": -0.0958721712231636, "rewards/margins_std": 0.11435544490814209, "rewards/rejected": 0.08842740207910538, "step": 2540 }, { "dpo_losses": 0.6604206562042236, "epoch": 0.67, "grad_norm": 1.6351465216728645, "learning_rate": 1.5049083690569454e-07, "logits/chosen": -2.8244802951812744, "logits/rejected": -2.795187473297119, "logps/chosen": -257.86102294921875, "logps/rejected": -270.40325927734375, "loss": 0.6749, "positive_losses": 0.15022125840187073, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1673004925251007, "rewards/margins": 0.07084228098392487, "rewards/margins_max": 0.20384936034679413, "rewards/margins_min": -0.05480436235666275, "rewards/margins_std": 0.1157851442694664, "rewards/rejected": 0.09645821899175644, "step": 2550 }, { "dpo_losses": 0.6679819822311401, "epoch": 0.67, "grad_norm": 2.1467085106051065, "learning_rate": 1.4839932443063056e-07, "logits/chosen": -2.6807057857513428, "logits/rejected": -2.6425938606262207, "logps/chosen": -185.5140380859375, "logps/rejected": -180.81089782714844, "loss": 0.6694, "positive_losses": 0.00739707937464118, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.14547543227672577, "rewards/margins": 0.05403967574238777, "rewards/margins_max": 0.18117661774158478, "rewards/margins_min": -0.05315234512090683, "rewards/margins_std": 0.10514678806066513, "rewards/rejected": 0.09143576771020889, "step": 2560 }, { "dpo_losses": 0.6649169921875, "epoch": 0.67, "grad_norm": 13.485934241396896, "learning_rate": 1.46316295582738e-07, "logits/chosen": -2.8397908210754395, "logits/rejected": -2.802277088165283, "logps/chosen": -268.9765319824219, "logps/rejected": -267.4847717285156, "loss": 0.6949, "positive_losses": 0.15890884399414062, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.16603443026542664, "rewards/margins": 0.0631035789847374, "rewards/margins_max": 0.17990688979625702, "rewards/margins_min": -0.059853047132492065, "rewards/margins_std": 0.10987289249897003, "rewards/rejected": 0.10293082892894745, "step": 2570 }, { "dpo_losses": 0.6597756147384644, "epoch": 0.68, "grad_norm": 9.548073651225087, "learning_rate": 1.4424192429432655e-07, "logits/chosen": -2.796020030975342, "logits/rejected": -2.770440101623535, "logps/chosen": -251.7606201171875, "logps/rejected": -265.84588623046875, "loss": 0.6803, "positive_losses": 0.09473152458667755, "rewards/accuracies": 0.75, "rewards/chosen": 0.16906090080738068, "rewards/margins": 0.071074478328228, "rewards/margins_max": 0.1876489222049713, "rewards/margins_min": -0.04980349540710449, "rewards/margins_std": 0.10807528346776962, "rewards/rejected": 0.09798641502857208, "step": 2580 }, { "dpo_losses": 0.6544302105903625, "epoch": 0.68, "grad_norm": 4.496828020922733, "learning_rate": 1.4217638377480158e-07, "logits/chosen": -2.798560380935669, "logits/rejected": -2.7506070137023926, "logps/chosen": -284.4412841796875, "logps/rejected": -232.6147918701172, "loss": 0.6724, "positive_losses": 0.13090820610523224, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1532086730003357, "rewards/margins": 0.08290395885705948, "rewards/margins_max": 0.20642821490764618, "rewards/margins_min": -0.04796721413731575, "rewards/margins_std": 0.11846397072076797, "rewards/rejected": 0.07030472904443741, "step": 2590 }, { "dpo_losses": 0.6792913675308228, "epoch": 0.68, "grad_norm": 13.357376038313298, "learning_rate": 1.401198464962021e-07, "logits/chosen": -2.6972367763519287, "logits/rejected": -2.7522435188293457, "logps/chosen": -216.80258178710938, "logps/rejected": -253.00991821289062, "loss": 0.6871, "positive_losses": 0.30749091506004333, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.12594708800315857, "rewards/margins": 0.030662814155220985, "rewards/margins_max": 0.13051792979240417, "rewards/margins_min": -0.06012769415974617, "rewards/margins_std": 0.08853994309902191, "rewards/rejected": 0.09528429806232452, "step": 2600 }, { "epoch": 0.68, "eval_dpo_losses": 0.6637357473373413, "eval_logits/chosen": -2.783560037612915, "eval_logits/rejected": -2.7451324462890625, "eval_logps/chosen": -268.1626281738281, "eval_logps/rejected": -252.1409149169922, "eval_loss": 0.6808480620384216, "eval_positive_losses": 0.11579447984695435, "eval_rewards/accuracies": 0.7242063283920288, "eval_rewards/chosen": 0.16332247853279114, "eval_rewards/margins": 0.06323818117380142, "eval_rewards/margins_max": 0.24468624591827393, "eval_rewards/margins_min": -0.09912735968828201, "eval_rewards/margins_std": 0.11344098299741745, "eval_rewards/rejected": 0.10008430480957031, "eval_runtime": 389.2769, "eval_samples_per_second": 5.138, "eval_steps_per_second": 0.162, "step": 2600 }, { "dpo_losses": 0.6520320773124695, "epoch": 0.68, "grad_norm": 9.598610507226221, "learning_rate": 1.3807248417879894e-07, "logits/chosen": -2.7677581310272217, "logits/rejected": -2.6992528438568115, "logps/chosen": -243.98825073242188, "logps/rejected": -180.00013732910156, "loss": 0.6777, "positive_losses": 0.11720962822437286, "rewards/accuracies": 0.75, "rewards/chosen": 0.1732184886932373, "rewards/margins": 0.08967794477939606, "rewards/margins_max": 0.27268117666244507, "rewards/margins_min": -0.042021073400974274, "rewards/margins_std": 0.14438588917255402, "rewards/rejected": 0.08354054391384125, "step": 2610 }, { "dpo_losses": 0.6666117906570435, "epoch": 0.69, "grad_norm": 6.692568560574354, "learning_rate": 1.3603446777675665e-07, "logits/chosen": -2.6856884956359863, "logits/rejected": -2.6961658000946045, "logps/chosen": -234.836181640625, "logps/rejected": -251.9840087890625, "loss": 0.6944, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.16049371659755707, "rewards/margins": 0.05744044855237007, "rewards/margins_max": 0.19652999937534332, "rewards/margins_min": -0.07018232345581055, "rewards/margins_std": 0.11920342594385147, "rewards/rejected": 0.1030532717704773, "step": 2620 }, { "dpo_losses": 0.6739298105239868, "epoch": 0.69, "grad_norm": 2.7563088616208113, "learning_rate": 1.3400596746385814e-07, "logits/chosen": -2.7988457679748535, "logits/rejected": -2.8075976371765137, "logps/chosen": -243.51730346679688, "logps/rejected": -238.9121551513672, "loss": 0.6832, "positive_losses": 0.37547796964645386, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.13830161094665527, "rewards/margins": 0.04181267321109772, "rewards/margins_max": 0.17611472308635712, "rewards/margins_min": -0.087108314037323, "rewards/margins_std": 0.11464836448431015, "rewards/rejected": 0.09648893028497696, "step": 2630 }, { "dpo_losses": 0.6748381853103638, "epoch": 0.69, "grad_norm": 15.581915207796786, "learning_rate": 1.3198715261929586e-07, "logits/chosen": -2.8534445762634277, "logits/rejected": -2.8705265522003174, "logps/chosen": -301.2350769042969, "logps/rejected": -282.84124755859375, "loss": 0.6904, "positive_losses": 0.09905795753002167, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1465148627758026, "rewards/margins": 0.041211504489183426, "rewards/margins_max": 0.20768475532531738, "rewards/margins_min": -0.06541645526885986, "rewards/margins_std": 0.1251172572374344, "rewards/rejected": 0.10530336201190948, "step": 2640 }, { "dpo_losses": 0.6479775309562683, "epoch": 0.69, "grad_norm": 1.7643300937257929, "learning_rate": 1.299781918135282e-07, "logits/chosen": -2.8581717014312744, "logits/rejected": -2.779258966445923, "logps/chosen": -227.37393188476562, "logps/rejected": -206.65213012695312, "loss": 0.6719, "positive_losses": 0.15762968361377716, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.1743009388446808, "rewards/margins": 0.09570324420928955, "rewards/margins_max": 0.21518942713737488, "rewards/margins_min": -0.016625795513391495, "rewards/margins_std": 0.1057358831167221, "rewards/rejected": 0.07859767973423004, "step": 2650 }, { "dpo_losses": 0.6573314070701599, "epoch": 0.7, "grad_norm": 10.786175561891858, "learning_rate": 1.279792527942045e-07, "logits/chosen": -2.8398871421813965, "logits/rejected": -2.837266206741333, "logps/chosen": -312.1396484375, "logps/rejected": -241.70187377929688, "loss": 0.6739, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.17937134206295013, "rewards/margins": 0.07706346362829208, "rewards/margins_max": 0.17430905997753143, "rewards/margins_min": -0.03193662688136101, "rewards/margins_std": 0.09627391397953033, "rewards/rejected": 0.10230787843465805, "step": 2660 }, { "dpo_losses": 0.6637741923332214, "epoch": 0.7, "grad_norm": 5.670050041072099, "learning_rate": 1.259905024721576e-07, "logits/chosen": -2.8783836364746094, "logits/rejected": -2.7891175746917725, "logps/chosen": -275.72515869140625, "logps/rejected": -226.322265625, "loss": 0.6759, "positive_losses": 0.051831819117069244, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.13656029105186462, "rewards/margins": 0.06270195543766022, "rewards/margins_max": 0.17947831749916077, "rewards/margins_min": -0.04513033479452133, "rewards/margins_std": 0.10144752264022827, "rewards/rejected": 0.07385829836130142, "step": 2670 }, { "dpo_losses": 0.6624875068664551, "epoch": 0.7, "grad_norm": 9.926135588667393, "learning_rate": 1.2401210690746703e-07, "logits/chosen": -2.85262131690979, "logits/rejected": -2.817472457885742, "logps/chosen": -333.7018127441406, "logps/rejected": -368.96319580078125, "loss": 0.6893, "positive_losses": 0.582872748374939, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.1505921632051468, "rewards/margins": 0.06744848191738129, "rewards/margins_max": 0.22661364078521729, "rewards/margins_min": -0.09928081184625626, "rewards/margins_std": 0.13975416123867035, "rewards/rejected": 0.08314366638660431, "step": 2680 }, { "dpo_losses": 0.6623440980911255, "epoch": 0.7, "grad_norm": 2.2882187886567933, "learning_rate": 1.2204423129559305e-07, "logits/chosen": -2.705098867416382, "logits/rejected": -2.6861870288848877, "logps/chosen": -183.91455078125, "logps/rejected": -223.2926025390625, "loss": 0.68, "positive_losses": 0.22965507209300995, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1502687633037567, "rewards/margins": 0.06848227977752686, "rewards/margins_max": 0.20223240554332733, "rewards/margins_min": -0.08552606403827667, "rewards/margins_std": 0.12745562195777893, "rewards/rejected": 0.08178650587797165, "step": 2690 }, { "dpo_losses": 0.6769312620162964, "epoch": 0.71, "grad_norm": 13.211990251334853, "learning_rate": 1.2008703995358299e-07, "logits/chosen": -2.7353789806365967, "logits/rejected": -2.6951496601104736, "logps/chosen": -232.9313507080078, "logps/rejected": -224.88162231445312, "loss": 0.683, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.14097455143928528, "rewards/margins": 0.03472239524126053, "rewards/margins_max": 0.12374613434076309, "rewards/margins_min": -0.053682826459407806, "rewards/margins_std": 0.08197776973247528, "rewards/rejected": 0.10625217854976654, "step": 2700 }, { "epoch": 0.71, "eval_dpo_losses": 0.6639883518218994, "eval_logits/chosen": -2.782484769821167, "eval_logits/rejected": -2.7438457012176514, "eval_logps/chosen": -268.0138244628906, "eval_logps/rejected": -251.9336395263672, "eval_loss": 0.6799085736274719, "eval_positive_losses": 0.10900817066431046, "eval_rewards/accuracies": 0.7242063283920288, "eval_rewards/chosen": 0.1648109257221222, "eval_rewards/margins": 0.0626538097858429, "eval_rewards/margins_max": 0.2421714812517166, "eval_rewards/margins_min": -0.09798896312713623, "eval_rewards/margins_std": 0.11240836977958679, "eval_rewards/rejected": 0.1021571010351181, "eval_runtime": 389.6711, "eval_samples_per_second": 5.133, "eval_steps_per_second": 0.162, "step": 2700 }, { "dpo_losses": 0.6747549772262573, "epoch": 0.71, "grad_norm": 2.0042003393483747, "learning_rate": 1.1814069630635068e-07, "logits/chosen": -2.9221720695495605, "logits/rejected": -2.870772123336792, "logps/chosen": -270.75146484375, "logps/rejected": -264.449951171875, "loss": 0.6896, "positive_losses": 0.2701377868652344, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.16046664118766785, "rewards/margins": 0.039906375110149384, "rewards/margins_max": 0.15826861560344696, "rewards/margins_min": -0.06516039371490479, "rewards/margins_std": 0.10186408460140228, "rewards/rejected": 0.12056026607751846, "step": 2710 }, { "dpo_losses": 0.6686604619026184, "epoch": 0.71, "grad_norm": 2.744524446300366, "learning_rate": 1.1620536287303051e-07, "logits/chosen": -2.77477765083313, "logits/rejected": -2.7189297676086426, "logps/chosen": -204.58444213867188, "logps/rejected": -225.67245483398438, "loss": 0.6675, "positive_losses": 0.0, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.16270171105861664, "rewards/margins": 0.05269937589764595, "rewards/margins_max": 0.13205501437187195, "rewards/margins_min": -0.06587193161249161, "rewards/margins_std": 0.09286411851644516, "rewards/rejected": 0.11000235378742218, "step": 2720 }, { "dpo_losses": 0.6740443706512451, "epoch": 0.71, "grad_norm": 2.114891130066726, "learning_rate": 1.1428120125340716e-07, "logits/chosen": -2.720109462738037, "logits/rejected": -2.7368974685668945, "logps/chosen": -315.4144592285156, "logps/rejected": -291.7226867675781, "loss": 0.6874, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1961982250213623, "rewards/margins": 0.04646676033735275, "rewards/margins_max": 0.23910515010356903, "rewards/margins_min": -0.11346033960580826, "rewards/margins_std": 0.1614883840084076, "rewards/rejected": 0.14973145723342896, "step": 2730 }, { "dpo_losses": 0.6759941577911377, "epoch": 0.72, "grad_norm": 9.69964507721924, "learning_rate": 1.123683721144223e-07, "logits/chosen": -2.813666343688965, "logits/rejected": -2.8336429595947266, "logps/chosen": -207.52584838867188, "logps/rejected": -249.69692993164062, "loss": 0.6861, "positive_losses": 0.02162322960793972, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1626010537147522, "rewards/margins": 0.03769798204302788, "rewards/margins_max": 0.16509708762168884, "rewards/margins_min": -0.07659967988729477, "rewards/margins_std": 0.10486801713705063, "rewards/rejected": 0.12490306049585342, "step": 2740 }, { "dpo_losses": 0.6610895395278931, "epoch": 0.72, "grad_norm": 10.986869604711686, "learning_rate": 1.1046703517675845e-07, "logits/chosen": -2.8068175315856934, "logits/rejected": -2.7834575176239014, "logps/chosen": -257.51422119140625, "logps/rejected": -221.0384979248047, "loss": 0.6955, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1787460744380951, "rewards/margins": 0.06879962235689163, "rewards/margins_max": 0.17229793965816498, "rewards/margins_min": -0.027008920907974243, "rewards/margins_std": 0.09230650961399078, "rewards/rejected": 0.10994645208120346, "step": 2750 }, { "dpo_losses": 0.6526534557342529, "epoch": 0.72, "grad_norm": 10.098201372495632, "learning_rate": 1.085773492015028e-07, "logits/chosen": -2.8394532203674316, "logits/rejected": -2.745370388031006, "logps/chosen": -351.73455810546875, "logps/rejected": -284.814208984375, "loss": 0.6655, "positive_losses": 0.0, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.18526779115200043, "rewards/margins": 0.08610363304615021, "rewards/margins_max": 0.20476289093494415, "rewards/margins_min": -0.028496265411376953, "rewards/margins_std": 0.10529184341430664, "rewards/rejected": 0.09916415065526962, "step": 2760 }, { "dpo_losses": 0.6613882780075073, "epoch": 0.72, "grad_norm": 1.9894765322581731, "learning_rate": 1.0669947197689033e-07, "logits/chosen": -2.9060561656951904, "logits/rejected": -2.8399486541748047, "logps/chosen": -290.3299560546875, "logps/rejected": -259.8872985839844, "loss": 0.693, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.16502395272254944, "rewards/margins": 0.06983888149261475, "rewards/margins_max": 0.2175408899784088, "rewards/margins_min": -0.08632582426071167, "rewards/margins_std": 0.13449755311012268, "rewards/rejected": 0.0951850563287735, "step": 2770 }, { "dpo_losses": 0.6637939214706421, "epoch": 0.73, "grad_norm": 1.693095361528334, "learning_rate": 1.048335603051291e-07, "logits/chosen": -2.7786593437194824, "logits/rejected": -2.721432685852051, "logps/chosen": -325.7188720703125, "logps/rejected": -225.9957275390625, "loss": 0.6761, "positive_losses": 0.10735664516687393, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.17629331350326538, "rewards/margins": 0.06277813762426376, "rewards/margins_max": 0.18301381170749664, "rewards/margins_min": -0.04662441462278366, "rewards/margins_std": 0.10360904037952423, "rewards/rejected": 0.11351517587900162, "step": 2780 }, { "dpo_losses": 0.6610652804374695, "epoch": 0.73, "grad_norm": 1.5668810650373743, "learning_rate": 1.0297976998930663e-07, "logits/chosen": -2.918388843536377, "logits/rejected": -2.8212196826934814, "logps/chosen": -342.0752868652344, "logps/rejected": -275.6966857910156, "loss": 0.6758, "positive_losses": 0.0019851685501635075, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.19239678978919983, "rewards/margins": 0.06978907436132431, "rewards/margins_max": 0.19509926438331604, "rewards/margins_min": -0.04050639644265175, "rewards/margins_std": 0.10559085756540298, "rewards/rejected": 0.12260772287845612, "step": 2790 }, { "dpo_losses": 0.6732099652290344, "epoch": 0.73, "grad_norm": 11.168298023302597, "learning_rate": 1.0113825582038077e-07, "logits/chosen": -2.82181978225708, "logits/rejected": -2.7915401458740234, "logps/chosen": -226.28927612304688, "logps/rejected": -222.4562225341797, "loss": 0.6785, "positive_losses": 0.1656135618686676, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1473783701658249, "rewards/margins": 0.04381309077143669, "rewards/margins_max": 0.18773579597473145, "rewards/margins_min": -0.08167463541030884, "rewards/margins_std": 0.12001528590917587, "rewards/rejected": 0.1035652607679367, "step": 2800 }, { "epoch": 0.73, "eval_dpo_losses": 0.6633877158164978, "eval_logits/chosen": -2.7828617095947266, "eval_logits/rejected": -2.74420428276062, "eval_logps/chosen": -268.234130859375, "eval_logps/rejected": -252.28933715820312, "eval_loss": 0.6808531284332275, "eval_positive_losses": 0.11935018002986908, "eval_rewards/accuracies": 0.7142857313156128, "eval_rewards/chosen": 0.16260772943496704, "eval_rewards/margins": 0.06400728970766068, "eval_rewards/margins_max": 0.2456274777650833, "eval_rewards/margins_min": -0.10007373243570328, "eval_rewards/margins_std": 0.11423919349908829, "eval_rewards/rejected": 0.09860043972730637, "eval_runtime": 391.2169, "eval_samples_per_second": 5.112, "eval_steps_per_second": 0.161, "step": 2800 }, { "dpo_losses": 0.6562505960464478, "epoch": 0.74, "grad_norm": 2.6651011481065163, "learning_rate": 9.930917156425475e-08, "logits/chosen": -2.7254879474639893, "logits/rejected": -2.7593536376953125, "logps/chosen": -224.43960571289062, "logps/rejected": -202.2880401611328, "loss": 0.6673, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.16689622402191162, "rewards/margins": 0.08031634986400604, "rewards/margins_max": 0.19498199224472046, "rewards/margins_min": -0.03201249986886978, "rewards/margins_std": 0.10381577908992767, "rewards/rejected": 0.08657988160848618, "step": 2810 }, { "dpo_losses": 0.6722387075424194, "epoch": 0.74, "grad_norm": 2.395446900978545, "learning_rate": 9.749266994893754e-08, "logits/chosen": -2.7374165058135986, "logits/rejected": -2.7058839797973633, "logps/chosen": -225.48974609375, "logps/rejected": -256.9203186035156, "loss": 0.6653, "positive_losses": 0.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.15854783356189728, "rewards/margins": 0.04549305886030197, "rewards/margins_max": 0.17629720270633698, "rewards/margins_min": -0.06231803819537163, "rewards/margins_std": 0.10453619807958603, "rewards/rejected": 0.1130547747015953, "step": 2820 }, { "dpo_losses": 0.6781406998634338, "epoch": 0.74, "grad_norm": 2.149688245659782, "learning_rate": 9.568890265179128e-08, "logits/chosen": -2.719682216644287, "logits/rejected": -2.696131467819214, "logps/chosen": -271.04443359375, "logps/rejected": -413.84649658203125, "loss": 0.6691, "positive_losses": 0.12154903262853622, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.13876976072788239, "rewards/margins": 0.0326072983443737, "rewards/margins_max": 0.1310979127883911, "rewards/margins_min": -0.08175252377986908, "rewards/margins_std": 0.09425903856754303, "rewards/rejected": 0.10616246610879898, "step": 2830 }, { "dpo_losses": 0.6699537634849548, "epoch": 0.74, "grad_norm": 1.9474233182763336, "learning_rate": 9.389802028686616e-08, "logits/chosen": -2.719886302947998, "logits/rejected": -2.709803819656372, "logps/chosen": -194.5177001953125, "logps/rejected": -175.28614807128906, "loss": 0.6668, "positive_losses": 0.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.14552441239356995, "rewards/margins": 0.04965772479772568, "rewards/margins_max": 0.16322022676467896, "rewards/margins_min": -0.06379050016403198, "rewards/margins_std": 0.10277913510799408, "rewards/rejected": 0.09586669504642487, "step": 2840 }, { "dpo_losses": 0.6644850969314575, "epoch": 0.75, "grad_norm": 2.927693494596595, "learning_rate": 9.212017239232426e-08, "logits/chosen": -2.8066647052764893, "logits/rejected": -2.832144260406494, "logps/chosen": -265.63226318359375, "logps/rejected": -243.2608184814453, "loss": 0.6782, "positive_losses": 0.06858177483081818, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.15173125267028809, "rewards/margins": 0.06289365887641907, "rewards/margins_max": 0.1837751567363739, "rewards/margins_min": -0.12117429822683334, "rewards/margins_std": 0.13747075200080872, "rewards/rejected": 0.08883760124444962, "step": 2850 }, { "dpo_losses": 0.6735584735870361, "epoch": 0.75, "grad_norm": 2.2505464949789444, "learning_rate": 9.035550741795328e-08, "logits/chosen": -2.802375316619873, "logits/rejected": -2.763209819793701, "logps/chosen": -247.94754028320312, "logps/rejected": -231.13558959960938, "loss": 0.6617, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15921562910079956, "rewards/margins": 0.04222042113542557, "rewards/margins_max": 0.14945976436138153, "rewards/margins_min": -0.07940498739480972, "rewards/margins_std": 0.10080856084823608, "rewards/rejected": 0.1169952005147934, "step": 2860 }, { "dpo_losses": 0.6567317247390747, "epoch": 0.75, "grad_norm": 2.502330647320227, "learning_rate": 8.860417271277065e-08, "logits/chosen": -2.6637213230133057, "logits/rejected": -2.6016478538513184, "logps/chosen": -274.00079345703125, "logps/rejected": -253.66122436523438, "loss": 0.6672, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.1512971669435501, "rewards/margins": 0.07740378379821777, "rewards/margins_max": 0.19368486106395721, "rewards/margins_min": -0.03223549947142601, "rewards/margins_std": 0.10386307537555695, "rewards/rejected": 0.07389337569475174, "step": 2870 }, { "dpo_losses": 0.6591945290565491, "epoch": 0.75, "grad_norm": 6.093505541470816, "learning_rate": 8.686631451272029e-08, "logits/chosen": -2.8315553665161133, "logits/rejected": -2.7398500442504883, "logps/chosen": -287.47442626953125, "logps/rejected": -230.60415649414062, "loss": 0.6897, "positive_losses": 0.5427009463310242, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.14553631842136383, "rewards/margins": 0.0738452821969986, "rewards/margins_max": 0.2060689479112625, "rewards/margins_min": -0.03878684341907501, "rewards/margins_std": 0.10930664837360382, "rewards/rejected": 0.07169099897146225, "step": 2880 }, { "dpo_losses": 0.6751716732978821, "epoch": 0.76, "grad_norm": 112.45037968199574, "learning_rate": 8.514207792846168e-08, "logits/chosen": -2.7375433444976807, "logits/rejected": -2.722531795501709, "logps/chosen": -269.4003601074219, "logps/rejected": -297.16485595703125, "loss": 0.6822, "positive_losses": 0.5430085062980652, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1424095630645752, "rewards/margins": 0.04013755917549133, "rewards/margins_max": 0.16956102848052979, "rewards/margins_min": -0.08579131960868835, "rewards/margins_std": 0.11141952127218246, "rewards/rejected": 0.10227201133966446, "step": 2890 }, { "dpo_losses": 0.6576107144355774, "epoch": 0.76, "grad_norm": 14.170585060505838, "learning_rate": 8.343160693325355e-08, "logits/chosen": -2.8384392261505127, "logits/rejected": -2.801820993423462, "logps/chosen": -288.8127136230469, "logps/rejected": -245.9336700439453, "loss": 0.6804, "positive_losses": 0.13667449355125427, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.15546861290931702, "rewards/margins": 0.07626311480998993, "rewards/margins_max": 0.18484382331371307, "rewards/margins_min": -0.040147729218006134, "rewards/margins_std": 0.10086791217327118, "rewards/rejected": 0.07920549809932709, "step": 2900 }, { "epoch": 0.76, "eval_dpo_losses": 0.6628540754318237, "eval_logits/chosen": -2.7846555709838867, "eval_logits/rejected": -2.7461493015289307, "eval_logps/chosen": -268.41455078125, "eval_logps/rejected": -252.5924835205078, "eval_loss": 0.6822347640991211, "eval_positive_losses": 0.13458429276943207, "eval_rewards/accuracies": 0.7202380895614624, "eval_rewards/chosen": 0.16080327332019806, "eval_rewards/margins": 0.06523434817790985, "eval_rewards/margins_max": 0.2495136708021164, "eval_rewards/margins_min": -0.10225551575422287, "eval_rewards/margins_std": 0.11622842401266098, "eval_rewards/rejected": 0.0955689400434494, "eval_runtime": 389.5641, "eval_samples_per_second": 5.134, "eval_steps_per_second": 0.162, "step": 2900 }, { "dpo_losses": 0.6652017831802368, "epoch": 0.76, "grad_norm": 11.459882337888212, "learning_rate": 8.173504435093173e-08, "logits/chosen": -2.7680153846740723, "logits/rejected": -2.692370891571045, "logps/chosen": -260.47747802734375, "logps/rejected": -214.68319702148438, "loss": 0.6668, "positive_losses": 0.044872283935546875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.15751729905605316, "rewards/margins": 0.05986157804727554, "rewards/margins_max": 0.18254561722278595, "rewards/margins_min": -0.04402995854616165, "rewards/margins_std": 0.10341081768274307, "rewards/rejected": 0.09765572845935822, "step": 2910 }, { "dpo_losses": 0.6544531583786011, "epoch": 0.76, "grad_norm": 20.25977532189708, "learning_rate": 8.005253184398359e-08, "logits/chosen": -2.7313950061798096, "logits/rejected": -2.6855309009552, "logps/chosen": -265.0918884277344, "logps/rejected": -204.74412536621094, "loss": 0.6786, "positive_losses": 0.5254768133163452, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.16128066182136536, "rewards/margins": 0.08340667188167572, "rewards/margins_max": 0.2493068277835846, "rewards/margins_min": -0.04549757391214371, "rewards/margins_std": 0.12967939674854279, "rewards/rejected": 0.07787398993968964, "step": 2920 }, { "dpo_losses": 0.6423149108886719, "epoch": 0.77, "grad_norm": 15.548518875088533, "learning_rate": 7.838420990171926e-08, "logits/chosen": -2.730849504470825, "logits/rejected": -2.702017068862915, "logps/chosen": -290.5140380859375, "logps/rejected": -235.5591583251953, "loss": 0.6799, "positive_losses": 0.0, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.19081711769104004, "rewards/margins": 0.10711536556482315, "rewards/margins_max": 0.2387050837278366, "rewards/margins_min": -0.004120032303035259, "rewards/margins_std": 0.10651262104511261, "rewards/rejected": 0.08370174467563629, "step": 2930 }, { "dpo_losses": 0.6588220596313477, "epoch": 0.77, "grad_norm": 1.7419158313182692, "learning_rate": 7.673021782854083e-08, "logits/chosen": -2.8104755878448486, "logits/rejected": -2.8491647243499756, "logps/chosen": -283.4486083984375, "logps/rejected": -260.8815612792969, "loss": 0.6847, "positive_losses": 0.36250799894332886, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.171676367521286, "rewards/margins": 0.07349254935979843, "rewards/margins_max": 0.22684898972511292, "rewards/margins_min": -0.038581203669309616, "rewards/margins_std": 0.117698073387146, "rewards/rejected": 0.09818382561206818, "step": 2940 }, { "dpo_losses": 0.6470920443534851, "epoch": 0.77, "grad_norm": 20.281353072975755, "learning_rate": 7.509069373231039e-08, "logits/chosen": -2.9167816638946533, "logits/rejected": -2.873137950897217, "logps/chosen": -268.0244445800781, "logps/rejected": -281.0406188964844, "loss": 0.6973, "positive_losses": 0.5683601498603821, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.1619308590888977, "rewards/margins": 0.09898541867733002, "rewards/margins_max": 0.23728612065315247, "rewards/margins_min": -0.035527534782886505, "rewards/margins_std": 0.11940480768680573, "rewards/rejected": 0.06294544041156769, "step": 2950 }, { "dpo_losses": 0.6785377264022827, "epoch": 0.77, "grad_norm": 1.9344681429277137, "learning_rate": 7.346577451281821e-08, "logits/chosen": -2.729431629180908, "logits/rejected": -2.7387375831604004, "logps/chosen": -242.9123992919922, "logps/rejected": -236.4193878173828, "loss": 0.667, "positive_losses": 0.1447097808122635, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.14629048109054565, "rewards/margins": 0.034385181963443756, "rewards/margins_max": 0.18973104655742645, "rewards/margins_min": -0.11912697553634644, "rewards/margins_std": 0.13839221000671387, "rewards/rejected": 0.11190527677536011, "step": 2960 }, { "dpo_losses": 0.6690041422843933, "epoch": 0.78, "grad_norm": 8.081258553414122, "learning_rate": 7.185559585035136e-08, "logits/chosen": -2.8309693336486816, "logits/rejected": -2.8083622455596924, "logps/chosen": -219.78732299804688, "logps/rejected": -195.7440643310547, "loss": 0.6817, "positive_losses": 0.05683441087603569, "rewards/accuracies": 0.625, "rewards/chosen": 0.14709046483039856, "rewards/margins": 0.05245544761419296, "rewards/margins_max": 0.172337144613266, "rewards/margins_min": -0.05212901905179024, "rewards/margins_std": 0.10182130336761475, "rewards/rejected": 0.0946350246667862, "step": 2970 }, { "dpo_losses": 0.6723123788833618, "epoch": 0.78, "grad_norm": 2.180158448016903, "learning_rate": 7.026029219436502e-08, "logits/chosen": -2.7011642456054688, "logits/rejected": -2.672740936279297, "logps/chosen": -292.22528076171875, "logps/rejected": -297.1006164550781, "loss": 0.6716, "positive_losses": 0.20120945572853088, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.15260522067546844, "rewards/margins": 0.04637325555086136, "rewards/margins_max": 0.19567745923995972, "rewards/margins_min": -0.10428061336278915, "rewards/margins_std": 0.1315806806087494, "rewards/rejected": 0.10623196512460709, "step": 2980 }, { "dpo_losses": 0.6591792106628418, "epoch": 0.78, "grad_norm": 2.4447852593104873, "learning_rate": 6.867999675225522e-08, "logits/chosen": -2.7815351486206055, "logits/rejected": -2.7108778953552246, "logps/chosen": -291.34375, "logps/rejected": -299.01239013671875, "loss": 0.6753, "positive_losses": 0.1278236359357834, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1643422693014145, "rewards/margins": 0.0730084627866745, "rewards/margins_max": 0.2041645050048828, "rewards/margins_min": -0.034510623663663864, "rewards/margins_std": 0.11097339540719986, "rewards/rejected": 0.09133382886648178, "step": 2990 }, { "dpo_losses": 0.6654072999954224, "epoch": 0.79, "grad_norm": 2.581985542481455, "learning_rate": 6.711484147823662e-08, "logits/chosen": -2.7847719192504883, "logits/rejected": -2.7462828159332275, "logps/chosen": -224.4231719970703, "logps/rejected": -255.90652465820312, "loss": 0.6741, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16300776600837708, "rewards/margins": 0.05929435044527054, "rewards/margins_max": 0.18764983117580414, "rewards/margins_min": -0.046713314950466156, "rewards/margins_std": 0.10569945722818375, "rewards/rejected": 0.10371343791484833, "step": 3000 }, { "epoch": 0.79, "eval_dpo_losses": 0.6630541682243347, "eval_logits/chosen": -2.7844808101654053, "eval_logits/rejected": -2.7461202144622803, "eval_logps/chosen": -268.1099548339844, "eval_logps/rejected": -252.24093627929688, "eval_loss": 0.6808385252952576, "eval_positive_losses": 0.1180451363325119, "eval_rewards/accuracies": 0.7202380895614624, "eval_rewards/chosen": 0.16384930908679962, "eval_rewards/margins": 0.06476480513811111, "eval_rewards/margins_max": 0.2480427622795105, "eval_rewards/margins_min": -0.10046318173408508, "eval_rewards/margins_std": 0.11531291902065277, "eval_rewards/rejected": 0.0990845113992691, "eval_runtime": 389.5682, "eval_samples_per_second": 5.134, "eval_steps_per_second": 0.162, "step": 3000 }, { "dpo_losses": 0.6563786268234253, "epoch": 0.79, "grad_norm": 2.150591737799825, "learning_rate": 6.556495706232412e-08, "logits/chosen": -2.824885368347168, "logits/rejected": -2.781872272491455, "logps/chosen": -292.45086669921875, "logps/rejected": -249.26254272460938, "loss": 0.6716, "positive_losses": 0.17349128425121307, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.16372860968112946, "rewards/margins": 0.07855953276157379, "rewards/margins_max": 0.19966019690036774, "rewards/margins_min": -0.03546585515141487, "rewards/margins_std": 0.10650608688592911, "rewards/rejected": 0.08516907691955566, "step": 3010 }, { "dpo_losses": 0.6640094518661499, "epoch": 0.79, "grad_norm": 6.759203252845801, "learning_rate": 6.403047291942057e-08, "logits/chosen": -2.7413415908813477, "logits/rejected": -2.7051734924316406, "logps/chosen": -225.3956756591797, "logps/rejected": -183.965087890625, "loss": 0.6774, "positive_losses": 0.107610322535038, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1544293761253357, "rewards/margins": 0.06344745308160782, "rewards/margins_max": 0.18341758847236633, "rewards/margins_min": -0.0742439404129982, "rewards/margins_std": 0.10965760797262192, "rewards/rejected": 0.09098193049430847, "step": 3020 }, { "dpo_losses": 0.6746552586555481, "epoch": 0.79, "grad_norm": 1.989812534837794, "learning_rate": 6.251151717851021e-08, "logits/chosen": -2.761890172958374, "logits/rejected": -2.772231340408325, "logps/chosen": -210.30636596679688, "logps/rejected": -281.67205810546875, "loss": 0.6891, "positive_losses": 0.038213349878787994, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.15557453036308289, "rewards/margins": 0.040930986404418945, "rewards/margins_max": 0.14809675514698029, "rewards/margins_min": -0.07007072120904922, "rewards/margins_std": 0.09487451612949371, "rewards/rejected": 0.11464353650808334, "step": 3030 }, { "dpo_losses": 0.6591284275054932, "epoch": 0.8, "grad_norm": 8.24123633803779, "learning_rate": 6.100821667196041e-08, "logits/chosen": -2.8116676807403564, "logits/rejected": -2.753418445587158, "logps/chosen": -283.1061706542969, "logps/rejected": -318.7002868652344, "loss": 0.6655, "positive_losses": 0.12325821071863174, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.19477012753486633, "rewards/margins": 0.07227279245853424, "rewards/margins_max": 0.18873360753059387, "rewards/margins_min": -0.058577846735715866, "rewards/margins_std": 0.111363984644413, "rewards/rejected": 0.12249733507633209, "step": 3040 }, { "dpo_losses": 0.6539517045021057, "epoch": 0.8, "grad_norm": 105.3660147785447, "learning_rate": 5.952069692493061e-08, "logits/chosen": -2.8027291297912598, "logits/rejected": -2.773585796356201, "logps/chosen": -280.40838623046875, "logps/rejected": -266.98675537109375, "loss": 0.677, "positive_losses": 0.14699554443359375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.16817526519298553, "rewards/margins": 0.08418744802474976, "rewards/margins_max": 0.21738891303539276, "rewards/margins_min": -0.03661995381116867, "rewards/margins_std": 0.1140676960349083, "rewards/rejected": 0.08398783951997757, "step": 3050 }, { "dpo_losses": 0.6710882186889648, "epoch": 0.8, "grad_norm": 2.4427182997857435, "learning_rate": 5.8049082144891794e-08, "logits/chosen": -2.7883102893829346, "logits/rejected": -2.7700586318969727, "logps/chosen": -290.86627197265625, "logps/rejected": -272.36419677734375, "loss": 0.6659, "positive_losses": 0.049817658960819244, "rewards/accuracies": 0.625, "rewards/chosen": 0.15865769982337952, "rewards/margins": 0.04813426360487938, "rewards/margins_max": 0.17915073037147522, "rewards/margins_min": -0.06630988419055939, "rewards/margins_std": 0.11496686935424805, "rewards/rejected": 0.11052343994379044, "step": 3060 }, { "dpo_losses": 0.6638559103012085, "epoch": 0.8, "grad_norm": 12.267114011928014, "learning_rate": 5.659349521125459e-08, "logits/chosen": -2.6516406536102295, "logits/rejected": -2.6265311241149902, "logps/chosen": -234.54946899414062, "logps/rejected": -211.96896362304688, "loss": 0.6734, "positive_losses": 0.022372817620635033, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1678735613822937, "rewards/margins": 0.06170881539583206, "rewards/margins_max": 0.16219031810760498, "rewards/margins_min": -0.03331523388624191, "rewards/margins_std": 0.08575156331062317, "rewards/rejected": 0.10616473853588104, "step": 3070 }, { "dpo_losses": 0.6670681834220886, "epoch": 0.81, "grad_norm": 7.370967473746132, "learning_rate": 5.5154057665109e-08, "logits/chosen": -2.777247190475464, "logits/rejected": -2.767956495285034, "logps/chosen": -224.2619171142578, "logps/rejected": -257.23052978515625, "loss": 0.6841, "positive_losses": 0.1429794281721115, "rewards/accuracies": 0.625, "rewards/chosen": 0.15937839448451996, "rewards/margins": 0.05837244912981987, "rewards/margins_max": 0.1938154101371765, "rewards/margins_min": -0.0703662633895874, "rewards/margins_std": 0.11851917207241058, "rewards/rejected": 0.10100595653057098, "step": 3080 }, { "dpo_losses": 0.6655889749526978, "epoch": 0.81, "grad_norm": 1.9205130736947094, "learning_rate": 5.3730889699075853e-08, "logits/chosen": -2.8654658794403076, "logits/rejected": -2.750743865966797, "logps/chosen": -295.4491271972656, "logps/rejected": -248.1079864501953, "loss": 0.6867, "positive_losses": 0.30791252851486206, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1357293426990509, "rewards/margins": 0.05936115235090256, "rewards/margins_max": 0.17789778113365173, "rewards/margins_min": -0.05033574625849724, "rewards/margins_std": 0.09862684458494186, "rewards/rejected": 0.07636817544698715, "step": 3090 }, { "dpo_losses": 0.6691070795059204, "epoch": 0.81, "grad_norm": 15.561858749966415, "learning_rate": 5.2324110147270893e-08, "logits/chosen": -2.8081393241882324, "logits/rejected": -2.7991766929626465, "logps/chosen": -278.1468505859375, "logps/rejected": -275.0913391113281, "loss": 0.6856, "positive_losses": 0.03154640272259712, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.17184340953826904, "rewards/margins": 0.05207213759422302, "rewards/margins_max": 0.1620274782180786, "rewards/margins_min": -0.06637358665466309, "rewards/margins_std": 0.10301927477121353, "rewards/rejected": 0.11977127939462662, "step": 3100 }, { "epoch": 0.81, "eval_dpo_losses": 0.6627760529518127, "eval_logits/chosen": -2.7823197841644287, "eval_logits/rejected": -2.743809223175049, "eval_logps/chosen": -268.22344970703125, "eval_logps/rejected": -252.41836547851562, "eval_loss": 0.6812266707420349, "eval_positive_losses": 0.12757235765457153, "eval_rewards/accuracies": 0.7222222089767456, "eval_rewards/chosen": 0.162714421749115, "eval_rewards/margins": 0.06540438532829285, "eval_rewards/margins_max": 0.2497921884059906, "eval_rewards/margins_min": -0.10203025490045547, "eval_rewards/margins_std": 0.1164335235953331, "eval_rewards/rejected": 0.09731005132198334, "eval_runtime": 390.3466, "eval_samples_per_second": 5.124, "eval_steps_per_second": 0.161, "step": 3100 }, { "dpo_losses": 0.6689059138298035, "epoch": 0.81, "grad_norm": 2.2747325027317773, "learning_rate": 5.0933836475381795e-08, "logits/chosen": -2.811298131942749, "logits/rejected": -2.7352309226989746, "logps/chosen": -325.28240966796875, "logps/rejected": -284.04608154296875, "loss": 0.6792, "positive_losses": 0.2840765118598938, "rewards/accuracies": 0.625, "rewards/chosen": 0.16880102455615997, "rewards/margins": 0.05320410802960396, "rewards/margins_max": 0.20439854264259338, "rewards/margins_min": -0.06459472328424454, "rewards/margins_std": 0.12115363031625748, "rewards/rejected": 0.11559691280126572, "step": 3110 }, { "dpo_losses": 0.6610020995140076, "epoch": 0.82, "grad_norm": 1.9238160418600554, "learning_rate": 4.956018477086005e-08, "logits/chosen": -2.8051600456237793, "logits/rejected": -2.7575900554656982, "logps/chosen": -258.21368408203125, "logps/rejected": -248.81930541992188, "loss": 0.6664, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.17074163258075714, "rewards/margins": 0.06870418787002563, "rewards/margins_max": 0.1845478117465973, "rewards/margins_min": -0.056457966566085815, "rewards/margins_std": 0.10763120651245117, "rewards/rejected": 0.10203742980957031, "step": 3120 }, { "dpo_losses": 0.6620965003967285, "epoch": 0.82, "grad_norm": 1.8853223857876538, "learning_rate": 4.820326973322763e-08, "logits/chosen": -2.7807884216308594, "logits/rejected": -2.77821683883667, "logps/chosen": -241.8834686279297, "logps/rejected": -221.46566772460938, "loss": 0.6621, "positive_losses": 0.169139102101326, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.169694721698761, "rewards/margins": 0.06687848269939423, "rewards/margins_max": 0.2040960043668747, "rewards/margins_min": -0.06565778702497482, "rewards/margins_std": 0.12235186249017715, "rewards/rejected": 0.10281624644994736, "step": 3130 }, { "dpo_losses": 0.6488478779792786, "epoch": 0.82, "grad_norm": 2.114513665415584, "learning_rate": 4.686320466449981e-08, "logits/chosen": -2.6740877628326416, "logits/rejected": -2.723538875579834, "logps/chosen": -207.9197235107422, "logps/rejected": -207.06576538085938, "loss": 0.6765, "positive_losses": 0.0, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.16196046769618988, "rewards/margins": 0.09336929768323898, "rewards/margins_max": 0.211838960647583, "rewards/margins_min": -0.024275779724121094, "rewards/margins_std": 0.10504420846700668, "rewards/rejected": 0.0685911625623703, "step": 3140 }, { "dpo_losses": 0.6786141991615295, "epoch": 0.82, "grad_norm": 2.2968035836048664, "learning_rate": 4.554010145972417e-08, "logits/chosen": -2.783653736114502, "logits/rejected": -2.8021936416625977, "logps/chosen": -274.2801208496094, "logps/rejected": -278.07366943359375, "loss": 0.6777, "positive_losses": 0.1276847869157791, "rewards/accuracies": 0.625, "rewards/chosen": 0.15860846638679504, "rewards/margins": 0.033648617565631866, "rewards/margins_max": 0.18964725732803345, "rewards/margins_min": -0.11228775978088379, "rewards/margins_std": 0.13103614747524261, "rewards/rejected": 0.12495984137058258, "step": 3150 }, { "dpo_losses": 0.6525992155075073, "epoch": 0.83, "grad_norm": 6.062725888792658, "learning_rate": 4.423407059763745e-08, "logits/chosen": -2.8857107162475586, "logits/rejected": -2.815931558609009, "logps/chosen": -311.3929443359375, "logps/rejected": -211.2099609375, "loss": 0.6679, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.18596258759498596, "rewards/margins": 0.08752071857452393, "rewards/margins_max": 0.2264481484889984, "rewards/margins_min": -0.046439796686172485, "rewards/margins_std": 0.12269928306341171, "rewards/rejected": 0.09844187647104263, "step": 3160 }, { "dpo_losses": 0.6678873896598816, "epoch": 0.83, "grad_norm": 9.53360466559852, "learning_rate": 4.294522113144078e-08, "logits/chosen": -2.8117034435272217, "logits/rejected": -2.791072368621826, "logps/chosen": -299.37103271484375, "logps/rejected": -281.37615966796875, "loss": 0.6766, "positive_losses": 0.3987411558628082, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1582927405834198, "rewards/margins": 0.05428671836853027, "rewards/margins_max": 0.17796790599822998, "rewards/margins_min": -0.06260037422180176, "rewards/margins_std": 0.11026357114315033, "rewards/rejected": 0.10400601476430893, "step": 3170 }, { "dpo_losses": 0.6631403565406799, "epoch": 0.83, "grad_norm": 8.95303126314368, "learning_rate": 4.1673660679693804e-08, "logits/chosen": -2.6631081104278564, "logits/rejected": -2.671422243118286, "logps/chosen": -269.2955017089844, "logps/rejected": -231.2158203125, "loss": 0.6828, "positive_losses": 0.08814601600170135, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.17081721127033234, "rewards/margins": 0.0644526481628418, "rewards/margins_max": 0.17851416766643524, "rewards/margins_min": -0.05811725929379463, "rewards/margins_std": 0.10786614567041397, "rewards/rejected": 0.10636456310749054, "step": 3180 }, { "dpo_losses": 0.660934567451477, "epoch": 0.83, "grad_norm": 1.9643955819256005, "learning_rate": 4.041949541732825e-08, "logits/chosen": -2.74068021774292, "logits/rejected": -2.6296112537384033, "logps/chosen": -231.2447052001953, "logps/rejected": -222.8256072998047, "loss": 0.6751, "positive_losses": 0.3026248812675476, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1583462655544281, "rewards/margins": 0.07087197154760361, "rewards/margins_max": 0.22593124210834503, "rewards/margins_min": -0.08451506495475769, "rewards/margins_std": 0.14156809449195862, "rewards/rejected": 0.08747430145740509, "step": 3190 }, { "dpo_losses": 0.6655724048614502, "epoch": 0.84, "grad_norm": 1.8090603516961787, "learning_rate": 3.9182830066782605e-08, "logits/chosen": -2.692063570022583, "logits/rejected": -2.7058863639831543, "logps/chosen": -261.7070007324219, "logps/rejected": -234.983154296875, "loss": 0.6678, "positive_losses": 0.02298889122903347, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.144895538687706, "rewards/margins": 0.05822502821683884, "rewards/margins_max": 0.1596185564994812, "rewards/margins_min": -0.04484047740697861, "rewards/margins_std": 0.08938737213611603, "rewards/rejected": 0.08667052537202835, "step": 3200 }, { "epoch": 0.84, "eval_dpo_losses": 0.6627413630485535, "eval_logits/chosen": -2.7853500843048096, "eval_logits/rejected": -2.747154712677002, "eval_logps/chosen": -268.1344909667969, "eval_logps/rejected": -252.33535766601562, "eval_loss": 0.6808730363845825, "eval_positive_losses": 0.12440192699432373, "eval_rewards/accuracies": 0.7222222089767456, "eval_rewards/chosen": 0.1636039912700653, "eval_rewards/margins": 0.06546396762132645, "eval_rewards/margins_max": 0.25000351667404175, "eval_rewards/margins_min": -0.10156844556331635, "eval_rewards/margins_std": 0.11613842099905014, "eval_rewards/rejected": 0.09814003854990005, "eval_runtime": 389.6112, "eval_samples_per_second": 5.133, "eval_steps_per_second": 0.162, "step": 3200 }, { "dpo_losses": 0.6610216498374939, "epoch": 0.84, "grad_norm": 8.077637648500255, "learning_rate": 3.79637678892577e-08, "logits/chosen": -2.8092358112335205, "logits/rejected": -2.786480188369751, "logps/chosen": -241.1995391845703, "logps/rejected": -257.9070739746094, "loss": 0.677, "positive_losses": 0.05450744554400444, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.14913347363471985, "rewards/margins": 0.06916774064302444, "rewards/margins_max": 0.20991599559783936, "rewards/margins_min": -0.05254759639501572, "rewards/margins_std": 0.11959397792816162, "rewards/rejected": 0.0799657329916954, "step": 3210 }, { "dpo_losses": 0.6678776741027832, "epoch": 0.84, "grad_norm": 2.4239630773017296, "learning_rate": 3.6762410676094645e-08, "logits/chosen": -2.718482494354248, "logits/rejected": -2.672642469406128, "logps/chosen": -247.31820678710938, "logps/rejected": -254.6053009033203, "loss": 0.6796, "positive_losses": 0.03541602939367294, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1772005707025528, "rewards/margins": 0.05370795726776123, "rewards/margins_max": 0.16246002912521362, "rewards/margins_min": -0.04772583767771721, "rewards/margins_std": 0.09779927134513855, "rewards/rejected": 0.12349263578653336, "step": 3220 }, { "dpo_losses": 0.6720653772354126, "epoch": 0.85, "grad_norm": 2.523629479442598, "learning_rate": 3.557885874027497e-08, "logits/chosen": -2.775148868560791, "logits/rejected": -2.758457899093628, "logps/chosen": -300.9856872558594, "logps/rejected": -315.10015869140625, "loss": 0.6848, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.14731472730636597, "rewards/margins": 0.048546768724918365, "rewards/margins_max": 0.21185681223869324, "rewards/margins_min": -0.10242996364831924, "rewards/margins_std": 0.13624873757362366, "rewards/rejected": 0.098767951130867, "step": 3230 }, { "dpo_losses": 0.6623659133911133, "epoch": 0.85, "grad_norm": 11.640982848945923, "learning_rate": 3.441321090804469e-08, "logits/chosen": -2.9063901901245117, "logits/rejected": -2.827007293701172, "logps/chosen": -249.5447998046875, "logps/rejected": -290.7762145996094, "loss": 0.6814, "positive_losses": 0.161967471241951, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.1393720656633377, "rewards/margins": 0.06511013209819794, "rewards/margins_max": 0.16636310517787933, "rewards/margins_min": -0.03109910525381565, "rewards/margins_std": 0.09238677471876144, "rewards/rejected": 0.07426193356513977, "step": 3240 }, { "dpo_losses": 0.6672154068946838, "epoch": 0.85, "grad_norm": 10.181580491202146, "learning_rate": 3.326556451066234e-08, "logits/chosen": -2.848665952682495, "logits/rejected": -2.8185975551605225, "logps/chosen": -284.4683532714844, "logps/rejected": -275.9518127441406, "loss": 0.6871, "positive_losses": 0.097559355199337, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15505456924438477, "rewards/margins": 0.055702775716781616, "rewards/margins_max": 0.17600347101688385, "rewards/margins_min": -0.041048236191272736, "rewards/margins_std": 0.09688388556241989, "rewards/rejected": 0.09935178607702255, "step": 3250 }, { "dpo_losses": 0.6560079455375671, "epoch": 0.85, "grad_norm": 14.361829479425445, "learning_rate": 3.2136015376271946e-08, "logits/chosen": -2.8152987957000732, "logits/rejected": -2.8128440380096436, "logps/chosen": -237.4013214111328, "logps/rejected": -266.92230224609375, "loss": 0.6727, "positive_losses": 0.08050384372472763, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.15222205221652985, "rewards/margins": 0.0787474662065506, "rewards/margins_max": 0.198978453874588, "rewards/margins_min": -0.04368770867586136, "rewards/margins_std": 0.10543539375066757, "rewards/rejected": 0.07347457110881805, "step": 3260 }, { "dpo_losses": 0.6642253994941711, "epoch": 0.86, "grad_norm": 1.8469226501824152, "learning_rate": 3.102465782190106e-08, "logits/chosen": -2.7718749046325684, "logits/rejected": -2.741318702697754, "logps/chosen": -236.1094970703125, "logps/rejected": -247.9010772705078, "loss": 0.6721, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15767602622509003, "rewards/margins": 0.06289416551589966, "rewards/margins_max": 0.21374940872192383, "rewards/margins_min": -0.06856616586446762, "rewards/margins_std": 0.12185641378164291, "rewards/rejected": 0.09478186070919037, "step": 3270 }, { "dpo_losses": 0.6692901849746704, "epoch": 0.86, "grad_norm": 2.1990584049075594, "learning_rate": 2.993158464558565e-08, "logits/chosen": -2.859236717224121, "logits/rejected": -2.7646913528442383, "logps/chosen": -198.5013427734375, "logps/rejected": -175.32022094726562, "loss": 0.6738, "positive_losses": 0.055785369127988815, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1464720368385315, "rewards/margins": 0.052502166479825974, "rewards/margins_max": 0.19133032858371735, "rewards/margins_min": -0.0839792937040329, "rewards/margins_std": 0.11933982372283936, "rewards/rejected": 0.09396988153457642, "step": 3280 }, { "dpo_losses": 0.646541953086853, "epoch": 0.86, "grad_norm": 9.628112685985919, "learning_rate": 2.8856887118621358e-08, "logits/chosen": -2.747638702392578, "logits/rejected": -2.724984645843506, "logps/chosen": -279.7018737792969, "logps/rejected": -280.64251708984375, "loss": 0.6694, "positive_losses": 0.02620544470846653, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.17957869172096252, "rewards/margins": 0.09934578835964203, "rewards/margins_max": 0.22174513339996338, "rewards/margins_min": -0.04211106896400452, "rewards/margins_std": 0.11883123964071274, "rewards/rejected": 0.0802329033613205, "step": 3290 }, { "dpo_losses": 0.6547456383705139, "epoch": 0.86, "grad_norm": 13.829934556741401, "learning_rate": 2.7800654977942482e-08, "logits/chosen": -2.870971202850342, "logits/rejected": -2.828831911087036, "logps/chosen": -324.831787109375, "logps/rejected": -328.6128234863281, "loss": 0.6786, "positive_losses": 0.2483566254377365, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1950651854276657, "rewards/margins": 0.08232709765434265, "rewards/margins_max": 0.21603958308696747, "rewards/margins_min": -0.06784708052873611, "rewards/margins_std": 0.12510669231414795, "rewards/rejected": 0.11273808777332306, "step": 3300 }, { "epoch": 0.86, "eval_dpo_losses": 0.6626967787742615, "eval_logits/chosen": -2.7811217308044434, "eval_logits/rejected": -2.7425291538238525, "eval_logps/chosen": -268.10919189453125, "eval_logps/rejected": -252.32167053222656, "eval_loss": 0.6810629963874817, "eval_positive_losses": 0.12673041224479675, "eval_rewards/accuracies": 0.7222222089767456, "eval_rewards/chosen": 0.16385717689990997, "eval_rewards/margins": 0.06558007746934891, "eval_rewards/margins_max": 0.2502087950706482, "eval_rewards/margins_min": -0.10188627988100052, "eval_rewards/margins_std": 0.1165127083659172, "eval_rewards/rejected": 0.09827709943056107, "eval_runtime": 389.5445, "eval_samples_per_second": 5.134, "eval_steps_per_second": 0.162, "step": 3300 }, { "dpo_losses": 0.6536380052566528, "epoch": 0.87, "grad_norm": 10.468335317113388, "learning_rate": 2.676297641862879e-08, "logits/chosen": -2.733778476715088, "logits/rejected": -2.673896551132202, "logps/chosen": -204.01513671875, "logps/rejected": -214.28353881835938, "loss": 0.6713, "positive_losses": 0.09598731994628906, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.14947354793548584, "rewards/margins": 0.08431442081928253, "rewards/margins_max": 0.2124217450618744, "rewards/margins_min": -0.031432487070560455, "rewards/margins_std": 0.10755596309900284, "rewards/rejected": 0.0651591345667839, "step": 3310 }, { "dpo_losses": 0.6592745780944824, "epoch": 0.87, "grad_norm": 2.4810075309912802, "learning_rate": 2.5743938086541352e-08, "logits/chosen": -2.5734775066375732, "logits/rejected": -2.5956883430480957, "logps/chosen": -288.0391540527344, "logps/rejected": -232.1073455810547, "loss": 0.6769, "positive_losses": 0.35831108689308167, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.16860854625701904, "rewards/margins": 0.0769532322883606, "rewards/margins_max": 0.2719673216342926, "rewards/margins_min": -0.0690711960196495, "rewards/margins_std": 0.15144231915473938, "rewards/rejected": 0.09165529161691666, "step": 3320 }, { "dpo_losses": 0.6756593585014343, "epoch": 0.87, "grad_norm": 11.08969308523051, "learning_rate": 2.474362507108757e-08, "logits/chosen": -2.821928024291992, "logits/rejected": -2.852421760559082, "logps/chosen": -265.82354736328125, "logps/rejected": -295.21044921875, "loss": 0.6664, "positive_losses": 0.0, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.15832513570785522, "rewards/margins": 0.03733016178011894, "rewards/margins_max": 0.13323178887367249, "rewards/margins_min": -0.049654535949230194, "rewards/margins_std": 0.0812670961022377, "rewards/rejected": 0.1209949478507042, "step": 3330 }, { "dpo_losses": 0.6848273277282715, "epoch": 0.87, "grad_norm": 1.935696162097246, "learning_rate": 2.3762120898116495e-08, "logits/chosen": -2.805917501449585, "logits/rejected": -2.8301444053649902, "logps/chosen": -257.66290283203125, "logps/rejected": -310.5917053222656, "loss": 0.6745, "positive_losses": 0.20382681488990784, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.13177986443042755, "rewards/margins": 0.019999397918581963, "rewards/margins_max": 0.14217321574687958, "rewards/margins_min": -0.11165271699428558, "rewards/margins_std": 0.11129863560199738, "rewards/rejected": 0.11178047955036163, "step": 3340 }, { "dpo_losses": 0.6613301038742065, "epoch": 0.88, "grad_norm": 6.993539280993901, "learning_rate": 2.2799507522944044e-08, "logits/chosen": -2.863429546356201, "logits/rejected": -2.7525746822357178, "logps/chosen": -277.91192626953125, "logps/rejected": -247.7657928466797, "loss": 0.6798, "positive_losses": 0.2877071499824524, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.16407844424247742, "rewards/margins": 0.06978114694356918, "rewards/margins_max": 0.21672868728637695, "rewards/margins_min": -0.06434138119220734, "rewards/margins_std": 0.1292802393436432, "rewards/rejected": 0.09429730474948883, "step": 3350 }, { "dpo_losses": 0.6659659147262573, "epoch": 0.88, "grad_norm": 1.7625116902498015, "learning_rate": 2.1855865323510054e-08, "logits/chosen": -2.8760766983032227, "logits/rejected": -2.812631845474243, "logps/chosen": -266.16845703125, "logps/rejected": -319.29388427734375, "loss": 0.6708, "positive_losses": 0.0024131773971021175, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.15420962870121002, "rewards/margins": 0.05738378316164017, "rewards/margins_max": 0.15622182190418243, "rewards/margins_min": -0.0385439433157444, "rewards/margins_std": 0.08907020092010498, "rewards/rejected": 0.09682585299015045, "step": 3360 }, { "dpo_losses": 0.6743310689926147, "epoch": 0.88, "grad_norm": 8.324356228334473, "learning_rate": 2.0931273093666573e-08, "logits/chosen": -2.8275797367095947, "logits/rejected": -2.8362526893615723, "logps/chosen": -254.84768676757812, "logps/rejected": -265.889404296875, "loss": 0.6689, "positive_losses": 0.006755066104233265, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.16076824069023132, "rewards/margins": 0.04187941551208496, "rewards/margins_max": 0.18899044394493103, "rewards/margins_min": -0.08825278282165527, "rewards/margins_std": 0.12628883123397827, "rewards/rejected": 0.11888883262872696, "step": 3370 }, { "dpo_losses": 0.669025182723999, "epoch": 0.88, "grad_norm": 1.7763929569667392, "learning_rate": 2.002580803659873e-08, "logits/chosen": -2.854543685913086, "logits/rejected": -2.7524867057800293, "logps/chosen": -269.3807373046875, "logps/rejected": -230.5249786376953, "loss": 0.6985, "positive_losses": 0.07552261650562286, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.13878145813941956, "rewards/margins": 0.05284886434674263, "rewards/margins_max": 0.17732667922973633, "rewards/margins_min": -0.05076334998011589, "rewards/margins_std": 0.10122529417276382, "rewards/rejected": 0.08593259006738663, "step": 3380 }, { "dpo_losses": 0.6597247123718262, "epoch": 0.89, "grad_norm": 1.943915972722519, "learning_rate": 1.9139545758378256e-08, "logits/chosen": -2.76768159866333, "logits/rejected": -2.7349982261657715, "logps/chosen": -241.35092163085938, "logps/rejected": -259.1753845214844, "loss": 0.6668, "positive_losses": 0.09793557971715927, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15099754929542542, "rewards/margins": 0.07219112664461136, "rewards/margins_max": 0.21532170474529266, "rewards/margins_min": -0.08124847710132599, "rewards/margins_std": 0.12921682000160217, "rewards/rejected": 0.07880643010139465, "step": 3390 }, { "dpo_losses": 0.6647534370422363, "epoch": 0.89, "grad_norm": 1.8916313078456324, "learning_rate": 1.8272560261650277e-08, "logits/chosen": -2.796069860458374, "logits/rejected": -2.7577872276306152, "logps/chosen": -260.0286865234375, "logps/rejected": -275.72344970703125, "loss": 0.675, "positive_losses": 0.06406746059656143, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.14727066457271576, "rewards/margins": 0.060552507638931274, "rewards/margins_max": 0.1828928291797638, "rewards/margins_min": -0.04774991795420647, "rewards/margins_std": 0.10030458122491837, "rewards/rejected": 0.08671815693378448, "step": 3400 }, { "epoch": 0.89, "eval_dpo_losses": 0.6627320647239685, "eval_logits/chosen": -2.7832562923431396, "eval_logits/rejected": -2.744842767715454, "eval_logps/chosen": -268.03973388671875, "eval_logps/rejected": -252.24195861816406, "eval_loss": 0.680766224861145, "eval_positive_losses": 0.12215282768011093, "eval_rewards/accuracies": 0.7202380895614624, "eval_rewards/chosen": 0.16455139219760895, "eval_rewards/margins": 0.06547729671001434, "eval_rewards/margins_max": 0.2496509701013565, "eval_rewards/margins_min": -0.10108717530965805, "eval_rewards/margins_std": 0.11614864319562912, "eval_rewards/rejected": 0.09907408058643341, "eval_runtime": 390.0194, "eval_samples_per_second": 5.128, "eval_steps_per_second": 0.162, "step": 3400 }, { "dpo_losses": 0.6754037141799927, "epoch": 0.89, "grad_norm": 1.626861972705387, "learning_rate": 1.742492393945427e-08, "logits/chosen": -2.612673282623291, "logits/rejected": -2.6271557807922363, "logps/chosen": -228.8592071533203, "logps/rejected": -260.7303466796875, "loss": 0.6826, "positive_losses": 0.2633230686187744, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.11556209623813629, "rewards/margins": 0.03941815719008446, "rewards/margins_max": 0.1536892205476761, "rewards/margins_min": -0.08365072309970856, "rewards/margins_std": 0.1035921722650528, "rewards/rejected": 0.07614392787218094, "step": 3410 }, { "dpo_losses": 0.6367681622505188, "epoch": 0.9, "grad_norm": 2.493987865146216, "learning_rate": 1.6596707569179302e-08, "logits/chosen": -2.8057358264923096, "logits/rejected": -2.7106568813323975, "logps/chosen": -344.5505676269531, "logps/rejected": -280.78411865234375, "loss": 0.6756, "positive_losses": 0.057952117174863815, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.18896682560443878, "rewards/margins": 0.12057340145111084, "rewards/margins_max": 0.27455177903175354, "rewards/margins_min": -0.008689996786415577, "rewards/margins_std": 0.12518426775932312, "rewards/rejected": 0.06839345395565033, "step": 3420 }, { "dpo_losses": 0.6571991443634033, "epoch": 0.9, "grad_norm": 2.113252819384774, "learning_rate": 1.5787980306653848e-08, "logits/chosen": -2.8217930793762207, "logits/rejected": -2.7686524391174316, "logps/chosen": -317.1985778808594, "logps/rejected": -278.02716064453125, "loss": 0.6726, "positive_losses": 0.03348579257726669, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1714903563261032, "rewards/margins": 0.07688074558973312, "rewards/margins_max": 0.20531579852104187, "rewards/margins_min": -0.014707878232002258, "rewards/margins_std": 0.10292376577854156, "rewards/rejected": 0.09460960328578949, "step": 3430 }, { "dpo_losses": 0.6721744537353516, "epoch": 0.9, "grad_norm": 1.8059559117382162, "learning_rate": 1.499880968037165e-08, "logits/chosen": -2.713167667388916, "logits/rejected": -2.7415285110473633, "logps/chosen": -268.3160400390625, "logps/rejected": -277.2546691894531, "loss": 0.6743, "positive_losses": 0.1692478209733963, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.14324799180030823, "rewards/margins": 0.045404307544231415, "rewards/margins_max": 0.15480300784111023, "rewards/margins_min": -0.07178832590579987, "rewards/margins_std": 0.10168786346912384, "rewards/rejected": 0.09784368425607681, "step": 3440 }, { "dpo_losses": 0.6616576313972473, "epoch": 0.9, "grad_norm": 10.562875594283618, "learning_rate": 1.4229261585852803e-08, "logits/chosen": -2.7967777252197266, "logits/rejected": -2.7842721939086914, "logps/chosen": -257.3331298828125, "logps/rejected": -245.29470825195312, "loss": 0.6779, "positive_losses": 0.047858428210020065, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1710709184408188, "rewards/margins": 0.06789450347423553, "rewards/margins_max": 0.17945295572280884, "rewards/margins_min": -0.05227668210864067, "rewards/margins_std": 0.10495837032794952, "rewards/rejected": 0.10317642986774445, "step": 3450 }, { "dpo_losses": 0.6492521166801453, "epoch": 0.91, "grad_norm": 3.0638981955525155, "learning_rate": 1.3479400280141883e-08, "logits/chosen": -2.8031203746795654, "logits/rejected": -2.7489144802093506, "logps/chosen": -299.0844421386719, "logps/rejected": -206.0711212158203, "loss": 0.667, "positive_losses": 0.0, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.1845073699951172, "rewards/margins": 0.09424134343862534, "rewards/margins_max": 0.22934171557426453, "rewards/margins_min": -0.028472676873207092, "rewards/margins_std": 0.1167953833937645, "rewards/rejected": 0.09026604890823364, "step": 3460 }, { "dpo_losses": 0.6643989682197571, "epoch": 0.91, "grad_norm": 6.962641343714408, "learning_rate": 1.2749288376442042e-08, "logits/chosen": -2.7879843711853027, "logits/rejected": -2.724153757095337, "logps/chosen": -260.9576721191406, "logps/rejected": -277.0984191894531, "loss": 0.6717, "positive_losses": 0.01393737830221653, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.16241498291492462, "rewards/margins": 0.062306541949510574, "rewards/margins_max": 0.18870343267917633, "rewards/margins_min": -0.06285803765058517, "rewards/margins_std": 0.1129065528512001, "rewards/rejected": 0.10010842233896255, "step": 3470 }, { "dpo_losses": 0.6731021404266357, "epoch": 0.91, "grad_norm": 2.0789325263747136, "learning_rate": 1.2038986838887127e-08, "logits/chosen": -2.820610523223877, "logits/rejected": -2.723703384399414, "logps/chosen": -212.0461883544922, "logps/rejected": -224.64724731445312, "loss": 0.6798, "positive_losses": 0.22083091735839844, "rewards/accuracies": 0.625, "rewards/chosen": 0.14202693104743958, "rewards/margins": 0.04587015509605408, "rewards/margins_max": 0.19555941224098206, "rewards/margins_min": -0.09572341293096542, "rewards/margins_std": 0.1279156655073166, "rewards/rejected": 0.0961567685008049, "step": 3480 }, { "dpo_losses": 0.6755630970001221, "epoch": 0.91, "grad_norm": 8.736703415606234, "learning_rate": 1.1348554977451131e-08, "logits/chosen": -2.7565951347351074, "logits/rejected": -2.7981178760528564, "logps/chosen": -238.7095947265625, "logps/rejected": -209.91738891601562, "loss": 0.6776, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.15453873574733734, "rewards/margins": 0.03820282220840454, "rewards/margins_max": 0.14551648497581482, "rewards/margins_min": -0.07384388148784637, "rewards/margins_std": 0.09857748448848724, "rewards/rejected": 0.1163359060883522, "step": 3490 }, { "dpo_losses": 0.6538613438606262, "epoch": 0.92, "grad_norm": 5.676467087455956, "learning_rate": 1.06780504429958e-08, "logits/chosen": -2.84360933303833, "logits/rejected": -2.812001943588257, "logps/chosen": -345.8283996582031, "logps/rejected": -269.57781982421875, "loss": 0.6743, "positive_losses": 0.00730133056640625, "rewards/accuracies": 0.75, "rewards/chosen": 0.18043914437294006, "rewards/margins": 0.0834948718547821, "rewards/margins_max": 0.2217591553926468, "rewards/margins_min": -0.031042709946632385, "rewards/margins_std": 0.11449646949768066, "rewards/rejected": 0.09694425761699677, "step": 3500 }, { "epoch": 0.92, "eval_dpo_losses": 0.6627262830734253, "eval_logits/chosen": -2.78560471534729, "eval_logits/rejected": -2.7474148273468018, "eval_logps/chosen": -268.0489807128906, "eval_logps/rejected": -252.25411987304688, "eval_loss": 0.6805341243743896, "eval_positive_losses": 0.12150750309228897, "eval_rewards/accuracies": 0.7242063283920288, "eval_rewards/chosen": 0.16445913910865784, "eval_rewards/margins": 0.0655067041516304, "eval_rewards/margins_max": 0.2502119243144989, "eval_rewards/margins_min": -0.10156121850013733, "eval_rewards/margins_std": 0.1163892149925232, "eval_rewards/rejected": 0.09895242750644684, "eval_runtime": 389.4568, "eval_samples_per_second": 5.135, "eval_steps_per_second": 0.162, "step": 3500 }, { "dpo_losses": 0.6715716123580933, "epoch": 0.92, "grad_norm": 9.201369838734866, "learning_rate": 1.0027529222456754e-08, "logits/chosen": -2.766420364379883, "logits/rejected": -2.7141261100769043, "logps/chosen": -276.39501953125, "logps/rejected": -255.0765380859375, "loss": 0.6725, "positive_losses": 0.19459286332130432, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1418302208185196, "rewards/margins": 0.04641687124967575, "rewards/margins_max": 0.15345872938632965, "rewards/margins_min": -0.06363958865404129, "rewards/margins_std": 0.0970257893204689, "rewards/rejected": 0.09541334956884384, "step": 3510 }, { "dpo_losses": 0.6719012260437012, "epoch": 0.92, "grad_norm": 2.078586039634592, "learning_rate": 9.397045634168766e-09, "logits/chosen": -2.7967326641082764, "logits/rejected": -2.825498104095459, "logps/chosen": -251.73776245117188, "logps/rejected": -249.35696411132812, "loss": 0.6834, "positive_losses": 0.19272366166114807, "rewards/accuracies": 0.625, "rewards/chosen": 0.12054480612277985, "rewards/margins": 0.047685910016298294, "rewards/margins_max": 0.21180269122123718, "rewards/margins_min": -0.08542943000793457, "rewards/margins_std": 0.1332327276468277, "rewards/rejected": 0.07285889238119125, "step": 3520 }, { "dpo_losses": 0.6688061952590942, "epoch": 0.92, "grad_norm": 10.22580076780168, "learning_rate": 8.78665232332998e-09, "logits/chosen": -2.8747401237487793, "logits/rejected": -2.8032772541046143, "logps/chosen": -266.33514404296875, "logps/rejected": -234.31851196289062, "loss": 0.6704, "positive_losses": 0.07058105617761612, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.15240497887134552, "rewards/margins": 0.05244448781013489, "rewards/margins_max": 0.1862691044807434, "rewards/margins_min": -0.039194732904434204, "rewards/margins_std": 0.09924636781215668, "rewards/rejected": 0.09996049106121063, "step": 3530 }, { "dpo_losses": 0.6617444157600403, "epoch": 0.93, "grad_norm": 15.43040346440166, "learning_rate": 8.196400257606206e-09, "logits/chosen": -2.764500379562378, "logits/rejected": -2.661292791366577, "logps/chosen": -265.88470458984375, "logps/rejected": -230.70852661132812, "loss": 0.6896, "positive_losses": 0.18745270371437073, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.15406164526939392, "rewards/margins": 0.06750769913196564, "rewards/margins_max": 0.20162923634052277, "rewards/margins_min": -0.05225520208477974, "rewards/margins_std": 0.11536550521850586, "rewards/rejected": 0.08655395358800888, "step": 3540 }, { "dpo_losses": 0.6632605791091919, "epoch": 0.93, "grad_norm": 2.1012023844137704, "learning_rate": 7.626338722875075e-09, "logits/chosen": -2.7544524669647217, "logits/rejected": -2.7378907203674316, "logps/chosen": -245.88211059570312, "logps/rejected": -251.2125244140625, "loss": 0.6689, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.17252537608146667, "rewards/margins": 0.06453979015350342, "rewards/margins_max": 0.20910212397575378, "rewards/margins_min": -0.05815510079264641, "rewards/margins_std": 0.11868518590927124, "rewards/rejected": 0.10798557102680206, "step": 3550 }, { "dpo_losses": 0.648143470287323, "epoch": 0.93, "grad_norm": 10.053806126992908, "learning_rate": 7.0765153191106875e-09, "logits/chosen": -2.7637624740600586, "logits/rejected": -2.686310291290283, "logps/chosen": -264.9227600097656, "logps/rejected": -247.6665802001953, "loss": 0.6717, "positive_losses": 0.025927353650331497, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.15334150195121765, "rewards/margins": 0.09508029371500015, "rewards/margins_max": 0.22009754180908203, "rewards/margins_min": -0.00194616022054106, "rewards/margins_std": 0.09800895303487778, "rewards/rejected": 0.0582612045109272, "step": 3560 }, { "dpo_losses": 0.6595361828804016, "epoch": 0.93, "grad_norm": 14.661114306391521, "learning_rate": 6.54697595640899e-09, "logits/chosen": -2.7691988945007324, "logits/rejected": -2.7651801109313965, "logps/chosen": -305.5066223144531, "logps/rejected": -253.8525848388672, "loss": 0.6729, "positive_losses": 0.26647910475730896, "rewards/accuracies": 0.75, "rewards/chosen": 0.17720453441143036, "rewards/margins": 0.07246123254299164, "rewards/margins_max": 0.21851103007793427, "rewards/margins_min": -0.05463584512472153, "rewards/margins_std": 0.1229761391878128, "rewards/rejected": 0.10474331676959991, "step": 3570 }, { "dpo_losses": 0.6499595642089844, "epoch": 0.94, "grad_norm": 11.351711033310751, "learning_rate": 6.037764851154425e-09, "logits/chosen": -2.841740131378174, "logits/rejected": -2.756340742111206, "logps/chosen": -291.97027587890625, "logps/rejected": -261.6409912109375, "loss": 0.689, "positive_losses": 0.2165401428937912, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.16067275404930115, "rewards/margins": 0.09467877447605133, "rewards/margins_max": 0.243547722697258, "rewards/margins_min": -0.06149807572364807, "rewards/margins_std": 0.1369081288576126, "rewards/rejected": 0.06599397957324982, "step": 3580 }, { "dpo_losses": 0.6605626344680786, "epoch": 0.94, "grad_norm": 2.214755694002987, "learning_rate": 5.548924522327747e-09, "logits/chosen": -2.760728359222412, "logits/rejected": -2.7672464847564697, "logps/chosen": -192.11802673339844, "logps/rejected": -197.56478881835938, "loss": 0.6692, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.165755957365036, "rewards/margins": 0.0702081099152565, "rewards/margins_max": 0.18508026003837585, "rewards/margins_min": -0.05116415023803711, "rewards/margins_std": 0.10595384985208511, "rewards/rejected": 0.09554782509803772, "step": 3590 }, { "dpo_losses": 0.6780807971954346, "epoch": 0.94, "grad_norm": 2.3958579241831295, "learning_rate": 5.080495787955691e-09, "logits/chosen": -2.6624696254730225, "logits/rejected": -2.6513009071350098, "logps/chosen": -242.3583984375, "logps/rejected": -269.24737548828125, "loss": 0.6778, "positive_losses": 0.15529099106788635, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.14953303337097168, "rewards/margins": 0.034219689667224884, "rewards/margins_max": 0.16842004656791687, "rewards/margins_min": -0.10593481361865997, "rewards/margins_std": 0.12279149144887924, "rewards/rejected": 0.11531335115432739, "step": 3600 }, { "epoch": 0.94, "eval_dpo_losses": 0.6625929474830627, "eval_logits/chosen": -2.7853004932403564, "eval_logits/rejected": -2.7470314502716064, "eval_logps/chosen": -268.0681457519531, "eval_logps/rejected": -252.3022003173828, "eval_loss": 0.6809914708137512, "eval_positive_losses": 0.12787041068077087, "eval_rewards/accuracies": 0.7182539701461792, "eval_rewards/chosen": 0.16426753997802734, "eval_rewards/margins": 0.06579570472240448, "eval_rewards/margins_max": 0.25073686242103577, "eval_rewards/margins_min": -0.10168647021055222, "eval_rewards/margins_std": 0.11665406078100204, "eval_rewards/rejected": 0.09847183525562286, "eval_runtime": 389.461, "eval_samples_per_second": 5.135, "eval_steps_per_second": 0.162, "step": 3600 }, { "dpo_losses": 0.6752595901489258, "epoch": 0.94, "grad_norm": 1.9735634709955117, "learning_rate": 4.632517761702814e-09, "logits/chosen": -2.683030128479004, "logits/rejected": -2.70261812210083, "logps/chosen": -225.0770721435547, "logps/rejected": -216.30899047851562, "loss": 0.6706, "positive_losses": 0.007031249813735485, "rewards/accuracies": 0.625, "rewards/chosen": 0.13683125376701355, "rewards/margins": 0.03817175701260567, "rewards/margins_max": 0.14266720414161682, "rewards/margins_min": -0.048948802053928375, "rewards/margins_std": 0.08342118561267853, "rewards/rejected": 0.09865951538085938, "step": 3610 }, { "dpo_losses": 0.6676380038261414, "epoch": 0.95, "grad_norm": 3.130911461961453, "learning_rate": 4.205027849605358e-09, "logits/chosen": -2.7730565071105957, "logits/rejected": -2.718841552734375, "logps/chosen": -219.79776000976562, "logps/rejected": -233.9205322265625, "loss": 0.6702, "positive_losses": 0.02769775316119194, "rewards/accuracies": 0.75, "rewards/chosen": 0.15636537969112396, "rewards/margins": 0.0540301576256752, "rewards/margins_max": 0.16362106800079346, "rewards/margins_min": -0.02917725406587124, "rewards/margins_std": 0.08670753240585327, "rewards/rejected": 0.10233521461486816, "step": 3620 }, { "dpo_losses": 0.6682838797569275, "epoch": 0.95, "grad_norm": 6.827471599261377, "learning_rate": 3.798061746947995e-09, "logits/chosen": -2.680844783782959, "logits/rejected": -2.6313278675079346, "logps/chosen": -235.08230590820312, "logps/rejected": -297.84027099609375, "loss": 0.6741, "positive_losses": 0.3615362048149109, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.13550007343292236, "rewards/margins": 0.05279763787984848, "rewards/margins_max": 0.15825259685516357, "rewards/margins_min": -0.054834891110658646, "rewards/margins_std": 0.09746996313333511, "rewards/rejected": 0.08270244300365448, "step": 3630 }, { "dpo_losses": 0.6689377427101135, "epoch": 0.95, "grad_norm": 5.3563624331270425, "learning_rate": 3.411653435283157e-09, "logits/chosen": -2.782923460006714, "logits/rejected": -2.7497828006744385, "logps/chosen": -250.45681762695312, "logps/rejected": -224.9089813232422, "loss": 0.6699, "positive_losses": 0.007960510440170765, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15632639825344086, "rewards/margins": 0.05212927609682083, "rewards/margins_max": 0.17522598803043365, "rewards/margins_min": -0.055180471390485764, "rewards/margins_std": 0.10455608367919922, "rewards/rejected": 0.10419712215662003, "step": 3640 }, { "dpo_losses": 0.677409291267395, "epoch": 0.96, "grad_norm": 13.57210864894291, "learning_rate": 3.0458351795936698e-09, "logits/chosen": -2.7331697940826416, "logits/rejected": -2.759122848510742, "logps/chosen": -266.11138916015625, "logps/rejected": -315.8394470214844, "loss": 0.6718, "positive_losses": 0.013798522762954235, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.16741888225078583, "rewards/margins": 0.0366583950817585, "rewards/margins_max": 0.190229594707489, "rewards/margins_min": -0.10931988805532455, "rewards/margins_std": 0.1350203901529312, "rewards/rejected": 0.13076052069664001, "step": 3650 }, { "dpo_losses": 0.6810728311538696, "epoch": 0.96, "grad_norm": 2.2443964374976946, "learning_rate": 2.700637525598598e-09, "logits/chosen": -2.914287805557251, "logits/rejected": -2.8853492736816406, "logps/chosen": -264.8750305175781, "logps/rejected": -253.35372924804688, "loss": 0.6766, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.14287258684635162, "rewards/margins": 0.025848830118775368, "rewards/margins_max": 0.1001816838979721, "rewards/margins_min": -0.05944544076919556, "rewards/margins_std": 0.07113735377788544, "rewards/rejected": 0.11702374368906021, "step": 3660 }, { "dpo_losses": 0.6558116674423218, "epoch": 0.96, "grad_norm": 1.832206668873853, "learning_rate": 2.3760892972027324e-09, "logits/chosen": -2.774958372116089, "logits/rejected": -2.752284049987793, "logps/chosen": -277.69671630859375, "logps/rejected": -295.549072265625, "loss": 0.6636, "positive_losses": 0.08751678466796875, "rewards/accuracies": 0.625, "rewards/chosen": 0.17725810408592224, "rewards/margins": 0.08058985322713852, "rewards/margins_max": 0.25112852454185486, "rewards/margins_min": -0.0600384883582592, "rewards/margins_std": 0.14340457320213318, "rewards/rejected": 0.09666825830936432, "step": 3670 }, { "dpo_losses": 0.655846357345581, "epoch": 0.96, "grad_norm": 9.385693364328457, "learning_rate": 2.0722175940897645e-09, "logits/chosen": -2.736213207244873, "logits/rejected": -2.713000535964966, "logps/chosen": -277.0622863769531, "logps/rejected": -266.38720703125, "loss": 0.6805, "positive_losses": 0.07905082404613495, "rewards/accuracies": 0.75, "rewards/chosen": 0.17580725252628326, "rewards/margins": 0.07951674610376358, "rewards/margins_max": 0.21051025390625, "rewards/margins_min": -0.039799876511096954, "rewards/margins_std": 0.11044758558273315, "rewards/rejected": 0.09629050642251968, "step": 3680 }, { "dpo_losses": 0.6505584716796875, "epoch": 0.97, "grad_norm": 1.9905331193688285, "learning_rate": 1.7890477894593748e-09, "logits/chosen": -2.752624273300171, "logits/rejected": -2.7264013290405273, "logps/chosen": -312.59149169921875, "logps/rejected": -239.69857788085938, "loss": 0.6652, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.18325236439704895, "rewards/margins": 0.09036721289157867, "rewards/margins_max": 0.20896823704242706, "rewards/margins_min": -0.037368323653936386, "rewards/margins_std": 0.10634877532720566, "rewards/rejected": 0.09288517385721207, "step": 3690 }, { "dpo_losses": 0.6630789637565613, "epoch": 0.97, "grad_norm": 7.024509844530663, "learning_rate": 1.5266035279088708e-09, "logits/chosen": -2.896160840988159, "logits/rejected": -2.8630728721618652, "logps/chosen": -314.116943359375, "logps/rejected": -273.3288879394531, "loss": 0.6788, "positive_losses": 0.15244026482105255, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.16923435032367706, "rewards/margins": 0.0650079995393753, "rewards/margins_max": 0.19643534719944, "rewards/margins_min": -0.08276429772377014, "rewards/margins_std": 0.12063497304916382, "rewards/rejected": 0.10422635078430176, "step": 3700 }, { "epoch": 0.97, "eval_dpo_losses": 0.6626591682434082, "eval_logits/chosen": -2.781266212463379, "eval_logits/rejected": -2.7426860332489014, "eval_logps/chosen": -268.09796142578125, "eval_logps/rejected": -252.3194580078125, "eval_loss": 0.6810540556907654, "eval_positive_losses": 0.12859095633029938, "eval_rewards/accuracies": 0.7222222089767456, "eval_rewards/chosen": 0.1639692187309265, "eval_rewards/margins": 0.06567002832889557, "eval_rewards/margins_max": 0.25067293643951416, "eval_rewards/margins_min": -0.10208174586296082, "eval_rewards/margins_std": 0.116749107837677, "eval_rewards/rejected": 0.09829918295145035, "eval_runtime": 390.2519, "eval_samples_per_second": 5.125, "eval_steps_per_second": 0.161, "step": 3700 }, { "dpo_losses": 0.6626378297805786, "epoch": 0.97, "grad_norm": 8.275436952239337, "learning_rate": 1.2849067234584621e-09, "logits/chosen": -2.679213762283325, "logits/rejected": -2.680332660675049, "logps/chosen": -285.9251708984375, "logps/rejected": -265.3730773925781, "loss": 0.6823, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1750296652317047, "rewards/margins": 0.06485885381698608, "rewards/margins_max": 0.17583505809307098, "rewards/margins_min": -0.04068106785416603, "rewards/margins_std": 0.09638006240129471, "rewards/rejected": 0.11017082631587982, "step": 3710 }, { "dpo_losses": 0.6638874411582947, "epoch": 0.97, "grad_norm": 9.97430246156224, "learning_rate": 1.0639775577218625e-09, "logits/chosen": -2.8001160621643066, "logits/rejected": -2.792501926422119, "logps/chosen": -281.8971252441406, "logps/rejected": -253.5721893310547, "loss": 0.6714, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1858047991991043, "rewards/margins": 0.06325621902942657, "rewards/margins_max": 0.20727744698524475, "rewards/margins_min": -0.06923703104257584, "rewards/margins_std": 0.12179337441921234, "rewards/rejected": 0.12254859507083893, "step": 3720 }, { "dpo_losses": 0.6496821641921997, "epoch": 0.98, "grad_norm": 8.743713736578215, "learning_rate": 8.638344782207485e-10, "logits/chosen": -2.887678861618042, "logits/rejected": -2.821990966796875, "logps/chosen": -320.71673583984375, "logps/rejected": -252.2602996826172, "loss": 0.6863, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.19121643900871277, "rewards/margins": 0.09353573620319366, "rewards/margins_max": 0.24803459644317627, "rewards/margins_min": -0.038599640130996704, "rewards/margins_std": 0.1312188357114792, "rewards/rejected": 0.09768068790435791, "step": 3730 }, { "dpo_losses": 0.6656588315963745, "epoch": 0.98, "grad_norm": 2.059055573345697, "learning_rate": 6.844941968447149e-10, "logits/chosen": -2.816688060760498, "logits/rejected": -2.7649848461151123, "logps/chosen": -317.15789794921875, "logps/rejected": -317.58514404296875, "loss": 0.6659, "positive_losses": 0.028494644910097122, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.17586681246757507, "rewards/margins": 0.06203201413154602, "rewards/margins_max": 0.19731295108795166, "rewards/margins_min": -0.06362716853618622, "rewards/margins_std": 0.11939724534749985, "rewards/rejected": 0.11383481323719025, "step": 3740 }, { "dpo_losses": 0.6846436262130737, "epoch": 0.98, "grad_norm": 2.128617293319985, "learning_rate": 5.25971688455612e-10, "logits/chosen": -2.7480340003967285, "logits/rejected": -2.7826037406921387, "logps/chosen": -279.6949157714844, "logps/rejected": -252.7490234375, "loss": 0.6753, "positive_losses": 0.13470391929149628, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.1422509104013443, "rewards/margins": 0.020876307040452957, "rewards/margins_max": 0.13941016793251038, "rewards/margins_min": -0.1157798022031784, "rewards/margins_std": 0.11612125486135483, "rewards/rejected": 0.12137460708618164, "step": 3750 }, { "dpo_losses": 0.6659666895866394, "epoch": 0.98, "grad_norm": 2.07576600608641, "learning_rate": 3.882801896372967e-10, "logits/chosen": -2.7559616565704346, "logits/rejected": -2.782864570617676, "logps/chosen": -219.6016082763672, "logps/rejected": -228.398681640625, "loss": 0.669, "positive_losses": 0.0055253030732274055, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15218304097652435, "rewards/margins": 0.058321546763181686, "rewards/margins_max": 0.17173783481121063, "rewards/margins_min": -0.03377734497189522, "rewards/margins_std": 0.09105263650417328, "rewards/rejected": 0.09386148303747177, "step": 3760 }, { "dpo_losses": 0.6622568964958191, "epoch": 0.99, "grad_norm": 15.114443791711706, "learning_rate": 2.714311975902661e-10, "logits/chosen": -2.7868869304656982, "logits/rejected": -2.738193988800049, "logps/chosen": -288.66583251953125, "logps/rejected": -321.9875183105469, "loss": 0.6696, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.1881106197834015, "rewards/margins": 0.06645546853542328, "rewards/margins_max": 0.19537228345870972, "rewards/margins_min": -0.06403455138206482, "rewards/margins_std": 0.11841548979282379, "rewards/rejected": 0.1216551661491394, "step": 3770 }, { "dpo_losses": 0.6676747798919678, "epoch": 0.99, "grad_norm": 8.209236934388109, "learning_rate": 1.754344691717591e-10, "logits/chosen": -2.7726235389709473, "logits/rejected": -2.696892023086548, "logps/chosen": -279.5736389160156, "logps/rejected": -267.17205810546875, "loss": 0.6801, "positive_losses": 0.09341277927160263, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.16620515286922455, "rewards/margins": 0.056923191994428635, "rewards/margins_max": 0.22022612392902374, "rewards/margins_min": -0.12907883524894714, "rewards/margins_std": 0.16013504564762115, "rewards/rejected": 0.10928195714950562, "step": 3780 }, { "dpo_losses": 0.6785107851028442, "epoch": 0.99, "grad_norm": 15.44886268948098, "learning_rate": 1.0029802008096333e-10, "logits/chosen": -2.8079333305358887, "logits/rejected": -2.8062872886657715, "logps/chosen": -235.77810668945312, "logps/rejected": -209.78030395507812, "loss": 0.7003, "positive_losses": 0.6782264709472656, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.11785753071308136, "rewards/margins": 0.033666886389255524, "rewards/margins_max": 0.16288526356220245, "rewards/margins_min": -0.07857248932123184, "rewards/margins_std": 0.10428880155086517, "rewards/rejected": 0.08419065177440643, "step": 3790 }, { "dpo_losses": 0.6620692014694214, "epoch": 0.99, "grad_norm": 1.7519493145513187, "learning_rate": 4.602812418974533e-11, "logits/chosen": -2.693044424057007, "logits/rejected": -2.674826145172119, "logps/chosen": -255.14208984375, "logps/rejected": -189.10543823242188, "loss": 0.6668, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.16355091333389282, "rewards/margins": 0.06598031520843506, "rewards/margins_max": 0.16940084099769592, "rewards/margins_min": -0.025763630867004395, "rewards/margins_std": 0.08741643279790878, "rewards/rejected": 0.09757061302661896, "step": 3800 }, { "epoch": 0.99, "eval_dpo_losses": 0.6626641154289246, "eval_logits/chosen": -2.782968044281006, "eval_logits/rejected": -2.7445266246795654, "eval_logps/chosen": -268.0970458984375, "eval_logps/rejected": -252.318603515625, "eval_loss": 0.6810625195503235, "eval_positive_losses": 0.12868142127990723, "eval_rewards/accuracies": 0.7202380895614624, "eval_rewards/chosen": 0.16397827863693237, "eval_rewards/margins": 0.0656706765294075, "eval_rewards/margins_max": 0.25085288286209106, "eval_rewards/margins_min": -0.10247818380594254, "eval_rewards/margins_std": 0.11689701676368713, "eval_rewards/rejected": 0.09830759465694427, "eval_runtime": 389.263, "eval_samples_per_second": 5.138, "eval_steps_per_second": 0.162, "step": 3800 }, { "dpo_losses": 0.6744269132614136, "epoch": 1.0, "grad_norm": 1.7814713725344193, "learning_rate": 1.2629313018819309e-11, "logits/chosen": -2.815308094024658, "logits/rejected": -2.7851715087890625, "logps/chosen": -297.2845458984375, "logps/rejected": -259.3329162597656, "loss": 0.6728, "positive_losses": 0.046834565699100494, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.15666639804840088, "rewards/margins": 0.04045479744672775, "rewards/margins_max": 0.15807831287384033, "rewards/margins_min": -0.07443277537822723, "rewards/margins_std": 0.10473154485225677, "rewards/rejected": 0.11621161550283432, "step": 3810 }, { "dpo_losses": 0.656509280204773, "epoch": 1.0, "grad_norm": 6.53868913211251, "learning_rate": 1.0437535929996855e-13, "logits/chosen": -2.748309373855591, "logits/rejected": -2.7362165451049805, "logps/chosen": -317.56524658203125, "logps/rejected": -208.669921875, "loss": 0.674, "positive_losses": 0.18830108642578125, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1658371239900589, "rewards/margins": 0.07918829470872879, "rewards/margins_max": 0.22351901233196259, "rewards/margins_min": -0.048277318477630615, "rewards/margins_std": 0.11985959112644196, "rewards/rejected": 0.08664882928133011, "step": 3820 }, { "epoch": 1.0, "step": 3821, "total_flos": 0.0, "train_loss": 0.6809778677109991, "train_runtime": 43303.5615, "train_samples_per_second": 1.412, "train_steps_per_second": 0.088 } ], "logging_steps": 10, "max_steps": 3821, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }