diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,12 +10,12 @@ "log_history": [ { "epoch": 0.0, - "grad_norm": 1.65625, + "grad_norm": 1.75, "learning_rate": 2.617801047120419e-08, - "logits/chosen": -2.4349141120910645, - "logits/rejected": -2.305828332901001, - "logps/chosen": -259.81884765625, - "logps/rejected": -293.43365478515625, + "logits/chosen": -2.2731704711914062, + "logits/rejected": -2.1761367321014404, + "logps/chosen": -360.11749267578125, + "logps/rejected": -295.71942138671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, @@ -25,3181 +25,3181 @@ }, { "epoch": 0.01, - "grad_norm": 1.5078125, + "grad_norm": 1.6328125, "learning_rate": 2.617801047120419e-07, - "logits/chosen": -2.403059959411621, - "logits/rejected": -2.388718843460083, - "logps/chosen": -235.5987091064453, - "logps/rejected": -222.26573181152344, - "loss": 0.6928, - "rewards/accuracies": 0.4791666567325592, - "rewards/chosen": 0.0011952732456848025, - "rewards/margins": 0.0005173576646484435, - "rewards/rejected": 0.0006779157556593418, + "logits/chosen": -2.259983777999878, + "logits/rejected": -2.1553776264190674, + "logps/chosen": -303.7694091796875, + "logps/rejected": -241.65496826171875, + "loss": 0.6931, + "rewards/accuracies": 0.4097222089767456, + "rewards/chosen": 0.0003990587720181793, + "rewards/margins": 0.00018596058362163603, + "rewards/rejected": 0.00021309818839654326, "step": 10 }, { "epoch": 0.01, - "grad_norm": 1.703125, + "grad_norm": 1.5078125, "learning_rate": 5.235602094240838e-07, - "logits/chosen": -2.439892292022705, - "logits/rejected": -2.4342308044433594, - "logps/chosen": -253.2171173095703, - "logps/rejected": -252.88656616210938, - "loss": 0.6924, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": 0.007272551767528057, - "rewards/margins": 0.0015017122495919466, - "rewards/rejected": 0.005770839285105467, + "logits/chosen": -2.2269904613494873, + "logits/rejected": -2.1716256141662598, + "logps/chosen": -262.0898132324219, + "logps/rejected": -249.70352172851562, + "loss": 0.6929, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": 0.001905500190332532, + "rewards/margins": 0.00033334543695673347, + "rewards/rejected": 0.0015721546951681376, "step": 20 }, { "epoch": 0.02, - "grad_norm": 1.671875, + "grad_norm": 1.609375, "learning_rate": 7.853403141361258e-07, - "logits/chosen": -2.433501720428467, - "logits/rejected": -2.4373536109924316, - "logps/chosen": -257.414306640625, - "logps/rejected": -245.83837890625, - "loss": 0.6907, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.018377887085080147, - "rewards/margins": 0.0048246318474411964, - "rewards/rejected": 0.013553252443671227, + "logits/chosen": -2.1317379474639893, + "logits/rejected": -2.082672595977783, + "logps/chosen": -277.956298828125, + "logps/rejected": -239.60293579101562, + "loss": 0.692, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.005530746188014746, + "rewards/margins": 0.0024719357024878263, + "rewards/rejected": 0.003058810718357563, "step": 30 }, { "epoch": 0.02, - "grad_norm": 1.546875, + "grad_norm": 1.6484375, "learning_rate": 1.0471204188481676e-06, - "logits/chosen": -2.4690449237823486, - "logits/rejected": -2.4557018280029297, - "logps/chosen": -251.83187866210938, - "logps/rejected": -219.1208038330078, - "loss": 0.6889, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.03281437233090401, - "rewards/margins": 0.008931155316531658, - "rewards/rejected": 0.023883214220404625, + "logits/chosen": -2.241527795791626, + "logits/rejected": -2.1675925254821777, + "logps/chosen": -267.1982727050781, + "logps/rejected": -270.7894287109375, + "loss": 0.6901, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.010646065697073936, + "rewards/margins": 0.007375557906925678, + "rewards/rejected": 0.003270507324486971, "step": 40 }, { "epoch": 0.03, - "grad_norm": 1.453125, + "grad_norm": 1.578125, "learning_rate": 1.3089005235602096e-06, - "logits/chosen": -2.4257969856262207, - "logits/rejected": -2.50382661819458, - "logps/chosen": -251.3686981201172, - "logps/rejected": -234.69204711914062, - "loss": 0.6862, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": 0.04148787260055542, - "rewards/margins": 0.013229536823928356, - "rewards/rejected": 0.02825833484530449, + "logits/chosen": -2.269123077392578, + "logits/rejected": -2.1638808250427246, + "logps/chosen": -285.50164794921875, + "logps/rejected": -254.6826629638672, + "loss": 0.6881, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.016856886446475983, + "rewards/margins": 0.012282918207347393, + "rewards/rejected": 0.004573967307806015, "step": 50 }, { "epoch": 0.03, - "grad_norm": 1.65625, + "grad_norm": 1.5625, "learning_rate": 1.5706806282722515e-06, - "logits/chosen": -2.389047622680664, - "logits/rejected": -2.4084537029266357, - "logps/chosen": -237.94482421875, - "logps/rejected": -256.0442199707031, - "loss": 0.6854, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.04300086200237274, - "rewards/margins": 0.017871979624032974, - "rewards/rejected": 0.025128880515694618, + "logits/chosen": -2.272505283355713, + "logits/rejected": -2.122178316116333, + "logps/chosen": -310.4421081542969, + "logps/rejected": -260.3993835449219, + "loss": 0.6838, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.027886558324098587, + "rewards/margins": 0.01857396773993969, + "rewards/rejected": 0.009312589652836323, "step": 60 }, { "epoch": 0.04, - "grad_norm": 1.640625, + "grad_norm": 1.71875, "learning_rate": 1.8324607329842933e-06, - "logits/chosen": -2.5107932090759277, - "logits/rejected": -2.441624402999878, - "logps/chosen": -270.041259765625, - "logps/rejected": -262.65423583984375, - "loss": 0.676, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.05541977286338806, - "rewards/margins": 0.03250492364168167, - "rewards/rejected": 0.02291484735906124, + "logits/chosen": -2.2799620628356934, + "logits/rejected": -2.126312494277954, + "logps/chosen": -266.8229064941406, + "logps/rejected": -234.4290313720703, + "loss": 0.68, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03913799300789833, + "rewards/margins": 0.0319330058991909, + "rewards/rejected": 0.0072049833834171295, "step": 70 }, { "epoch": 0.04, - "grad_norm": 1.4140625, + "grad_norm": 1.859375, "learning_rate": 2.094240837696335e-06, - "logits/chosen": -2.5124049186706543, - "logits/rejected": -2.4392712116241455, - "logps/chosen": -271.6557312011719, - "logps/rejected": -247.5332794189453, - "loss": 0.6779, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.048886366188526154, - "rewards/margins": 0.030922260135412216, - "rewards/rejected": 0.01796409860253334, + "logits/chosen": -2.2087223529815674, + "logits/rejected": -2.1472485065460205, + "logps/chosen": -279.0526123046875, + "logps/rejected": -270.34185791015625, + "loss": 0.6778, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.05015086010098457, + "rewards/margins": 0.03647767752408981, + "rewards/rejected": 0.013673178851604462, "step": 80 }, { "epoch": 0.05, - "grad_norm": 1.609375, + "grad_norm": 1.75, "learning_rate": 2.356020942408377e-06, - "logits/chosen": -2.437180995941162, - "logits/rejected": -2.4321937561035156, - "logps/chosen": -233.2532501220703, - "logps/rejected": -220.14111328125, - "loss": 0.6701, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": 0.04492487758398056, - "rewards/margins": 0.04827447608113289, - "rewards/rejected": -0.003349601523950696, + "logits/chosen": -2.2452147006988525, + "logits/rejected": -2.178189992904663, + "logps/chosen": -250.22409057617188, + "logps/rejected": -241.08779907226562, + "loss": 0.676, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0462195985019207, + "rewards/margins": 0.03395666554570198, + "rewards/rejected": 0.012262934818863869, "step": 90 }, { "epoch": 0.05, - "grad_norm": 1.7734375, + "grad_norm": 1.9296875, "learning_rate": 2.617801047120419e-06, - "logits/chosen": -2.4792046546936035, - "logits/rejected": -2.489912748336792, - "logps/chosen": -261.0204772949219, - "logps/rejected": -243.6798858642578, - "loss": 0.6622, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.042254410684108734, - "rewards/margins": 0.06775885075330734, - "rewards/rejected": -0.025504430755972862, + "logits/chosen": -2.2681682109832764, + "logits/rejected": -2.121201515197754, + "logps/chosen": -244.41537475585938, + "logps/rejected": -206.2124481201172, + "loss": 0.6641, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.027621516957879066, + "rewards/margins": 0.07613887637853622, + "rewards/rejected": -0.048517368733882904, "step": 100 }, { "epoch": 0.05, - "eval_logits/chosen": -2.324150562286377, - "eval_logits/rejected": -2.297302484512329, - "eval_logps/chosen": -264.0424499511719, - "eval_logps/rejected": -247.6175537109375, - "eval_loss": 0.6637194156646729, - "eval_rewards/accuracies": 0.6840000152587891, - "eval_rewards/chosen": 0.012637840583920479, - "eval_rewards/margins": 0.07619453966617584, - "eval_rewards/rejected": -0.06355669349431992, - "eval_runtime": 451.4766, - "eval_samples_per_second": 4.43, - "eval_steps_per_second": 0.277, + "eval_logits/chosen": -2.136847972869873, + "eval_logits/rejected": -2.0436248779296875, + "eval_logps/chosen": -271.1658935546875, + "eval_logps/rejected": -254.5337371826172, + "eval_loss": 0.6636302471160889, + "eval_rewards/accuracies": 0.6899999976158142, + "eval_rewards/chosen": 0.00537072541192174, + "eval_rewards/margins": 0.07350712269544601, + "eval_rewards/rejected": -0.06813640147447586, + "eval_runtime": 206.3535, + "eval_samples_per_second": 9.692, + "eval_steps_per_second": 0.606, "step": 100 }, { "epoch": 0.06, - "grad_norm": 2.15625, + "grad_norm": 2.125, "learning_rate": 2.8795811518324613e-06, - "logits/chosen": -2.4320034980773926, - "logits/rejected": -2.367145538330078, - "logps/chosen": -238.0926513671875, - "logps/rejected": -240.7525634765625, - "loss": 0.664, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.00833176914602518, - "rewards/margins": 0.0710856094956398, - "rewards/rejected": -0.07941737025976181, + "logits/chosen": -2.2474560737609863, + "logits/rejected": -2.1244096755981445, + "logps/chosen": -264.55267333984375, + "logps/rejected": -220.3800506591797, + "loss": 0.6544, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.01394604705274105, + "rewards/margins": 0.09943284094333649, + "rewards/rejected": -0.08548679202795029, "step": 110 }, { "epoch": 0.06, - "grad_norm": 2.0, + "grad_norm": 2.21875, "learning_rate": 3.141361256544503e-06, - "logits/chosen": -2.33853816986084, - "logits/rejected": -2.331645965576172, - "logps/chosen": -263.82879638671875, - "logps/rejected": -240.96682739257812, - "loss": 0.6499, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": 0.019238661974668503, - "rewards/margins": 0.10400726646184921, - "rewards/rejected": -0.08476860821247101, + "logits/chosen": -2.269590139389038, + "logits/rejected": -2.1833536624908447, + "logps/chosen": -320.98858642578125, + "logps/rejected": -295.50213623046875, + "loss": 0.6587, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.03138069808483124, + "rewards/margins": 0.07552894949913025, + "rewards/rejected": -0.10690964758396149, "step": 120 }, { "epoch": 0.07, - "grad_norm": 2.5, + "grad_norm": 2.734375, "learning_rate": 3.403141361256545e-06, - "logits/chosen": -2.3021273612976074, - "logits/rejected": -2.2405173778533936, - "logps/chosen": -265.228515625, - "logps/rejected": -267.389892578125, - "loss": 0.6528, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.15649668872356415, - "rewards/margins": 0.10490702092647552, - "rewards/rejected": -0.2614037096500397, + "logits/chosen": -2.2066664695739746, + "logits/rejected": -2.1157479286193848, + "logps/chosen": -286.4569396972656, + "logps/rejected": -271.8417053222656, + "loss": 0.6471, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.026338139548897743, + "rewards/margins": 0.12607838213443756, + "rewards/rejected": -0.09974025189876556, "step": 130 }, { "epoch": 0.07, - "grad_norm": 2.109375, + "grad_norm": 3.296875, "learning_rate": 3.6649214659685865e-06, - "logits/chosen": -2.269350290298462, - "logits/rejected": -2.1657040119171143, - "logps/chosen": -266.2239685058594, - "logps/rejected": -277.36688232421875, - "loss": 0.6388, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.1390775889158249, - "rewards/margins": 0.1098862886428833, - "rewards/rejected": -0.2489638775587082, + "logits/chosen": -2.1494216918945312, + "logits/rejected": -2.084484815597534, + "logps/chosen": -266.2762756347656, + "logps/rejected": -270.8194580078125, + "loss": 0.6385, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.075466588139534, + "rewards/margins": 0.14162591099739075, + "rewards/rejected": -0.21709248423576355, "step": 140 }, { "epoch": 0.08, - "grad_norm": 2.953125, + "grad_norm": 3.0625, "learning_rate": 3.926701570680629e-06, - "logits/chosen": -2.291912317276001, - "logits/rejected": -2.2098731994628906, - "logps/chosen": -268.5712585449219, - "logps/rejected": -270.6773681640625, - "loss": 0.6356, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.11082954704761505, - "rewards/margins": 0.17063280940055847, - "rewards/rejected": -0.2814623713493347, + "logits/chosen": -2.189535617828369, + "logits/rejected": -2.0706353187561035, + "logps/chosen": -291.97283935546875, + "logps/rejected": -291.4541015625, + "loss": 0.6269, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.13837790489196777, + "rewards/margins": 0.16128641366958618, + "rewards/rejected": -0.29966431856155396, "step": 150 }, { "epoch": 0.08, - "grad_norm": 3.953125, + "grad_norm": 3.9375, "learning_rate": 4.18848167539267e-06, - "logits/chosen": -2.117079496383667, - "logits/rejected": -2.1376585960388184, - "logps/chosen": -268.0408020019531, - "logps/rejected": -271.73858642578125, - "loss": 0.6274, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.18969736993312836, - "rewards/margins": 0.20424196124076843, - "rewards/rejected": -0.39393937587738037, + "logits/chosen": -2.3002448081970215, + "logits/rejected": -2.1360554695129395, + "logps/chosen": -293.7611083984375, + "logps/rejected": -274.07025146484375, + "loss": 0.6296, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.17077429592609406, + "rewards/margins": 0.15933959186077118, + "rewards/rejected": -0.3301139175891876, "step": 160 }, { "epoch": 0.09, - "grad_norm": 3.953125, + "grad_norm": 3.71875, "learning_rate": 4.450261780104713e-06, - "logits/chosen": -2.225733995437622, - "logits/rejected": -2.188385248184204, - "logps/chosen": -291.46209716796875, - "logps/rejected": -291.213134765625, - "loss": 0.6273, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.29510316252708435, - "rewards/margins": 0.19445903599262238, - "rewards/rejected": -0.48956218361854553, + "logits/chosen": -2.1448490619659424, + "logits/rejected": -2.123392105102539, + "logps/chosen": -263.1458740234375, + "logps/rejected": -278.94635009765625, + "loss": 0.6264, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.16839735209941864, + "rewards/margins": 0.17006024718284607, + "rewards/rejected": -0.3384575843811035, "step": 170 }, { "epoch": 0.09, - "grad_norm": 3.171875, + "grad_norm": 2.78125, "learning_rate": 4.712041884816754e-06, - "logits/chosen": -2.145782947540283, - "logits/rejected": -2.1198437213897705, - "logps/chosen": -309.0302429199219, - "logps/rejected": -312.53302001953125, - "loss": 0.6111, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.2840114235877991, - "rewards/margins": 0.23209214210510254, - "rewards/rejected": -0.5161035656929016, + "logits/chosen": -2.2578933238983154, + "logits/rejected": -2.1124982833862305, + "logps/chosen": -318.94476318359375, + "logps/rejected": -312.0294494628906, + "loss": 0.628, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.23213641345500946, + "rewards/margins": 0.20630350708961487, + "rewards/rejected": -0.43843990564346313, "step": 180 }, { "epoch": 0.1, - "grad_norm": 3.28125, + "grad_norm": 3.53125, "learning_rate": 4.9738219895287965e-06, - "logits/chosen": -2.164933204650879, - "logits/rejected": -1.9833053350448608, - "logps/chosen": -289.17510986328125, - "logps/rejected": -285.3663024902344, - "loss": 0.6033, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.20175080001354218, - "rewards/margins": 0.22905221581459045, - "rewards/rejected": -0.43080300092697144, + "logits/chosen": -2.2287371158599854, + "logits/rejected": -2.1092655658721924, + "logps/chosen": -303.0882873535156, + "logps/rejected": -299.7857360839844, + "loss": 0.6017, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.2376522272825241, + "rewards/margins": 0.2720206081867218, + "rewards/rejected": -0.5096728205680847, "step": 190 }, { "epoch": 0.1, - "grad_norm": 4.1875, + "grad_norm": 3.34375, "learning_rate": 4.999661831436499e-06, - "logits/chosen": -2.1845479011535645, - "logits/rejected": -2.1678760051727295, - "logps/chosen": -355.00604248046875, - "logps/rejected": -341.51348876953125, - "loss": 0.6069, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.5404707789421082, - "rewards/margins": 0.2492518424987793, - "rewards/rejected": -0.7897226214408875, + "logits/chosen": -2.1837334632873535, + "logits/rejected": -2.094944477081299, + "logps/chosen": -285.54425048828125, + "logps/rejected": -296.94720458984375, + "loss": 0.6105, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1814960539340973, + "rewards/margins": 0.24684414267539978, + "rewards/rejected": -0.4283401370048523, "step": 200 }, { "epoch": 0.1, - "eval_logits/chosen": -2.064443349838257, - "eval_logits/rejected": -1.9985278844833374, - "eval_logps/chosen": -319.2918395996094, - "eval_logps/rejected": -322.12091064453125, - "eval_loss": 0.6174576282501221, - "eval_rewards/accuracies": 0.671999990940094, - "eval_rewards/chosen": -0.5398561954498291, - "eval_rewards/margins": 0.2687341868877411, - "eval_rewards/rejected": -0.8085903525352478, - "eval_runtime": 449.2939, - "eval_samples_per_second": 4.451, - "eval_steps_per_second": 0.278, + "eval_logits/chosen": -2.0918540954589844, + "eval_logits/rejected": -2.003007173538208, + "eval_logps/chosen": -304.0613098144531, + "eval_logps/rejected": -307.0966796875, + "eval_loss": 0.607542872428894, + "eval_rewards/accuracies": 0.6890000104904175, + "eval_rewards/chosen": -0.32358381152153015, + "eval_rewards/margins": 0.2701820731163025, + "eval_rewards/rejected": -0.5937658548355103, + "eval_runtime": 205.7166, + "eval_samples_per_second": 9.722, + "eval_steps_per_second": 0.608, "step": 200 }, { "epoch": 0.11, - "grad_norm": 4.96875, + "grad_norm": 5.09375, "learning_rate": 4.9984929711403395e-06, - "logits/chosen": -2.1534504890441895, - "logits/rejected": -2.1233341693878174, - "logps/chosen": -343.18817138671875, - "logps/rejected": -340.7259216308594, - "loss": 0.6027, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.5040772557258606, - "rewards/margins": 0.2695787847042084, - "rewards/rejected": -0.7736560702323914, + "logits/chosen": -2.132068395614624, + "logits/rejected": -2.023831844329834, + "logps/chosen": -262.7835693359375, + "logps/rejected": -267.73046875, + "loss": 0.6256, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.24357381463050842, + "rewards/margins": 0.18603582680225372, + "rewards/rejected": -0.4296096861362457, "step": 210 }, { "epoch": 0.12, - "grad_norm": 5.09375, + "grad_norm": 3.671875, "learning_rate": 4.996489634487865e-06, - "logits/chosen": -2.0994200706481934, - "logits/rejected": -1.9647928476333618, - "logps/chosen": -295.4856262207031, - "logps/rejected": -336.73419189453125, - "loss": 0.6241, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.5017188787460327, - "rewards/margins": 0.269430547952652, - "rewards/rejected": -0.7711495161056519, + "logits/chosen": -2.1585371494293213, + "logits/rejected": -2.0781378746032715, + "logps/chosen": -319.03131103515625, + "logps/rejected": -319.93634033203125, + "loss": 0.5928, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.40161317586898804, + "rewards/margins": 0.3085588216781616, + "rewards/rejected": -0.7101720571517944, "step": 220 }, { "epoch": 0.12, - "grad_norm": 4.0, + "grad_norm": 5.65625, "learning_rate": 4.9936524905772466e-06, - "logits/chosen": -2.008256435394287, - "logits/rejected": -2.0621442794799805, - "logps/chosen": -281.52276611328125, - "logps/rejected": -306.1183166503906, - "loss": 0.6078, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.3256304860115051, - "rewards/margins": 0.3065679669380188, - "rewards/rejected": -0.6321984529495239, + "logits/chosen": -2.099738359451294, + "logits/rejected": -2.0198581218719482, + "logps/chosen": -287.83807373046875, + "logps/rejected": -313.92620849609375, + "loss": 0.5783, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.435781329870224, + "rewards/margins": 0.37423354387283325, + "rewards/rejected": -0.8100149035453796, "step": 230 }, { "epoch": 0.13, - "grad_norm": 5.90625, + "grad_norm": 4.28125, "learning_rate": 4.9899824869915e-06, - "logits/chosen": -1.9556798934936523, - "logits/rejected": -1.8381097316741943, - "logps/chosen": -316.3447265625, - "logps/rejected": -323.200439453125, - "loss": 0.6013, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.3776395618915558, - "rewards/margins": 0.2974005341529846, - "rewards/rejected": -0.675040066242218, + "logits/chosen": -2.1551759243011475, + "logits/rejected": -2.0810234546661377, + "logps/chosen": -326.7232360839844, + "logps/rejected": -348.8532409667969, + "loss": 0.5916, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7081143260002136, + "rewards/margins": 0.3044819235801697, + "rewards/rejected": -1.0125962495803833, "step": 240 }, { "epoch": 0.13, - "grad_norm": 10.25, + "grad_norm": 3.203125, "learning_rate": 4.985480849482012e-06, - "logits/chosen": -1.5747647285461426, - "logits/rejected": -1.4650704860687256, - "logps/chosen": -333.0284118652344, - "logps/rejected": -334.76806640625, - "loss": 0.5577, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.6769969463348389, - "rewards/margins": 0.387776643037796, - "rewards/rejected": -1.064773678779602, + "logits/chosen": -2.1337616443634033, + "logits/rejected": -2.0014939308166504, + "logps/chosen": -374.9393615722656, + "logps/rejected": -349.9031066894531, + "loss": 0.5716, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6511236429214478, + "rewards/margins": 0.35167860984802246, + "rewards/rejected": -1.0028022527694702, "step": 250 }, { "epoch": 0.14, - "grad_norm": 4.28125, + "grad_norm": 3.96875, "learning_rate": 4.980149081559142e-06, - "logits/chosen": -1.3563178777694702, - "logits/rejected": -1.1254017353057861, - "logps/chosen": -384.9044189453125, - "logps/rejected": -386.17791748046875, - "loss": 0.562, + "logits/chosen": -2.1727969646453857, + "logits/rejected": -2.108790874481201, + "logps/chosen": -314.8628234863281, + "logps/rejected": -321.34710693359375, + "loss": 0.5922, "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.8524494171142578, - "rewards/margins": 0.5679513216018677, - "rewards/rejected": -1.420400857925415, + "rewards/chosen": -0.4089882969856262, + "rewards/margins": 0.38155826926231384, + "rewards/rejected": -0.7905465960502625, "step": 260 }, { "epoch": 0.14, - "grad_norm": 6.5, + "grad_norm": 3.421875, "learning_rate": 4.9739889639900655e-06, - "logits/chosen": -0.7365055084228516, - "logits/rejected": -0.5389373302459717, - "logps/chosen": -318.93756103515625, - "logps/rejected": -336.3540954589844, - "loss": 0.5668, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.6196847558021545, - "rewards/margins": 0.42324098944664, - "rewards/rejected": -1.0429257154464722, + "logits/chosen": -2.1201374530792236, + "logits/rejected": -2.074112892150879, + "logps/chosen": -317.30908203125, + "logps/rejected": -349.3733825683594, + "loss": 0.5602, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.4397730827331543, + "rewards/margins": 0.4018850326538086, + "rewards/rejected": -0.8416581153869629, "step": 270 }, { "epoch": 0.15, - "grad_norm": 5.8125, + "grad_norm": 3.3125, "learning_rate": 4.967002554204009e-06, - "logits/chosen": -0.35692664980888367, - "logits/rejected": -0.4370260238647461, - "logps/chosen": -348.50592041015625, - "logps/rejected": -375.6952209472656, - "loss": 0.5692, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.9018071889877319, - "rewards/margins": 0.5076521635055542, - "rewards/rejected": -1.4094593524932861, + "logits/chosen": -2.1125736236572266, + "logits/rejected": -2.088714122772217, + "logps/chosen": -330.7603759765625, + "logps/rejected": -371.1779479980469, + "loss": 0.5972, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5998188853263855, + "rewards/margins": 0.33086952567100525, + "rewards/rejected": -0.9306885004043579, "step": 280 }, { "epoch": 0.15, - "grad_norm": 5.3125, + "grad_norm": 3.765625, "learning_rate": 4.959192185605089e-06, - "logits/chosen": -0.597706139087677, - "logits/rejected": -0.4843805432319641, - "logps/chosen": -354.8358154296875, - "logps/rejected": -397.0480041503906, - "loss": 0.5257, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.7842257022857666, - "rewards/margins": 0.6368502378463745, - "rewards/rejected": -1.4210759401321411, + "logits/chosen": -2.266671657562256, + "logits/rejected": -2.168347120285034, + "logps/chosen": -372.093017578125, + "logps/rejected": -373.88555908203125, + "loss": 0.5818, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.5426467657089233, + "rewards/margins": 0.4017227590084076, + "rewards/rejected": -0.9443694353103638, "step": 290 }, { "epoch": 0.16, - "grad_norm": 6.4375, + "grad_norm": 7.375, "learning_rate": 4.950560466792969e-06, - "logits/chosen": -0.4317222237586975, - "logits/rejected": -0.48575448989868164, - "logps/chosen": -346.1421203613281, - "logps/rejected": -373.5080871582031, - "loss": 0.5858, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.8795005083084106, - "rewards/margins": 0.45564961433410645, - "rewards/rejected": -1.3351500034332275, + "logits/chosen": -2.1483025550842285, + "logits/rejected": -2.048117160797119, + "logps/chosen": -341.9796447753906, + "logps/rejected": -351.8575744628906, + "loss": 0.5883, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6638094782829285, + "rewards/margins": 0.44865164160728455, + "rewards/rejected": -1.1124610900878906, "step": 300 }, { "epoch": 0.16, - "eval_logits/chosen": 0.11951165646314621, - "eval_logits/rejected": 0.21957716345787048, - "eval_logps/chosen": -349.1536865234375, - "eval_logps/rejected": -377.4862976074219, - "eval_loss": 0.5706533193588257, - "eval_rewards/accuracies": 0.6930000185966492, - "eval_rewards/chosen": -0.8384745717048645, - "eval_rewards/margins": 0.523769736289978, - "eval_rewards/rejected": -1.3622443675994873, - "eval_runtime": 449.2894, - "eval_samples_per_second": 4.451, - "eval_steps_per_second": 0.278, + "eval_logits/chosen": -2.076050281524658, + "eval_logits/rejected": -1.9913941621780396, + "eval_logps/chosen": -342.9188232421875, + "eval_logps/rejected": -360.5767517089844, + "eval_loss": 0.5817497372627258, + "eval_rewards/accuracies": 0.7020000219345093, + "eval_rewards/chosen": -0.7121586203575134, + "eval_rewards/margins": 0.4164075553417206, + "eval_rewards/rejected": -1.1285661458969116, + "eval_runtime": 205.7512, + "eval_samples_per_second": 9.72, + "eval_steps_per_second": 0.608, "step": 300 }, { "epoch": 0.16, - "grad_norm": 6.34375, + "grad_norm": 5.09375, "learning_rate": 4.9411102806916185e-06, - "logits/chosen": -0.18695969879627228, - "logits/rejected": -0.09409158676862717, - "logps/chosen": -356.5097961425781, - "logps/rejected": -384.3717956542969, - "loss": 0.5979, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.946184515953064, - "rewards/margins": 0.5263864398002625, - "rewards/rejected": -1.4725710153579712, + "logits/chosen": -2.1432528495788574, + "logits/rejected": -2.0720839500427246, + "logps/chosen": -332.2557067871094, + "logps/rejected": -340.0525817871094, + "loss": 0.5635, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.576932966709137, + "rewards/margins": 0.40913018584251404, + "rewards/rejected": -0.9860631823539734, "step": 310 }, { "epoch": 0.17, - "grad_norm": 4.8125, + "grad_norm": 4.4375, "learning_rate": 4.930844783586424e-06, - "logits/chosen": -0.17142558097839355, - "logits/rejected": -0.16491857171058655, - "logps/chosen": -350.50146484375, - "logps/rejected": -370.40521240234375, - "loss": 0.5827, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.9111695289611816, - "rewards/margins": 0.42720574140548706, - "rewards/rejected": -1.3383753299713135, + "logits/chosen": -2.1997363567352295, + "logits/rejected": -2.0350875854492188, + "logps/chosen": -320.69720458984375, + "logps/rejected": -334.16217041015625, + "loss": 0.5567, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.5008949041366577, + "rewards/margins": 0.5016661882400513, + "rewards/rejected": -1.002561092376709, "step": 320 }, { "epoch": 0.17, - "grad_norm": 4.9375, + "grad_norm": 4.71875, "learning_rate": 4.919767404070033e-06, - "logits/chosen": -0.34910041093826294, - "logits/rejected": -0.32271599769592285, - "logps/chosen": -377.28887939453125, - "logps/rejected": -369.8608703613281, - "loss": 0.5684, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.7913080453872681, - "rewards/margins": 0.46660441160202026, - "rewards/rejected": -1.2579123973846436, + "logits/chosen": -2.1636505126953125, + "logits/rejected": -2.0376198291778564, + "logps/chosen": -330.29254150390625, + "logps/rejected": -358.37811279296875, + "loss": 0.5787, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6509741544723511, + "rewards/margins": 0.4970785677433014, + "rewards/rejected": -1.1480529308319092, "step": 330 }, { "epoch": 0.18, - "grad_norm": 4.34375, + "grad_norm": 8.5, "learning_rate": 4.907881841897216e-06, - "logits/chosen": -0.2663159966468811, - "logits/rejected": -0.3360903859138489, - "logps/chosen": -312.9281311035156, - "logps/rejected": -343.1236877441406, - "loss": 0.5597, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.6925094723701477, - "rewards/margins": 0.4805082380771637, - "rewards/rejected": -1.1730177402496338, + "logits/chosen": -2.1893808841705322, + "logits/rejected": -2.115548849105835, + "logps/chosen": -376.6805114746094, + "logps/rejected": -387.1194763183594, + "loss": 0.5545, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8470619320869446, + "rewards/margins": 0.41202712059020996, + "rewards/rejected": -1.2590891122817993, "step": 340 }, { "epoch": 0.18, - "grad_norm": 7.1875, + "grad_norm": 3.53125, "learning_rate": 4.89519206674919e-06, - "logits/chosen": -0.0721643716096878, - "logits/rejected": -0.15388105809688568, - "logps/chosen": -367.50555419921875, - "logps/rejected": -385.8136291503906, - "loss": 0.5695, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.8829039335250854, - "rewards/margins": 0.5089597702026367, - "rewards/rejected": -1.3918637037277222, + "logits/chosen": -2.1224799156188965, + "logits/rejected": -2.1183266639709473, + "logps/chosen": -311.69000244140625, + "logps/rejected": -391.626220703125, + "loss": 0.529, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.6741515398025513, + "rewards/margins": 0.5650461912155151, + "rewards/rejected": -1.2391977310180664, "step": 350 }, { "epoch": 0.19, - "grad_norm": 4.8125, + "grad_norm": 4.9375, "learning_rate": 4.881702316907769e-06, - "logits/chosen": -0.3347160518169403, - "logits/rejected": -0.02113138698041439, - "logps/chosen": -392.6307678222656, - "logps/rejected": -411.7140197753906, - "loss": 0.552, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.851973831653595, - "rewards/margins": 0.5162093043327332, - "rewards/rejected": -1.3681831359863281, + "logits/chosen": -2.1981089115142822, + "logits/rejected": -2.0802032947540283, + "logps/chosen": -349.52618408203125, + "logps/rejected": -364.61285400390625, + "loss": 0.5662, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.5019327998161316, + "rewards/margins": 0.4679330885410309, + "rewards/rejected": -0.9698659181594849, "step": 360 }, { "epoch": 0.19, - "grad_norm": 8.5625, + "grad_norm": 6.4375, "learning_rate": 4.86741709783982e-06, - "logits/chosen": 0.3518471121788025, - "logits/rejected": 0.3347684442996979, - "logps/chosen": -370.5345764160156, - "logps/rejected": -374.44964599609375, - "loss": 0.5687, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.9835584759712219, - "rewards/margins": 0.5378425717353821, - "rewards/rejected": -1.521401047706604, + "logits/chosen": -2.064434051513672, + "logits/rejected": -1.957092523574829, + "logps/chosen": -323.6076965332031, + "logps/rejected": -344.89971923828125, + "loss": 0.5741, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.6532014608383179, + "rewards/margins": 0.5228798985481262, + "rewards/rejected": -1.1760812997817993, "step": 370 }, { "epoch": 0.2, - "grad_norm": 4.9375, + "grad_norm": 6.5, "learning_rate": 4.852341180692471e-06, - "logits/chosen": 0.45614609122276306, - "logits/rejected": 0.686242938041687, - "logps/chosen": -351.5860290527344, - "logps/rejected": -388.1460266113281, - "loss": 0.5242, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.1243712902069092, - "rewards/margins": 0.5316659212112427, - "rewards/rejected": -1.6560373306274414, + "logits/chosen": -2.0200791358947754, + "logits/rejected": -1.9938948154449463, + "logps/chosen": -333.9423522949219, + "logps/rejected": -399.6837463378906, + "loss": 0.548, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8045186996459961, + "rewards/margins": 0.47954291105270386, + "rewards/rejected": -1.2840616703033447, "step": 380 }, { "epoch": 0.2, - "grad_norm": 8.625, + "grad_norm": 3.65625, "learning_rate": 4.836479600699579e-06, - "logits/chosen": 0.25028011202812195, - "logits/rejected": 0.527495265007019, - "logps/chosen": -387.6080322265625, - "logps/rejected": -400.1957092285156, - "loss": 0.5649, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.1668118238449097, - "rewards/margins": 0.5952257513999939, - "rewards/rejected": -1.7620376348495483, + "logits/chosen": -2.0388081073760986, + "logits/rejected": -1.9344475269317627, + "logps/chosen": -365.44732666015625, + "logps/rejected": -364.44549560546875, + "loss": 0.5859, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.677007794380188, + "rewards/margins": 0.46105116605758667, + "rewards/rejected": -1.1380589008331299, "step": 390 }, { "epoch": 0.21, "grad_norm": 5.53125, "learning_rate": 4.819837655500014e-06, - "logits/chosen": 0.5255290269851685, - "logits/rejected": 0.35975736379623413, - "logps/chosen": -298.29302978515625, - "logps/rejected": -343.5097351074219, - "loss": 0.5518, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.8005388379096985, - "rewards/margins": 0.5582879781723022, - "rewards/rejected": -1.3588266372680664, + "logits/chosen": -2.023102283477783, + "logits/rejected": -1.9553874731063843, + "logps/chosen": -306.1051330566406, + "logps/rejected": -330.5825500488281, + "loss": 0.5651, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5321759581565857, + "rewards/margins": 0.4713428020477295, + "rewards/rejected": -1.00351881980896, "step": 400 }, { "epoch": 0.21, - "eval_logits/chosen": 0.7207584381103516, - "eval_logits/rejected": 0.8423263430595398, - "eval_logps/chosen": -346.00152587890625, - "eval_logps/rejected": -382.4471435546875, - "eval_loss": 0.5535975098609924, - "eval_rewards/accuracies": 0.7229999899864197, - "eval_rewards/chosen": -0.8069528937339783, - "eval_rewards/margins": 0.6048997640609741, - "eval_rewards/rejected": -1.4118527173995972, - "eval_runtime": 450.2977, - "eval_samples_per_second": 4.442, - "eval_steps_per_second": 0.278, + "eval_logits/chosen": -1.982036828994751, + "eval_logits/rejected": -1.9001150131225586, + "eval_logps/chosen": -350.7093200683594, + "eval_logps/rejected": -376.6873779296875, + "eval_loss": 0.566522479057312, + "eval_rewards/accuracies": 0.7250000238418579, + "eval_rewards/chosen": -0.7900638580322266, + "eval_rewards/margins": 0.4996088147163391, + "eval_rewards/rejected": -1.289672613143921, + "eval_runtime": 205.8214, + "eval_samples_per_second": 9.717, + "eval_steps_per_second": 0.607, "step": 400 }, { "epoch": 0.21, - "grad_norm": 6.21875, + "grad_norm": 4.71875, "learning_rate": 4.802420903368286e-06, - "logits/chosen": 0.4230332374572754, - "logits/rejected": 0.2597886025905609, - "logps/chosen": -347.322509765625, - "logps/rejected": -403.64508056640625, - "loss": 0.5539, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.9662551879882812, - "rewards/margins": 0.6025245785713196, - "rewards/rejected": -1.568779706954956, + "logits/chosen": -2.045619487762451, + "logits/rejected": -2.054408550262451, + "logps/chosen": -364.0269775390625, + "logps/rejected": -412.7276306152344, + "loss": 0.5841, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.106697678565979, + "rewards/margins": 0.3950948119163513, + "rewards/rejected": -1.501792311668396, "step": 410 }, { "epoch": 0.22, - "grad_norm": 5.15625, + "grad_norm": 5.34375, "learning_rate": 4.784235161358124e-06, - "logits/chosen": 0.7549188733100891, - "logits/rejected": 0.3922800123691559, - "logps/chosen": -349.7222595214844, - "logps/rejected": -404.5429992675781, - "loss": 0.5143, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.0141912698745728, - "rewards/margins": 0.6538187265396118, - "rewards/rejected": -1.6680099964141846, + "logits/chosen": -2.0790481567382812, + "logits/rejected": -2.003018617630005, + "logps/chosen": -399.572998046875, + "logps/rejected": -433.9266662597656, + "loss": 0.583, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2603890895843506, + "rewards/margins": 0.41771477460861206, + "rewards/rejected": -1.6781038045883179, "step": 420 }, { "epoch": 0.23, - "grad_norm": 5.875, + "grad_norm": 5.75, "learning_rate": 4.765286503359632e-06, - "logits/chosen": 0.8512343168258667, - "logits/rejected": 0.7955237627029419, - "logps/chosen": -374.3222351074219, - "logps/rejected": -425.25408935546875, - "loss": 0.5631, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.1482092142105103, - "rewards/margins": 0.606701135635376, - "rewards/rejected": -1.7549104690551758, + "logits/chosen": -2.0433547496795654, + "logits/rejected": -1.968266248703003, + "logps/chosen": -357.1161193847656, + "logps/rejected": -381.88507080078125, + "loss": 0.5707, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.8924205899238586, + "rewards/margins": 0.43997693061828613, + "rewards/rejected": -1.3323975801467896, "step": 430 }, { "epoch": 0.23, - "grad_norm": 7.21875, + "grad_norm": 4.1875, "learning_rate": 4.745581258070654e-06, - "logits/chosen": 0.484804630279541, - "logits/rejected": 0.20197534561157227, - "logps/chosen": -325.0903625488281, - "logps/rejected": -374.3199157714844, - "loss": 0.5117, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.72711580991745, - "rewards/margins": 0.6210989952087402, - "rewards/rejected": -1.3482147455215454, + "logits/chosen": -1.973879098892212, + "logits/rejected": -1.9118854999542236, + "logps/chosen": -329.8348693847656, + "logps/rejected": -366.04461669921875, + "loss": 0.5512, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.562414288520813, + "rewards/margins": 0.5114336013793945, + "rewards/rejected": -1.073848009109497, "step": 440 }, { "epoch": 0.24, - "grad_norm": 7.53125, + "grad_norm": 5.9375, "learning_rate": 4.725126006883047e-06, - "logits/chosen": 0.9636757969856262, - "logits/rejected": 0.9192026257514954, - "logps/chosen": -341.41998291015625, - "logps/rejected": -403.25836181640625, - "loss": 0.5154, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.9017482995986938, - "rewards/margins": 0.7085460424423218, - "rewards/rejected": -1.6102945804595947, + "logits/chosen": -1.8499130010604858, + "logits/rejected": -1.827254056930542, + "logps/chosen": -307.35150146484375, + "logps/rejected": -369.239990234375, + "loss": 0.5526, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.7073386311531067, + "rewards/margins": 0.5170890092849731, + "rewards/rejected": -1.224427580833435, "step": 450 }, { "epoch": 0.24, - "grad_norm": 5.1875, + "grad_norm": 4.625, "learning_rate": 4.70392758168454e-06, - "logits/chosen": 1.3140493631362915, - "logits/rejected": 0.9456102252006531, - "logps/chosen": -385.99493408203125, - "logps/rejected": -433.5835876464844, - "loss": 0.549, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.943748950958252, - "rewards/margins": 0.8300511240959167, - "rewards/rejected": -1.7738001346588135, + "logits/chosen": -1.9882104396820068, + "logits/rejected": -1.8469337224960327, + "logps/chosen": -375.27935791015625, + "logps/rejected": -379.7239685058594, + "loss": 0.5734, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9050993919372559, + "rewards/margins": 0.47090214490890503, + "rewards/rejected": -1.3760015964508057, "step": 460 }, { "epoch": 0.25, - "grad_norm": 7.5, + "grad_norm": 4.03125, "learning_rate": 4.68199306257695e-06, - "logits/chosen": 1.4614379405975342, - "logits/rejected": 1.2364531755447388, - "logps/chosen": -397.2190856933594, - "logps/rejected": -452.3384704589844, - "loss": 0.5139, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2018821239471436, - "rewards/margins": 0.7513980865478516, - "rewards/rejected": -1.9532800912857056, + "logits/chosen": -1.9672870635986328, + "logits/rejected": -1.9157249927520752, + "logps/chosen": -396.733154296875, + "logps/rejected": -421.2018127441406, + "loss": 0.5354, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9322503805160522, + "rewards/margins": 0.5409501791000366, + "rewards/rejected": -1.4732005596160889, "step": 470 }, { "epoch": 0.25, - "grad_norm": 6.625, + "grad_norm": 5.1875, "learning_rate": 4.659329775511478e-06, - "logits/chosen": 1.343225121498108, - "logits/rejected": 1.212619423866272, - "logps/chosen": -399.1175537109375, - "logps/rejected": -448.16473388671875, - "loss": 0.5715, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.349334716796875, - "rewards/margins": 0.7152506709098816, - "rewards/rejected": -2.0645856857299805, + "logits/chosen": -1.973670244216919, + "logits/rejected": -1.9386718273162842, + "logps/chosen": -343.16387939453125, + "logps/rejected": -386.0146484375, + "loss": 0.556, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9054447412490845, + "rewards/margins": 0.559832751750946, + "rewards/rejected": -1.4652774333953857, "step": 480 }, { "epoch": 0.26, - "grad_norm": 4.78125, + "grad_norm": 5.0, "learning_rate": 4.635945289841902e-06, - "logits/chosen": 0.8150955438613892, - "logits/rejected": 0.598046600818634, - "logps/chosen": -360.7403869628906, - "logps/rejected": -376.80316162109375, - "loss": 0.5301, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.086352825164795, - "rewards/margins": 0.5136397480964661, - "rewards/rejected": -1.5999925136566162, + "logits/chosen": -1.9971214532852173, + "logits/rejected": -1.9401931762695312, + "logps/chosen": -368.4125061035156, + "logps/rejected": -396.1431579589844, + "loss": 0.5296, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.8962229490280151, + "rewards/margins": 0.6276494264602661, + "rewards/rejected": -1.5238726139068604, "step": 490 }, { "epoch": 0.26, - "grad_norm": 4.34375, + "grad_norm": 5.25, "learning_rate": 4.611847415796476e-06, - "logits/chosen": 0.45536771416664124, - "logits/rejected": 0.9996572732925415, - "logps/chosen": -347.04986572265625, - "logps/rejected": -361.86285400390625, - "loss": 0.5953, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.7578734159469604, - "rewards/margins": 0.41181692481040955, - "rewards/rejected": -1.1696903705596924, + "logits/chosen": -2.0031449794769287, + "logits/rejected": -1.9153926372528076, + "logps/chosen": -377.18121337890625, + "logps/rejected": -411.65234375, + "loss": 0.5136, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.0275315046310425, + "rewards/margins": 0.6614707708358765, + "rewards/rejected": -1.6890023946762085, "step": 500 }, { "epoch": 0.26, - "eval_logits/chosen": 1.0708205699920654, - "eval_logits/rejected": 1.2557843923568726, - "eval_logps/chosen": -332.0845947265625, - "eval_logps/rejected": -359.5694580078125, - "eval_loss": 0.5575395226478577, - "eval_rewards/accuracies": 0.7110000252723694, - "eval_rewards/chosen": -0.6677836179733276, - "eval_rewards/margins": 0.515292227268219, - "eval_rewards/rejected": -1.1830756664276123, - "eval_runtime": 450.3565, - "eval_samples_per_second": 4.441, - "eval_steps_per_second": 0.278, + "eval_logits/chosen": -1.8879700899124146, + "eval_logits/rejected": -1.8081378936767578, + "eval_logps/chosen": -374.9992370605469, + "eval_logps/rejected": -414.1807556152344, + "eval_loss": 0.5519765019416809, + "eval_rewards/accuracies": 0.718999981880188, + "eval_rewards/chosen": -1.0329629182815552, + "eval_rewards/margins": 0.6316434741020203, + "eval_rewards/rejected": -1.6646064519882202, + "eval_runtime": 205.8805, + "eval_samples_per_second": 9.714, + "eval_steps_per_second": 0.607, "step": 500 }, { "epoch": 0.27, - "grad_norm": 7.46875, + "grad_norm": 5.3125, "learning_rate": 4.587044201869378e-06, - "logits/chosen": 1.2414617538452148, - "logits/rejected": 1.1634864807128906, - "logps/chosen": -321.0514831542969, - "logps/rejected": -368.89849853515625, - "loss": 0.544, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.7744086384773254, - "rewards/margins": 0.5061289668083191, - "rewards/rejected": -1.280537724494934, + "logits/chosen": -1.9341243505477905, + "logits/rejected": -1.8541990518569946, + "logps/chosen": -348.41650390625, + "logps/rejected": -377.76055908203125, + "loss": 0.5499, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9087320566177368, + "rewards/margins": 0.5346066355705261, + "rewards/rejected": -1.4433386325836182, "step": 510 }, { "epoch": 0.27, - "grad_norm": 5.875, + "grad_norm": 5.40625, "learning_rate": 4.561543932132574e-06, - "logits/chosen": 1.5618733167648315, - "logits/rejected": 1.069901466369629, - "logps/chosen": -353.8107604980469, - "logps/rejected": -423.22064208984375, - "loss": 0.5877, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.0526001453399658, - "rewards/margins": 0.6813434362411499, - "rewards/rejected": -1.7339435815811157, + "logits/chosen": -1.9719091653823853, + "logits/rejected": -1.8899688720703125, + "logps/chosen": -356.0486145019531, + "logps/rejected": -401.54852294921875, + "loss": 0.5157, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.79421067237854, + "rewards/margins": 0.6548107266426086, + "rewards/rejected": -1.449021577835083, "step": 520 }, { "epoch": 0.28, - "grad_norm": 6.71875, + "grad_norm": 6.5625, "learning_rate": 4.535355123469009e-06, - "logits/chosen": 1.3146389722824097, - "logits/rejected": 1.3446049690246582, - "logps/chosen": -429.43133544921875, - "logps/rejected": -472.72540283203125, - "loss": 0.5618, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.510373830795288, - "rewards/margins": 0.7111620903015137, - "rewards/rejected": -2.2215359210968018, + "logits/chosen": -1.938852071762085, + "logits/rejected": -1.8662010431289673, + "logps/chosen": -384.50897216796875, + "logps/rejected": -447.8426818847656, + "loss": 0.5328, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1384373903274536, + "rewards/margins": 0.725753664970398, + "rewards/rejected": -1.8641912937164307, "step": 530 }, { "epoch": 0.28, - "grad_norm": 5.4375, + "grad_norm": 5.625, "learning_rate": 4.508486522728037e-06, - "logits/chosen": 0.9349297285079956, - "logits/rejected": 1.2060667276382446, - "logps/chosen": -431.19757080078125, - "logps/rejected": -456.5882263183594, - "loss": 0.5189, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.5400089025497437, - "rewards/margins": 0.7873141169548035, - "rewards/rejected": -2.3273231983184814, + "logits/chosen": -1.7794740200042725, + "logits/rejected": -1.712432861328125, + "logps/chosen": -402.6400146484375, + "logps/rejected": -434.7808532714844, + "loss": 0.5375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2654469013214111, + "rewards/margins": 0.6806719303131104, + "rewards/rejected": -1.9461190700531006, "step": 540 }, { "epoch": 0.29, - "grad_norm": 7.3125, + "grad_norm": 5.28125, "learning_rate": 4.480947103804044e-06, - "logits/chosen": 1.4042693376541138, - "logits/rejected": 1.3253014087677002, - "logps/chosen": -383.0726318359375, - "logps/rejected": -433.75079345703125, - "loss": 0.488, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.2447248697280884, - "rewards/margins": 0.7737690806388855, - "rewards/rejected": -2.018493890762329, + "logits/chosen": -1.789461374282837, + "logits/rejected": -1.69761061668396, + "logps/chosen": -392.892578125, + "logps/rejected": -422.740478515625, + "loss": 0.5647, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.173139214515686, + "rewards/margins": 0.6945830583572388, + "rewards/rejected": -1.8677222728729248, "step": 550 }, { "epoch": 0.29, - "grad_norm": 4.75, + "grad_norm": 4.09375, "learning_rate": 4.452746064639239e-06, - "logits/chosen": 1.8145596981048584, - "logits/rejected": 1.7395979166030884, - "logps/chosen": -346.5498962402344, - "logps/rejected": -437.8565979003906, - "loss": 0.5433, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.9916859865188599, - "rewards/margins": 0.6973880529403687, - "rewards/rejected": -1.689074158668518, + "logits/chosen": -1.7166792154312134, + "logits/rejected": -1.6769170761108398, + "logps/chosen": -347.1570129394531, + "logps/rejected": -412.37762451171875, + "loss": 0.5468, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7677599787712097, + "rewards/margins": 0.697329580783844, + "rewards/rejected": -1.4650894403457642, "step": 560 }, { "epoch": 0.3, - "grad_norm": 7.40625, + "grad_norm": 5.15625, "learning_rate": 4.423892824151617e-06, - "logits/chosen": 2.161806344985962, - "logits/rejected": 2.135643243789673, - "logps/chosen": -366.948974609375, - "logps/rejected": -431.3831481933594, - "loss": 0.5404, + "logits/chosen": -1.834933876991272, + "logits/rejected": -1.6764202117919922, + "logps/chosen": -381.7269592285156, + "logps/rejected": -391.5668029785156, + "loss": 0.5411, "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.1190184354782104, - "rewards/margins": 0.7804428935050964, - "rewards/rejected": -1.8994615077972412, + "rewards/chosen": -0.7797503471374512, + "rewards/margins": 0.558749794960022, + "rewards/rejected": -1.3385001420974731, "step": 570 }, { "epoch": 0.3, - "grad_norm": 8.375, + "grad_norm": 5.6875, "learning_rate": 4.3943970190891164e-06, - "logits/chosen": 1.9935728311538696, - "logits/rejected": 1.705012321472168, - "logps/chosen": -370.86553955078125, - "logps/rejected": -437.19647216796875, - "loss": 0.5531, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.0637662410736084, - "rewards/margins": 0.7619091272354126, - "rewards/rejected": -1.825675368309021, + "logits/chosen": -1.7892568111419678, + "logits/rejected": -1.662570595741272, + "logps/chosen": -390.51953125, + "logps/rejected": -397.81451416015625, + "loss": 0.5465, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8887346982955933, + "rewards/margins": 0.6098966002464294, + "rewards/rejected": -1.498631477355957, "step": 580 }, { "epoch": 0.31, - "grad_norm": 6.71875, + "grad_norm": 4.90625, "learning_rate": 4.364268500811025e-06, - "logits/chosen": 2.146028995513916, - "logits/rejected": 2.347506046295166, - "logps/chosen": -367.67535400390625, - "logps/rejected": -429.10919189453125, - "loss": 0.498, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.3295910358428955, - "rewards/margins": 0.7955694794654846, - "rewards/rejected": -2.1251604557037354, + "logits/chosen": -1.6196047067642212, + "logits/rejected": -1.5555574893951416, + "logps/chosen": -402.47772216796875, + "logps/rejected": -446.379638671875, + "loss": 0.5403, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.4109909534454346, + "rewards/margins": 0.6602510213851929, + "rewards/rejected": -2.071241855621338, "step": 590 }, { "epoch": 0.31, - "grad_norm": 6.96875, + "grad_norm": 6.0, "learning_rate": 4.333517331997704e-06, - "logits/chosen": 1.7965142726898193, - "logits/rejected": 1.3090850114822388, - "logps/chosen": -423.20477294921875, - "logps/rejected": -467.678955078125, - "loss": 0.5032, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.322430968284607, - "rewards/margins": 0.8503682017326355, - "rewards/rejected": -2.1727993488311768, + "logits/chosen": -1.6825096607208252, + "logits/rejected": -1.56454336643219, + "logps/chosen": -378.9342041015625, + "logps/rejected": -428.3062438964844, + "loss": 0.5587, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3179118633270264, + "rewards/margins": 0.6010549068450928, + "rewards/rejected": -1.9189666509628296, "step": 600 }, { "epoch": 0.31, - "eval_logits/chosen": 2.7062325477600098, - "eval_logits/rejected": 2.842660427093506, - "eval_logps/chosen": -400.8144836425781, - "eval_logps/rejected": -454.5939025878906, - "eval_loss": 0.5359104871749878, - "eval_rewards/accuracies": 0.7310000061988831, - "eval_rewards/chosen": -1.3550822734832764, - "eval_rewards/margins": 0.778237521648407, - "eval_rewards/rejected": -2.133319854736328, - "eval_runtime": 449.3131, - "eval_samples_per_second": 4.451, - "eval_steps_per_second": 0.278, + "eval_logits/chosen": -1.5609403848648071, + "eval_logits/rejected": -1.4665495157241821, + "eval_logps/chosen": -403.8534240722656, + "eval_logps/rejected": -448.6079406738281, + "eval_loss": 0.5327390432357788, + "eval_rewards/accuracies": 0.7319999933242798, + "eval_rewards/chosen": -1.3215045928955078, + "eval_rewards/margins": 0.6873735189437866, + "eval_rewards/rejected": -2.008878231048584, + "eval_runtime": 205.8124, + "eval_samples_per_second": 9.718, + "eval_steps_per_second": 0.607, "step": 600 }, { "epoch": 0.32, - "grad_norm": 5.09375, + "grad_norm": 5.53125, "learning_rate": 4.302153783289737e-06, - "logits/chosen": 1.6601619720458984, - "logits/rejected": 2.115678310394287, - "logps/chosen": -421.45556640625, - "logps/rejected": -427.3929138183594, - "loss": 0.551, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3307194709777832, - "rewards/margins": 0.6118677258491516, - "rewards/rejected": -1.94258713722229, + "logits/chosen": -1.657099962234497, + "logits/rejected": -1.5476830005645752, + "logps/chosen": -389.5664367675781, + "logps/rejected": -463.8951110839844, + "loss": 0.5259, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1841762065887451, + "rewards/margins": 0.7400022745132446, + "rewards/rejected": -1.9241783618927002, "step": 610 }, { "epoch": 0.32, - "grad_norm": 8.1875, + "grad_norm": 6.53125, "learning_rate": 4.270188329857613e-06, - "logits/chosen": 1.341740608215332, - "logits/rejected": 1.3879600763320923, - "logps/chosen": -374.7286071777344, - "logps/rejected": -436.7850646972656, - "loss": 0.5097, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.9597892761230469, - "rewards/margins": 0.8522161245346069, - "rewards/rejected": -1.8120054006576538, + "logits/chosen": -1.687072515487671, + "logits/rejected": -1.5819660425186157, + "logps/chosen": -392.8175354003906, + "logps/rejected": -414.7140197753906, + "loss": 0.531, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.9409040212631226, + "rewards/margins": 0.7219224572181702, + "rewards/rejected": -1.6628265380859375, "step": 620 }, { "epoch": 0.33, - "grad_norm": 8.125, + "grad_norm": 6.53125, "learning_rate": 4.237631647903115e-06, - "logits/chosen": 2.013131856918335, - "logits/rejected": 1.7528884410858154, - "logps/chosen": -368.9488830566406, - "logps/rejected": -439.6729431152344, - "loss": 0.5056, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.0934112071990967, - "rewards/margins": 0.8812692761421204, - "rewards/rejected": -1.9746805429458618, + "logits/chosen": -1.6374574899673462, + "logits/rejected": -1.513041377067566, + "logps/chosen": -367.43792724609375, + "logps/rejected": -422.2757873535156, + "loss": 0.5751, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.0590665340423584, + "rewards/margins": 0.6901532411575317, + "rewards/rejected": -1.749219536781311, "step": 630 }, { "epoch": 0.33, - "grad_norm": 5.71875, + "grad_norm": 5.5, "learning_rate": 4.204494611093548e-06, - "logits/chosen": 1.9838985204696655, - "logits/rejected": 2.020540714263916, - "logps/chosen": -377.3255920410156, - "logps/rejected": -431.0025329589844, - "loss": 0.5329, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.2972571849822998, - "rewards/margins": 0.6514188051223755, - "rewards/rejected": -1.9486758708953857, + "logits/chosen": -1.5762876272201538, + "logits/rejected": -1.439675211906433, + "logps/chosen": -353.2894592285156, + "logps/rejected": -415.69384765625, + "loss": 0.5084, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0523182153701782, + "rewards/margins": 0.8923817873001099, + "rewards/rejected": -1.944700002670288, "step": 640 }, { "epoch": 0.34, - "grad_norm": 7.59375, + "grad_norm": 5.75, "learning_rate": 4.170788286930024e-06, - "logits/chosen": 1.998337984085083, - "logits/rejected": 2.040975332260132, - "logps/chosen": -384.91455078125, - "logps/rejected": -467.4063415527344, - "loss": 0.5308, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.4080098867416382, - "rewards/margins": 0.744787335395813, - "rewards/rejected": -2.152797222137451, + "logits/chosen": -1.5430004596710205, + "logits/rejected": -1.5310301780700684, + "logps/chosen": -418.9070739746094, + "logps/rejected": -503.3553771972656, + "loss": 0.5448, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.7990005016326904, + "rewards/margins": 0.6734641194343567, + "rewards/rejected": -2.4724647998809814, "step": 650 }, { "epoch": 0.35, - "grad_norm": 5.03125, + "grad_norm": 5.53125, "learning_rate": 4.136523933051005e-06, - "logits/chosen": 2.7154483795166016, - "logits/rejected": 2.629507541656494, - "logps/chosen": -394.7875061035156, - "logps/rejected": -470.86932373046875, - "loss": 0.5028, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.6549485921859741, - "rewards/margins": 0.7828361392021179, - "rewards/rejected": -2.4377849102020264, + "logits/chosen": -1.553636074066162, + "logits/rejected": -1.4528017044067383, + "logps/chosen": -416.9197692871094, + "logps/rejected": -488.38824462890625, + "loss": 0.5151, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.6012948751449585, + "rewards/margins": 0.7516334652900696, + "rewards/rejected": -2.352928638458252, "step": 660 }, { "epoch": 0.35, - "grad_norm": 6.40625, + "grad_norm": 5.375, "learning_rate": 4.101712993472348e-06, - "logits/chosen": 2.115504026412964, - "logits/rejected": 2.440556049346924, - "logps/chosen": -407.92193603515625, - "logps/rejected": -464.93341064453125, - "loss": 0.5557, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.5494651794433594, - "rewards/margins": 0.6849027872085571, - "rewards/rejected": -2.234367847442627, + "logits/chosen": -1.5965683460235596, + "logits/rejected": -1.5354357957839966, + "logps/chosen": -373.5552978515625, + "logps/rejected": -417.95806884765625, + "loss": 0.5543, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0864415168762207, + "rewards/margins": 0.6252428293228149, + "rewards/rejected": -1.711684226989746, "step": 670 }, { "epoch": 0.36, - "grad_norm": 7.0625, + "grad_norm": 7.46875, "learning_rate": 4.066367094765091e-06, - "logits/chosen": 1.77793288230896, - "logits/rejected": 1.8030977249145508, - "logps/chosen": -431.06451416015625, - "logps/rejected": -489.83087158203125, - "loss": 0.4919, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.1827547550201416, - "rewards/margins": 0.8239189386367798, - "rewards/rejected": -2.006673812866211, + "logits/chosen": -1.5761507749557495, + "logits/rejected": -1.4767498970031738, + "logps/chosen": -377.71893310546875, + "logps/rejected": -418.2323303222656, + "loss": 0.536, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.113816261291504, + "rewards/margins": 0.6647173166275024, + "rewards/rejected": -1.7785335779190063, "step": 680 }, { "epoch": 0.36, - "grad_norm": 5.4375, + "grad_norm": 4.9375, "learning_rate": 4.030498042172277e-06, - "logits/chosen": 1.8615095615386963, - "logits/rejected": 2.4596309661865234, - "logps/chosen": -379.59381103515625, - "logps/rejected": -427.30242919921875, - "loss": 0.5394, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.1986615657806396, - "rewards/margins": 0.7092850208282471, - "rewards/rejected": -1.9079463481903076, + "logits/chosen": -1.692792296409607, + "logits/rejected": -1.6168203353881836, + "logps/chosen": -386.3111267089844, + "logps/rejected": -467.1307678222656, + "loss": 0.5083, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.1883788108825684, + "rewards/margins": 0.8070133328437805, + "rewards/rejected": -1.9953922033309937, "step": 690 }, { "epoch": 0.37, - "grad_norm": 6.5, + "grad_norm": 8.625, "learning_rate": 3.994117815666095e-06, - "logits/chosen": 1.8667644262313843, - "logits/rejected": 2.414881944656372, - "logps/chosen": -415.29058837890625, - "logps/rejected": -435.1332092285156, - "loss": 0.5741, + "logits/chosen": -1.5926947593688965, + "logits/rejected": -1.482301950454712, + "logps/chosen": -407.1866455078125, + "logps/rejected": -477.66632080078125, + "loss": 0.5167, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3635553121566772, - "rewards/margins": 0.7062723636627197, - "rewards/rejected": -2.0698277950286865, + "rewards/chosen": -1.3628543615341187, + "rewards/margins": 0.8543855547904968, + "rewards/rejected": -2.21724009513855, "step": 700 }, { "epoch": 0.37, - "eval_logits/chosen": 2.9904465675354004, - "eval_logits/rejected": 3.1370763778686523, - "eval_logps/chosen": -394.3450622558594, - "eval_logps/rejected": -445.326904296875, - "eval_loss": 0.531741201877594, - "eval_rewards/accuracies": 0.7260000109672546, - "eval_rewards/chosen": -1.2903878688812256, - "eval_rewards/margins": 0.7502626776695251, - "eval_rewards/rejected": -2.0406503677368164, - "eval_runtime": 449.2183, - "eval_samples_per_second": 4.452, - "eval_steps_per_second": 0.278, + "eval_logits/chosen": -1.4903253316879272, + "eval_logits/rejected": -1.3917725086212158, + "eval_logps/chosen": -399.6684265136719, + "eval_logps/rejected": -467.6413269042969, + "eval_loss": 0.5299228429794312, + "eval_rewards/accuracies": 0.7229999899864197, + "eval_rewards/chosen": -1.2796550989151, + "eval_rewards/margins": 0.9195566177368164, + "eval_rewards/rejected": -2.199211597442627, + "eval_runtime": 205.8617, + "eval_samples_per_second": 9.715, + "eval_steps_per_second": 0.607, "step": 700 }, { "epoch": 0.37, - "grad_norm": 6.4375, + "grad_norm": 7.125, "learning_rate": 3.957238565946672e-06, - "logits/chosen": 2.365293502807617, - "logits/rejected": 2.3199546337127686, - "logps/chosen": -411.216552734375, - "logps/rejected": -493.239501953125, - "loss": 0.514, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.5279957056045532, - "rewards/margins": 0.8807098269462585, - "rewards/rejected": -2.408705472946167, + "logits/chosen": -1.5940111875534058, + "logits/rejected": -1.4753977060317993, + "logps/chosen": -437.0880432128906, + "logps/rejected": -492.3658752441406, + "loss": 0.4612, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.347034215927124, + "rewards/margins": 0.9986494183540344, + "rewards/rejected": -2.345684051513672, "step": 710 }, { "epoch": 0.38, - "grad_norm": 5.15625, + "grad_norm": 6.96875, "learning_rate": 3.919872610383831e-06, - "logits/chosen": 2.7599196434020996, - "logits/rejected": 2.716507911682129, - "logps/chosen": -426.38134765625, - "logps/rejected": -493.5303649902344, - "loss": 0.5325, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.8960431814193726, - "rewards/margins": 0.8603929281234741, - "rewards/rejected": -2.756436347961426, + "logits/chosen": -1.4148577451705933, + "logits/rejected": -1.3983594179153442, + "logps/chosen": -392.7431640625, + "logps/rejected": -485.83819580078125, + "loss": 0.4803, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4197003841400146, + "rewards/margins": 1.0409051179885864, + "rewards/rejected": -2.4606053829193115, "step": 720 }, { "epoch": 0.38, - "grad_norm": 7.71875, + "grad_norm": 7.0, "learning_rate": 3.882032428903195e-06, - "logits/chosen": 2.7069733142852783, - "logits/rejected": 2.9215176105499268, - "logps/chosen": -451.400390625, - "logps/rejected": -510.039306640625, - "loss": 0.5345, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.9560165405273438, - "rewards/margins": 0.7693789005279541, - "rewards/rejected": -2.7253952026367188, + "logits/chosen": -1.5557122230529785, + "logits/rejected": -1.5083773136138916, + "logps/chosen": -366.6706848144531, + "logps/rejected": -449.88397216796875, + "loss": 0.5389, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3032705783843994, + "rewards/margins": 0.8904848098754883, + "rewards/rejected": -2.1937553882598877, "step": 730 }, { "epoch": 0.39, - "grad_norm": 7.28125, + "grad_norm": 4.8125, "learning_rate": 3.84373065981799e-06, - "logits/chosen": 2.783442258834839, - "logits/rejected": 3.478522777557373, - "logps/chosen": -454.78399658203125, - "logps/rejected": -508.5516052246094, - "loss": 0.5135, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.5752308368682861, - "rewards/margins": 0.9094041585922241, - "rewards/rejected": -2.4846348762512207, + "logits/chosen": -1.607391595840454, + "logits/rejected": -1.5484809875488281, + "logps/chosen": -386.6382751464844, + "logps/rejected": -422.96148681640625, + "loss": 0.4946, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2972227334976196, + "rewards/margins": 0.7347890734672546, + "rewards/rejected": -2.0320117473602295, "step": 740 }, { "epoch": 0.39, - "grad_norm": 4.40625, + "grad_norm": 9.625, "learning_rate": 3.8049800956079552e-06, - "logits/chosen": 2.476569890975952, - "logits/rejected": 2.471484661102295, - "logps/chosen": -381.59808349609375, - "logps/rejected": -432.3984375, - "loss": 0.5322, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.1650186777114868, - "rewards/margins": 0.6638428568840027, - "rewards/rejected": -1.8288615942001343, + "logits/chosen": -1.5829339027404785, + "logits/rejected": -1.4720735549926758, + "logps/chosen": -412.76531982421875, + "logps/rejected": -479.85272216796875, + "loss": 0.4811, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.502972960472107, + "rewards/margins": 0.8380988240242004, + "rewards/rejected": -2.341071605682373, "step": 750 }, { "epoch": 0.4, - "grad_norm": 7.15625, + "grad_norm": 8.375, "learning_rate": 3.765793678646753e-06, - "logits/chosen": 2.8157763481140137, - "logits/rejected": 2.5621135234832764, - "logps/chosen": -388.00347900390625, - "logps/rejected": -389.6156005859375, - "loss": 0.5869, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.2750461101531982, - "rewards/margins": 0.4696625769138336, - "rewards/rejected": -1.7447086572647095, + "logits/chosen": -1.4679361581802368, + "logits/rejected": -1.4195685386657715, + "logps/chosen": -412.76763916015625, + "logps/rejected": -477.0049743652344, + "loss": 0.5676, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.7044397592544556, + "rewards/margins": 0.8208500742912292, + "rewards/rejected": -2.525290012359619, "step": 760 }, { "epoch": 0.4, - "grad_norm": 5.46875, + "grad_norm": 6.125, "learning_rate": 3.726184496879323e-06, - "logits/chosen": 2.844412088394165, - "logits/rejected": 3.3043785095214844, - "logps/chosen": -435.10986328125, - "logps/rejected": -469.0890197753906, - "loss": 0.5539, + "logits/chosen": -1.4565179347991943, + "logits/rejected": -1.4047472476959229, + "logps/chosen": -431.69512939453125, + "logps/rejected": -482.6431579589844, + "loss": 0.5755, "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.465742588043213, - "rewards/margins": 0.7117661237716675, - "rewards/rejected": -2.17750883102417, + "rewards/chosen": -1.7018753290176392, + "rewards/margins": 0.5642359852790833, + "rewards/rejected": -2.2661116123199463, "step": 770 }, { "epoch": 0.41, - "grad_norm": 5.625, + "grad_norm": 5.53125, "learning_rate": 3.686165779450619e-06, - "logits/chosen": 3.148547887802124, - "logits/rejected": 3.126634120941162, - "logps/chosen": -391.44647216796875, - "logps/rejected": -427.7322692871094, - "loss": 0.5171, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3907300233840942, - "rewards/margins": 0.693662703037262, - "rewards/rejected": -2.084392786026001, + "logits/chosen": -1.5907853841781616, + "logits/rejected": -1.5108683109283447, + "logps/chosen": -422.84735107421875, + "logps/rejected": -478.88580322265625, + "loss": 0.4939, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5510692596435547, + "rewards/margins": 0.7624626159667969, + "rewards/rejected": -2.3135318756103516, "step": 780 }, { "epoch": 0.41, - "grad_norm": 8.5, + "grad_norm": 4.8125, "learning_rate": 3.645750892287178e-06, - "logits/chosen": 3.375934600830078, - "logits/rejected": 2.9477076530456543, - "logps/chosen": -400.77655029296875, - "logps/rejected": -481.9225158691406, - "loss": 0.5271, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.5433567762374878, - "rewards/margins": 0.8481906056404114, - "rewards/rejected": -2.391547441482544, + "logits/chosen": -1.51558518409729, + "logits/rejected": -1.4765126705169678, + "logps/chosen": -420.275634765625, + "logps/rejected": -512.0084838867188, + "loss": 0.5178, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6825615167617798, + "rewards/margins": 0.7326836585998535, + "rewards/rejected": -2.4152450561523438, "step": 790 }, { "epoch": 0.42, - "grad_norm": 6.46875, + "grad_norm": 5.59375, "learning_rate": 3.604953333633009e-06, - "logits/chosen": 3.302586317062378, - "logits/rejected": 2.6571342945098877, - "logps/chosen": -453.5899353027344, - "logps/rejected": -497.45440673828125, - "loss": 0.5318, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.8011194467544556, - "rewards/margins": 0.7497151494026184, - "rewards/rejected": -2.5508346557617188, + "logits/chosen": -1.5319992303848267, + "logits/rejected": -1.4801173210144043, + "logps/chosen": -434.16082763671875, + "logps/rejected": -493.60308837890625, + "loss": 0.5465, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7253410816192627, + "rewards/margins": 0.7062736749649048, + "rewards/rejected": -2.431614398956299, "step": 800 }, { "epoch": 0.42, - "eval_logits/chosen": 3.5383141040802, - "eval_logits/rejected": 3.714007616043091, - "eval_logps/chosen": -425.8876647949219, - "eval_logps/rejected": -488.1441650390625, - "eval_loss": 0.5149358510971069, - "eval_rewards/accuracies": 0.7450000047683716, - "eval_rewards/chosen": -1.6058142185211182, - "eval_rewards/margins": 0.8630084991455078, - "eval_rewards/rejected": -2.468822717666626, - "eval_runtime": 450.9184, - "eval_samples_per_second": 4.435, - "eval_steps_per_second": 0.277, + "eval_logits/chosen": -1.4641598463058472, + "eval_logits/rejected": -1.3684852123260498, + "eval_logps/chosen": -438.16168212890625, + "eval_logps/rejected": -494.5843505859375, + "eval_loss": 0.5188941359519958, + "eval_rewards/accuracies": 0.7200000286102295, + "eval_rewards/chosen": -1.664587140083313, + "eval_rewards/margins": 0.8040550947189331, + "eval_rewards/rejected": -2.468642234802246, + "eval_runtime": 205.9278, + "eval_samples_per_second": 9.712, + "eval_steps_per_second": 0.607, "step": 800 }, { "epoch": 0.42, - "grad_norm": 6.5625, + "grad_norm": 7.59375, "learning_rate": 3.56378672954129e-06, - "logits/chosen": 2.6567466259002686, - "logits/rejected": 2.880585193634033, - "logps/chosen": -404.9405822753906, - "logps/rejected": -479.71783447265625, - "loss": 0.4768, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.5296947956085205, - "rewards/margins": 0.8995970487594604, - "rewards/rejected": -2.4292919635772705, + "logits/chosen": -1.5462032556533813, + "logits/rejected": -1.4628247022628784, + "logps/chosen": -409.8141174316406, + "logps/rejected": -486.94940185546875, + "loss": 0.5401, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5370090007781982, + "rewards/margins": 0.756435751914978, + "rewards/rejected": -2.293444871902466, "step": 810 }, { "epoch": 0.43, - "grad_norm": 11.625, + "grad_norm": 6.34375, "learning_rate": 3.5222648293233806e-06, - "logits/chosen": 2.2279410362243652, - "logits/rejected": 2.345160961151123, - "logps/chosen": -459.04241943359375, - "logps/rejected": -514.1052856445312, - "loss": 0.5446, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.7574459314346313, - "rewards/margins": 0.675226092338562, - "rewards/rejected": -2.4326720237731934, + "logits/chosen": -1.492166519165039, + "logits/rejected": -1.4682246446609497, + "logps/chosen": -375.8512268066406, + "logps/rejected": -446.55914306640625, + "loss": 0.5386, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.162585973739624, + "rewards/margins": 0.9081439971923828, + "rewards/rejected": -2.070729970932007, "step": 820 }, { "epoch": 0.43, - "grad_norm": 7.03125, + "grad_norm": 7.25, "learning_rate": 3.4804015009566573e-06, - "logits/chosen": 1.985507607460022, - "logits/rejected": 2.084001302719116, - "logps/chosen": -481.2820739746094, - "logps/rejected": -523.9012451171875, - "loss": 0.5162, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.8201816082000732, - "rewards/margins": 0.7969433069229126, - "rewards/rejected": -2.6171250343322754, + "logits/chosen": -1.5281693935394287, + "logits/rejected": -1.4133872985839844, + "logps/chosen": -343.50860595703125, + "logps/rejected": -365.1877746582031, + "loss": 0.5756, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.016842246055603, + "rewards/margins": 0.5352990031242371, + "rewards/rejected": -1.5521411895751953, "step": 830 }, { "epoch": 0.44, - "grad_norm": 5.1875, + "grad_norm": 7.90625, "learning_rate": 3.4382107264527244e-06, - "logits/chosen": 2.884337902069092, - "logits/rejected": 2.2890899181365967, - "logps/chosen": -427.71044921875, - "logps/rejected": -492.051513671875, - "loss": 0.5213, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.8584165573120117, - "rewards/margins": 0.7980149984359741, - "rewards/rejected": -2.6564316749572754, + "logits/chosen": -1.5180078744888306, + "logits/rejected": -1.351276159286499, + "logps/chosen": -419.5958557128906, + "logps/rejected": -453.79693603515625, + "loss": 0.4943, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1950498819351196, + "rewards/margins": 0.8461320996284485, + "rewards/rejected": -2.041182041168213, "step": 840 }, { "epoch": 0.44, - "grad_norm": 5.9375, + "grad_norm": 7.0, "learning_rate": 3.3957065971875387e-06, - "logits/chosen": 2.7502715587615967, - "logits/rejected": 3.0479960441589355, - "logps/chosen": -470.54010009765625, - "logps/rejected": -530.3944091796875, - "loss": 0.5264, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.191692352294922, - "rewards/margins": 0.729444146156311, - "rewards/rejected": -2.9211363792419434, + "logits/chosen": -1.488877534866333, + "logits/rejected": -1.3683526515960693, + "logps/chosen": -446.36004638671875, + "logps/rejected": -501.3478088378906, + "loss": 0.5352, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.955243468284607, + "rewards/margins": 0.8272415399551392, + "rewards/rejected": -2.782485246658325, "step": 850 }, { "epoch": 0.45, - "grad_norm": 4.875, + "grad_norm": 5.59375, "learning_rate": 3.352903309194999e-06, - "logits/chosen": 2.7881550788879395, - "logits/rejected": 2.5179460048675537, - "logps/chosen": -482.68731689453125, - "logps/rejected": -551.6927490234375, - "loss": 0.4913, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.1960439682006836, - "rewards/margins": 0.8961966633796692, - "rewards/rejected": -3.092240810394287, + "logits/chosen": -1.5099315643310547, + "logits/rejected": -1.4115077257156372, + "logps/chosen": -438.7635803222656, + "logps/rejected": -531.4382934570312, + "loss": 0.5104, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.076658248901367, + "rewards/margins": 0.8972193598747253, + "rewards/rejected": -2.9738779067993164, "step": 860 }, { "epoch": 0.46, - "grad_norm": 8.25, + "grad_norm": 6.21875, "learning_rate": 3.309815158425591e-06, - "logits/chosen": 2.5673789978027344, - "logits/rejected": 2.7013182640075684, - "logps/chosen": -454.56280517578125, - "logps/rejected": -510.7538146972656, - "loss": 0.4807, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.938982367515564, - "rewards/margins": 0.9660106897354126, - "rewards/rejected": -2.9049932956695557, + "logits/chosen": -1.3534867763519287, + "logits/rejected": -1.33297598361969, + "logps/chosen": -432.561279296875, + "logps/rejected": -499.51153564453125, + "loss": 0.5148, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.744372010231018, + "rewards/margins": 0.8061767816543579, + "rewards/rejected": -2.550548791885376, "step": 870 }, { "epoch": 0.46, - "grad_norm": 12.6875, + "grad_norm": 8.25, "learning_rate": 3.266456535971654e-06, - "logits/chosen": 2.3261733055114746, - "logits/rejected": 2.8744044303894043, - "logps/chosen": -450.94287109375, - "logps/rejected": -544.7605590820312, - "loss": 0.4976, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.6397998332977295, - "rewards/margins": 1.0769449472427368, - "rewards/rejected": -2.716744899749756, + "logits/chosen": -1.5983731746673584, + "logits/rejected": -1.5390180349349976, + "logps/chosen": -411.49652099609375, + "logps/rejected": -469.626708984375, + "loss": 0.5347, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.632322907447815, + "rewards/margins": 0.7145469784736633, + "rewards/rejected": -2.346869945526123, "step": 880 }, { "epoch": 0.47, - "grad_norm": 7.84375, + "grad_norm": 5.15625, "learning_rate": 3.2228419232608692e-06, - "logits/chosen": 3.20257568359375, - "logits/rejected": 3.1647896766662598, - "logps/chosen": -459.48199462890625, - "logps/rejected": -522.6142578125, - "loss": 0.5218, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.9324430227279663, - "rewards/margins": 0.9745299220085144, - "rewards/rejected": -2.906972885131836, + "logits/chosen": -1.6237919330596924, + "logits/rejected": -1.5474942922592163, + "logps/chosen": -407.1629943847656, + "logps/rejected": -489.897705078125, + "loss": 0.502, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5020484924316406, + "rewards/margins": 0.8913022875785828, + "rewards/rejected": -2.393350839614868, "step": 890 }, { "epoch": 0.47, - "grad_norm": 7.125, + "grad_norm": 6.3125, "learning_rate": 3.1789858872195888e-06, - "logits/chosen": 2.845961809158325, - "logits/rejected": 3.169747829437256, - "logps/chosen": -498.2582092285156, - "logps/rejected": -551.0, - "loss": 0.5353, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.3102707862854004, - "rewards/margins": 0.9176058769226074, - "rewards/rejected": -3.227876663208008, + "logits/chosen": -1.4964396953582764, + "logits/rejected": -1.4070580005645752, + "logps/chosen": -410.7691345214844, + "logps/rejected": -528.3038330078125, + "loss": 0.5002, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6621105670928955, + "rewards/margins": 1.0146641731262207, + "rewards/rejected": -2.676774501800537, "step": 900 }, { "epoch": 0.47, - "eval_logits/chosen": 4.20645809173584, - "eval_logits/rejected": 4.417923450469971, - "eval_logps/chosen": -522.4096069335938, - "eval_logps/rejected": -595.375244140625, - "eval_loss": 0.5125031471252441, - "eval_rewards/accuracies": 0.7459999918937683, - "eval_rewards/chosen": -2.571033239364624, - "eval_rewards/margins": 0.9701002240180969, - "eval_rewards/rejected": -3.541133403778076, - "eval_runtime": 449.2414, - "eval_samples_per_second": 4.452, - "eval_steps_per_second": 0.278, + "eval_logits/chosen": -1.505366325378418, + "eval_logits/rejected": -1.4178627729415894, + "eval_logps/chosen": -450.1383056640625, + "eval_logps/rejected": -519.8884887695312, + "eval_loss": 0.5142305493354797, + "eval_rewards/accuracies": 0.7289999723434448, + "eval_rewards/chosen": -1.784353494644165, + "eval_rewards/margins": 0.9373300075531006, + "eval_rewards/rejected": -2.7216835021972656, + "eval_runtime": 205.8963, + "eval_samples_per_second": 9.714, + "eval_steps_per_second": 0.607, "step": 900 }, { "epoch": 0.48, - "grad_norm": 5.6875, + "grad_norm": 7.90625, "learning_rate": 3.1349030754075945e-06, - "logits/chosen": 3.1661219596862793, - "logits/rejected": 3.1258158683776855, - "logps/chosen": -524.2811889648438, - "logps/rejected": -597.2448120117188, - "loss": 0.5033, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.592829704284668, - "rewards/margins": 0.9196772575378418, - "rewards/rejected": -3.512507200241089, + "logits/chosen": -1.473548173904419, + "logits/rejected": -1.4354422092437744, + "logps/chosen": -455.54534912109375, + "logps/rejected": -518.474609375, + "loss": 0.5482, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.9718334674835205, + "rewards/margins": 0.7413331270217896, + "rewards/rejected": -2.7131667137145996, "step": 910 }, { "epoch": 0.48, - "grad_norm": 5.875, + "grad_norm": 5.78125, "learning_rate": 3.0906082111259313e-06, - "logits/chosen": 2.6309409141540527, - "logits/rejected": 2.872328758239746, - "logps/chosen": -511.0491638183594, - "logps/rejected": -565.0892333984375, - "loss": 0.4732, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.4000844955444336, - "rewards/margins": 1.0670232772827148, - "rewards/rejected": -3.4671077728271484, + "logits/chosen": -1.4893120527267456, + "logits/rejected": -1.4045579433441162, + "logps/chosen": -469.7232360839844, + "logps/rejected": -551.2777099609375, + "loss": 0.5324, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.048574209213257, + "rewards/margins": 1.0462497472763062, + "rewards/rejected": -3.0948235988616943, "step": 920 }, { "epoch": 0.49, - "grad_norm": 6.59375, + "grad_norm": 13.0, "learning_rate": 3.046116088499449e-06, - "logits/chosen": 2.71614408493042, - "logits/rejected": 2.5246307849884033, - "logps/chosen": -510.1669921875, - "logps/rejected": -567.8058471679688, - "loss": 0.5296, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -2.43577241897583, - "rewards/margins": 0.7605811953544617, - "rewards/rejected": -3.1963534355163574, + "logits/chosen": -1.518959403038025, + "logits/rejected": -1.447871208190918, + "logps/chosen": -473.65850830078125, + "logps/rejected": -532.4075927734375, + "loss": 0.514, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.0019118785858154, + "rewards/margins": 0.7329636216163635, + "rewards/rejected": -2.734875202178955, "step": 930 }, { "epoch": 0.49, - "grad_norm": 6.9375, + "grad_norm": 6.375, "learning_rate": 3.0014415675356813e-06, - "logits/chosen": 2.3216631412506104, - "logits/rejected": 2.393738031387329, - "logps/chosen": -490.48681640625, - "logps/rejected": -565.7747802734375, - "loss": 0.4591, + "logits/chosen": -1.5328662395477295, + "logits/rejected": -1.4063748121261597, + "logps/chosen": -478.6943359375, + "logps/rejected": -537.5908813476562, + "loss": 0.4481, "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.194924831390381, - "rewards/margins": 1.1211020946502686, - "rewards/rejected": -3.3160271644592285, + "rewards/chosen": -1.9542016983032227, + "rewards/margins": 0.9170918464660645, + "rewards/rejected": -2.871293544769287, "step": 940 }, { "epoch": 0.5, - "grad_norm": 5.28125, + "grad_norm": 7.71875, "learning_rate": 2.9565995691617242e-06, - "logits/chosen": 2.1706225872039795, - "logits/rejected": 2.3366177082061768, - "logps/chosen": -508.671630859375, - "logps/rejected": -567.91552734375, - "loss": 0.5043, + "logits/chosen": -1.4631025791168213, + "logits/rejected": -1.3391731977462769, + "logps/chosen": -521.5445556640625, + "logps/rejected": -596.1942138671875, + "loss": 0.4863, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.3915352821350098, - "rewards/margins": 0.8179728388786316, - "rewards/rejected": -3.209507703781128, + "rewards/chosen": -2.2124335765838623, + "rewards/margins": 1.047732949256897, + "rewards/rejected": -3.260166645050049, "step": 950 }, { "epoch": 0.5, - "grad_norm": 6.625, + "grad_norm": 5.46875, "learning_rate": 2.9116050702407706e-06, - "logits/chosen": 2.606666088104248, - "logits/rejected": 2.1300926208496094, - "logps/chosen": -512.1990966796875, - "logps/rejected": -589.0401611328125, - "loss": 0.5121, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.3908963203430176, - "rewards/margins": 0.9071556925773621, - "rewards/rejected": -3.2980518341064453, + "logits/chosen": -1.5032353401184082, + "logits/rejected": -1.4071391820907593, + "logps/chosen": -507.3263244628906, + "logps/rejected": -564.1354370117188, + "loss": 0.5064, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.2760252952575684, + "rewards/margins": 0.9310242533683777, + "rewards/rejected": -3.20704984664917, "step": 960 }, { "epoch": 0.51, - "grad_norm": 5.8125, + "grad_norm": 5.96875, "learning_rate": 2.8664730985699537e-06, - "logits/chosen": 2.07898211479187, - "logits/rejected": 2.355185031890869, - "logps/chosen": -526.248046875, - "logps/rejected": -580.1257934570312, - "loss": 0.4977, + "logits/chosen": -1.5140959024429321, + "logits/rejected": -1.4394776821136475, + "logps/chosen": -478.9674377441406, + "logps/rejected": -564.43212890625, + "loss": 0.5034, "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.5468173027038574, - "rewards/margins": 0.7905376553535461, - "rewards/rejected": -3.337355136871338, + "rewards/chosen": -2.1803536415100098, + "rewards/margins": 0.8669061660766602, + "rewards/rejected": -3.04725980758667, "step": 970 }, { "epoch": 0.51, - "grad_norm": 7.65625, + "grad_norm": 4.15625, "learning_rate": 2.8212187278611907e-06, - "logits/chosen": 1.8555338382720947, - "logits/rejected": 1.950961709022522, - "logps/chosen": -461.7571716308594, - "logps/rejected": -560.3824462890625, - "loss": 0.5131, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.2809879779815674, - "rewards/margins": 0.9975978136062622, - "rewards/rejected": -3.278585910797119, + "logits/chosen": -1.5928372144699097, + "logits/rejected": -1.4567886590957642, + "logps/chosen": -487.82489013671875, + "logps/rejected": -544.0657348632812, + "loss": 0.4944, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9777450561523438, + "rewards/margins": 0.938134491443634, + "rewards/rejected": -2.915879726409912, "step": 980 }, { "epoch": 0.52, - "grad_norm": 5.1875, + "grad_norm": 7.125, "learning_rate": 2.7758570727066843e-06, - "logits/chosen": 2.0223584175109863, - "logits/rejected": 1.8026624917984009, - "logps/chosen": -522.4490966796875, - "logps/rejected": -621.7756958007812, - "loss": 0.4985, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.564455509185791, - "rewards/margins": 1.1646798849105835, - "rewards/rejected": -3.729135036468506, + "logits/chosen": -1.4225056171417236, + "logits/rejected": -1.3867155313491821, + "logps/chosen": -464.3641052246094, + "logps/rejected": -549.7391357421875, + "loss": 0.5201, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.0725159645080566, + "rewards/margins": 1.0133941173553467, + "rewards/rejected": -3.0859100818634033, "step": 990 }, { "epoch": 0.52, - "grad_norm": 8.0625, + "grad_norm": 7.34375, "learning_rate": 2.730403283530767e-06, - "logits/chosen": 2.3971285820007324, - "logits/rejected": 2.2188148498535156, - "logps/chosen": -491.7401428222656, - "logps/rejected": -557.7470703125, - "loss": 0.574, - "rewards/accuracies": 0.625, - "rewards/chosen": -2.730563163757324, - "rewards/margins": 0.6435677409172058, - "rewards/rejected": -3.374130964279175, + "logits/chosen": -1.4609169960021973, + "logits/rejected": -1.3873465061187744, + "logps/chosen": -514.2507934570312, + "logps/rejected": -596.9881591796875, + "loss": 0.5017, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.514453887939453, + "rewards/margins": 0.9918726682662964, + "rewards/rejected": -3.506326198577881, "step": 1000 }, { "epoch": 0.52, - "eval_logits/chosen": 2.4408442974090576, - "eval_logits/rejected": 2.651709794998169, - "eval_logps/chosen": -527.5897827148438, - "eval_logps/rejected": -608.1039428710938, - "eval_loss": 0.503455638885498, - "eval_rewards/accuracies": 0.7369999885559082, - "eval_rewards/chosen": -2.622835874557495, - "eval_rewards/margins": 1.0455843210220337, - "eval_rewards/rejected": -3.6684203147888184, - "eval_runtime": 449.3106, - "eval_samples_per_second": 4.451, - "eval_steps_per_second": 0.278, + "eval_logits/chosen": -1.3947843313217163, + "eval_logits/rejected": -1.2972902059555054, + "eval_logps/chosen": -533.4492797851562, + "eval_logps/rejected": -608.9218139648438, + "eval_loss": 0.5058408379554749, + "eval_rewards/accuracies": 0.7360000014305115, + "eval_rewards/chosen": -2.6174628734588623, + "eval_rewards/margins": 0.9945541024208069, + "eval_rewards/rejected": -3.6120169162750244, + "eval_runtime": 205.8968, + "eval_samples_per_second": 9.714, + "eval_steps_per_second": 0.607, "step": 1000 }, { "epoch": 0.53, - "grad_norm": 6.40625, + "grad_norm": 6.0, "learning_rate": 2.6848725415297888e-06, - "logits/chosen": 1.7300176620483398, - "logits/rejected": 2.1867687702178955, - "logps/chosen": -528.530517578125, - "logps/rejected": -599.1966552734375, - "loss": 0.4677, + "logits/chosen": -1.4565789699554443, + "logits/rejected": -1.4135055541992188, + "logps/chosen": -489.64874267578125, + "logps/rejected": -567.3189697265625, + "loss": 0.4895, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.534254550933838, - "rewards/margins": 0.9947689771652222, - "rewards/rejected": -3.5290236473083496, + "rewards/chosen": -2.4934046268463135, + "rewards/margins": 0.8419889211654663, + "rewards/rejected": -3.3353939056396484, "step": 1010 }, { "epoch": 0.53, - "grad_norm": 4.75, + "grad_norm": 7.375, "learning_rate": 2.639280053601719e-06, - "logits/chosen": 1.3660762310028076, - "logits/rejected": 1.5705347061157227, - "logps/chosen": -476.1238708496094, - "logps/rejected": -563.6986083984375, - "loss": 0.4872, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.1599488258361816, - "rewards/margins": 1.0557085275650024, - "rewards/rejected": -3.2156574726104736, + "logits/chosen": -1.573761224746704, + "logits/rejected": -1.4989092350006104, + "logps/chosen": -512.4972534179688, + "logps/rejected": -576.3961181640625, + "loss": 0.4665, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.162745952606201, + "rewards/margins": 1.0285950899124146, + "rewards/rejected": -3.191340923309326, "step": 1020 }, { "epoch": 0.54, - "grad_norm": 4.6875, + "grad_norm": 6.15625, "learning_rate": 2.59364104726716e-06, - "logits/chosen": 1.2051117420196533, - "logits/rejected": 1.490389347076416, - "logps/chosen": -445.2794494628906, - "logps/rejected": -525.7783813476562, - "loss": 0.4768, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.794106125831604, - "rewards/margins": 1.1164175271987915, - "rewards/rejected": -2.9105236530303955, + "logits/chosen": -1.484546184539795, + "logits/rejected": -1.3231149911880493, + "logps/chosen": -528.154541015625, + "logps/rejected": -576.6033325195312, + "loss": 0.512, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.636870861053467, + "rewards/margins": 0.9289640188217163, + "rewards/rejected": -3.5658352375030518, "step": 1030 }, { "epoch": 0.54, - "grad_norm": 6.0, + "grad_norm": 4.4375, "learning_rate": 2.547970765583491e-06, - "logits/chosen": 1.5332322120666504, - "logits/rejected": 1.4781954288482666, - "logps/chosen": -453.91705322265625, - "logps/rejected": -533.1343994140625, - "loss": 0.528, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.072399854660034, - "rewards/margins": 1.0115078687667847, - "rewards/rejected": -3.0839076042175293, + "logits/chosen": -1.4552520513534546, + "logits/rejected": -1.4171597957611084, + "logps/chosen": -519.861328125, + "logps/rejected": -591.8118896484375, + "loss": 0.5328, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.421581745147705, + "rewards/margins": 0.9120811223983765, + "rewards/rejected": -3.33366322517395, "step": 1040 }, { "epoch": 0.55, - "grad_norm": 6.34375, + "grad_norm": 7.3125, "learning_rate": 2.502284462053799e-06, - "logits/chosen": 1.4764400720596313, - "logits/rejected": 1.6051571369171143, - "logps/chosen": -488.93017578125, - "logps/rejected": -568.1224365234375, - "loss": 0.5088, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.2974724769592285, - "rewards/margins": 0.9695172309875488, - "rewards/rejected": -3.2669894695281982, + "logits/chosen": -1.4553632736206055, + "logits/rejected": -1.3192330598831177, + "logps/chosen": -525.8233642578125, + "logps/rejected": -599.904296875, + "loss": 0.4895, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.279545307159424, + "rewards/margins": 1.0983493328094482, + "rewards/rejected": -3.377894639968872, "step": 1050 }, { "epoch": 0.55, - "grad_norm": 6.0625, + "grad_norm": 7.3125, "learning_rate": 2.456597395532338e-06, - "logits/chosen": 1.6312462091445923, - "logits/rejected": 1.5368585586547852, - "logps/chosen": -527.2152099609375, - "logps/rejected": -628.8519897460938, - "loss": 0.459, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.6452157497406006, - "rewards/margins": 1.1508594751358032, - "rewards/rejected": -3.7960751056671143, + "logits/chosen": -1.4849998950958252, + "logits/rejected": -1.4006808996200562, + "logps/chosen": -488.8839416503906, + "logps/rejected": -538.5235595703125, + "loss": 0.499, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.3017091751098633, + "rewards/margins": 0.8732277750968933, + "rewards/rejected": -3.1749370098114014, "step": 1060 }, { "epoch": 0.56, - "grad_norm": 8.125, + "grad_norm": 6.375, "learning_rate": 2.4109248251281953e-06, - "logits/chosen": 1.7708513736724854, - "logits/rejected": 2.067249298095703, - "logps/chosen": -618.783203125, - "logps/rejected": -723.3242797851562, - "loss": 0.4951, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -3.3241093158721924, - "rewards/margins": 1.1731927394866943, - "rewards/rejected": -4.497302055358887, + "logits/chosen": -1.5076830387115479, + "logits/rejected": -1.427677869796753, + "logps/chosen": -512.9411010742188, + "logps/rejected": -549.7650146484375, + "loss": 0.5241, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.252761125564575, + "rewards/margins": 0.8173268437385559, + "rewards/rejected": -3.0700876712799072, "step": 1070 }, { "epoch": 0.57, - "grad_norm": 7.34375, + "grad_norm": 6.03125, "learning_rate": 2.365282005108875e-06, - "logits/chosen": 2.138941526412964, - "logits/rejected": 2.0456833839416504, - "logps/chosen": -570.5878295898438, - "logps/rejected": -641.3853759765625, - "loss": 0.5073, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -3.1022305488586426, - "rewards/margins": 0.8927729725837708, - "rewards/rejected": -3.9950034618377686, + "logits/chosen": -1.5420407056808472, + "logits/rejected": -1.4118328094482422, + "logps/chosen": -469.3882751464844, + "logps/rejected": -503.13726806640625, + "loss": 0.5381, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.117811679840088, + "rewards/margins": 0.8648239970207214, + "rewards/rejected": -2.982635498046875, "step": 1080 }, { "epoch": 0.57, - "grad_norm": 6.0625, + "grad_norm": 10.9375, "learning_rate": 2.319684179805491e-06, - "logits/chosen": 1.5040104389190674, - "logits/rejected": 1.8883628845214844, - "logps/chosen": -540.3502197265625, - "logps/rejected": -629.1265869140625, - "loss": 0.4734, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.862849235534668, - "rewards/margins": 1.0459764003753662, - "rewards/rejected": -3.908825635910034, + "logits/chosen": -1.5114177465438843, + "logits/rejected": -1.377497673034668, + "logps/chosen": -492.01141357421875, + "logps/rejected": -511.5269470214844, + "loss": 0.5381, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.3241496086120605, + "rewards/margins": 0.6907966732978821, + "rewards/rejected": -3.014946460723877, "step": 1090 }, { "epoch": 0.58, - "grad_norm": 7.5, + "grad_norm": 6.0, "learning_rate": 2.2741465785212905e-06, - "logits/chosen": 1.5514931678771973, - "logits/rejected": 1.7437639236450195, - "logps/chosen": -565.4725341796875, - "logps/rejected": -635.3278198242188, - "loss": 0.471, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.8363194465637207, - "rewards/margins": 1.0234676599502563, - "rewards/rejected": -3.8597874641418457, + "logits/chosen": -1.4633898735046387, + "logits/rejected": -1.3916144371032715, + "logps/chosen": -471.26141357421875, + "logps/rejected": -557.3035278320312, + "loss": 0.4966, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.141718626022339, + "rewards/margins": 0.8240246772766113, + "rewards/rejected": -2.96574330329895, "step": 1100 }, { "epoch": 0.58, - "eval_logits/chosen": 2.0694196224212646, - "eval_logits/rejected": 2.2637150287628174, - "eval_logps/chosen": -528.3989868164062, - "eval_logps/rejected": -612.6806030273438, - "eval_loss": 0.5027692914009094, - "eval_rewards/accuracies": 0.75, - "eval_rewards/chosen": -2.6309285163879395, - "eval_rewards/margins": 1.0832594633102417, - "eval_rewards/rejected": -3.7141873836517334, - "eval_runtime": 449.2444, - "eval_samples_per_second": 4.452, - "eval_steps_per_second": 0.278, + "eval_logits/chosen": -1.4740034341812134, + "eval_logits/rejected": -1.3782777786254883, + "eval_logps/chosen": -477.5079650878906, + "eval_logps/rejected": -545.9103393554688, + "eval_loss": 0.504342794418335, + "eval_rewards/accuracies": 0.7369999885559082, + "eval_rewards/chosen": -2.0580503940582275, + "eval_rewards/margins": 0.923851728439331, + "eval_rewards/rejected": -2.9819023609161377, + "eval_runtime": 205.8953, + "eval_samples_per_second": 9.714, + "eval_steps_per_second": 0.607, "step": 1100 }, { "epoch": 0.58, - "grad_norm": 7.3125, + "grad_norm": 5.75, "learning_rate": 2.2286844104451848e-06, - "logits/chosen": 1.3312523365020752, - "logits/rejected": 1.3924553394317627, - "logps/chosen": -527.3834228515625, - "logps/rejected": -586.1485595703125, - "loss": 0.5064, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.468048334121704, - "rewards/margins": 0.9593726992607117, - "rewards/rejected": -3.4274210929870605, + "logits/chosen": -1.4495588541030884, + "logits/rejected": -1.3826367855072021, + "logps/chosen": -462.7281799316406, + "logps/rejected": -548.97265625, + "loss": 0.5435, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.0098259449005127, + "rewards/margins": 0.95000821352005, + "rewards/rejected": -2.959834337234497, "step": 1110 }, { "epoch": 0.59, - "grad_norm": 5.875, + "grad_norm": 7.125, "learning_rate": 2.183312859572008e-06, - "logits/chosen": 1.220323920249939, - "logits/rejected": 1.4167616367340088, - "logps/chosen": -527.3272705078125, - "logps/rejected": -584.5949096679688, - "loss": 0.4823, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.417142868041992, - "rewards/margins": 0.9714761972427368, - "rewards/rejected": -3.3886189460754395, + "logits/chosen": -1.4757729768753052, + "logits/rejected": -1.3990167379379272, + "logps/chosen": -496.04339599609375, + "logps/rejected": -542.3977661132812, + "loss": 0.4874, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.0698461532592773, + "rewards/margins": 0.8941160440444946, + "rewards/rejected": -2.9639620780944824, "step": 1120 }, { "epoch": 0.59, - "grad_norm": 9.5, + "grad_norm": 9.125, "learning_rate": 2.1380470796311843e-06, - "logits/chosen": 1.8727235794067383, - "logits/rejected": 1.7719242572784424, - "logps/chosen": -512.21484375, - "logps/rejected": -586.517333984375, - "loss": 0.4865, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.514179229736328, - "rewards/margins": 1.0654077529907227, - "rewards/rejected": -3.5795867443084717, + "logits/chosen": -1.4388748407363892, + "logits/rejected": -1.3238605260849, + "logps/chosen": -490.1044006347656, + "logps/rejected": -552.4682006835938, + "loss": 0.5065, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.0447587966918945, + "rewards/margins": 0.98194420337677, + "rewards/rejected": -3.026703119277954, "step": 1130 }, { "epoch": 0.6, - "grad_norm": 8.5, + "grad_norm": 9.8125, "learning_rate": 2.092902189025507e-06, - "logits/chosen": 2.006112813949585, - "logits/rejected": 1.8822578191757202, - "logps/chosen": -504.0730895996094, - "logps/rejected": -584.8082885742188, - "loss": 0.56, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -2.6244325637817383, - "rewards/margins": 0.8445955514907837, - "rewards/rejected": -3.4690279960632324, + "logits/chosen": -1.579469919204712, + "logits/rejected": -1.4409890174865723, + "logps/chosen": -501.4720764160156, + "logps/rejected": -554.6583251953125, + "loss": 0.5142, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.0851852893829346, + "rewards/margins": 0.9953821301460266, + "rewards/rejected": -3.0805675983428955, "step": 1140 }, { "epoch": 0.6, - "grad_norm": 7.0, + "grad_norm": 6.71875, "learning_rate": 2.0478932657817105e-06, - "logits/chosen": 1.5363643169403076, - "logits/rejected": 1.6541494131088257, - "logps/chosen": -516.8738403320312, - "logps/rejected": -631.0724487304688, - "loss": 0.4873, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.3589491844177246, - "rewards/margins": 0.9803365468978882, - "rewards/rejected": -3.3392856121063232, + "logits/chosen": -1.5608408451080322, + "logits/rejected": -1.4242637157440186, + "logps/chosen": -509.3160705566406, + "logps/rejected": -564.5472412109375, + "loss": 0.4965, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.316391944885254, + "rewards/margins": 0.892681896686554, + "rewards/rejected": -3.209073543548584, "step": 1150 }, { "epoch": 0.61, - "grad_norm": 6.5, + "grad_norm": 6.375, "learning_rate": 2.0030353425145376e-06, - "logits/chosen": 1.454318642616272, - "logits/rejected": 1.3277199268341064, - "logps/chosen": -472.8223571777344, - "logps/rejected": -549.0642700195312, - "loss": 0.4997, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.163424015045166, - "rewards/margins": 0.8882259130477905, - "rewards/rejected": -3.051649808883667, + "logits/chosen": -1.4031821489334106, + "logits/rejected": -1.392533779144287, + "logps/chosen": -465.71734619140625, + "logps/rejected": -529.384033203125, + "loss": 0.5434, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.336087465286255, + "rewards/margins": 0.8120396733283997, + "rewards/rejected": -3.1481270790100098, "step": 1160 }, { "epoch": 0.61, - "grad_norm": 10.0, + "grad_norm": 7.75, "learning_rate": 1.958343401405964e-06, - "logits/chosen": 1.2178795337677002, - "logits/rejected": 1.890763521194458, - "logps/chosen": -465.97210693359375, - "logps/rejected": -528.733154296875, - "loss": 0.5568, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -2.076791286468506, - "rewards/margins": 0.8051401972770691, - "rewards/rejected": -2.881931781768799, + "logits/chosen": -1.4990739822387695, + "logits/rejected": -1.444461703300476, + "logps/chosen": -472.1663513183594, + "logps/rejected": -566.5687255859375, + "loss": 0.533, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.144223690032959, + "rewards/margins": 0.9988595843315125, + "rewards/rejected": -3.1430835723876953, "step": 1170 }, { "epoch": 0.62, - "grad_norm": 6.25, + "grad_norm": 6.65625, "learning_rate": 1.9138323692012734e-06, - "logits/chosen": 1.706913709640503, - "logits/rejected": 1.0627291202545166, - "logps/chosen": -456.10162353515625, - "logps/rejected": -510.69854736328125, - "loss": 0.5027, + "logits/chosen": -1.5581806898117065, + "logits/rejected": -1.4660371541976929, + "logps/chosen": -500.2381286621094, + "logps/rejected": -572.9973754882812, + "loss": 0.4747, "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.9897725582122803, - "rewards/margins": 0.8308976888656616, - "rewards/rejected": -2.8206703662872314, + "rewards/chosen": -2.226621150970459, + "rewards/margins": 0.942787766456604, + "rewards/rejected": -3.1694092750549316, "step": 1180 }, { "epoch": 0.62, - "grad_norm": 5.71875, + "grad_norm": 5.375, "learning_rate": 1.8695171122236443e-06, - "logits/chosen": 1.7154747247695923, - "logits/rejected": 1.9419562816619873, - "logps/chosen": -424.7981872558594, - "logps/rejected": -494.462158203125, - "loss": 0.5191, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.0186305046081543, - "rewards/margins": 0.8082389831542969, - "rewards/rejected": -2.826869487762451, + "logits/chosen": -1.4782741069793701, + "logits/rejected": -1.3444011211395264, + "logps/chosen": -538.46923828125, + "logps/rejected": -594.9655151367188, + "loss": 0.4581, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.3236842155456543, + "rewards/margins": 1.0366638898849487, + "rewards/rejected": -3.3603484630584717, "step": 1190 }, { "epoch": 0.63, - "grad_norm": 9.25, + "grad_norm": 5.5, "learning_rate": 1.8254124314089225e-06, - "logits/chosen": 1.758643388748169, - "logits/rejected": 1.7241748571395874, - "logps/chosen": -456.5091247558594, - "logps/rejected": -547.2735595703125, - "loss": 0.4888, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.281468152999878, - "rewards/margins": 0.8465303182601929, - "rewards/rejected": -3.1279983520507812, + "logits/chosen": -1.4107558727264404, + "logits/rejected": -1.2883371114730835, + "logps/chosen": -511.8663635253906, + "logps/rejected": -562.5142822265625, + "loss": 0.5087, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.319617748260498, + "rewards/margins": 0.765404224395752, + "rewards/rejected": -3.08502197265625, "step": 1200 }, { "epoch": 0.63, - "eval_logits/chosen": 2.2262675762176514, - "eval_logits/rejected": 2.4042489528656006, - "eval_logps/chosen": -509.4260559082031, - "eval_logps/rejected": -582.6142578125, - "eval_loss": 0.4964662492275238, - "eval_rewards/accuracies": 0.753000020980835, - "eval_rewards/chosen": -2.4411978721618652, - "eval_rewards/margins": 0.9723253846168518, - "eval_rewards/rejected": -3.4135231971740723, - "eval_runtime": 450.7796, - "eval_samples_per_second": 4.437, - "eval_steps_per_second": 0.277, + "eval_logits/chosen": -1.4261987209320068, + "eval_logits/rejected": -1.333054780960083, + "eval_logps/chosen": -508.8494567871094, + "eval_logps/rejected": -582.47119140625, + "eval_loss": 0.5040486454963684, + "eval_rewards/accuracies": 0.7450000047683716, + "eval_rewards/chosen": -2.371464729309082, + "eval_rewards/margins": 0.9760459661483765, + "eval_rewards/rejected": -3.347510814666748, + "eval_runtime": 205.8406, + "eval_samples_per_second": 9.716, + "eval_steps_per_second": 0.607, "step": 1200 }, { "epoch": 0.63, - "grad_norm": 7.0625, + "grad_norm": 6.21875, "learning_rate": 1.781533057362221e-06, - "logits/chosen": 2.2214999198913574, - "logits/rejected": 1.9609037637710571, - "logps/chosen": -481.3634338378906, - "logps/rejected": -556.6956176757812, - "loss": 0.4816, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.4214730262756348, - "rewards/margins": 1.1080448627471924, - "rewards/rejected": -3.5295181274414062, + "logits/chosen": -1.5876115560531616, + "logits/rejected": -1.5332571268081665, + "logps/chosen": -502.91046142578125, + "logps/rejected": -568.5400390625, + "loss": 0.5013, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.2924888134002686, + "rewards/margins": 0.8915454745292664, + "rewards/rejected": -3.1840338706970215, "step": 1210 }, { "epoch": 0.64, - "grad_norm": 6.46875, + "grad_norm": 7.8125, "learning_rate": 1.7378936454380277e-06, - "logits/chosen": 2.2386937141418457, - "logits/rejected": 1.7253150939941406, - "logps/chosen": -516.8323364257812, - "logps/rejected": -622.4188232421875, - "loss": 0.4432, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.6058599948883057, - "rewards/margins": 1.1149176359176636, - "rewards/rejected": -3.7207775115966797, + "logits/chosen": -1.4910120964050293, + "logits/rejected": -1.3955504894256592, + "logps/chosen": -508.55474853515625, + "logps/rejected": -586.0122680664062, + "loss": 0.5113, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.406351089477539, + "rewards/margins": 1.08707594871521, + "rewards/rejected": -3.49342679977417, "step": 1220 }, { "epoch": 0.64, - "grad_norm": 6.1875, + "grad_norm": 6.90625, "learning_rate": 1.6945087708454273e-06, - "logits/chosen": 2.12276554107666, - "logits/rejected": 1.9039256572723389, - "logps/chosen": -525.0025634765625, - "logps/rejected": -638.1419067382812, - "loss": 0.4679, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.7907423973083496, - "rewards/margins": 1.0106900930404663, - "rewards/rejected": -3.8014328479766846, + "logits/chosen": -1.5146963596343994, + "logits/rejected": -1.401847243309021, + "logps/chosen": -513.2926635742188, + "logps/rejected": -559.9019775390625, + "loss": 0.4528, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.2331342697143555, + "rewards/margins": 0.9627434015274048, + "rewards/rejected": -3.19587779045105, "step": 1230 }, { "epoch": 0.65, - "grad_norm": 6.6875, + "grad_norm": 7.625, "learning_rate": 1.651392923780105e-06, - "logits/chosen": 1.8808460235595703, - "logits/rejected": 1.99996817111969, - "logps/chosen": -550.5272216796875, - "logps/rejected": -645.2981567382812, - "loss": 0.4703, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.9035916328430176, - "rewards/margins": 1.0486520528793335, - "rewards/rejected": -3.9522433280944824, + "logits/chosen": -1.5016862154006958, + "logits/rejected": -1.433712124824524, + "logps/chosen": -480.95703125, + "logps/rejected": -585.30712890625, + "loss": 0.4813, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.1668860912323, + "rewards/margins": 1.0968172550201416, + "rewards/rejected": -3.2637035846710205, "step": 1240 }, { "epoch": 0.65, - "grad_norm": 6.28125, + "grad_norm": 5.875, "learning_rate": 1.608560504584737e-06, - "logits/chosen": 2.11899733543396, - "logits/rejected": 1.9603424072265625, - "logps/chosen": -535.086181640625, - "logps/rejected": -645.9200439453125, - "loss": 0.4858, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.8361852169036865, - "rewards/margins": 1.132416009902954, - "rewards/rejected": -3.9686012268066406, + "logits/chosen": -1.435272455215454, + "logits/rejected": -1.3818395137786865, + "logps/chosen": -489.82293701171875, + "logps/rejected": -585.4735717773438, + "loss": 0.4916, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.3021085262298584, + "rewards/margins": 1.004734992980957, + "rewards/rejected": -3.3068439960479736, "step": 1250 }, { "epoch": 0.66, - "grad_norm": 7.59375, + "grad_norm": 5.59375, "learning_rate": 1.5660258189393945e-06, - "logits/chosen": 1.9444576501846313, - "logits/rejected": 1.9254415035247803, - "logps/chosen": -544.8070068359375, - "logps/rejected": -617.1060791015625, - "loss": 0.4611, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.795332431793213, - "rewards/margins": 1.0758298635482788, - "rewards/rejected": -3.871161937713623, + "logits/chosen": -1.527229905128479, + "logits/rejected": -1.4203643798828125, + "logps/chosen": -503.65716552734375, + "logps/rejected": -556.7635498046875, + "loss": 0.4766, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.143995523452759, + "rewards/margins": 1.0609943866729736, + "rewards/rejected": -3.2049896717071533, "step": 1260 }, { "epoch": 0.66, - "grad_norm": 7.40625, + "grad_norm": 7.5625, "learning_rate": 1.5238030730835578e-06, - "logits/chosen": 1.9484163522720337, - "logits/rejected": 1.8690261840820312, - "logps/chosen": -509.67449951171875, - "logps/rejected": -591.234375, - "loss": 0.4893, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.839465618133545, - "rewards/margins": 0.8926876187324524, - "rewards/rejected": -3.7321534156799316, + "logits/chosen": -1.4004570245742798, + "logits/rejected": -1.3315227031707764, + "logps/chosen": -484.2720642089844, + "logps/rejected": -577.0310668945312, + "loss": 0.4337, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.3353257179260254, + "rewards/margins": 1.0654656887054443, + "rewards/rejected": -3.4007911682128906, "step": 1270 }, { "epoch": 0.67, - "grad_norm": 8.6875, + "grad_norm": 9.1875, "learning_rate": 1.4819063690713565e-06, - "logits/chosen": 2.2301783561706543, - "logits/rejected": 2.2617290019989014, - "logps/chosen": -532.890380859375, - "logps/rejected": -599.6547241210938, - "loss": 0.5125, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.8411078453063965, - "rewards/margins": 0.9625542759895325, - "rewards/rejected": -3.8036625385284424, + "logits/chosen": -1.44136381149292, + "logits/rejected": -1.3490862846374512, + "logps/chosen": -492.79620361328125, + "logps/rejected": -599.3836059570312, + "loss": 0.5165, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.4487698078155518, + "rewards/margins": 1.134749174118042, + "rewards/rejected": -3.5835189819335938, "step": 1280 }, { "epoch": 0.68, - "grad_norm": 8.5625, + "grad_norm": 6.6875, "learning_rate": 1.4403497000615885e-06, - "logits/chosen": 1.2555204629898071, - "logits/rejected": 1.2910771369934082, - "logps/chosen": -531.36328125, - "logps/rejected": -612.5070190429688, - "loss": 0.4943, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.5224602222442627, - "rewards/margins": 1.1016714572906494, - "rewards/rejected": -3.624131679534912, + "logits/chosen": -1.481796145439148, + "logits/rejected": -1.3834607601165771, + "logps/chosen": -502.568603515625, + "logps/rejected": -589.8240356445312, + "loss": 0.5391, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.3244917392730713, + "rewards/margins": 0.9923933744430542, + "rewards/rejected": -3.316884994506836, "step": 1290 }, { "epoch": 0.68, - "grad_norm": 7.09375, + "grad_norm": 5.9375, "learning_rate": 1.3991469456441273e-06, - "logits/chosen": 1.42030668258667, - "logits/rejected": 1.6860134601593018, - "logps/chosen": -482.36065673828125, - "logps/rejected": -559.9107666015625, - "loss": 0.5204, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.3926775455474854, - "rewards/margins": 0.9813091158866882, - "rewards/rejected": -3.3739867210388184, + "logits/chosen": -1.5021679401397705, + "logits/rejected": -1.4300124645233154, + "logps/chosen": -498.32763671875, + "logps/rejected": -587.1395874023438, + "loss": 0.4799, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.2161061763763428, + "rewards/margins": 1.0599734783172607, + "rewards/rejected": -3.2760796546936035, "step": 1300 }, { "epoch": 0.68, - "eval_logits/chosen": 2.0121309757232666, - "eval_logits/rejected": 2.206488609313965, - "eval_logps/chosen": -492.31475830078125, - "eval_logps/rejected": -570.6591186523438, - "eval_loss": 0.4941355586051941, - "eval_rewards/accuracies": 0.7480000257492065, - "eval_rewards/chosen": -2.270085096359253, - "eval_rewards/margins": 1.023887276649475, - "eval_rewards/rejected": -3.2939727306365967, - "eval_runtime": 449.2395, - "eval_samples_per_second": 4.452, - "eval_steps_per_second": 0.278, + "eval_logits/chosen": -1.4277222156524658, + "eval_logits/rejected": -1.3340253829956055, + "eval_logps/chosen": -502.36865234375, + "eval_logps/rejected": -582.15625, + "eval_loss": 0.5011327266693115, + "eval_rewards/accuracies": 0.7450000047683716, + "eval_rewards/chosen": -2.306657075881958, + "eval_rewards/margins": 1.0377042293548584, + "eval_rewards/rejected": -3.3443613052368164, + "eval_runtime": 205.8366, + "eval_samples_per_second": 9.716, + "eval_steps_per_second": 0.607, "step": 1300 }, { "epoch": 0.69, - "grad_norm": 5.625, + "grad_norm": 14.375, "learning_rate": 1.3583118672042441e-06, - "logits/chosen": 1.6364719867706299, - "logits/rejected": 1.8250305652618408, - "logps/chosen": -467.3515625, - "logps/rejected": -567.787841796875, - "loss": 0.4726, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.2239139080047607, - "rewards/margins": 1.1083593368530273, - "rewards/rejected": -3.332273483276367, + "logits/chosen": -1.5242735147476196, + "logits/rejected": -1.423452377319336, + "logps/chosen": -495.67608642578125, + "logps/rejected": -586.1196899414062, + "loss": 0.4927, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.3464579582214355, + "rewards/margins": 1.1166086196899414, + "rewards/rejected": -3.463066577911377, "step": 1310 }, { "epoch": 0.69, - "grad_norm": 8.5, + "grad_norm": 6.625, "learning_rate": 1.3178581033264218e-06, - "logits/chosen": 1.3394782543182373, - "logits/rejected": 1.563076138496399, - "logps/chosen": -541.9149169921875, - "logps/rejected": -596.292236328125, - "loss": 0.49, + "logits/chosen": -1.4835782051086426, + "logits/rejected": -1.394766092300415, + "logps/chosen": -526.1903076171875, + "logps/rejected": -599.1194458007812, + "loss": 0.464, "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.4323418140411377, - "rewards/margins": 0.9971737861633301, - "rewards/rejected": -3.4295153617858887, + "rewards/chosen": -2.623629093170166, + "rewards/margins": 0.9394786953926086, + "rewards/rejected": -3.56310772895813, "step": 1320 }, { "epoch": 0.7, - "grad_norm": 6.15625, + "grad_norm": 9.8125, "learning_rate": 1.2777991652391757e-06, - "logits/chosen": 1.7200143337249756, - "logits/rejected": 1.6109498739242554, - "logps/chosen": -539.2000732421875, - "logps/rejected": -615.1435546875, - "loss": 0.5401, + "logits/chosen": -1.4715049266815186, + "logits/rejected": -1.3863550424575806, + "logps/chosen": -536.083984375, + "logps/rejected": -642.5689697265625, + "loss": 0.4522, "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.6219089031219482, - "rewards/margins": 0.9337452054023743, - "rewards/rejected": -3.5556540489196777, + "rewards/chosen": -2.66798996925354, + "rewards/margins": 1.0482239723205566, + "rewards/rejected": -3.7162139415740967, "step": 1330 }, { "epoch": 0.7, - "grad_norm": 10.0, + "grad_norm": 7.28125, "learning_rate": 1.2381484323024178e-06, - "logits/chosen": 1.7607828378677368, - "logits/rejected": 1.8438133001327515, - "logps/chosen": -535.6881713867188, - "logps/rejected": -615.4736938476562, - "loss": 0.5102, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.7043251991271973, - "rewards/margins": 1.0787712335586548, - "rewards/rejected": -3.7830963134765625, + "logits/chosen": -1.418473482131958, + "logits/rejected": -1.34147047996521, + "logps/chosen": -537.9956665039062, + "logps/rejected": -605.8355102539062, + "loss": 0.4949, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.777344226837158, + "rewards/margins": 1.0527513027191162, + "rewards/rejected": -3.8300960063934326, "step": 1340 }, { "epoch": 0.71, - "grad_norm": 5.34375, + "grad_norm": 11.0, "learning_rate": 1.1989191475388518e-06, - "logits/chosen": 2.0806539058685303, - "logits/rejected": 1.8068488836288452, - "logps/chosen": -544.23291015625, - "logps/rejected": -627.465087890625, - "loss": 0.4753, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.6438353061676025, - "rewards/margins": 0.9974311590194702, - "rewards/rejected": -3.641266345977783, + "logits/chosen": -1.5058234930038452, + "logits/rejected": -1.4633159637451172, + "logps/chosen": -527.7548828125, + "logps/rejected": -597.4782104492188, + "loss": 0.5086, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.516946792602539, + "rewards/margins": 0.9122180938720703, + "rewards/rejected": -3.429164409637451, "step": 1350 }, { "epoch": 0.71, - "grad_norm": 10.4375, + "grad_norm": 6.03125, "learning_rate": 1.160124413210918e-06, - "logits/chosen": 1.998169183731079, - "logits/rejected": 1.928439736366272, - "logps/chosen": -522.8230590820312, - "logps/rejected": -579.4949340820312, - "loss": 0.4773, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.640355110168457, - "rewards/margins": 1.0438846349716187, - "rewards/rejected": -3.6842403411865234, + "logits/chosen": -1.416601538658142, + "logits/rejected": -1.351701021194458, + "logps/chosen": -547.7350463867188, + "logps/rejected": -607.982666015625, + "loss": 0.5249, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.456645965576172, + "rewards/margins": 0.9266567230224609, + "rewards/rejected": -3.383302688598633, "step": 1360 }, { "epoch": 0.72, - "grad_norm": 8.0625, + "grad_norm": 6.125, "learning_rate": 1.1217771864447396e-06, - "logits/chosen": 1.564211368560791, - "logits/rejected": 1.8740813732147217, - "logps/chosen": -525.1043090820312, - "logps/rejected": -598.16259765625, - "loss": 0.4682, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.6339075565338135, - "rewards/margins": 1.0636084079742432, - "rewards/rejected": -3.6975159645080566, + "logits/chosen": -1.5074869394302368, + "logits/rejected": -1.4494173526763916, + "logps/chosen": -547.4534912109375, + "logps/rejected": -611.69482421875, + "loss": 0.4353, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.41982102394104, + "rewards/margins": 1.221834659576416, + "rewards/rejected": -3.641655445098877, "step": 1370 }, { "epoch": 0.72, - "grad_norm": 7.71875, + "grad_norm": 6.59375, "learning_rate": 1.08389027490255e-06, - "logits/chosen": 1.5220708847045898, - "logits/rejected": 1.471215844154358, - "logps/chosen": -577.1598510742188, - "logps/rejected": -613.5559692382812, - "loss": 0.5242, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -2.865147829055786, - "rewards/margins": 0.7697300910949707, - "rewards/rejected": -3.6348776817321777, + "logits/chosen": -1.4987690448760986, + "logits/rejected": -1.3986529111862183, + "logps/chosen": -490.8663024902344, + "logps/rejected": -572.9545288085938, + "loss": 0.4849, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3217995166778564, + "rewards/margins": 0.8936141729354858, + "rewards/rejected": -3.215414047241211, "step": 1380 }, { "epoch": 0.73, - "grad_norm": 7.3125, + "grad_norm": 6.78125, "learning_rate": 1.046476332505036e-06, - "logits/chosen": 1.776266098022461, - "logits/rejected": 1.908911943435669, - "logps/chosen": -504.511962890625, - "logps/rejected": -586.9796142578125, - "loss": 0.5074, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.7093136310577393, - "rewards/margins": 0.8418199419975281, - "rewards/rejected": -3.551133632659912, + "logits/chosen": -1.4200499057769775, + "logits/rejected": -1.355092167854309, + "logps/chosen": -523.000244140625, + "logps/rejected": -604.7027587890625, + "loss": 0.4608, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.414511203765869, + "rewards/margins": 1.0894904136657715, + "rewards/rejected": -3.504002094268799, "step": 1390 }, { "epoch": 0.73, - "grad_norm": 8.0625, + "grad_norm": 6.875, "learning_rate": 1.0095478552050348e-06, - "logits/chosen": 1.7001237869262695, - "logits/rejected": 1.5393736362457275, - "logps/chosen": -551.8426513671875, - "logps/rejected": -619.8112182617188, - "loss": 0.5158, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.7164037227630615, - "rewards/margins": 1.046526312828064, - "rewards/rejected": -3.762930393218994, + "logits/chosen": -1.4363720417022705, + "logits/rejected": -1.4057958126068115, + "logps/chosen": -509.18304443359375, + "logps/rejected": -622.3807373046875, + "loss": 0.4606, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.538700580596924, + "rewards/margins": 1.1025012731552124, + "rewards/rejected": -3.6412017345428467, "step": 1400 }, { "epoch": 0.73, - "eval_logits/chosen": 2.27836275100708, - "eval_logits/rejected": 2.481665849685669, - "eval_logps/chosen": -527.249267578125, - "eval_logps/rejected": -611.9570922851562, - "eval_loss": 0.4924592673778534, - "eval_rewards/accuracies": 0.7540000081062317, - "eval_rewards/chosen": -2.6194300651550293, - "eval_rewards/margins": 1.0875214338302612, - "eval_rewards/rejected": -3.70695161819458, - "eval_runtime": 449.2776, - "eval_samples_per_second": 4.452, - "eval_steps_per_second": 0.278, + "eval_logits/chosen": -1.4218993186950684, + "eval_logits/rejected": -1.3291493654251099, + "eval_logps/chosen": -521.8630981445312, + "eval_logps/rejected": -603.546875, + "eval_loss": 0.4991470277309418, + "eval_rewards/accuracies": 0.7429999709129333, + "eval_rewards/chosen": -2.5016019344329834, + "eval_rewards/margins": 1.056665062904358, + "eval_rewards/rejected": -3.5582666397094727, + "eval_runtime": 206.8725, + "eval_samples_per_second": 9.668, + "eval_steps_per_second": 0.604, "step": 1400 }, { "epoch": 0.74, - "grad_norm": 6.09375, + "grad_norm": 8.5625, "learning_rate": 9.731171768139808e-07, - "logits/chosen": 1.8738467693328857, - "logits/rejected": 1.6231794357299805, - "logps/chosen": -510.99755859375, - "logps/rejected": -603.8427124023438, - "loss": 0.4857, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -2.6101596355438232, - "rewards/margins": 1.1430681943893433, - "rewards/rejected": -3.753227710723877, + "logits/chosen": -1.425769567489624, + "logits/rejected": -1.3510525226593018, + "logps/chosen": -534.87353515625, + "logps/rejected": -616.7877807617188, + "loss": 0.5568, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.6500022411346436, + "rewards/margins": 0.9020141363143921, + "rewards/rejected": -3.552016496658325, "step": 1410 }, { "epoch": 0.74, - "grad_norm": 7.03125, + "grad_norm": 6.96875, "learning_rate": 9.371964648825221e-07, - "logits/chosen": 2.0445821285247803, - "logits/rejected": 2.0226082801818848, - "logps/chosen": -511.77435302734375, - "logps/rejected": -593.7333984375, - "loss": 0.4627, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.699057102203369, - "rewards/margins": 1.1369215250015259, - "rewards/rejected": -3.8359782695770264, + "logits/chosen": -1.497687578201294, + "logits/rejected": -1.411454439163208, + "logps/chosen": -542.4428100585938, + "logps/rejected": -576.1937255859375, + "loss": 0.5665, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.580232620239258, + "rewards/margins": 0.7872194051742554, + "rewards/rejected": -3.3674521446228027, "step": 1420 }, { "epoch": 0.75, - "grad_norm": 5.875, + "grad_norm": 6.75, "learning_rate": 9.017977166366445e-07, - "logits/chosen": 1.337143063545227, - "logits/rejected": 1.4125709533691406, - "logps/chosen": -547.0819091796875, - "logps/rejected": -636.6948852539062, - "loss": 0.4438, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.638824939727783, - "rewards/margins": 0.9935441017150879, - "rewards/rejected": -3.632368803024292, + "logits/chosen": -1.4858081340789795, + "logits/rejected": -1.3852221965789795, + "logps/chosen": -526.9737548828125, + "logps/rejected": -635.5704345703125, + "loss": 0.4595, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.443568468093872, + "rewards/margins": 1.2052720785140991, + "rewards/rejected": -3.6488404273986816, "step": 1430 }, { "epoch": 0.75, - "grad_norm": 7.90625, + "grad_norm": 7.40625, "learning_rate": 8.669327549707096e-07, - "logits/chosen": 2.07295298576355, - "logits/rejected": 1.8528960943222046, - "logps/chosen": -519.5410766601562, - "logps/rejected": -614.3350830078125, - "loss": 0.5055, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -2.67018461227417, - "rewards/margins": 1.1164127588272095, - "rewards/rejected": -3.7865970134735107, + "logits/chosen": -1.6181617975234985, + "logits/rejected": -1.5521459579467773, + "logps/chosen": -541.6924438476562, + "logps/rejected": -613.9597778320312, + "loss": 0.5011, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.4517698287963867, + "rewards/margins": 0.8626209497451782, + "rewards/rejected": -3.3143906593322754, "step": 1440 }, { "epoch": 0.76, - "grad_norm": 8.1875, + "grad_norm": 6.65625, "learning_rate": 8.326132244986932e-07, - "logits/chosen": 2.217745065689087, - "logits/rejected": 2.1313698291778564, - "logps/chosen": -498.91455078125, - "logps/rejected": -612.3049926757812, - "loss": 0.4805, + "logits/chosen": -1.511328935623169, + "logits/rejected": -1.432902216911316, + "logps/chosen": -508.4786071777344, + "logps/rejected": -612.4136962890625, + "loss": 0.488, "rewards/accuracies": 0.78125, - "rewards/chosen": -2.5271456241607666, - "rewards/margins": 1.1923385858535767, - "rewards/rejected": -3.7194838523864746, + "rewards/chosen": -2.4070212841033936, + "rewards/margins": 1.1571033000946045, + "rewards/rejected": -3.564124584197998, "step": 1450 }, { "epoch": 0.76, - "grad_norm": 8.0625, + "grad_norm": 5.75, "learning_rate": 7.988505876649863e-07, - "logits/chosen": 1.5606577396392822, - "logits/rejected": 1.8046897649765015, - "logps/chosen": -571.3505859375, - "logps/rejected": -620.4075927734375, - "loss": 0.5353, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.810910701751709, - "rewards/margins": 0.8456093668937683, - "rewards/rejected": -3.656520128250122, + "logits/chosen": -1.4642016887664795, + "logits/rejected": -1.353859543800354, + "logps/chosen": -487.744140625, + "logps/rejected": -559.90234375, + "loss": 0.5018, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.3719143867492676, + "rewards/margins": 0.841119647026062, + "rewards/rejected": -3.213034152984619, "step": 1460 }, { "epoch": 0.77, - "grad_norm": 8.6875, + "grad_norm": 8.4375, "learning_rate": 7.656561209160248e-07, - "logits/chosen": 1.7562596797943115, - "logits/rejected": 1.6450916528701782, - "logps/chosen": -557.3902587890625, - "logps/rejected": -651.6214599609375, - "loss": 0.4805, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.7291982173919678, - "rewards/margins": 1.1322641372680664, - "rewards/rejected": -3.861462354660034, + "logits/chosen": -1.430325984954834, + "logits/rejected": -1.3274606466293335, + "logps/chosen": -524.908447265625, + "logps/rejected": -588.8642578125, + "loss": 0.4972, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.478081226348877, + "rewards/margins": 1.044504165649414, + "rewards/rejected": -3.522585391998291, "step": 1470 }, { "epoch": 0.77, - "grad_norm": 5.96875, + "grad_norm": 6.3125, "learning_rate": 7.330409109340563e-07, - "logits/chosen": 1.6876871585845947, - "logits/rejected": 1.8287250995635986, - "logps/chosen": -548.6978759765625, - "logps/rejected": -624.533203125, - "loss": 0.496, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.747478723526001, - "rewards/margins": 1.0232455730438232, - "rewards/rejected": -3.770724058151245, + "logits/chosen": -1.4950729608535767, + "logits/rejected": -1.399980902671814, + "logps/chosen": -539.3687133789062, + "logps/rejected": -593.13623046875, + "loss": 0.5227, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.505924701690674, + "rewards/margins": 1.0100219249725342, + "rewards/rejected": -3.515946626663208, "step": 1480 }, { "epoch": 0.78, - "grad_norm": 7.4375, + "grad_norm": 6.5625, "learning_rate": 7.010158509342682e-07, - "logits/chosen": 1.8295252323150635, - "logits/rejected": 1.6652030944824219, - "logps/chosen": -538.7413330078125, - "logps/rejected": -626.6610717773438, - "loss": 0.4822, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.6121177673339844, - "rewards/margins": 1.0102406740188599, - "rewards/rejected": -3.622358798980713, + "logits/chosen": -1.422590732574463, + "logits/rejected": -1.3758926391601562, + "logps/chosen": -523.3316650390625, + "logps/rejected": -640.554931640625, + "loss": 0.4806, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.6385467052459717, + "rewards/margins": 1.0877034664154053, + "rewards/rejected": -3.726250410079956, "step": 1490 }, { "epoch": 0.79, - "grad_norm": 5.375, + "grad_norm": 6.65625, "learning_rate": 6.695916370265529e-07, - "logits/chosen": 1.9342527389526367, - "logits/rejected": 1.8059526681900024, - "logps/chosen": -494.00677490234375, - "logps/rejected": -591.4176025390625, - "loss": 0.4677, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.608290195465088, - "rewards/margins": 1.0327186584472656, - "rewards/rejected": -3.6410088539123535, + "logits/chosen": -1.454978585243225, + "logits/rejected": -1.4019845724105835, + "logps/chosen": -488.15228271484375, + "logps/rejected": -599.3827514648438, + "loss": 0.4763, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.454293727874756, + "rewards/margins": 1.122016429901123, + "rewards/rejected": -3.576310396194458, "step": 1500 }, { "epoch": 0.79, - "eval_logits/chosen": 2.373870372772217, - "eval_logits/rejected": 2.584756851196289, - "eval_logps/chosen": -527.5073852539062, - "eval_logps/rejected": -612.5421142578125, - "eval_loss": 0.49223849177360535, - "eval_rewards/accuracies": 0.7540000081062317, - "eval_rewards/chosen": -2.622011661529541, - "eval_rewards/margins": 1.09079110622406, - "eval_rewards/rejected": -3.7128028869628906, - "eval_runtime": 450.8468, - "eval_samples_per_second": 4.436, - "eval_steps_per_second": 0.277, + "eval_logits/chosen": -1.4325312376022339, + "eval_logits/rejected": -1.3394147157669067, + "eval_logps/chosen": -521.4944458007812, + "eval_logps/rejected": -599.7630615234375, + "eval_loss": 0.4985138177871704, + "eval_rewards/accuracies": 0.746999979019165, + "eval_rewards/chosen": -2.497915267944336, + "eval_rewards/margins": 1.0225143432617188, + "eval_rewards/rejected": -3.5204296112060547, + "eval_runtime": 206.7979, + "eval_samples_per_second": 9.671, + "eval_steps_per_second": 0.604, "step": 1500 }, { "epoch": 0.79, - "grad_norm": 10.0, + "grad_norm": 7.3125, "learning_rate": 6.387787646430854e-07, - "logits/chosen": 1.389383316040039, - "logits/rejected": 1.57669198513031, - "logps/chosen": -528.4805297851562, - "logps/rejected": -606.1107177734375, - "loss": 0.4755, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -2.586696147918701, - "rewards/margins": 1.0107383728027344, - "rewards/rejected": -3.5974345207214355, + "logits/chosen": -1.4497127532958984, + "logits/rejected": -1.3799619674682617, + "logps/chosen": -531.990966796875, + "logps/rejected": -574.7440185546875, + "loss": 0.5299, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.597123861312866, + "rewards/margins": 0.8730913400650024, + "rewards/rejected": -3.470215320587158, "step": 1510 }, { "epoch": 0.8, - "grad_norm": 8.875, + "grad_norm": 5.6875, "learning_rate": 6.085875250329401e-07, - "logits/chosen": 1.4683406352996826, - "logits/rejected": 1.9843635559082031, - "logps/chosen": -538.0323486328125, - "logps/rejected": -633.2237548828125, - "loss": 0.4915, + "logits/chosen": -1.5374524593353271, + "logits/rejected": -1.4480254650115967, + "logps/chosen": -493.16351318359375, + "logps/rejected": -555.0206298828125, + "loss": 0.4998, "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.590121269226074, - "rewards/margins": 1.0882453918457031, - "rewards/rejected": -3.6783668994903564, + "rewards/chosen": -2.448246717453003, + "rewards/margins": 0.842933177947998, + "rewards/rejected": -3.291179656982422, "step": 1520 }, { "epoch": 0.8, - "grad_norm": 6.28125, + "grad_norm": 7.21875, "learning_rate": 5.79028001824894e-07, - "logits/chosen": 2.1134185791015625, - "logits/rejected": 2.00264310836792, - "logps/chosen": -518.1842651367188, - "logps/rejected": -585.181640625, - "loss": 0.5058, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.550919532775879, - "rewards/margins": 1.059746503829956, - "rewards/rejected": -3.610666275024414, + "logits/chosen": -1.4127936363220215, + "logits/rejected": -1.4277909994125366, + "logps/chosen": -473.9107360839844, + "logps/rejected": -639.1129150390625, + "loss": 0.4427, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.282348155975342, + "rewards/margins": 1.2384718656539917, + "rewards/rejected": -3.520819902420044, "step": 1530 }, { "epoch": 0.81, - "grad_norm": 11.0625, + "grad_norm": 4.5625, "learning_rate": 5.501100676595761e-07, - "logits/chosen": 1.5053377151489258, - "logits/rejected": 1.8114296197891235, - "logps/chosen": -503.01202392578125, - "logps/rejected": -604.1715087890625, - "loss": 0.5031, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.5767312049865723, - "rewards/margins": 1.0214576721191406, - "rewards/rejected": -3.598188877105713, + "logits/chosen": -1.5411286354064941, + "logits/rejected": -1.4845304489135742, + "logps/chosen": -528.363525390625, + "logps/rejected": -610.4037475585938, + "loss": 0.4969, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.430098056793213, + "rewards/margins": 1.0298941135406494, + "rewards/rejected": -3.4599921703338623, "step": 1540 }, { "epoch": 0.81, - "grad_norm": 7.5625, + "grad_norm": 6.4375, "learning_rate": 5.218433808920884e-07, - "logits/chosen": 1.5970503091812134, - "logits/rejected": 2.1210780143737793, - "logps/chosen": -524.966796875, - "logps/rejected": -603.1199951171875, - "loss": 0.5213, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.6611809730529785, - "rewards/margins": 1.0476148128509521, - "rewards/rejected": -3.7087955474853516, + "logits/chosen": -1.5715194940567017, + "logits/rejected": -1.4276337623596191, + "logps/chosen": -515.0184326171875, + "logps/rejected": -564.2301025390625, + "loss": 0.4785, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.4059693813323975, + "rewards/margins": 0.9038017988204956, + "rewards/rejected": -3.3097712993621826, "step": 1550 }, { "epoch": 0.82, - "grad_norm": 7.6875, + "grad_norm": 6.28125, "learning_rate": 4.942373823661928e-07, - "logits/chosen": 1.453554391860962, - "logits/rejected": 1.9258426427841187, - "logps/chosen": -492.60064697265625, - "logps/rejected": -562.98095703125, - "loss": 0.4492, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.3589653968811035, - "rewards/margins": 1.078311562538147, - "rewards/rejected": -3.437276840209961, + "logits/chosen": -1.537407398223877, + "logits/rejected": -1.352263331413269, + "logps/chosen": -559.8363037109375, + "logps/rejected": -603.9837646484375, + "loss": 0.5263, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.3580124378204346, + "rewards/margins": 1.0659737586975098, + "rewards/rejected": -3.4239859580993652, "step": 1560 }, { "epoch": 0.82, - "grad_norm": 5.46875, + "grad_norm": 6.40625, "learning_rate": 4.6730129226114363e-07, - "logits/chosen": 1.5613019466400146, - "logits/rejected": 2.030231237411499, - "logps/chosen": -481.99029541015625, - "logps/rejected": -551.4531860351562, - "loss": 0.4448, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.3367221355438232, - "rewards/margins": 1.1135777235031128, - "rewards/rejected": -3.4502997398376465, + "logits/chosen": -1.4590742588043213, + "logits/rejected": -1.3711886405944824, + "logps/chosen": -499.3367614746094, + "logps/rejected": -608.9996337890625, + "loss": 0.4878, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.5338335037231445, + "rewards/margins": 0.9903131723403931, + "rewards/rejected": -3.524146318435669, "step": 1570 }, { "epoch": 0.83, - "grad_norm": 8.4375, + "grad_norm": 6.28125, "learning_rate": 4.4104410701222703e-07, - "logits/chosen": 1.846142053604126, - "logits/rejected": 2.0114002227783203, - "logps/chosen": -539.5140380859375, - "logps/rejected": -590.5936279296875, - "loss": 0.4993, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.5811474323272705, - "rewards/margins": 1.007107138633728, - "rewards/rejected": -3.588254451751709, + "logits/chosen": -1.4868284463882446, + "logits/rejected": -1.4123101234436035, + "logps/chosen": -530.2494506835938, + "logps/rejected": -610.2762451171875, + "loss": 0.5491, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.6920907497406006, + "rewards/margins": 0.8161090612411499, + "rewards/rejected": -3.508200168609619, "step": 1580 }, { "epoch": 0.83, - "grad_norm": 4.125, + "grad_norm": 6.8125, "learning_rate": 4.154745963060197e-07, - "logits/chosen": 1.6992515325546265, - "logits/rejected": 1.6688053607940674, - "logps/chosen": -487.58135986328125, - "logps/rejected": -601.6525268554688, - "loss": 0.4819, + "logits/chosen": -1.4255352020263672, + "logits/rejected": -1.3225051164627075, + "logps/chosen": -488.10015869140625, + "logps/rejected": -570.8845825195312, + "loss": 0.4793, "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.4629733562469482, - "rewards/margins": 1.129070520401001, - "rewards/rejected": -3.59204363822937, + "rewards/chosen": -2.4687325954437256, + "rewards/margins": 1.027092695236206, + "rewards/rejected": -3.4958252906799316, "step": 1590 }, { "epoch": 0.84, - "grad_norm": 8.625, + "grad_norm": 7.84375, "learning_rate": 3.9060130015138863e-07, - "logits/chosen": 1.7125355005264282, - "logits/rejected": 1.963112235069275, - "logps/chosen": -518.4107055664062, - "logps/rejected": -589.79541015625, - "loss": 0.5464, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.61944580078125, - "rewards/margins": 0.8665763735771179, - "rewards/rejected": -3.4860222339630127, + "logits/chosen": -1.4365253448486328, + "logits/rejected": -1.4320509433746338, + "logps/chosen": -510.7242126464844, + "logps/rejected": -598.7286987304688, + "loss": 0.5008, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.55981707572937, + "rewards/margins": 0.9268480539321899, + "rewards/rejected": -3.4866650104522705, "step": 1600 }, { "epoch": 0.84, - "eval_logits/chosen": 2.2803421020507812, - "eval_logits/rejected": 2.4955389499664307, - "eval_logps/chosen": -516.6763305664062, - "eval_logps/rejected": -600.9805297851562, - "eval_loss": 0.4925038516521454, - "eval_rewards/accuracies": 0.7509999871253967, - "eval_rewards/chosen": -2.5137012004852295, - "eval_rewards/margins": 1.0834852457046509, - "eval_rewards/rejected": -3.597186326980591, - "eval_runtime": 449.4345, - "eval_samples_per_second": 4.45, - "eval_steps_per_second": 0.278, + "eval_logits/chosen": -1.4415297508239746, + "eval_logits/rejected": -1.3491688966751099, + "eval_logps/chosen": -517.2503662109375, + "eval_logps/rejected": -594.9102172851562, + "eval_loss": 0.4976504147052765, + "eval_rewards/accuracies": 0.7480000257492065, + "eval_rewards/chosen": -2.4554741382598877, + "eval_rewards/margins": 1.016426682472229, + "eval_rewards/rejected": -3.471900701522827, + "eval_runtime": 206.6827, + "eval_samples_per_second": 9.677, + "eval_steps_per_second": 0.605, "step": 1600 }, { "epoch": 0.84, - "grad_norm": 6.96875, + "grad_norm": 6.5625, "learning_rate": 3.664325260271953e-07, - "logits/chosen": 1.7167911529541016, - "logits/rejected": 1.6851142644882202, - "logps/chosen": -496.5397033691406, - "logps/rejected": -586.4037475585938, - "loss": 0.4926, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -2.6186347007751465, - "rewards/margins": 0.9449771046638489, - "rewards/rejected": -3.5636115074157715, + "logits/chosen": -1.4340312480926514, + "logits/rejected": -1.381696105003357, + "logps/chosen": -529.0896606445312, + "logps/rejected": -568.31787109375, + "loss": 0.5177, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.3479690551757812, + "rewards/margins": 0.7791107892990112, + "rewards/rejected": -3.127079963684082, "step": 1610 }, { "epoch": 0.85, - "grad_norm": 9.75, + "grad_norm": 6.6875, "learning_rate": 3.429763461076677e-07, - "logits/chosen": 1.7546532154083252, - "logits/rejected": 1.7746025323867798, - "logps/chosen": -486.4024353027344, - "logps/rejected": -575.5360717773438, - "loss": 0.529, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.5622782707214355, - "rewards/margins": 0.9538448452949524, - "rewards/rejected": -3.5161232948303223, + "logits/chosen": -1.5787631273269653, + "logits/rejected": -1.4651563167572021, + "logps/chosen": -544.9623413085938, + "logps/rejected": -591.4544677734375, + "loss": 0.4948, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.447205066680908, + "rewards/margins": 0.9208120107650757, + "rewards/rejected": -3.3680167198181152, "step": 1620 }, { "epoch": 0.85, - "grad_norm": 12.375, + "grad_norm": 6.34375, "learning_rate": 3.202405945663556e-07, - "logits/chosen": 1.7500112056732178, - "logits/rejected": 1.6894385814666748, - "logps/chosen": -503.01727294921875, - "logps/rejected": -568.9468994140625, - "loss": 0.5315, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.571544647216797, - "rewards/margins": 0.7718718647956848, - "rewards/rejected": -3.343416690826416, + "logits/chosen": -1.5388927459716797, + "logits/rejected": -1.435225248336792, + "logps/chosen": -527.3236694335938, + "logps/rejected": -591.0799560546875, + "loss": 0.4897, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.576470136642456, + "rewards/margins": 0.910003662109375, + "rewards/rejected": -3.486473798751831, "step": 1630 }, { "epoch": 0.86, - "grad_norm": 10.4375, + "grad_norm": 6.65625, "learning_rate": 2.982328649595856e-07, - "logits/chosen": 1.3868858814239502, - "logits/rejected": 1.7116378545761108, - "logps/chosen": -516.5772705078125, - "logps/rejected": -601.1525268554688, - "loss": 0.4942, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.530223846435547, - "rewards/margins": 0.9710138440132141, - "rewards/rejected": -3.5012378692626953, + "logits/chosen": -1.4623661041259766, + "logits/rejected": -1.418001651763916, + "logps/chosen": -497.0453186035156, + "logps/rejected": -582.2621459960938, + "loss": 0.5222, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.3540682792663574, + "rewards/margins": 0.9974225759506226, + "rewards/rejected": -3.3514907360076904, "step": 1640 }, { "epoch": 0.86, - "grad_norm": 7.3125, + "grad_norm": 5.1875, "learning_rate": 2.7696050769026954e-07, - "logits/chosen": 1.8670718669891357, - "logits/rejected": 2.1231729984283447, - "logps/chosen": -528.6818237304688, - "logps/rejected": -598.3047485351562, - "loss": 0.5072, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.472653388977051, - "rewards/margins": 1.0345518589019775, - "rewards/rejected": -3.5072052478790283, + "logits/chosen": -1.4713447093963623, + "logits/rejected": -1.4519026279449463, + "logps/chosen": -513.2701416015625, + "logps/rejected": -621.521484375, + "loss": 0.5271, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.4599874019622803, + "rewards/margins": 1.0760111808776855, + "rewards/rejected": -3.535998582839966, "step": 1650 }, { "epoch": 0.87, - "grad_norm": 5.0625, + "grad_norm": 7.34375, "learning_rate": 2.564306275529341e-07, - "logits/chosen": 1.5151128768920898, - "logits/rejected": 1.9168331623077393, - "logps/chosen": -532.9383544921875, - "logps/rejected": -606.9699096679688, - "loss": 0.4644, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.455781936645508, - "rewards/margins": 1.086609125137329, - "rewards/rejected": -3.542390823364258, + "logits/chosen": -1.4529814720153809, + "logits/rejected": -1.3669501543045044, + "logps/chosen": -488.9100646972656, + "logps/rejected": -551.0786743164062, + "loss": 0.5005, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.357375144958496, + "rewards/margins": 0.9981120824813843, + "rewards/rejected": -3.355487108230591, "step": 1660 }, { "epoch": 0.87, - "grad_norm": 6.625, + "grad_norm": 7.34375, "learning_rate": 2.3665008136077332e-07, - "logits/chosen": 1.9394729137420654, - "logits/rejected": 1.6315431594848633, - "logps/chosen": -506.2244567871094, - "logps/rejected": -598.549072265625, - "loss": 0.508, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.4361801147460938, - "rewards/margins": 0.9389766454696655, - "rewards/rejected": -3.3751564025878906, + "logits/chosen": -1.577505350112915, + "logits/rejected": -1.4699662923812866, + "logps/chosen": -525.9560546875, + "logps/rejected": -618.0115966796875, + "loss": 0.4965, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.5162107944488525, + "rewards/margins": 1.0870957374572754, + "rewards/rejected": -3.603306293487549, "step": 1670 }, { "epoch": 0.88, - "grad_norm": 8.1875, + "grad_norm": 7.125, "learning_rate": 2.1762547565553293e-07, - "logits/chosen": 1.3666921854019165, - "logits/rejected": 1.5760104656219482, - "logps/chosen": -513.62646484375, - "logps/rejected": -581.5886840820312, - "loss": 0.5356, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.499276638031006, - "rewards/margins": 0.931717574596405, - "rewards/rejected": -3.4309940338134766, + "logits/chosen": -1.4847347736358643, + "logits/rejected": -1.3919851779937744, + "logps/chosen": -512.091552734375, + "logps/rejected": -616.5901489257812, + "loss": 0.459, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.374174118041992, + "rewards/margins": 1.1540172100067139, + "rewards/rejected": -3.528191328048706, "step": 1680 }, { "epoch": 0.88, - "grad_norm": 6.3125, + "grad_norm": 7.03125, "learning_rate": 1.993631645009747e-07, - "logits/chosen": 1.6720809936523438, - "logits/rejected": 1.689178228378296, - "logps/chosen": -539.5501098632812, - "logps/rejected": -618.399169921875, - "loss": 0.4615, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.4603431224823, - "rewards/margins": 1.1464570760726929, - "rewards/rejected": -3.6068005561828613, + "logits/chosen": -1.4473907947540283, + "logits/rejected": -1.346949815750122, + "logps/chosen": -485.31512451171875, + "logps/rejected": -593.9915161132812, + "loss": 0.4396, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.3521013259887695, + "rewards/margins": 1.203741192817688, + "rewards/rejected": -3.555842161178589, "step": 1690 }, { "epoch": 0.89, - "grad_norm": 6.3125, + "grad_norm": 6.84375, "learning_rate": 1.818692473606748e-07, - "logits/chosen": 1.6739721298217773, - "logits/rejected": 1.4370070695877075, - "logps/chosen": -520.3096923828125, - "logps/rejected": -609.7615966796875, - "loss": 0.5078, + "logits/chosen": -1.4863381385803223, + "logits/rejected": -1.3555645942687988, + "logps/chosen": -513.4937744140625, + "logps/rejected": -581.9398803710938, + "loss": 0.4654, "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -2.5945487022399902, - "rewards/margins": 1.0985443592071533, - "rewards/rejected": -3.6930930614471436, + "rewards/chosen": -2.393968105316162, + "rewards/margins": 1.0673789978027344, + "rewards/rejected": -3.4613468647003174, "step": 1700 }, { "epoch": 0.89, - "eval_logits/chosen": 2.3031065464019775, - "eval_logits/rejected": 2.5159618854522705, - "eval_logps/chosen": -515.6121826171875, - "eval_logps/rejected": -599.6626586914062, - "eval_loss": 0.49199017882347107, - "eval_rewards/accuracies": 0.7549999952316284, - "eval_rewards/chosen": -2.5030598640441895, - "eval_rewards/margins": 1.0809484720230103, - "eval_rewards/rejected": -3.58400821685791, - "eval_runtime": 449.2811, - "eval_samples_per_second": 4.452, - "eval_steps_per_second": 0.278, + "eval_logits/chosen": -1.440211534500122, + "eval_logits/rejected": -1.347786784172058, + "eval_logps/chosen": -516.6851806640625, + "eval_logps/rejected": -594.441650390625, + "eval_loss": 0.49758249521255493, + "eval_rewards/accuracies": 0.7509999871253967, + "eval_rewards/chosen": -2.449821710586548, + "eval_rewards/margins": 1.0173932313919067, + "eval_rewards/rejected": -3.467214822769165, + "eval_runtime": 206.7023, + "eval_samples_per_second": 9.676, + "eval_steps_per_second": 0.605, "step": 1700 }, { "epoch": 0.9, - "grad_norm": 5.9375, + "grad_norm": 8.8125, "learning_rate": 1.6514956706084885e-07, - "logits/chosen": 1.7647104263305664, - "logits/rejected": 1.7470661401748657, - "logps/chosen": -510.3075256347656, - "logps/rejected": -584.53076171875, - "loss": 0.479, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.5620687007904053, - "rewards/margins": 0.9733829498291016, - "rewards/rejected": -3.5354514122009277, + "logits/chosen": -1.5661511421203613, + "logits/rejected": -1.444165825843811, + "logps/chosen": -521.919189453125, + "logps/rejected": -586.3562622070312, + "loss": 0.4662, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.438523769378662, + "rewards/margins": 1.0026253461837769, + "rewards/rejected": -3.4411492347717285, "step": 1710 }, { "epoch": 0.9, - "grad_norm": 7.28125, + "grad_norm": 6.75, "learning_rate": 1.4920970783889737e-07, - "logits/chosen": 2.056183338165283, - "logits/rejected": 1.862571358680725, - "logps/chosen": -492.4064025878906, - "logps/rejected": -592.354248046875, - "loss": 0.4619, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.4460253715515137, - "rewards/margins": 1.100265622138977, - "rewards/rejected": -3.5462913513183594, + "logits/chosen": -1.4804189205169678, + "logits/rejected": -1.3565007448196411, + "logps/chosen": -517.4879150390625, + "logps/rejected": -590.8859252929688, + "loss": 0.4738, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.4466044902801514, + "rewards/margins": 0.9578853845596313, + "rewards/rejected": -3.4044899940490723, "step": 1720 }, { "epoch": 0.91, - "grad_norm": 6.125, + "grad_norm": 7.4375, "learning_rate": 1.340549934783164e-07, - "logits/chosen": 2.063292980194092, - "logits/rejected": 1.9309425354003906, - "logps/chosen": -481.56402587890625, - "logps/rejected": -577.6758422851562, - "loss": 0.5047, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.4606540203094482, - "rewards/margins": 1.0394421815872192, - "rewards/rejected": -3.500096082687378, + "logits/chosen": -1.4345219135284424, + "logits/rejected": -1.398148775100708, + "logps/chosen": -510.354736328125, + "logps/rejected": -606.5594482421875, + "loss": 0.473, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.461632490158081, + "rewards/margins": 1.1440181732177734, + "rewards/rejected": -3.6056504249572754, "step": 1730 }, { "epoch": 0.91, - "grad_norm": 6.59375, + "grad_norm": 5.3125, "learning_rate": 1.196904855305961e-07, - "logits/chosen": 1.4203405380249023, - "logits/rejected": 2.2476806640625, - "logps/chosen": -489.14306640625, - "logps/rejected": -624.822265625, - "loss": 0.4847, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.6157355308532715, - "rewards/margins": 1.1121528148651123, - "rewards/rejected": -3.7278881072998047, + "logits/chosen": -1.4843945503234863, + "logits/rejected": -1.3764415979385376, + "logps/chosen": -511.3872985839844, + "logps/rejected": -572.77783203125, + "loss": 0.5174, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.429408311843872, + "rewards/margins": 1.009290099143982, + "rewards/rejected": -3.4386982917785645, "step": 1740 }, { "epoch": 0.92, - "grad_norm": 6.25, + "grad_norm": 5.3125, "learning_rate": 1.0612098162470302e-07, - "logits/chosen": 2.0200281143188477, - "logits/rejected": 2.246073007583618, - "logps/chosen": -531.7589111328125, - "logps/rejected": -600.7255859375, - "loss": 0.5183, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.6462159156799316, - "rewards/margins": 1.0438668727874756, - "rewards/rejected": -3.6900830268859863, + "logits/chosen": -1.5372560024261475, + "logits/rejected": -1.3980759382247925, + "logps/chosen": -542.35400390625, + "logps/rejected": -590.6364135742188, + "loss": 0.5084, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.490891933441162, + "rewards/margins": 0.8898002505302429, + "rewards/rejected": -3.38069224357605, "step": 1750 }, { "epoch": 0.92, - "grad_norm": 6.59375, + "grad_norm": 5.4375, "learning_rate": 9.335101386471285e-08, - "logits/chosen": 1.8874080181121826, - "logits/rejected": 1.6664152145385742, - "logps/chosen": -526.5037231445312, - "logps/rejected": -639.0104370117188, - "loss": 0.4508, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -2.4748384952545166, - "rewards/margins": 1.4183952808380127, - "rewards/rejected": -3.89323353767395, + "logits/chosen": -1.5029523372650146, + "logits/rejected": -1.4057238101959229, + "logps/chosen": -491.0376892089844, + "logps/rejected": -596.9283447265625, + "loss": 0.4623, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.436763286590576, + "rewards/margins": 1.0951788425445557, + "rewards/rejected": -3.5319418907165527, "step": 1760 }, { "epoch": 0.93, - "grad_norm": 8.625, + "grad_norm": 6.3125, "learning_rate": 8.138484731612273e-08, - "logits/chosen": 1.7972297668457031, - "logits/rejected": 1.9597629308700562, - "logps/chosen": -512.8606567382812, - "logps/rejected": -580.591796875, - "loss": 0.5182, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -2.616697311401367, - "rewards/margins": 0.8005778193473816, - "rewards/rejected": -3.4172751903533936, + "logits/chosen": -1.4329261779785156, + "logits/rejected": -1.3377002477645874, + "logps/chosen": -494.4960021972656, + "logps/rejected": -598.6796264648438, + "loss": 0.4788, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.4506049156188965, + "rewards/margins": 1.1435714960098267, + "rewards/rejected": -3.5941765308380127, "step": 1770 }, { "epoch": 0.93, - "grad_norm": 8.5625, + "grad_norm": 4.75, "learning_rate": 7.022647858135501e-08, - "logits/chosen": 1.7437160015106201, - "logits/rejected": 1.9117034673690796, - "logps/chosen": -505.4112243652344, - "logps/rejected": -593.5841674804688, - "loss": 0.4775, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.5735456943511963, - "rewards/margins": 1.0469127893447876, - "rewards/rejected": -3.6204586029052734, + "logits/chosen": -1.4736502170562744, + "logits/rejected": -1.451183557510376, + "logps/chosen": -493.92706298828125, + "logps/rejected": -590.5465087890625, + "loss": 0.5134, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.3842227458953857, + "rewards/margins": 1.1362553834915161, + "rewards/rejected": -3.5204784870147705, "step": 1780 }, { "epoch": 0.94, - "grad_norm": 7.09375, + "grad_norm": 7.5625, "learning_rate": 5.987963446492384e-08, - "logits/chosen": 1.6604769229888916, - "logits/rejected": 1.3407343626022339, - "logps/chosen": -500.91851806640625, - "logps/rejected": -603.554931640625, - "loss": 0.4941, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -2.477205514907837, - "rewards/margins": 1.0765531063079834, - "rewards/rejected": -3.553758144378662, + "logits/chosen": -1.4801084995269775, + "logits/rejected": -1.4299700260162354, + "logps/chosen": -557.2908325195312, + "logps/rejected": -669.438720703125, + "loss": 0.4906, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4419429302215576, + "rewards/margins": 1.1474285125732422, + "rewards/rejected": -3.5893714427948, "step": 1790 }, { "epoch": 0.94, - "grad_norm": 8.3125, + "grad_norm": 6.90625, "learning_rate": 5.034777072871394e-08, - "logits/chosen": 1.9356000423431396, - "logits/rejected": 1.387791395187378, - "logps/chosen": -526.5845947265625, - "logps/rejected": -615.2680053710938, - "loss": 0.4864, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.5586600303649902, - "rewards/margins": 1.0848100185394287, - "rewards/rejected": -3.643470048904419, + "logits/chosen": -1.4195510149002075, + "logits/rejected": -1.336871862411499, + "logps/chosen": -495.6595153808594, + "logps/rejected": -588.1336669921875, + "loss": 0.4854, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.4461569786071777, + "rewards/margins": 1.0101310014724731, + "rewards/rejected": -3.4562880992889404, "step": 1800 }, { "epoch": 0.94, - "eval_logits/chosen": 2.298227548599243, - "eval_logits/rejected": 2.511512517929077, - "eval_logps/chosen": -516.33203125, - "eval_logps/rejected": -600.2826538085938, - "eval_loss": 0.49213770031929016, - "eval_rewards/accuracies": 0.7549999952316284, - "eval_rewards/chosen": -2.510258197784424, - "eval_rewards/margins": 1.0799494981765747, - "eval_rewards/rejected": -3.590207576751709, - "eval_runtime": 449.189, - "eval_samples_per_second": 4.452, - "eval_steps_per_second": 0.278, + "eval_logits/chosen": -1.4365895986557007, + "eval_logits/rejected": -1.34409499168396, + "eval_logps/chosen": -516.9639892578125, + "eval_logps/rejected": -595.033935546875, + "eval_loss": 0.49747607111930847, + "eval_rewards/accuracies": 0.7480000257492065, + "eval_rewards/chosen": -2.4526102542877197, + "eval_rewards/margins": 1.0205274820327759, + "eval_rewards/rejected": -3.473137855529785, + "eval_runtime": 206.9741, + "eval_samples_per_second": 9.663, + "eval_steps_per_second": 0.604, "step": 1800 }, { "epoch": 0.95, - "grad_norm": 6.0625, + "grad_norm": 6.21875, "learning_rate": 4.163407093778243e-08, - "logits/chosen": 1.7006988525390625, - "logits/rejected": 1.82578444480896, - "logps/chosen": -488.0613708496094, - "logps/rejected": -576.953857421875, - "loss": 0.4602, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.424391984939575, - "rewards/margins": 1.1633926630020142, - "rewards/rejected": -3.5877845287323, + "logits/chosen": -1.4668951034545898, + "logits/rejected": -1.3559987545013428, + "logps/chosen": -506.38153076171875, + "logps/rejected": -566.1290283203125, + "loss": 0.5032, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.4424426555633545, + "rewards/margins": 1.029581904411316, + "rewards/rejected": -3.472024440765381, "step": 1810 }, { "epoch": 0.95, - "grad_norm": 7.28125, + "grad_norm": 7.46875, "learning_rate": 3.37414453970758e-08, - "logits/chosen": 1.462934136390686, - "logits/rejected": 1.820743203163147, - "logps/chosen": -547.2937622070312, - "logps/rejected": -571.7078857421875, - "loss": 0.5663, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -2.642662286758423, - "rewards/margins": 0.6700358986854553, - "rewards/rejected": -3.3126978874206543, + "logits/chosen": -1.5458420515060425, + "logits/rejected": -1.3794844150543213, + "logps/chosen": -514.1174926757812, + "logps/rejected": -563.9821166992188, + "loss": 0.5023, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.361403226852417, + "rewards/margins": 1.0842626094818115, + "rewards/rejected": -3.4456658363342285, "step": 1820 }, { "epoch": 0.96, - "grad_norm": 6.0625, + "grad_norm": 7.75, "learning_rate": 2.6672530179410183e-08, - "logits/chosen": 1.7907625436782837, - "logits/rejected": 2.133568048477173, - "logps/chosen": -494.36236572265625, - "logps/rejected": -579.52587890625, - "loss": 0.4597, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.377741813659668, - "rewards/margins": 1.148186445236206, - "rewards/rejected": -3.525928020477295, + "logits/chosen": -1.4805978536605835, + "logits/rejected": -1.4541041851043701, + "logps/chosen": -495.7486877441406, + "logps/rejected": -565.6021118164062, + "loss": 0.5193, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.410522699356079, + "rewards/margins": 0.8961440324783325, + "rewards/rejected": -3.306666612625122, "step": 1830 }, { "epoch": 0.96, - "grad_norm": 7.90625, + "grad_norm": 8.125, "learning_rate": 2.04296862450451e-08, - "logits/chosen": 1.5523009300231934, - "logits/rejected": 1.7978283166885376, - "logps/chosen": -545.3399658203125, - "logps/rejected": -616.2483520507812, - "loss": 0.5025, + "logits/chosen": -1.4646061658859253, + "logits/rejected": -1.4149842262268066, + "logps/chosen": -559.3178100585938, + "logps/rejected": -612.7825927734375, + "loss": 0.4901, "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.570333957672119, - "rewards/margins": 1.0563600063323975, - "rewards/rejected": -3.6266937255859375, + "rewards/chosen": -2.692322254180908, + "rewards/margins": 0.8889071345329285, + "rewards/rejected": -3.5812296867370605, "step": 1840 }, { "epoch": 0.97, - "grad_norm": 4.40625, + "grad_norm": 5.6875, "learning_rate": 1.501499865314171e-08, - "logits/chosen": 1.7202790975570679, - "logits/rejected": 1.7891725301742554, - "logps/chosen": -553.4903564453125, - "logps/rejected": -618.0564575195312, - "loss": 0.4786, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.56032395362854, - "rewards/margins": 1.0462162494659424, - "rewards/rejected": -3.6065402030944824, + "logits/chosen": -1.4032434225082397, + "logits/rejected": -1.3378710746765137, + "logps/chosen": -570.8396606445312, + "logps/rejected": -627.8958129882812, + "loss": 0.4925, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.5015716552734375, + "rewards/margins": 0.943825364112854, + "rewards/rejected": -3.445397138595581, "step": 1850 }, { "epoch": 0.97, - "grad_norm": 6.71875, + "grad_norm": 6.21875, "learning_rate": 1.0430275865371265e-08, - "logits/chosen": 1.5807321071624756, - "logits/rejected": 1.6664190292358398, - "logps/chosen": -529.1962890625, - "logps/rejected": -612.3154907226562, - "loss": 0.4764, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.5308566093444824, - "rewards/margins": 1.0107314586639404, - "rewards/rejected": -3.541588306427002, + "logits/chosen": -1.3695303201675415, + "logits/rejected": -1.251265525817871, + "logps/chosen": -500.8570251464844, + "logps/rejected": -570.6333618164062, + "loss": 0.4912, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.518383502960205, + "rewards/margins": 1.0404582023620605, + "rewards/rejected": -3.5588417053222656, "step": 1860 }, { "epoch": 0.98, - "grad_norm": 8.6875, + "grad_norm": 8.8125, "learning_rate": 6.677049141901315e-09, - "logits/chosen": 1.7959083318710327, - "logits/rejected": 1.607792854309082, - "logps/chosen": -529.5757446289062, - "logps/rejected": -606.5616455078125, - "loss": 0.5168, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -2.5840096473693848, - "rewards/margins": 0.845914363861084, - "rewards/rejected": -3.4299240112304688, + "logits/chosen": -1.4154746532440186, + "logits/rejected": -1.3926751613616943, + "logps/chosen": -495.06915283203125, + "logps/rejected": -597.0353393554688, + "loss": 0.4822, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.457148313522339, + "rewards/margins": 0.9585596919059753, + "rewards/rejected": -3.415708065032959, "step": 1870 }, { "epoch": 0.98, - "grad_norm": 10.1875, + "grad_norm": 5.3125, "learning_rate": 3.756572029968708e-09, - "logits/chosen": 1.670596718788147, - "logits/rejected": 1.6237586736679077, - "logps/chosen": -534.3974609375, - "logps/rejected": -604.6273803710938, - "loss": 0.4933, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.6218812465667725, - "rewards/margins": 0.8884350061416626, - "rewards/rejected": -3.5103163719177246, + "logits/chosen": -1.5453914403915405, + "logits/rejected": -1.4504231214523315, + "logps/chosen": -514.4291381835938, + "logps/rejected": -585.8607177734375, + "loss": 0.4866, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.3940634727478027, + "rewards/margins": 1.0368168354034424, + "rewards/rejected": -3.430880308151245, "step": 1880 }, { "epoch": 0.99, - "grad_norm": 6.5, + "grad_norm": 6.1875, "learning_rate": 1.6698199452053199e-09, - "logits/chosen": 1.671463966369629, - "logits/rejected": 1.5730221271514893, - "logps/chosen": -530.3741455078125, - "logps/rejected": -586.624755859375, - "loss": 0.4768, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -2.395474672317505, - "rewards/margins": 1.0592764616012573, - "rewards/rejected": -3.4547507762908936, + "logits/chosen": -1.4837844371795654, + "logits/rejected": -1.3669326305389404, + "logps/chosen": -526.9951171875, + "logps/rejected": -609.9420166015625, + "loss": 0.4737, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.426424503326416, + "rewards/margins": 1.0135810375213623, + "rewards/rejected": -3.4400055408477783, "step": 1890 }, { "epoch": 0.99, - "grad_norm": 7.0625, + "grad_norm": 6.875, "learning_rate": 4.1748984585560094e-10, - "logits/chosen": 1.872854232788086, - "logits/rejected": 1.524829626083374, - "logps/chosen": -533.9439697265625, - "logps/rejected": -585.8349609375, - "loss": 0.5211, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.592789649963379, - "rewards/margins": 0.8945242762565613, - "rewards/rejected": -3.487313747406006, + "logits/chosen": -1.4921762943267822, + "logits/rejected": -1.3556934595108032, + "logps/chosen": -516.9928588867188, + "logps/rejected": -590.8782958984375, + "loss": 0.4879, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.5034995079040527, + "rewards/margins": 1.0504939556121826, + "rewards/rejected": -3.5539937019348145, "step": 1900 }, { "epoch": 0.99, - "eval_logits/chosen": 2.297112464904785, - "eval_logits/rejected": 2.5098061561584473, - "eval_logps/chosen": -516.2830810546875, - "eval_logps/rejected": -600.2637939453125, - "eval_loss": 0.49207010865211487, - "eval_rewards/accuracies": 0.7549999952316284, - "eval_rewards/chosen": -2.509768486022949, - "eval_rewards/margins": 1.0802510976791382, - "eval_rewards/rejected": -3.590019464492798, - "eval_runtime": 451.0621, - "eval_samples_per_second": 4.434, - "eval_steps_per_second": 0.277, + "eval_logits/chosen": -1.4358515739440918, + "eval_logits/rejected": -1.3432255983352661, + "eval_logps/chosen": -517.0148315429688, + "eval_logps/rejected": -595.1220703125, + "eval_loss": 0.49738776683807373, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -2.4531185626983643, + "eval_rewards/margins": 1.020900845527649, + "eval_rewards/rejected": -3.474019765853882, + "eval_runtime": 206.2288, + "eval_samples_per_second": 9.698, + "eval_steps_per_second": 0.606, "step": 1900 }, { "epoch": 1.0, - "grad_norm": 7.0, + "grad_norm": 6.90625, "learning_rate": 0.0, - "logits/chosen": 1.4962704181671143, - "logits/rejected": 1.806806206703186, - "logps/chosen": -554.9898681640625, - "logps/rejected": -627.3148803710938, - "loss": 0.486, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.5435588359832764, - "rewards/margins": 1.075156569480896, - "rewards/rejected": -3.618715763092041, + "logits/chosen": -1.5044982433319092, + "logits/rejected": -1.4231712818145752, + "logps/chosen": -519.6868896484375, + "logps/rejected": -603.6512451171875, + "loss": 0.4895, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.5049242973327637, + "rewards/margins": 0.8985089063644409, + "rewards/rejected": -3.403432846069336, "step": 1910 }, { "epoch": 1.0, "step": 1910, "total_flos": 0.0, - "train_loss": 0.528951391499704, - "train_runtime": 37787.7404, - "train_samples_per_second": 1.618, - "train_steps_per_second": 0.051 + "train_loss": 0.5312882837824796, + "train_runtime": 19005.9968, + "train_samples_per_second": 3.217, + "train_steps_per_second": 0.1 } ], "logging_steps": 10,