diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,12 +10,12 @@ "log_history": [ { "epoch": 0.0, - "grad_norm": 50.348802908690914, + "grad_norm": 71.7172276228935, "learning_rate": 5.208333333333333e-09, - "logits/chosen": -1.8382859230041504, - "logits/rejected": -1.788834810256958, - "logps/chosen": -119.0692138671875, - "logps/rejected": -76.35714721679688, + "logits/chosen": -1.4981693029403687, + "logits/rejected": -1.5654948949813843, + "logps/chosen": -257.4665222167969, + "logps/rejected": -99.09321594238281, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, @@ -25,1581 +25,1581 @@ }, { "epoch": 0.01, - "grad_norm": 46.56256136532365, + "grad_norm": 72.96425106358538, "learning_rate": 5.208333333333333e-08, - "logits/chosen": -1.6623330116271973, - "logits/rejected": -1.5512146949768066, - "logps/chosen": -129.29541015625, - "logps/rejected": -82.82817840576172, - "loss": 0.6931, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.00018554604321252555, - "rewards/margins": 0.00036421266850084066, - "rewards/rejected": -0.0005497586098499596, + "logits/chosen": -1.875927209854126, + "logits/rejected": -1.8456478118896484, + "logps/chosen": -326.4024658203125, + "logps/rejected": -141.96922302246094, + "loss": 0.6926, + "rewards/accuracies": 0.5138888955116272, + "rewards/chosen": 4.941297447658144e-05, + "rewards/margins": 0.001581083401106298, + "rewards/rejected": -0.0015316703356802464, "step": 10 }, { "epoch": 0.02, - "grad_norm": 47.9759657771044, + "grad_norm": 67.94798943175728, "learning_rate": 1.0416666666666667e-07, - "logits/chosen": -1.6969773769378662, - "logits/rejected": -1.5268709659576416, - "logps/chosen": -140.09909057617188, - "logps/rejected": -80.9607162475586, - "loss": 0.6866, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.013138635084033012, - "rewards/margins": 0.015350555069744587, - "rewards/rejected": -0.002211919752880931, + "logits/chosen": -1.8996143341064453, + "logits/rejected": -1.8235642910003662, + "logps/chosen": -323.2792663574219, + "logps/rejected": -135.71456909179688, + "loss": 0.6788, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.013530316762626171, + "rewards/margins": 0.030556190758943558, + "rewards/rejected": -0.01702587492763996, "step": 20 }, { "epoch": 0.03, - "grad_norm": 37.210502761575135, + "grad_norm": 50.35196678274095, "learning_rate": 1.5624999999999999e-07, - "logits/chosen": -1.7122924327850342, - "logits/rejected": -1.6340242624282837, - "logps/chosen": -119.13212585449219, - "logps/rejected": -84.83552551269531, - "loss": 0.657, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.05337844043970108, - "rewards/margins": 0.0730198472738266, - "rewards/rejected": -0.01964140310883522, + "logits/chosen": -1.9016050100326538, + "logits/rejected": -1.8865772485733032, + "logps/chosen": -306.1601257324219, + "logps/rejected": -138.81069946289062, + "loss": 0.6075, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.08184734731912613, + "rewards/margins": 0.19270142912864685, + "rewards/rejected": -0.11085411161184311, "step": 30 }, { "epoch": 0.04, - "grad_norm": 38.29612001708921, + "grad_norm": 24.96243157624135, "learning_rate": 2.0833333333333333e-07, - "logits/chosen": -1.8123095035552979, - "logits/rejected": -1.716663122177124, - "logps/chosen": -130.39593505859375, - "logps/rejected": -98.6071548461914, - "loss": 0.5915, - "rewards/accuracies": 0.90625, - "rewards/chosen": 0.1293005496263504, - "rewards/margins": 0.2863259017467499, - "rewards/rejected": -0.15702535212039948, + "logits/chosen": -1.9789104461669922, + "logits/rejected": -2.000354766845703, + "logps/chosen": -269.0250549316406, + "logps/rejected": -152.91973876953125, + "loss": 0.5009, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.12138372659683228, + "rewards/margins": 0.45498305559158325, + "rewards/rejected": -0.3335992991924286, "step": 40 }, { "epoch": 0.05, - "grad_norm": 25.169270167372616, + "grad_norm": 14.96717932132009, "learning_rate": 2.604166666666667e-07, - "logits/chosen": -1.6766433715820312, - "logits/rejected": -1.6306852102279663, - "logps/chosen": -127.97342681884766, - "logps/rejected": -136.70712280273438, - "loss": 0.484, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": 0.0137257669121027, - "rewards/margins": 0.6059251427650452, - "rewards/rejected": -0.5921992063522339, + "logits/chosen": -2.158332109451294, + "logits/rejected": -2.16444993019104, + "logps/chosen": -280.4437255859375, + "logps/rejected": -197.60487365722656, + "loss": 0.4282, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 0.15608879923820496, + "rewards/margins": 0.867284893989563, + "rewards/rejected": -0.7111960053443909, "step": 50 }, { "epoch": 0.06, - "grad_norm": 26.650306729386905, + "grad_norm": 10.138930051409568, "learning_rate": 3.1249999999999997e-07, - "logits/chosen": -1.6139761209487915, - "logits/rejected": -1.610743761062622, - "logps/chosen": -173.83758544921875, - "logps/rejected": -218.09707641601562, - "loss": 0.3946, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -0.24665935337543488, - "rewards/margins": 1.0333704948425293, - "rewards/rejected": -1.280029535293579, + "logits/chosen": -2.3020527362823486, + "logits/rejected": -2.289559841156006, + "logps/chosen": -255.1080780029297, + "logps/rejected": -206.7790069580078, + "loss": 0.3883, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -0.01837761141359806, + "rewards/margins": 0.9224237203598022, + "rewards/rejected": -0.9408015012741089, "step": 60 }, { "epoch": 0.07, - "grad_norm": 32.401140738769385, + "grad_norm": 9.627361782454214, "learning_rate": 3.645833333333333e-07, - "logits/chosen": -1.4975354671478271, - "logits/rejected": -1.4718661308288574, - "logps/chosen": -191.6230010986328, - "logps/rejected": -297.1890563964844, - "loss": 0.3474, - "rewards/accuracies": 0.84375, - "rewards/chosen": -0.6469367146492004, - "rewards/margins": 1.586439847946167, - "rewards/rejected": -2.2333762645721436, + "logits/chosen": -2.3991897106170654, + "logits/rejected": -2.4038243293762207, + "logps/chosen": -295.4974670410156, + "logps/rejected": -273.0234375, + "loss": 0.3499, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.15266235172748566, + "rewards/margins": 1.2677773237228394, + "rewards/rejected": -1.4204397201538086, "step": 70 }, { "epoch": 0.08, - "grad_norm": 31.488475681745037, + "grad_norm": 14.313296662619264, "learning_rate": 4.1666666666666667e-07, - "logits/chosen": -1.4592185020446777, - "logits/rejected": -1.3416706323623657, - "logps/chosen": -249.4734649658203, - "logps/rejected": -405.06207275390625, - "loss": 0.2966, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -1.1794850826263428, - "rewards/margins": 2.0616517066955566, - "rewards/rejected": -3.2411365509033203, + "logits/chosen": -2.5073745250701904, + "logits/rejected": -2.553334951400757, + "logps/chosen": -332.49945068359375, + "logps/rejected": -319.8727722167969, + "loss": 0.3368, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -0.5055555105209351, + "rewards/margins": 1.468584418296814, + "rewards/rejected": -1.9741401672363281, "step": 80 }, { "epoch": 0.09, - "grad_norm": 34.896761047309674, + "grad_norm": 23.431306250567218, "learning_rate": 4.6874999999999996e-07, - "logits/chosen": -1.2935254573822021, - "logits/rejected": -1.215850591659546, - "logps/chosen": -236.42935180664062, - "logps/rejected": -427.88079833984375, - "loss": 0.2792, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -1.0514061450958252, - "rewards/margins": 2.4284262657165527, - "rewards/rejected": -3.479832410812378, + "logits/chosen": -2.611389636993408, + "logits/rejected": -2.643505334854126, + "logps/chosen": -355.7128601074219, + "logps/rejected": -361.992431640625, + "loss": 0.309, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.61676025390625, + "rewards/margins": 1.7021185159683228, + "rewards/rejected": -2.318878412246704, "step": 90 }, { "epoch": 0.1, - "grad_norm": 40.95979792917679, + "grad_norm": 12.356735796295311, "learning_rate": 4.999732492681437e-07, - "logits/chosen": -1.425644040107727, - "logits/rejected": -1.2574630975723267, - "logps/chosen": -236.2029571533203, - "logps/rejected": -470.1878967285156, - "loss": 0.2555, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -1.0542919635772705, - "rewards/margins": 2.8619871139526367, - "rewards/rejected": -3.916278839111328, + "logits/chosen": -2.621342420578003, + "logits/rejected": -2.6476845741271973, + "logps/chosen": -319.81988525390625, + "logps/rejected": -322.2472229003906, + "loss": 0.3047, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -0.6059430241584778, + "rewards/margins": 1.4836266040802002, + "rewards/rejected": -2.089569568634033, "step": 100 }, { "epoch": 0.1, - "eval_logits/chosen": -1.394241213798523, - "eval_logits/rejected": -1.3358198404312134, - "eval_logps/chosen": -800.6121215820312, - "eval_logps/rejected": -884.5335083007812, - "eval_loss": 1.4172171354293823, - "eval_rewards/accuracies": 0.58984375, - "eval_rewards/chosen": -4.888356685638428, - "eval_rewards/margins": 0.7817266583442688, - "eval_rewards/rejected": -5.670083045959473, - "eval_runtime": 97.8793, - "eval_samples_per_second": 20.433, - "eval_steps_per_second": 0.327, + "eval_logits/chosen": -2.44050931930542, + "eval_logits/rejected": -2.3863425254821777, + "eval_logps/chosen": -361.08013916015625, + "eval_logps/rejected": -337.77484130859375, + "eval_loss": 0.8551393747329712, + "eval_rewards/accuracies": 0.3203125, + "eval_rewards/chosen": -0.49303656816482544, + "eval_rewards/margins": -0.29053980112075806, + "eval_rewards/rejected": -0.20249679684638977, + "eval_runtime": 97.3683, + "eval_samples_per_second": 20.541, + "eval_steps_per_second": 0.329, "step": 100 }, { "epoch": 0.12, - "grad_norm": 32.26210278499666, + "grad_norm": 12.540254444715838, "learning_rate": 4.996723692767926e-07, - "logits/chosen": -1.300011157989502, - "logits/rejected": -1.1855316162109375, - "logps/chosen": -253.3606719970703, - "logps/rejected": -546.6198120117188, - "loss": 0.2322, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -1.3233938217163086, - "rewards/margins": 3.418271541595459, - "rewards/rejected": -4.741665840148926, + "logits/chosen": -2.7024824619293213, + "logits/rejected": -2.732452154159546, + "logps/chosen": -409.3314208984375, + "logps/rejected": -413.891357421875, + "loss": 0.2958, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.829090416431427, + "rewards/margins": 2.046325206756592, + "rewards/rejected": -2.875415325164795, "step": 110 }, { "epoch": 0.13, - "grad_norm": 47.80125591905201, + "grad_norm": 15.24853315289221, "learning_rate": 4.990375746213598e-07, - "logits/chosen": -1.4404656887054443, - "logits/rejected": -1.311632513999939, - "logps/chosen": -286.67169189453125, - "logps/rejected": -644.265625, - "loss": 0.2299, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -1.5356190204620361, - "rewards/margins": 4.171111583709717, - "rewards/rejected": -5.706730842590332, + "logits/chosen": -2.661564350128174, + "logits/rejected": -2.7124838829040527, + "logps/chosen": -378.96533203125, + "logps/rejected": -433.1641540527344, + "loss": 0.2884, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.9071281552314758, + "rewards/margins": 2.1523900032043457, + "rewards/rejected": -3.059518337249756, "step": 120 }, { "epoch": 0.14, - "grad_norm": 103.21873941787233, + "grad_norm": 20.41295055769396, "learning_rate": 4.980697142834314e-07, - "logits/chosen": -1.4422317743301392, - "logits/rejected": -1.295238733291626, - "logps/chosen": -258.7148742675781, - "logps/rejected": -593.8055419921875, - "loss": 0.2288, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.2843689918518066, - "rewards/margins": 3.8400402069091797, - "rewards/rejected": -5.1244096755981445, + "logits/chosen": -2.693155527114868, + "logits/rejected": -2.738525390625, + "logps/chosen": -401.2513122558594, + "logps/rejected": -464.12353515625, + "loss": 0.2528, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.0387437343597412, + "rewards/margins": 2.259091854095459, + "rewards/rejected": -3.2978355884552, "step": 130 }, { "epoch": 0.15, - "grad_norm": 40.83439235144051, + "grad_norm": 27.736891370306594, "learning_rate": 4.967700826904229e-07, - "logits/chosen": -1.4604966640472412, - "logits/rejected": -1.289422631263733, - "logps/chosen": -291.0477600097656, - "logps/rejected": -753.3111572265625, - "loss": 0.2219, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -1.6053016185760498, - "rewards/margins": 5.150030612945557, - "rewards/rejected": -6.755332946777344, + "logits/chosen": -2.6829416751861572, + "logits/rejected": -2.743159770965576, + "logps/chosen": -405.8708801269531, + "logps/rejected": -488.6322326660156, + "loss": 0.2529, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.1785838603973389, + "rewards/margins": 2.3818531036376953, + "rewards/rejected": -3.560436964035034, "step": 140 }, { "epoch": 0.16, - "grad_norm": 35.47272090007365, + "grad_norm": 31.193716735224292, "learning_rate": 4.951404179843962e-07, - "logits/chosen": -1.439012885093689, - "logits/rejected": -1.197693109512329, - "logps/chosen": -317.4605407714844, - "logps/rejected": -829.1416015625, - "loss": 0.1987, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -1.6784883737564087, - "rewards/margins": 5.721285820007324, - "rewards/rejected": -7.399774074554443, + "logits/chosen": -2.4764091968536377, + "logits/rejected": -2.481703281402588, + "logps/chosen": -504.5975036621094, + "logps/rejected": -567.6092529296875, + "loss": 0.2358, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.8161567449569702, + "rewards/margins": 2.5039477348327637, + "rewards/rejected": -4.320104598999023, "step": 150 }, { "epoch": 0.17, - "grad_norm": 41.95405531315011, + "grad_norm": 16.84967305265633, "learning_rate": 4.931828996974498e-07, - "logits/chosen": -1.3928442001342773, - "logits/rejected": -1.1220872402191162, - "logps/chosen": -289.9758605957031, - "logps/rejected": -711.052001953125, - "loss": 0.21, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.6032127141952515, - "rewards/margins": 4.786148548126221, - "rewards/rejected": -6.389361381530762, + "logits/chosen": -2.4284980297088623, + "logits/rejected": -2.4476141929626465, + "logps/chosen": -411.03546142578125, + "logps/rejected": -520.0714721679688, + "loss": 0.2377, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.3872876167297363, + "rewards/margins": 2.5429630279541016, + "rewards/rejected": -3.930250883102417, "step": 160 }, { "epoch": 0.18, - "grad_norm": 44.30975250132851, + "grad_norm": 25.1927910072169, "learning_rate": 4.909001458367866e-07, - "logits/chosen": -1.4432841539382935, - "logits/rejected": -1.237818956375122, - "logps/chosen": -394.95751953125, - "logps/rejected": -859.7473754882812, - "loss": 0.1885, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -2.7817580699920654, - "rewards/margins": 5.132468223571777, - "rewards/rejected": -7.914226531982422, + "logits/chosen": -2.2736942768096924, + "logits/rejected": -2.292642831802368, + "logps/chosen": -452.697998046875, + "logps/rejected": -563.0865478515625, + "loss": 0.2128, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.6114835739135742, + "rewards/margins": 2.781120538711548, + "rewards/rejected": -4.392604351043701, "step": 170 }, { "epoch": 0.19, - "grad_norm": 42.31623927536496, + "grad_norm": 18.107740978814483, "learning_rate": 4.882952093833627e-07, - "logits/chosen": -1.450234055519104, - "logits/rejected": -1.2738139629364014, - "logps/chosen": -342.06048583984375, - "logps/rejected": -785.0755004882812, - "loss": 0.2045, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -2.051614999771118, - "rewards/margins": 4.953377723693848, - "rewards/rejected": -7.0049920082092285, + "logits/chosen": -2.1593220233917236, + "logits/rejected": -2.1532785892486572, + "logps/chosen": -509.1954650878906, + "logps/rejected": -644.2286376953125, + "loss": 0.2037, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -1.7663100957870483, + "rewards/margins": 3.2817070484161377, + "rewards/rejected": -5.0480170249938965, "step": 180 }, { "epoch": 0.2, - "grad_norm": 39.855754537956905, + "grad_norm": 22.004607228534194, "learning_rate": 4.853715742087946e-07, - "logits/chosen": -1.3248379230499268, - "logits/rejected": -1.09066903591156, - "logps/chosen": -368.9568786621094, - "logps/rejected": -883.1036987304688, - "loss": 0.1906, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -2.428023099899292, - "rewards/margins": 5.649343967437744, - "rewards/rejected": -8.077366828918457, + "logits/chosen": -2.1366734504699707, + "logits/rejected": -2.1492080688476562, + "logps/chosen": -469.94976806640625, + "logps/rejected": -587.4523315429688, + "loss": 0.2059, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6838188171386719, + "rewards/margins": 2.9172627925872803, + "rewards/rejected": -4.601081848144531, "step": 190 }, { "epoch": 0.21, - "grad_norm": 43.44905869134368, + "grad_norm": 23.839892866837005, "learning_rate": 4.821331504159906e-07, - "logits/chosen": -1.122195839881897, - "logits/rejected": -0.9366437196731567, - "logps/chosen": -330.4066467285156, - "logps/rejected": -748.525634765625, - "loss": 0.1854, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -2.157914161682129, - "rewards/margins": 4.577654838562012, - "rewards/rejected": -6.735569000244141, + "logits/chosen": -2.0906856060028076, + "logits/rejected": -2.1124696731567383, + "logps/chosen": -466.5874938964844, + "logps/rejected": -608.2426147460938, + "loss": 0.1861, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.8450790643692017, + "rewards/margins": 3.033769130706787, + "rewards/rejected": -4.878848552703857, "step": 200 }, { "epoch": 0.21, - "eval_logits/chosen": -1.1852705478668213, - "eval_logits/rejected": -1.108778953552246, - "eval_logps/chosen": -926.8517456054688, - "eval_logps/rejected": -1050.1199951171875, - "eval_loss": 1.6753935813903809, - "eval_rewards/accuracies": 0.62109375, - "eval_rewards/chosen": -6.15075159072876, - "eval_rewards/margins": 1.1751970052719116, - "eval_rewards/rejected": -7.325948715209961, - "eval_runtime": 97.5501, - "eval_samples_per_second": 20.502, - "eval_steps_per_second": 0.328, + "eval_logits/chosen": -1.5417730808258057, + "eval_logits/rejected": -1.5107452869415283, + "eval_logps/chosen": -450.2716369628906, + "eval_logps/rejected": -421.0933532714844, + "eval_loss": 1.0494639873504639, + "eval_rewards/accuracies": 0.38671875, + "eval_rewards/chosen": -1.3849513530731201, + "eval_rewards/margins": -0.3492693305015564, + "eval_rewards/rejected": -1.0356820821762085, + "eval_runtime": 97.2226, + "eval_samples_per_second": 20.571, + "eval_steps_per_second": 0.329, "step": 200 }, { "epoch": 0.22, - "grad_norm": 36.690007199237776, + "grad_norm": 21.766108201350445, "learning_rate": 4.785842691097342e-07, - "logits/chosen": -1.129880666732788, - "logits/rejected": -0.8643050193786621, - "logps/chosen": -371.864013671875, - "logps/rejected": -938.4737548828125, - "loss": 0.174, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.416229009628296, - "rewards/margins": 6.226431369781494, - "rewards/rejected": -8.642660140991211, + "logits/chosen": -1.914615273475647, + "logits/rejected": -1.9393634796142578, + "logps/chosen": -541.3153076171875, + "logps/rejected": -766.5374145507812, + "loss": 0.1892, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.552523374557495, + "rewards/margins": 3.7902703285217285, + "rewards/rejected": -6.342793941497803, "step": 210 }, { "epoch": 0.23, - "grad_norm": 50.37285904600086, + "grad_norm": 37.7465298870174, "learning_rate": 4.7472967660421603e-07, - "logits/chosen": -1.2257055044174194, - "logits/rejected": -0.964257538318634, - "logps/chosen": -339.07196044921875, - "logps/rejected": -897.4797973632812, - "loss": 0.1776, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.065497875213623, - "rewards/margins": 6.163290023803711, - "rewards/rejected": -8.228787422180176, + "logits/chosen": -2.004246234893799, + "logits/rejected": -2.0251355171203613, + "logps/chosen": -528.6552734375, + "logps/rejected": -743.7687377929688, + "loss": 0.1798, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.033097743988037, + "rewards/margins": 4.051178932189941, + "rewards/rejected": -6.0842766761779785, "step": 220 }, { "epoch": 0.24, - "grad_norm": 41.9657054722659, + "grad_norm": 37.93991066707489, "learning_rate": 4.705745280752585e-07, - "logits/chosen": -1.2483174800872803, - "logits/rejected": -0.9970757365226746, - "logps/chosen": -328.69775390625, - "logps/rejected": -740.644775390625, - "loss": 0.1898, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -1.9448823928833008, - "rewards/margins": 4.71053409576416, - "rewards/rejected": -6.655416965484619, + "logits/chosen": -1.9027751684188843, + "logits/rejected": -1.8593933582305908, + "logps/chosen": -495.2493591308594, + "logps/rejected": -688.589599609375, + "loss": 0.1765, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -1.9664674997329712, + "rewards/margins": 3.703148603439331, + "rewards/rejected": -5.66961669921875, "step": 230 }, { "epoch": 0.25, - "grad_norm": 44.41737576885688, + "grad_norm": 19.738306178674453, "learning_rate": 4.6612438066572555e-07, - "logits/chosen": -1.1642967462539673, - "logits/rejected": -0.9903414845466614, - "logps/chosen": -318.5704650878906, - "logps/rejected": -773.8700561523438, - "loss": 0.1919, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.9761199951171875, - "rewards/margins": 5.03547477722168, - "rewards/rejected": -7.011595249176025, + "logits/chosen": -1.7326488494873047, + "logits/rejected": -1.585479497909546, + "logps/chosen": -627.2045288085938, + "logps/rejected": -877.15234375, + "loss": 0.164, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -3.1001508235931396, + "rewards/margins": 4.346390247344971, + "rewards/rejected": -7.446540832519531, "step": 240 }, { "epoch": 0.26, - "grad_norm": 35.762754087270444, + "grad_norm": 20.99257000654717, "learning_rate": 4.6138518605333664e-07, - "logits/chosen": -1.106542706489563, - "logits/rejected": -0.8413636088371277, - "logps/chosen": -434.7865295410156, - "logps/rejected": -1112.9827880859375, - "loss": 0.1773, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -2.952258825302124, - "rewards/margins": 7.3310394287109375, - "rewards/rejected": -10.283297538757324, + "logits/chosen": -1.7733532190322876, + "logits/rejected": -1.5429089069366455, + "logps/chosen": -534.9285888671875, + "logps/rejected": -766.8218383789062, + "loss": 0.1779, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.3250820636749268, + "rewards/margins": 4.1490068435668945, + "rewards/rejected": -6.4740891456604, "step": 250 }, { "epoch": 0.27, - "grad_norm": 35.92606291569474, + "grad_norm": 24.636105883207083, "learning_rate": 4.5636328249082514e-07, - "logits/chosen": -0.869143009185791, - "logits/rejected": -0.5741620063781738, - "logps/chosen": -467.0962829589844, - "logps/rejected": -1112.403564453125, - "loss": 0.162, - "rewards/accuracies": 0.875, - "rewards/chosen": -3.3054141998291016, - "rewards/margins": 7.030813694000244, - "rewards/rejected": -10.336227416992188, + "logits/chosen": -1.7743794918060303, + "logits/rejected": -1.6784734725952148, + "logps/chosen": -502.83929443359375, + "logps/rejected": -686.4469604492188, + "loss": 0.1666, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -2.020374298095703, + "rewards/margins": 3.5390701293945312, + "rewards/rejected": -5.559444427490234, "step": 260 }, { "epoch": 0.28, - "grad_norm": 53.50950070995327, + "grad_norm": 26.442049627508165, "learning_rate": 4.510653863290871e-07, - "logits/chosen": -0.9721959829330444, - "logits/rejected": -0.6126078367233276, - "logps/chosen": -439.41436767578125, - "logps/rejected": -1052.373291015625, - "loss": 0.1814, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -2.9460256099700928, - "rewards/margins": 6.7303972244262695, - "rewards/rejected": -9.676422119140625, + "logits/chosen": -1.8089313507080078, + "logits/rejected": -1.504288673400879, + "logps/chosen": -553.7681884765625, + "logps/rejected": -773.63671875, + "loss": 0.163, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3301241397857666, + "rewards/margins": 4.031432628631592, + "rewards/rejected": -6.361557483673096, "step": 270 }, { "epoch": 0.29, - "grad_norm": 39.535885045091504, + "grad_norm": 21.752850388248913, "learning_rate": 4.4549858303465737e-07, - "logits/chosen": -1.0113680362701416, - "logits/rejected": -0.6861897706985474, - "logps/chosen": -475.51251220703125, - "logps/rejected": -1126.9036865234375, - "loss": 0.1758, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -3.39562726020813, - "rewards/margins": 7.080619812011719, - "rewards/rejected": -10.47624683380127, + "logits/chosen": -1.792066216468811, + "logits/rejected": -1.5596303939819336, + "logps/chosen": -521.0010986328125, + "logps/rejected": -791.414794921875, + "loss": 0.1514, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -2.1834633350372314, + "rewards/margins": 4.415928363800049, + "rewards/rejected": -6.599390983581543, "step": 280 }, { "epoch": 0.3, - "grad_norm": 27.716897646836834, + "grad_norm": 21.62051159722618, "learning_rate": 4.396703177135261e-07, - "logits/chosen": -0.9551790952682495, - "logits/rejected": -0.6880333423614502, - "logps/chosen": -410.09857177734375, - "logps/rejected": -904.8678588867188, - "loss": 0.1956, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.7216389179229736, - "rewards/margins": 5.489422798156738, - "rewards/rejected": -8.21106243133545, + "logits/chosen": -1.573697805404663, + "logits/rejected": -1.1857482194900513, + "logps/chosen": -611.6795654296875, + "logps/rejected": -855.2744140625, + "loss": 0.1493, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.884495258331299, + "rewards/margins": 4.296299934387207, + "rewards/rejected": -7.180795192718506, "step": 290 }, { "epoch": 0.31, - "grad_norm": 39.15676652312449, + "grad_norm": 16.812667194402408, "learning_rate": 4.335883851539693e-07, - "logits/chosen": -1.0925099849700928, - "logits/rejected": -0.763215184211731, - "logps/chosen": -293.86553955078125, - "logps/rejected": -759.0989990234375, - "loss": 0.1799, + "logits/chosen": -1.738490343093872, + "logits/rejected": -1.7563718557357788, + "logps/chosen": -494.66253662109375, + "logps/rejected": -731.6812744140625, + "loss": 0.1608, "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -1.6122547388076782, - "rewards/margins": 5.217433452606201, - "rewards/rejected": -6.82968807220459, + "rewards/chosen": -1.9538425207138062, + "rewards/margins": 3.9912962913513184, + "rewards/rejected": -5.945138931274414, "step": 300 }, { "epoch": 0.31, - "eval_logits/chosen": -1.1109943389892578, - "eval_logits/rejected": -1.0192608833312988, - "eval_logps/chosen": -903.3418579101562, - "eval_logps/rejected": -1015.4615478515625, - "eval_loss": 1.5589935779571533, - "eval_rewards/accuracies": 0.59765625, - "eval_rewards/chosen": -5.915654182434082, - "eval_rewards/margins": 1.0637093782424927, - "eval_rewards/rejected": -6.979363441467285, - "eval_runtime": 97.4807, - "eval_samples_per_second": 20.517, - "eval_steps_per_second": 0.328, + "eval_logits/chosen": -1.4367032051086426, + "eval_logits/rejected": -1.4021562337875366, + "eval_logps/chosen": -454.9446105957031, + "eval_logps/rejected": -422.9683532714844, + "eval_loss": 1.0910040140151978, + "eval_rewards/accuracies": 0.39453125, + "eval_rewards/chosen": -1.4316813945770264, + "eval_rewards/margins": -0.37724918127059937, + "eval_rewards/rejected": -1.0544321537017822, + "eval_runtime": 97.2015, + "eval_samples_per_second": 20.576, + "eval_steps_per_second": 0.329, "step": 300 }, { "epoch": 0.32, - "grad_norm": 31.761072657280515, + "grad_norm": 23.495796172679743, "learning_rate": 4.272609194017105e-07, - "logits/chosen": -1.161056637763977, - "logits/rejected": -0.8523654937744141, - "logps/chosen": -387.35833740234375, - "logps/rejected": -977.5528564453125, - "loss": 0.1806, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -2.436070442199707, - "rewards/margins": 6.559242248535156, - "rewards/rejected": -8.99531364440918, + "logits/chosen": -1.7330036163330078, + "logits/rejected": -1.6350265741348267, + "logps/chosen": -519.0903930664062, + "logps/rejected": -720.5287475585938, + "loss": 0.1552, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.7445989847183228, + "rewards/margins": 4.112780570983887, + "rewards/rejected": -5.857378959655762, "step": 310 }, { "epoch": 0.33, - "grad_norm": 36.03487432587838, + "grad_norm": 16.298880424996614, "learning_rate": 4.2069638288135547e-07, - "logits/chosen": -0.8736389875411987, - "logits/rejected": -0.6802955865859985, - "logps/chosen": -391.59564208984375, - "logps/rejected": -1047.556884765625, - "loss": 0.1698, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -2.664311170578003, - "rewards/margins": 7.013079643249512, - "rewards/rejected": -9.677392959594727, + "logits/chosen": -1.7347389459609985, + "logits/rejected": -1.7288787364959717, + "logps/chosen": -510.26483154296875, + "logps/rejected": -713.36572265625, + "loss": 0.157, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -1.7514524459838867, + "rewards/margins": 3.96923828125, + "rewards/rejected": -5.720690727233887, "step": 320 }, { "epoch": 0.35, - "grad_norm": 31.572371493306342, + "grad_norm": 19.47125814373079, "learning_rate": 4.139035550786494e-07, - "logits/chosen": -1.1484278440475464, - "logits/rejected": -0.7356959581375122, - "logps/chosen": -385.1824645996094, - "logps/rejected": -1041.6712646484375, - "loss": 0.1467, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -2.5319342613220215, - "rewards/margins": 7.115927219390869, - "rewards/rejected": -9.647860527038574, + "logits/chosen": -1.1601016521453857, + "logits/rejected": -0.8797602653503418, + "logps/chosen": -555.39990234375, + "logps/rejected": -796.0065307617188, + "loss": 0.1561, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.3634750843048096, + "rewards/margins": 4.329623222351074, + "rewards/rejected": -6.693098545074463, "step": 330 }, { "epoch": 0.36, - "grad_norm": 30.493158942186962, + "grad_norm": 28.613677806731356, "learning_rate": 4.0689152079869306e-07, - "logits/chosen": -1.111301064491272, - "logits/rejected": -0.7459093332290649, - "logps/chosen": -419.9774475097656, - "logps/rejected": -1145.664306640625, - "loss": 0.1589, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -2.8832128047943115, - "rewards/margins": 7.796321868896484, - "rewards/rejected": -10.679533958435059, + "logits/chosen": -1.2572317123413086, + "logits/rejected": -0.9607254862785339, + "logps/chosen": -606.7630615234375, + "logps/rejected": -846.2305908203125, + "loss": 0.1416, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.5160038471221924, + "rewards/margins": 4.53468132019043, + "rewards/rejected": -7.050685882568359, "step": 340 }, { "epoch": 0.37, - "grad_norm": 35.73091881437617, + "grad_norm": 17.707474904024725, "learning_rate": 3.99669658015821e-07, - "logits/chosen": -0.9885336756706238, - "logits/rejected": -0.6827311515808105, - "logps/chosen": -417.18621826171875, - "logps/rejected": -1244.3375244140625, - "loss": 0.1607, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -2.9024839401245117, - "rewards/margins": 8.765107154846191, - "rewards/rejected": -11.667591094970703, + "logits/chosen": -1.3649070262908936, + "logits/rejected": -0.9824110269546509, + "logps/chosen": -558.0640869140625, + "logps/rejected": -775.701171875, + "loss": 0.1563, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.48624849319458, + "rewards/margins": 4.034348964691162, + "rewards/rejected": -6.5205979347229, "step": 350 }, { "epoch": 0.38, - "grad_norm": 26.448274353945774, + "grad_norm": 17.329304967511433, "learning_rate": 3.92247625331392e-07, - "logits/chosen": -0.9021160006523132, - "logits/rejected": -0.46636825799942017, - "logps/chosen": -396.3219299316406, - "logps/rejected": -1072.203857421875, - "loss": 0.168, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -2.622156858444214, - "rewards/margins": 7.3311333656311035, - "rewards/rejected": -9.953289985656738, + "logits/chosen": -1.5912964344024658, + "logits/rejected": -1.5140416622161865, + "logps/chosen": -473.7013244628906, + "logps/rejected": -705.4163818359375, + "loss": 0.1391, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -2.0327391624450684, + "rewards/margins": 3.767451047897339, + "rewards/rejected": -5.800190448760986, "step": 360 }, { "epoch": 0.39, - "grad_norm": 26.57225676522975, + "grad_norm": 21.039638710046997, "learning_rate": 3.846353490562664e-07, - "logits/chosen": -0.7423251867294312, - "logits/rejected": -0.3315241038799286, - "logps/chosen": -403.2605285644531, - "logps/rejected": -1123.38623046875, - "loss": 0.1524, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -2.6948583126068115, - "rewards/margins": 7.7378740310668945, - "rewards/rejected": -10.432731628417969, + "logits/chosen": -1.3319923877716064, + "logits/rejected": -1.090010404586792, + "logps/chosen": -520.6598510742188, + "logps/rejected": -788.8143310546875, + "loss": 0.1403, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -2.3732516765594482, + "rewards/margins": 4.287914276123047, + "rewards/rejected": -6.661165714263916, "step": 370 }, { "epoch": 0.4, - "grad_norm": 31.253155533057107, + "grad_norm": 19.41739908042072, "learning_rate": 3.768430099352445e-07, - "logits/chosen": -0.6284547448158264, - "logits/rejected": -0.2557411789894104, - "logps/chosen": -438.34454345703125, - "logps/rejected": -1187.157958984375, - "loss": 0.161, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -3.113786220550537, - "rewards/margins": 7.986392974853516, - "rewards/rejected": -11.100178718566895, + "logits/chosen": -1.2049062252044678, + "logits/rejected": -0.9995657205581665, + "logps/chosen": -531.4547119140625, + "logps/rejected": -863.2029418945312, + "loss": 0.1359, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5220770835876465, + "rewards/margins": 4.725777626037598, + "rewards/rejected": -7.247855186462402, "step": 380 }, { "epoch": 0.41, - "grad_norm": 30.747112252887895, + "grad_norm": 30.846420024935966, "learning_rate": 3.6888102953122304e-07, - "logits/chosen": -0.888546347618103, - "logits/rejected": -0.5933178663253784, - "logps/chosen": -410.141357421875, - "logps/rejected": -1144.3177490234375, - "loss": 0.1578, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -2.877619981765747, - "rewards/margins": 7.820855617523193, - "rewards/rejected": -10.69847583770752, + "logits/chosen": -1.4293053150177002, + "logits/rejected": -1.129652976989746, + "logps/chosen": -509.592529296875, + "logps/rejected": -834.6383056640625, + "loss": 0.1217, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.1002399921417236, + "rewards/margins": 4.87478494644165, + "rewards/rejected": -6.975025177001953, "step": 390 }, { "epoch": 0.42, - "grad_norm": 46.52274946683974, + "grad_norm": 26.99256302878051, "learning_rate": 3.607600562872785e-07, - "logits/chosen": -0.6739174127578735, - "logits/rejected": -0.4188503324985504, - "logps/chosen": -447.11083984375, - "logps/rejected": -1131.9605712890625, - "loss": 0.1679, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -3.236891508102417, - "rewards/margins": 7.345515251159668, - "rewards/rejected": -10.582406997680664, + "logits/chosen": -1.3965580463409424, + "logits/rejected": -0.9727011919021606, + "logps/chosen": -591.9832763671875, + "logps/rejected": -926.43701171875, + "loss": 0.1368, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -2.8865580558776855, + "rewards/margins": 5.026923656463623, + "rewards/rejected": -7.913481712341309, "step": 400 }, { "epoch": 0.42, - "eval_logits/chosen": -0.7096253037452698, - "eval_logits/rejected": -0.5753335356712341, - "eval_logps/chosen": -1096.810791015625, - "eval_logps/rejected": -1238.125244140625, - "eval_loss": 2.102952241897583, - "eval_rewards/accuracies": 0.609375, - "eval_rewards/chosen": -7.850342750549316, - "eval_rewards/margins": 1.355657696723938, - "eval_rewards/rejected": -9.206000328063965, - "eval_runtime": 97.4138, - "eval_samples_per_second": 20.531, - "eval_steps_per_second": 0.328, + "eval_logits/chosen": -1.053754210472107, + "eval_logits/rejected": -1.013121485710144, + "eval_logps/chosen": -520.169921875, + "eval_logps/rejected": -479.64556884765625, + "eval_loss": 1.3009716272354126, + "eval_rewards/accuracies": 0.41015625, + "eval_rewards/chosen": -2.0839340686798096, + "eval_rewards/margins": -0.46272987127304077, + "eval_rewards/rejected": -1.6212042570114136, + "eval_runtime": 97.208, + "eval_samples_per_second": 20.574, + "eval_steps_per_second": 0.329, "step": 400 }, { "epoch": 0.43, - "grad_norm": 39.27703884155879, + "grad_norm": 18.264964860581, "learning_rate": 3.5249095128531856e-07, - "logits/chosen": -0.6762361526489258, - "logits/rejected": -0.2780495882034302, - "logps/chosen": -492.8828125, - "logps/rejected": -1225.805419921875, - "loss": 0.1705, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -3.5533318519592285, - "rewards/margins": 7.891423225402832, - "rewards/rejected": -11.444755554199219, + "logits/chosen": -1.6575086116790771, + "logits/rejected": -1.4926228523254395, + "logps/chosen": -505.78448486328125, + "logps/rejected": -805.1259155273438, + "loss": 0.151, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -2.203605890274048, + "rewards/margins": 4.460430145263672, + "rewards/rejected": -6.664034843444824, "step": 410 }, { "epoch": 0.44, - "grad_norm": 29.296123751306272, + "grad_norm": 19.09047976589624, "learning_rate": 3.4408477372034736e-07, - "logits/chosen": -0.6263514757156372, - "logits/rejected": -0.36908912658691406, - "logps/chosen": -436.956298828125, - "logps/rejected": -1203.846923828125, - "loss": 0.1542, - "rewards/accuracies": 0.90625, - "rewards/chosen": -3.0062294006347656, - "rewards/margins": 8.188032150268555, - "rewards/rejected": -11.19426155090332, + "logits/chosen": -1.6263633966445923, + "logits/rejected": -1.4956812858581543, + "logps/chosen": -479.7705993652344, + "logps/rejected": -766.1605224609375, + "loss": 0.1448, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.093071937561035, + "rewards/margins": 4.394181728363037, + "rewards/rejected": -6.4872541427612305, "step": 420 }, { "epoch": 0.45, - "grad_norm": 38.53939686623868, + "grad_norm": 26.836781627013515, "learning_rate": 3.3555276610977276e-07, - "logits/chosen": -0.7492274045944214, - "logits/rejected": -0.3440536856651306, - "logps/chosen": -440.6358337402344, - "logps/rejected": -1202.001953125, - "loss": 0.1624, - "rewards/accuracies": 0.9375, - "rewards/chosen": -3.0986926555633545, - "rewards/margins": 8.180191040039062, - "rewards/rejected": -11.278883934020996, + "logits/chosen": -1.662540078163147, + "logits/rejected": -1.4674957990646362, + "logps/chosen": -527.6272583007812, + "logps/rejected": -827.05224609375, + "loss": 0.1477, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.207035541534424, + "rewards/margins": 4.785927772521973, + "rewards/rejected": -6.9929633140563965, "step": 430 }, { "epoch": 0.46, - "grad_norm": 44.81056957165417, + "grad_norm": 24.71697296764681, "learning_rate": 3.269063392575352e-07, - "logits/chosen": -0.7927631139755249, - "logits/rejected": -0.4414951801300049, - "logps/chosen": -404.3497009277344, - "logps/rejected": -1111.9443359375, - "loss": 0.1883, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -2.7665016651153564, - "rewards/margins": 7.598056793212891, - "rewards/rejected": -10.364558219909668, + "logits/chosen": -1.682995080947876, + "logits/rejected": -1.625732660293579, + "logps/chosen": -521.5386352539062, + "logps/rejected": -861.8421630859375, + "loss": 0.1431, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1410486698150635, + "rewards/margins": 5.073235511779785, + "rewards/rejected": -7.214284420013428, "step": 440 }, { "epoch": 0.47, - "grad_norm": 46.720798060289724, + "grad_norm": 35.22220664866878, "learning_rate": 3.1815705699316964e-07, - "logits/chosen": -0.7731117010116577, - "logits/rejected": -0.4938638210296631, - "logps/chosen": -400.8499755859375, - "logps/rejected": -1148.82177734375, - "loss": 0.1656, - "rewards/accuracies": 0.84375, - "rewards/chosen": -2.6965749263763428, - "rewards/margins": 8.003564834594727, - "rewards/rejected": -10.700139045715332, + "logits/chosen": -1.578249216079712, + "logits/rejected": -1.4307564496994019, + "logps/chosen": -618.6419067382812, + "logps/rejected": -917.3260498046875, + "loss": 0.1288, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.068641185760498, + "rewards/margins": 4.769894599914551, + "rewards/rejected": -7.838536262512207, "step": 450 }, { "epoch": 0.48, - "grad_norm": 35.01981360851454, + "grad_norm": 15.368374650378705, "learning_rate": 3.0931662070620794e-07, - "logits/chosen": -0.787259578704834, - "logits/rejected": -0.5048869848251343, - "logps/chosen": -389.9725036621094, - "logps/rejected": -1115.496337890625, - "loss": 0.1644, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -2.5107271671295166, - "rewards/margins": 7.763253688812256, - "rewards/rejected": -10.273981094360352, + "logits/chosen": -1.8674280643463135, + "logits/rejected": -1.8419768810272217, + "logps/chosen": -574.4186401367188, + "logps/rejected": -899.6975708007812, + "loss": 0.1317, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.381072998046875, + "rewards/margins": 5.151331424713135, + "rewards/rejected": -7.53240442276001, "step": 460 }, { "epoch": 0.49, - "grad_norm": 32.88620144282501, + "grad_norm": 32.45699366987169, "learning_rate": 3.003968536966078e-07, - "logits/chosen": -0.8425847887992859, - "logits/rejected": -0.507870614528656, - "logps/chosen": -377.3370666503906, - "logps/rejected": -1063.682861328125, - "loss": 0.1581, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -2.452817440032959, - "rewards/margins": 7.454998970031738, - "rewards/rejected": -9.907815933227539, + "logits/chosen": -1.779637098312378, + "logits/rejected": -1.7716691493988037, + "logps/chosen": -548.851806640625, + "logps/rejected": -895.9444580078125, + "loss": 0.1467, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.4585914611816406, + "rewards/margins": 5.207429885864258, + "rewards/rejected": -7.666021823883057, "step": 470 }, { "epoch": 0.5, - "grad_norm": 55.426706045689585, + "grad_norm": 20.390767179507876, "learning_rate": 2.9140968536213693e-07, - "logits/chosen": -0.915787398815155, - "logits/rejected": -0.6263198852539062, - "logps/chosen": -401.2126770019531, - "logps/rejected": -1090.992431640625, - "loss": 0.176, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -2.7542808055877686, - "rewards/margins": 7.400801181793213, - "rewards/rejected": -10.155081748962402, + "logits/chosen": -1.919952630996704, + "logits/rejected": -1.9492056369781494, + "logps/chosen": -536.4271240234375, + "logps/rejected": -884.7716064453125, + "loss": 0.1285, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.535160541534424, + "rewards/margins": 5.057857990264893, + "rewards/rejected": -7.593018531799316, "step": 480 }, { "epoch": 0.51, - "grad_norm": 22.799610213421833, + "grad_norm": 23.067489784018093, "learning_rate": 2.823671352438608e-07, - "logits/chosen": -0.8439705967903137, - "logits/rejected": -0.6034272313117981, - "logps/chosen": -354.22772216796875, - "logps/rejected": -956.9412841796875, - "loss": 0.1624, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.377206802368164, - "rewards/margins": 6.480790615081787, - "rewards/rejected": -8.85799789428711, + "logits/chosen": -1.9916563034057617, + "logits/rejected": -2.021616220474243, + "logps/chosen": -518.0189208984375, + "logps/rejected": -824.4202880859375, + "loss": 0.1367, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.1029324531555176, + "rewards/margins": 4.754405498504639, + "rewards/rejected": -6.857337951660156, "step": 490 }, { "epoch": 0.52, - "grad_norm": 39.88720282221281, + "grad_norm": 25.320584902838384, "learning_rate": 2.73281296951072e-07, - "logits/chosen": -1.0180326700210571, - "logits/rejected": -0.7310226559638977, - "logps/chosen": -385.81866455078125, - "logps/rejected": -1014.95458984375, - "loss": 0.1693, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -2.5694196224212646, - "rewards/margins": 6.7894439697265625, - "rewards/rejected": -9.358863830566406, + "logits/chosen": -1.8573198318481445, + "logits/rejected": -1.8858684301376343, + "logps/chosen": -541.9103393554688, + "logps/rejected": -806.6531372070312, + "loss": 0.1364, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.396160840988159, + "rewards/margins": 4.32682466506958, + "rewards/rejected": -6.722985744476318, "step": 500 }, { "epoch": 0.52, - "eval_logits/chosen": -0.987293541431427, - "eval_logits/rejected": -0.8597699403762817, - "eval_logps/chosen": -945.861083984375, - "eval_logps/rejected": -1084.707763671875, - "eval_loss": 1.6562931537628174, - "eval_rewards/accuracies": 0.625, - "eval_rewards/chosen": -6.340845108032227, - "eval_rewards/margins": 1.3309805393218994, - "eval_rewards/rejected": -7.671825885772705, - "eval_runtime": 97.6234, - "eval_samples_per_second": 20.487, - "eval_steps_per_second": 0.328, + "eval_logits/chosen": -1.6466461420059204, + "eval_logits/rejected": -1.6090116500854492, + "eval_logps/chosen": -470.0933532714844, + "eval_logps/rejected": -430.8614196777344, + "eval_loss": 1.1773370504379272, + "eval_rewards/accuracies": 0.37109375, + "eval_rewards/chosen": -1.5831681489944458, + "eval_rewards/margins": -0.44980528950691223, + "eval_rewards/rejected": -1.133362889289856, + "eval_runtime": 97.142, + "eval_samples_per_second": 20.588, + "eval_steps_per_second": 0.329, "step": 500 }, { "epoch": 0.53, - "grad_norm": 36.2723513365496, + "grad_norm": 22.389152396843905, "learning_rate": 2.641643219871597e-07, - "logits/chosen": -1.0296061038970947, - "logits/rejected": -0.6009940505027771, - "logps/chosen": -367.12713623046875, - "logps/rejected": -957.3023681640625, - "loss": 0.1488, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -2.196636199951172, - "rewards/margins": 6.570919990539551, - "rewards/rejected": -8.767557144165039, + "logits/chosen": -1.9270741939544678, + "logits/rejected": -1.8253233432769775, + "logps/chosen": -544.8509521484375, + "logps/rejected": -834.5119018554688, + "loss": 0.1273, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5420873165130615, + "rewards/margins": 4.583073616027832, + "rewards/rejected": -7.125161170959473, "step": 510 }, { "epoch": 0.54, - "grad_norm": 41.00764971947524, + "grad_norm": 29.91148010858245, "learning_rate": 2.550284034980507e-07, - "logits/chosen": -0.9317744970321655, - "logits/rejected": -0.4665905833244324, - "logps/chosen": -389.6575012207031, - "logps/rejected": -1017.7044067382812, - "loss": 0.1731, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.4715576171875, - "rewards/margins": 6.838615417480469, - "rewards/rejected": -9.310173034667969, + "logits/chosen": -1.8461729288101196, + "logits/rejected": -1.9183883666992188, + "logps/chosen": -536.2467041015625, + "logps/rejected": -846.56689453125, + "loss": 0.1301, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3210761547088623, + "rewards/margins": 4.834765911102295, + "rewards/rejected": -7.1558427810668945, "step": 520 }, { "epoch": 0.55, - "grad_norm": 24.760088451093488, + "grad_norm": 24.244708458502263, "learning_rate": 2.4588575996495794e-07, - "logits/chosen": -1.0027564764022827, - "logits/rejected": -0.5636709928512573, - "logps/chosen": -361.3106384277344, - "logps/rejected": -1032.2296142578125, - "loss": 0.1409, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -2.317450761795044, - "rewards/margins": 7.240323066711426, - "rewards/rejected": -9.557774543762207, + "logits/chosen": -1.80156672000885, + "logits/rejected": -1.815596580505371, + "logps/chosen": -494.86798095703125, + "logps/rejected": -798.2820434570312, + "loss": 0.1237, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -2.3462131023406982, + "rewards/margins": 4.387810230255127, + "rewards/rejected": -6.7340240478515625, "step": 530 }, { "epoch": 0.57, - "grad_norm": 29.45673267405302, + "grad_norm": 19.910786436303987, "learning_rate": 2.367486188632446e-07, - "logits/chosen": -0.6549164652824402, - "logits/rejected": -0.2623385787010193, - "logps/chosen": -400.9425048828125, - "logps/rejected": -1065.4869384765625, - "loss": 0.1447, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -2.686591863632202, - "rewards/margins": 7.183202266693115, - "rewards/rejected": -9.869793891906738, + "logits/chosen": -1.9172916412353516, + "logits/rejected": -1.9127076864242554, + "logps/chosen": -525.8649291992188, + "logps/rejected": -845.7303466796875, + "loss": 0.1138, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.059016704559326, + "rewards/margins": 4.951017379760742, + "rewards/rejected": -7.010035037994385, "step": 540 }, { "epoch": 0.58, - "grad_norm": 28.112202270911702, + "grad_norm": 26.12771211985874, "learning_rate": 2.276292003092593e-07, - "logits/chosen": -0.7201762199401855, - "logits/rejected": -0.20686273276805878, - "logps/chosen": -414.7852478027344, - "logps/rejected": -1124.4974365234375, - "loss": 0.154, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.9022722244262695, - "rewards/margins": 7.623268127441406, - "rewards/rejected": -10.525540351867676, + "logits/chosen": -1.8340644836425781, + "logits/rejected": -1.8532531261444092, + "logps/chosen": -547.7066650390625, + "logps/rejected": -828.9231567382812, + "loss": 0.1309, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.3669962882995605, + "rewards/margins": 4.561534404754639, + "rewards/rejected": -6.928530216217041, "step": 550 }, { "epoch": 0.59, - "grad_norm": 39.46687358645063, + "grad_norm": 23.903525728545898, "learning_rate": 2.185397007170141e-07, - "logits/chosen": -0.40733662247657776, - "logits/rejected": 0.02392803505063057, - "logps/chosen": -366.5719299316406, - "logps/rejected": -931.62841796875, - "loss": 0.1679, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -2.599259853363037, - "rewards/margins": 6.089108467102051, - "rewards/rejected": -8.68836784362793, + "logits/chosen": -1.7198997735977173, + "logits/rejected": -1.5677220821380615, + "logps/chosen": -525.4856567382812, + "logps/rejected": -855.2352294921875, + "loss": 0.1157, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.4956746101379395, + "rewards/margins": 4.861161708831787, + "rewards/rejected": -7.356836795806885, "step": 560 }, { "epoch": 0.6, - "grad_norm": 45.19451118355015, + "grad_norm": 24.886700178362616, "learning_rate": 2.094922764865619e-07, - "logits/chosen": -0.6102726459503174, - "logits/rejected": -0.25020402669906616, - "logps/chosen": -418.000732421875, - "logps/rejected": -1118.455322265625, - "loss": 0.1527, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -2.813910722732544, - "rewards/margins": 7.487911224365234, - "rewards/rejected": -10.3018217086792, + "logits/chosen": -1.7758210897445679, + "logits/rejected": -1.836538314819336, + "logps/chosen": -506.49053955078125, + "logps/rejected": -846.2745971679688, + "loss": 0.1366, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -2.2135753631591797, + "rewards/margins": 4.973353385925293, + "rewards/rejected": -7.186928749084473, "step": 570 }, { "epoch": 0.61, - "grad_norm": 27.296281473923543, + "grad_norm": 21.212825758656418, "learning_rate": 2.0049902774588797e-07, - "logits/chosen": -0.5131772756576538, - "logits/rejected": -0.07118790596723557, - "logps/chosen": -358.7304992675781, - "logps/rejected": -981.1790161132812, - "loss": 0.1379, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -2.248889923095703, - "rewards/margins": 6.8233489990234375, - "rewards/rejected": -9.072239875793457, + "logits/chosen": -1.7731355428695679, + "logits/rejected": -1.777611494064331, + "logps/chosen": -581.9010009765625, + "logps/rejected": -871.5733642578125, + "loss": 0.123, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -2.585883378982544, + "rewards/margins": 4.742143154144287, + "rewards/rejected": -7.32802677154541, "step": 580 }, { "epoch": 0.62, - "grad_norm": 28.400791802256467, + "grad_norm": 24.501331083650943, "learning_rate": 1.9157198216806238e-07, - "logits/chosen": -0.5218924283981323, - "logits/rejected": -0.11522813141345978, - "logps/chosen": -385.2206115722656, - "logps/rejected": -1059.663330078125, - "loss": 0.1526, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.5596184730529785, - "rewards/margins": 7.243247032165527, - "rewards/rejected": -9.802865028381348, + "logits/chosen": -1.7602859735488892, + "logits/rejected": -1.84162175655365, + "logps/chosen": -452.37420654296875, + "logps/rejected": -707.836181640625, + "loss": 0.1305, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.025653600692749, + "rewards/margins": 3.844447374343872, + "rewards/rejected": -5.870100975036621, "step": 590 }, { "epoch": 0.63, - "grad_norm": 36.01204411609108, + "grad_norm": 31.082842710792463, "learning_rate": 1.8272307888529274e-07, - "logits/chosen": -0.6575510501861572, - "logits/rejected": -0.13078925013542175, - "logps/chosen": -393.07891845703125, - "logps/rejected": -1083.641357421875, - "loss": 0.1609, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -2.5931639671325684, - "rewards/margins": 7.458055019378662, - "rewards/rejected": -10.05121898651123, + "logits/chosen": -1.5975382328033447, + "logits/rejected": -1.4109210968017578, + "logps/chosen": -539.9990234375, + "logps/rejected": -849.1795654296875, + "loss": 0.1223, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -2.722317934036255, + "rewards/margins": 4.622250556945801, + "rewards/rejected": -7.344568729400635, "step": 600 }, { "epoch": 0.63, - "eval_logits/chosen": -0.616359293460846, - "eval_logits/rejected": -0.45154044032096863, - "eval_logps/chosen": -959.7227172851562, - "eval_logps/rejected": -1097.447998046875, - "eval_loss": 1.6818441152572632, - "eval_rewards/accuracies": 0.62109375, - "eval_rewards/chosen": -6.479461669921875, - "eval_rewards/margins": 1.3197669982910156, - "eval_rewards/rejected": -7.799228668212891, - "eval_runtime": 97.4083, - "eval_samples_per_second": 20.532, + "eval_logits/chosen": -1.1879667043685913, + "eval_logits/rejected": -1.1541048288345337, + "eval_logps/chosen": -541.48828125, + "eval_logps/rejected": -500.4929504394531, + "eval_loss": 1.3205512762069702, + "eval_rewards/accuracies": 0.4140625, + "eval_rewards/chosen": -2.2971181869506836, + "eval_rewards/margins": -0.46744003891944885, + "eval_rewards/rejected": -1.829677939414978, + "eval_runtime": 97.2287, + "eval_samples_per_second": 20.57, "eval_steps_per_second": 0.329, "step": 600 }, { "epoch": 0.64, - "grad_norm": 25.93993016029263, + "grad_norm": 25.74237035768959, "learning_rate": 1.7396415252139288e-07, - "logits/chosen": -0.9318272471427917, - "logits/rejected": -0.5557786226272583, - "logps/chosen": -372.4070129394531, - "logps/rejected": -1039.4412841796875, - "loss": 0.1523, + "logits/chosen": -1.5792185068130493, + "logits/rejected": -1.6271593570709229, + "logps/chosen": -576.2874755859375, + "logps/rejected": -917.5999755859375, + "loss": 0.1228, "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -2.3948473930358887, - "rewards/margins": 7.164391994476318, - "rewards/rejected": -9.559239387512207, + "rewards/chosen": -2.802475929260254, + "rewards/margins": 5.048992156982422, + "rewards/rejected": -7.851468563079834, "step": 610 }, { "epoch": 0.65, - "grad_norm": 43.56570182026958, + "grad_norm": 29.519623894853613, "learning_rate": 1.6530691736402316e-07, - "logits/chosen": -0.6166467070579529, - "logits/rejected": -0.15916576981544495, - "logps/chosen": -382.71112060546875, - "logps/rejected": -1044.164794921875, - "loss": 0.1514, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -2.561720848083496, - "rewards/margins": 7.1611833572387695, - "rewards/rejected": -9.722905158996582, + "logits/chosen": -1.6389156579971313, + "logits/rejected": -1.6191425323486328, + "logps/chosen": -530.8707275390625, + "logps/rejected": -890.0936279296875, + "loss": 0.1254, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3094658851623535, + "rewards/margins": 5.2504377365112305, + "rewards/rejected": -7.559903144836426, "step": 620 }, { "epoch": 0.66, - "grad_norm": 28.928080380488158, + "grad_norm": 50.42769593704326, "learning_rate": 1.5676295169786864e-07, - "logits/chosen": -0.5803557634353638, - "logits/rejected": -0.2489413022994995, - "logps/chosen": -383.3107604980469, - "logps/rejected": -997.8870239257812, - "loss": 0.164, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -2.578981876373291, - "rewards/margins": 6.61221170425415, - "rewards/rejected": -9.191193580627441, + "logits/chosen": -1.7326772212982178, + "logits/rejected": -1.5138975381851196, + "logps/chosen": -561.8060302734375, + "logps/rejected": -941.2083740234375, + "loss": 0.1236, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -2.4162166118621826, + "rewards/margins": 5.7043328285217285, + "rewards/rejected": -8.120549201965332, "step": 630 }, { "epoch": 0.67, - "grad_norm": 20.471962287027644, + "grad_norm": 27.38270007193824, "learning_rate": 1.483436823197092e-07, - "logits/chosen": -0.6895365118980408, - "logits/rejected": -0.1990533173084259, - "logps/chosen": -362.12237548828125, - "logps/rejected": -1061.60400390625, - "loss": 0.1475, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": -2.2892990112304688, - "rewards/margins": 7.556910514831543, - "rewards/rejected": -9.846209526062012, + "logits/chosen": -1.5837154388427734, + "logits/rejected": -1.4245226383209229, + "logps/chosen": -562.3756103515625, + "logps/rejected": -949.3406982421875, + "loss": 0.1163, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -3.040762186050415, + "rewards/margins": 5.183028697967529, + "rewards/rejected": -8.223790168762207, "step": 640 }, { "epoch": 0.68, - "grad_norm": 26.665412290672624, + "grad_norm": 35.27601401211646, "learning_rate": 1.4006036925609243e-07, - "logits/chosen": -0.6053385734558105, - "logits/rejected": -0.19510070979595184, - "logps/chosen": -381.46514892578125, - "logps/rejected": -1129.877685546875, - "loss": 0.1432, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.4206106662750244, - "rewards/margins": 7.994618892669678, - "rewards/rejected": -10.415228843688965, + "logits/chosen": -1.7516834735870361, + "logits/rejected": -1.7616602182388306, + "logps/chosen": -558.3771362304688, + "logps/rejected": -879.00341796875, + "loss": 0.1139, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -2.68957257270813, + "rewards/margins": 4.818731784820557, + "rewards/rejected": -7.508304595947266, "step": 650 }, { "epoch": 0.69, - "grad_norm": 33.292289854748574, + "grad_norm": 22.77394327537456, "learning_rate": 1.319240907040458e-07, - "logits/chosen": -0.5973755717277527, - "logits/rejected": -0.16952477395534515, - "logps/chosen": -410.3275451660156, - "logps/rejected": -1123.840087890625, - "loss": 0.1388, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.748767852783203, - "rewards/margins": 7.701995849609375, - "rewards/rejected": -10.450764656066895, + "logits/chosen": -1.7446712255477905, + "logits/rejected": -1.6567370891571045, + "logps/chosen": -559.461669921875, + "logps/rejected": -941.3621826171875, + "loss": 0.1117, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.522426128387451, + "rewards/margins": 5.564127445220947, + "rewards/rejected": -8.086553573608398, "step": 660 }, { "epoch": 0.7, - "grad_norm": 30.896865825928337, + "grad_norm": 30.36694985683218, "learning_rate": 1.239457282149695e-07, - "logits/chosen": -0.5566490292549133, - "logits/rejected": -0.23316040635108948, - "logps/chosen": -428.1212463378906, - "logps/rejected": -1241.2476806640625, - "loss": 0.1475, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -3.0948407649993896, - "rewards/margins": 8.58587646484375, - "rewards/rejected": -11.680717468261719, + "logits/chosen": -1.4815702438354492, + "logits/rejected": -1.3673474788665771, + "logps/chosen": -548.4630737304688, + "logps/rejected": -930.783203125, + "loss": 0.1092, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -2.9377596378326416, + "rewards/margins": 5.2314453125, + "rewards/rejected": -8.169205665588379, "step": 670 }, { "epoch": 0.71, - "grad_norm": 35.84504872343951, + "grad_norm": 28.757425831930576, "learning_rate": 1.1613595214152711e-07, - "logits/chosen": -0.5821831822395325, - "logits/rejected": -0.07458965480327606, - "logps/chosen": -399.4475402832031, - "logps/rejected": -1166.2537841796875, - "loss": 0.1518, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -2.7263970375061035, - "rewards/margins": 8.200857162475586, - "rewards/rejected": -10.927252769470215, + "logits/chosen": -1.5986506938934326, + "logits/rejected": -1.5326309204101562, + "logps/chosen": -513.4288940429688, + "logps/rejected": -861.1954345703125, + "loss": 0.1105, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -2.518094062805176, + "rewards/margins": 4.961692810058594, + "rewards/rejected": -7.4797868728637695, "step": 680 }, { "epoch": 0.72, - "grad_norm": 33.84684500335279, + "grad_norm": 23.459702688906162, "learning_rate": 1.0850520736699362e-07, - "logits/chosen": -0.5655028223991394, - "logits/rejected": -0.15712787210941315, - "logps/chosen": -377.57208251953125, - "logps/rejected": -1100.81884765625, - "loss": 0.1454, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.5599417686462402, - "rewards/margins": 7.693644046783447, - "rewards/rejected": -10.253584861755371, + "logits/chosen": -1.6289697885513306, + "logits/rejected": -1.527553677558899, + "logps/chosen": -537.8997192382812, + "logps/rejected": -878.1143798828125, + "loss": 0.113, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -2.5290098190307617, + "rewards/margins": 4.979551315307617, + "rewards/rejected": -7.508561611175537, "step": 690 }, { "epoch": 0.73, - "grad_norm": 47.388927709457874, + "grad_norm": 25.531163769236304, "learning_rate": 1.0106369933615042e-07, - "logits/chosen": -0.2697560787200928, - "logits/rejected": 0.21001645922660828, - "logps/chosen": -389.8284606933594, - "logps/rejected": -1081.4793701171875, - "loss": 0.1559, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -2.6655356884002686, - "rewards/margins": 7.382843971252441, - "rewards/rejected": -10.048379898071289, + "logits/chosen": -1.5353871583938599, + "logits/rejected": -1.3200981616973877, + "logps/chosen": -662.8204956054688, + "logps/rejected": -1062.7938232421875, + "loss": 0.0971, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -3.5107734203338623, + "rewards/margins": 5.729162693023682, + "rewards/rejected": -9.239934921264648, "step": 700 }, { "epoch": 0.73, - "eval_logits/chosen": -0.5852116346359253, - "eval_logits/rejected": -0.4166191518306732, - "eval_logps/chosen": -1046.6217041015625, - "eval_logps/rejected": -1197.0731201171875, - "eval_loss": 1.9277653694152832, - "eval_rewards/accuracies": 0.61328125, - "eval_rewards/chosen": -7.348450660705566, - "eval_rewards/margins": 1.447028398513794, - "eval_rewards/rejected": -8.795478820800781, - "eval_runtime": 97.4242, - "eval_samples_per_second": 20.529, - "eval_steps_per_second": 0.328, + "eval_logits/chosen": -0.9711934924125671, + "eval_logits/rejected": -0.9391850829124451, + "eval_logps/chosen": -577.3128051757812, + "eval_logps/rejected": -533.4666748046875, + "eval_loss": 1.463817834854126, + "eval_rewards/accuracies": 0.390625, + "eval_rewards/chosen": -2.6553633213043213, + "eval_rewards/margins": -0.4959481358528137, + "eval_rewards/rejected": -2.1594154834747314, + "eval_runtime": 97.1589, + "eval_samples_per_second": 20.585, + "eval_steps_per_second": 0.329, "step": 700 }, { "epoch": 0.74, - "grad_norm": 21.54892454805239, + "grad_norm": 43.118837668077326, "learning_rate": 9.382138040640714e-08, - "logits/chosen": -0.6082885265350342, - "logits/rejected": 0.03385084122419357, - "logps/chosen": -389.99542236328125, - "logps/rejected": -1186.9339599609375, - "loss": 0.1358, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -2.5867698192596436, - "rewards/margins": 8.532285690307617, - "rewards/rejected": -11.11905574798584, + "logits/chosen": -1.6447147130966187, + "logits/rejected": -1.4446865320205688, + "logps/chosen": -644.3197021484375, + "logps/rejected": -1070.079345703125, + "loss": 0.11, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -3.044210910797119, + "rewards/margins": 6.259350776672363, + "rewards/rejected": -9.303561210632324, "step": 710 }, { "epoch": 0.75, - "grad_norm": 45.770131366856944, + "grad_norm": 32.87349411722058, "learning_rate": 8.678793653740632e-08, - "logits/chosen": -0.45069313049316406, - "logits/rejected": -0.01569805108010769, - "logps/chosen": -418.41015625, - "logps/rejected": -1100.5294189453125, - "loss": 0.1518, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -2.9045753479003906, - "rewards/margins": 7.348256587982178, - "rewards/rejected": -10.252832412719727, + "logits/chosen": -1.7681348323822021, + "logits/rejected": -1.6581518650054932, + "logps/chosen": -622.3300170898438, + "logps/rejected": -1050.2943115234375, + "loss": 0.1069, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.925344944000244, + "rewards/margins": 6.121405601501465, + "rewards/rejected": -9.04675006866455, "step": 720 }, { "epoch": 0.76, - "grad_norm": 26.24794912329421, + "grad_norm": 36.992204810711705, "learning_rate": 7.997277433690983e-08, - "logits/chosen": -0.6921511292457581, - "logits/rejected": -0.1756693720817566, - "logps/chosen": -402.31463623046875, - "logps/rejected": -1098.74951171875, - "loss": 0.1545, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -2.674762725830078, - "rewards/margins": 7.446639060974121, - "rewards/rejected": -10.1214017868042, + "logits/chosen": -1.623159646987915, + "logits/rejected": -1.4811782836914062, + "logps/chosen": -612.2554931640625, + "logps/rejected": -995.8507690429688, + "loss": 0.118, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.1586315631866455, + "rewards/margins": 5.526102066040039, + "rewards/rejected": -8.684733390808105, "step": 730 }, { "epoch": 0.77, - "grad_norm": 42.21104117559166, + "grad_norm": 31.945634867150687, "learning_rate": 7.338500848029602e-08, - "logits/chosen": -0.5230437517166138, - "logits/rejected": -0.08038869500160217, - "logps/chosen": -374.2379150390625, - "logps/rejected": -1173.2691650390625, - "loss": 0.1524, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.3602004051208496, - "rewards/margins": 8.614425659179688, - "rewards/rejected": -10.974625587463379, + "logits/chosen": -1.51155686378479, + "logits/rejected": -1.349172592163086, + "logps/chosen": -575.31103515625, + "logps/rejected": -1048.4927978515625, + "loss": 0.1135, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.0343711376190186, + "rewards/margins": 6.1768388748168945, + "rewards/rejected": -9.211209297180176, "step": 740 }, { "epoch": 0.78, - "grad_norm": 45.35711154178482, + "grad_norm": 26.030459327741653, "learning_rate": 6.70334495204884e-08, - "logits/chosen": -0.4201095998287201, - "logits/rejected": 0.112357497215271, - "logps/chosen": -399.91766357421875, - "logps/rejected": -1149.41357421875, - "loss": 0.1425, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -2.750382900238037, - "rewards/margins": 8.02344799041748, - "rewards/rejected": -10.773832321166992, + "logits/chosen": -1.6075376272201538, + "logits/rejected": -1.2717111110687256, + "logps/chosen": -635.0384521484375, + "logps/rejected": -1016.2355346679688, + "loss": 0.0957, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -3.0112128257751465, + "rewards/margins": 5.901360511779785, + "rewards/rejected": -8.912572860717773, "step": 750 }, { "epoch": 0.8, - "grad_norm": 26.89122562285564, + "grad_norm": 34.627238331382905, "learning_rate": 6.092659210462231e-08, - "logits/chosen": -0.6679720878601074, - "logits/rejected": -0.1197366863489151, - "logps/chosen": -417.52996826171875, - "logps/rejected": -1188.0615234375, - "loss": 0.1345, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.673654079437256, - "rewards/margins": 8.336530685424805, - "rewards/rejected": -11.010185241699219, + "logits/chosen": -1.448529839515686, + "logits/rejected": -1.4409586191177368, + "logps/chosen": -625.353271484375, + "logps/rejected": -1024.951416015625, + "loss": 0.1095, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.2976653575897217, + "rewards/margins": 5.690510272979736, + "rewards/rejected": -8.988176345825195, "step": 760 }, { "epoch": 0.81, - "grad_norm": 34.7731803926411, + "grad_norm": 29.968791403480516, "learning_rate": 5.507260361320737e-08, - "logits/chosen": -0.6739757657051086, - "logits/rejected": -0.05404149740934372, - "logps/chosen": -402.13238525390625, - "logps/rejected": -1078.5826416015625, - "loss": 0.1516, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -2.7083559036254883, - "rewards/margins": 7.324588775634766, - "rewards/rejected": -10.032943725585938, + "logits/chosen": -1.5798556804656982, + "logits/rejected": -1.5097671747207642, + "logps/chosen": -600.1854248046875, + "logps/rejected": -997.0427856445312, + "loss": 0.1033, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.1436641216278076, + "rewards/margins": 5.432265281677246, + "rewards/rejected": -8.575929641723633, "step": 770 }, { "epoch": 0.82, - "grad_norm": 25.35847618120489, + "grad_norm": 23.15982358200479, "learning_rate": 4.947931323697982e-08, - "logits/chosen": -0.5962761044502258, - "logits/rejected": -0.05018671602010727, - "logps/chosen": -392.66363525390625, - "logps/rejected": -1066.8021240234375, - "loss": 0.1551, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -2.6160168647766113, - "rewards/margins": 7.251564025878906, - "rewards/rejected": -9.86758041381836, + "logits/chosen": -1.4408910274505615, + "logits/rejected": -1.3072535991668701, + "logps/chosen": -556.1472778320312, + "logps/rejected": -968.1648559570312, + "loss": 0.0969, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.2009377479553223, + "rewards/margins": 5.246421813964844, + "rewards/rejected": -8.447359085083008, "step": 780 }, { "epoch": 0.83, - "grad_norm": 27.140797859113846, + "grad_norm": 23.663576108460163, "learning_rate": 4.415420150605398e-08, - "logits/chosen": -0.6300855278968811, - "logits/rejected": -0.1746017038822174, - "logps/chosen": -400.11431884765625, - "logps/rejected": -1069.177001953125, - "loss": 0.1616, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.7479443550109863, - "rewards/margins": 7.171761989593506, - "rewards/rejected": -9.919707298278809, + "logits/chosen": -1.5656832456588745, + "logits/rejected": -1.3935534954071045, + "logps/chosen": -601.1109008789062, + "logps/rejected": -999.2574462890625, + "loss": 0.1023, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -3.033841609954834, + "rewards/margins": 5.674118995666504, + "rewards/rejected": -8.70796012878418, "step": 790 }, { "epoch": 0.84, - "grad_norm": 20.145953031877735, + "grad_norm": 23.462966354133016, "learning_rate": 3.9104390285376374e-08, - "logits/chosen": -0.44818106293678284, - "logits/rejected": -0.035945743322372437, - "logps/chosen": -396.0670471191406, - "logps/rejected": -1147.34619140625, - "loss": 0.1433, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -2.6665611267089844, - "rewards/margins": 8.064103126525879, - "rewards/rejected": -10.730664253234863, + "logits/chosen": -1.466312289237976, + "logits/rejected": -1.2542184591293335, + "logps/chosen": -619.60009765625, + "logps/rejected": -1041.6527099609375, + "loss": 0.1035, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.135887861251831, + "rewards/margins": 5.890833854675293, + "rewards/rejected": -9.026721954345703, "step": 800 }, { "epoch": 0.84, - "eval_logits/chosen": -0.6744760870933533, - "eval_logits/rejected": -0.5140590667724609, - "eval_logps/chosen": -1026.7318115234375, - "eval_logps/rejected": -1180.040283203125, - "eval_loss": 1.9049861431121826, - "eval_rewards/accuracies": 0.6171875, - "eval_rewards/chosen": -7.149553298950195, - "eval_rewards/margins": 1.4755982160568237, - "eval_rewards/rejected": -8.625151634216309, - "eval_runtime": 97.364, - "eval_samples_per_second": 20.541, + "eval_logits/chosen": -0.9231772422790527, + "eval_logits/rejected": -0.8901588916778564, + "eval_logps/chosen": -569.3816528320312, + "eval_logps/rejected": -532.9067993164062, + "eval_loss": 1.4475353956222534, + "eval_rewards/accuracies": 0.39453125, + "eval_rewards/chosen": -2.5760514736175537, + "eval_rewards/margins": -0.42223480343818665, + "eval_rewards/rejected": -2.1538166999816895, + "eval_runtime": 97.1382, + "eval_samples_per_second": 20.589, "eval_steps_per_second": 0.329, "step": 800 }, { "epoch": 0.85, - "grad_norm": 21.539181167828147, + "grad_norm": 30.508507386848592, "learning_rate": 3.433663324986208e-08, - "logits/chosen": -0.5423880815505981, - "logits/rejected": -0.08916589617729187, - "logps/chosen": -378.41876220703125, - "logps/rejected": -1105.344970703125, - "loss": 0.156, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -2.467062473297119, - "rewards/margins": 7.7889909744262695, - "rewards/rejected": -10.256052017211914, + "logits/chosen": -1.4120627641677856, + "logits/rejected": -1.3580305576324463, + "logps/chosen": -642.4114990234375, + "logps/rejected": -1046.66357421875, + "loss": 0.1086, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.5207436084747314, + "rewards/margins": 5.544968605041504, + "rewards/rejected": -9.065712928771973, "step": 810 }, { "epoch": 0.86, - "grad_norm": 25.60967931137277, + "grad_norm": 15.450308092025814, "learning_rate": 2.9857306851953897e-08, - "logits/chosen": -0.5412378907203674, - "logits/rejected": -0.05382275581359863, - "logps/chosen": -373.0191345214844, - "logps/rejected": -1065.0948486328125, - "loss": 0.1524, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -2.4381911754608154, - "rewards/margins": 7.450464725494385, - "rewards/rejected": -9.888655662536621, + "logits/chosen": -1.5075926780700684, + "logits/rejected": -1.286818504333496, + "logps/chosen": -577.96533203125, + "logps/rejected": -1041.372802734375, + "loss": 0.0844, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.9870734214782715, + "rewards/margins": 6.117280960083008, + "rewards/rejected": -9.104352951049805, "step": 820 }, { "epoch": 0.87, - "grad_norm": 49.176125880409266, + "grad_norm": 32.428690976585585, "learning_rate": 2.567240179368185e-08, - "logits/chosen": -0.5491518378257751, - "logits/rejected": 0.00046962351188994944, - "logps/chosen": -376.9706115722656, - "logps/rejected": -1090.8355712890625, - "loss": 0.1429, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -2.460528612136841, - "rewards/margins": 7.668572902679443, - "rewards/rejected": -10.129101753234863, + "logits/chosen": -1.5189392566680908, + "logits/rejected": -1.3501880168914795, + "logps/chosen": -603.621337890625, + "logps/rejected": -1007.1226806640625, + "loss": 0.0982, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -3.046630620956421, + "rewards/margins": 5.773746967315674, + "rewards/rejected": -8.820378303527832, "step": 830 }, { "epoch": 0.88, - "grad_norm": 37.134305931663185, + "grad_norm": 35.07586716910879, "learning_rate": 2.1787515014630357e-08, - "logits/chosen": -0.8858461380004883, - "logits/rejected": -0.30200493335723877, - "logps/chosen": -372.07769775390625, - "logps/rejected": -1134.0087890625, - "loss": 0.1405, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": -2.2380013465881348, - "rewards/margins": 8.35414981842041, - "rewards/rejected": -10.592150688171387, + "logits/chosen": -1.406670331954956, + "logits/rejected": -1.1134653091430664, + "logps/chosen": -552.0911254882812, + "logps/rejected": -1012.5986328125, + "loss": 0.1141, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7988593578338623, + "rewards/margins": 6.124402046203613, + "rewards/rejected": -8.923261642456055, "step": 840 }, { "epoch": 0.89, - "grad_norm": 44.332126805939915, + "grad_norm": 35.12000542720853, "learning_rate": 1.820784220652766e-08, - "logits/chosen": -0.44481903314590454, - "logits/rejected": -0.00442737340927124, - "logps/chosen": -393.5993957519531, - "logps/rejected": -1024.645751953125, - "loss": 0.1641, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -2.714064121246338, - "rewards/margins": 6.799878120422363, - "rewards/rejected": -9.513941764831543, + "logits/chosen": -1.5011646747589111, + "logits/rejected": -1.4681178331375122, + "logps/chosen": -629.8143310546875, + "logps/rejected": -1053.095703125, + "loss": 0.0947, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -3.13895583152771, + "rewards/margins": 5.925333499908447, + "rewards/rejected": -9.064289093017578, "step": 850 }, { "epoch": 0.9, - "grad_norm": 27.781110281391676, + "grad_norm": 37.505156397985395, "learning_rate": 1.4938170864468636e-08, - "logits/chosen": -0.713314414024353, - "logits/rejected": -0.28505033254623413, - "logps/chosen": -372.9283447265625, - "logps/rejected": -1127.688232421875, - "loss": 0.151, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": -2.429081678390503, - "rewards/margins": 8.078619003295898, - "rewards/rejected": -10.507699966430664, + "logits/chosen": -1.5276473760604858, + "logits/rejected": -1.3087430000305176, + "logps/chosen": -611.999267578125, + "logps/rejected": -982.8528442382812, + "loss": 0.1089, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.328166961669922, + "rewards/margins": 5.248373031616211, + "rewards/rejected": -8.576539993286133, "step": 860 }, { "epoch": 0.91, - "grad_norm": 40.33670958727325, + "grad_norm": 33.59787382725122, "learning_rate": 1.1982873884064465e-08, - "logits/chosen": -0.642200767993927, - "logits/rejected": -0.2615371346473694, - "logps/chosen": -382.0066223144531, - "logps/rejected": -1205.7874755859375, - "loss": 0.1425, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.4844939708709717, - "rewards/margins": 8.8142728805542, - "rewards/rejected": -11.298765182495117, + "logits/chosen": -1.5041674375534058, + "logits/rejected": -1.186408281326294, + "logps/chosen": -543.9867553710938, + "logps/rejected": -960.0665893554688, + "loss": 0.1023, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.6006431579589844, + "rewards/margins": 5.71505069732666, + "rewards/rejected": -8.315693855285645, "step": 870 }, { "epoch": 0.92, - "grad_norm": 41.06397869005653, + "grad_norm": 24.198197782455043, "learning_rate": 9.345903713082304e-09, - "logits/chosen": -0.6282533407211304, - "logits/rejected": -0.19878247380256653, - "logps/chosen": -400.76605224609375, - "logps/rejected": -1277.481689453125, - "loss": 0.1417, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.5621111392974854, - "rewards/margins": 9.376588821411133, - "rewards/rejected": -11.938699722290039, + "logits/chosen": -1.5898497104644775, + "logits/rejected": -1.4362118244171143, + "logps/chosen": -581.5143432617188, + "logps/rejected": -1014.4615478515625, + "loss": 0.0837, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.9443726539611816, + "rewards/margins": 5.875336647033691, + "rewards/rejected": -8.819709777832031, "step": 880 }, { "epoch": 0.93, - "grad_norm": 59.83522561090434, + "grad_norm": 32.4653792997969, "learning_rate": 7.030787065396865e-09, - "logits/chosen": -0.5553107857704163, - "logits/rejected": -0.15107165277004242, - "logps/chosen": -375.3813781738281, - "logps/rejected": -1131.446533203125, - "loss": 0.1561, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.4943184852600098, - "rewards/margins": 8.074764251708984, - "rewards/rejected": -10.56908130645752, + "logits/chosen": -1.5886590480804443, + "logits/rejected": -1.2676339149475098, + "logps/chosen": -612.6450805664062, + "logps/rejected": -982.3916015625, + "loss": 0.1056, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -3.1008591651916504, + "rewards/margins": 5.476729393005371, + "rewards/rejected": -8.57758903503418, "step": 890 }, { "epoch": 0.94, - "grad_norm": 20.123097535281502, + "grad_norm": 28.89242092867656, "learning_rate": 5.04062020432286e-09, - "logits/chosen": -0.5520753860473633, - "logits/rejected": -0.19162021577358246, - "logps/chosen": -359.1523742675781, - "logps/rejected": -1050.653076171875, - "loss": 0.1479, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.378551959991455, - "rewards/margins": 7.357401371002197, - "rewards/rejected": -9.735953330993652, + "logits/chosen": -1.5399090051651, + "logits/rejected": -1.42655611038208, + "logps/chosen": -595.1466064453125, + "logps/rejected": -1033.9515380859375, + "loss": 0.088, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -2.967661142349243, + "rewards/margins": 6.027240753173828, + "rewards/rejected": -8.994901657104492, "step": 900 }, { "epoch": 0.94, - "eval_logits/chosen": -0.7198767066001892, - "eval_logits/rejected": -0.5643453598022461, - "eval_logps/chosen": -1010.4668579101562, - "eval_logps/rejected": -1164.5386962890625, - "eval_loss": 1.8978877067565918, - "eval_rewards/accuracies": 0.609375, - "eval_rewards/chosen": -6.986903190612793, - "eval_rewards/margins": 1.483232855796814, - "eval_rewards/rejected": -8.470136642456055, - "eval_runtime": 97.3726, - "eval_samples_per_second": 20.54, + "eval_logits/chosen": -1.1238834857940674, + "eval_logits/rejected": -1.0823395252227783, + "eval_logps/chosen": -554.9180297851562, + "eval_logps/rejected": -517.7515869140625, + "eval_loss": 1.3947275876998901, + "eval_rewards/accuracies": 0.38671875, + "eval_rewards/chosen": -2.43141508102417, + "eval_rewards/margins": -0.4291508197784424, + "eval_rewards/rejected": -2.0022642612457275, + "eval_runtime": 97.126, + "eval_samples_per_second": 20.592, "eval_steps_per_second": 0.329, "step": 900 }, { "epoch": 0.95, - "grad_norm": 27.829801809728036, + "grad_norm": 26.806730695970433, "learning_rate": 3.3780648016376866e-09, - "logits/chosen": -0.5426167249679565, - "logits/rejected": -0.20517143607139587, - "logps/chosen": -367.1676330566406, - "logps/rejected": -1091.75390625, - "loss": 0.1338, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -2.459343910217285, - "rewards/margins": 7.6989898681640625, - "rewards/rejected": -10.158334732055664, + "logits/chosen": -1.658847451210022, + "logits/rejected": -1.3762922286987305, + "logps/chosen": -603.6043701171875, + "logps/rejected": -1017.3605346679688, + "loss": 0.0898, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.103484630584717, + "rewards/margins": 5.833575248718262, + "rewards/rejected": -8.93705940246582, "step": 910 }, { "epoch": 0.96, - "grad_norm": 28.33961947218242, + "grad_norm": 15.136819931212546, "learning_rate": 2.0453443778310766e-09, - "logits/chosen": -0.6683856844902039, - "logits/rejected": -0.24923817813396454, - "logps/chosen": -397.66192626953125, - "logps/rejected": -1160.3267822265625, - "loss": 0.1607, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -2.5613341331481934, - "rewards/margins": 8.2199125289917, - "rewards/rejected": -10.781246185302734, + "logits/chosen": -1.627918004989624, + "logits/rejected": -1.4132583141326904, + "logps/chosen": -563.746337890625, + "logps/rejected": -1013.9581909179688, + "loss": 0.0936, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -2.8101205825805664, + "rewards/margins": 6.073071479797363, + "rewards/rejected": -8.88319206237793, "step": 920 }, { "epoch": 0.97, - "grad_norm": 31.245120668993923, + "grad_norm": 36.0925338348187, "learning_rate": 1.0442413283435758e-09, - "logits/chosen": -0.5892433524131775, - "logits/rejected": -0.16971439123153687, - "logps/chosen": -382.60394287109375, - "logps/rejected": -1098.789306640625, - "loss": 0.1559, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -2.5811736583709717, - "rewards/margins": 7.667700290679932, - "rewards/rejected": -10.248873710632324, + "logits/chosen": -1.5613000392913818, + "logits/rejected": -1.3666173219680786, + "logps/chosen": -583.2642822265625, + "logps/rejected": -1070.8175048828125, + "loss": 0.0947, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.103260040283203, + "rewards/margins": 6.282073020935059, + "rewards/rejected": -9.385333061218262, "step": 930 }, { "epoch": 0.98, - "grad_norm": 34.32124556738926, + "grad_norm": 30.829530604495584, "learning_rate": 3.760945397705828e-10, - "logits/chosen": -0.6785596013069153, - "logits/rejected": -0.16107648611068726, - "logps/chosen": -382.91876220703125, - "logps/rejected": -1063.984375, - "loss": 0.152, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.6222147941589355, - "rewards/margins": 7.256742000579834, - "rewards/rejected": -9.878957748413086, + "logits/chosen": -1.6098768711090088, + "logits/rejected": -1.4137991666793823, + "logps/chosen": -652.3986206054688, + "logps/rejected": -1045.8623046875, + "loss": 0.1041, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.067474365234375, + "rewards/margins": 6.041169166564941, + "rewards/rejected": -9.108643531799316, "step": 940 }, { "epoch": 0.99, - "grad_norm": 33.81539431852237, + "grad_norm": 28.95439781379653, "learning_rate": 4.17975992204056e-11, - "logits/chosen": -0.4908576011657715, - "logits/rejected": -0.08039870113134384, - "logps/chosen": -358.55047607421875, - "logps/rejected": -1048.7579345703125, - "loss": 0.1541, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.3112599849700928, - "rewards/margins": 7.406725883483887, - "rewards/rejected": -9.717985153198242, + "logits/chosen": -1.6131422519683838, + "logits/rejected": -1.4869133234024048, + "logps/chosen": -605.0281372070312, + "logps/rejected": -1031.686279296875, + "loss": 0.0954, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.1735126972198486, + "rewards/margins": 5.8094305992126465, + "rewards/rejected": -8.982942581176758, "step": 950 }, { "epoch": 1.0, "step": 955, "total_flos": 0.0, - "train_loss": 0.1961859940234279, - "train_runtime": 15468.9338, - "train_samples_per_second": 7.904, - "train_steps_per_second": 0.062 + "train_loss": 0.05187356284775659, + "train_runtime": 7314.4586, + "train_samples_per_second": 16.716, + "train_steps_per_second": 0.131 } ], "logging_steps": 10,