diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,7 +10,7 @@ "log_history": [ { "epoch": 0.0, - "grad_norm": 50.34250255302009, + "grad_norm": 50.34450486049318, "learning_rate": 5.208333333333333e-09, "logits/chosen": -1.8382859230041504, "logits/rejected": -1.788834810256958, @@ -25,1581 +25,1581 @@ }, { "epoch": 0.01, - "grad_norm": 46.63620247813633, + "grad_norm": 46.59550900548801, "learning_rate": 5.208333333333333e-08, - "logits/chosen": -1.6616814136505127, - "logits/rejected": -1.5508421659469604, - "logps/chosen": -129.22186279296875, - "logps/rejected": -82.81047821044922, + "logits/chosen": -1.661623477935791, + "logits/rejected": -1.5508946180343628, + "logps/chosen": -129.29275512695312, + "logps/rejected": -82.80602264404297, "loss": 0.6931, - "rewards/accuracies": 0.375, - "rewards/chosen": 0.0005498434184119105, - "rewards/margins": 0.0009226154070347548, - "rewards/rejected": -0.00037277190131135285, + "rewards/accuracies": 0.4305555522441864, + "rewards/chosen": -0.0001589156745467335, + "rewards/margins": 0.00016934690938796848, + "rewards/rejected": -0.00032826262759044766, "step": 10 }, { "epoch": 0.02, - "grad_norm": 48.137796577108965, + "grad_norm": 47.96697514664308, "learning_rate": 1.0416666666666667e-07, - "logits/chosen": -1.6961771249771118, - "logits/rejected": -1.5263831615447998, - "logps/chosen": -140.1017608642578, - "logps/rejected": -81.02197265625, - "loss": 0.6866, - "rewards/accuracies": 0.78125, - "rewards/chosen": 0.013111919164657593, - "rewards/margins": 0.01593630015850067, - "rewards/rejected": -0.0028243791311979294, + "logits/chosen": -1.695336103439331, + "logits/rejected": -1.5260241031646729, + "logps/chosen": -140.20523071289062, + "logps/rejected": -81.0006103515625, + "loss": 0.6864, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.012077380903065205, + "rewards/margins": 0.014688198454678059, + "rewards/rejected": -0.0026108173187822104, "step": 20 }, { "epoch": 0.03, - "grad_norm": 37.06492370153689, + "grad_norm": 36.99293097573757, "learning_rate": 1.5624999999999999e-07, - "logits/chosen": -1.7120641469955444, - "logits/rejected": -1.6340694427490234, - "logps/chosen": -119.171875, - "logps/rejected": -84.84639739990234, - "loss": 0.6569, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.05298091098666191, - "rewards/margins": 0.07273116707801819, - "rewards/rejected": -0.019750254228711128, + "logits/chosen": -1.7122951745986938, + "logits/rejected": -1.6335628032684326, + "logps/chosen": -119.1540756225586, + "logps/rejected": -84.78346252441406, + "loss": 0.6572, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": 0.053159065544605255, + "rewards/margins": 0.07227995246648788, + "rewards/rejected": -0.01912088319659233, "step": 30 }, { "epoch": 0.04, - "grad_norm": 38.277547537833144, + "grad_norm": 38.24696396088281, "learning_rate": 2.0833333333333333e-07, - "logits/chosen": -1.8135931491851807, - "logits/rejected": -1.7179806232452393, - "logps/chosen": -130.3672332763672, - "logps/rejected": -98.59812927246094, - "loss": 0.5913, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.1295873373746872, - "rewards/margins": 0.2865225374698639, - "rewards/rejected": -0.1569352000951767, + "logits/chosen": -1.8129314184188843, + "logits/rejected": -1.716965913772583, + "logps/chosen": -130.36550903320312, + "logps/rejected": -98.51797485351562, + "loss": 0.5915, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.12960486114025116, + "rewards/margins": 0.28573840856552124, + "rewards/rejected": -0.15613357722759247, "step": 40 }, { "epoch": 0.05, - "grad_norm": 25.0126190198897, + "grad_norm": 25.14816746021617, "learning_rate": 2.604166666666667e-07, - "logits/chosen": -1.67790949344635, - "logits/rejected": -1.6322219371795654, - "logps/chosen": -128.07374572753906, - "logps/rejected": -136.76470947265625, - "loss": 0.4837, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": 0.012722499668598175, - "rewards/margins": 0.6054978370666504, - "rewards/rejected": -0.5927752256393433, + "logits/chosen": -1.6774137020111084, + "logits/rejected": -1.631121039390564, + "logps/chosen": -127.99775695800781, + "logps/rejected": -136.59368896484375, + "loss": 0.484, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": 0.013482272624969482, + "rewards/margins": 0.6045473217964172, + "rewards/rejected": -0.5910650491714478, "step": 50 }, { "epoch": 0.06, - "grad_norm": 24.458659174156402, + "grad_norm": 23.977796197260762, "learning_rate": 3.1249999999999997e-07, - "logits/chosen": -1.6139692068099976, - "logits/rejected": -1.611717939376831, - "logps/chosen": -173.83407592773438, - "logps/rejected": -218.3336944580078, - "loss": 0.3944, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -0.2466244399547577, - "rewards/margins": 1.035771369934082, - "rewards/rejected": -1.282395839691162, + "logits/chosen": -1.612322211265564, + "logits/rejected": -1.6097100973129272, + "logps/chosen": -173.88418579101562, + "logps/rejected": -217.969482421875, + "loss": 0.3947, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -0.24712534248828888, + "rewards/margins": 1.0316286087036133, + "rewards/rejected": -1.2787538766860962, "step": 60 }, { "epoch": 0.07, - "grad_norm": 32.37910330977062, + "grad_norm": 30.66248348091119, "learning_rate": 3.645833333333333e-07, - "logits/chosen": -1.4961981773376465, - "logits/rejected": -1.4705344438552856, - "logps/chosen": -191.92495727539062, - "logps/rejected": -297.68841552734375, + "logits/chosen": -1.4964826107025146, + "logits/rejected": -1.4709423780441284, + "logps/chosen": -192.11148071289062, + "logps/rejected": -297.772705078125, "loss": 0.3475, "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.6499562859535217, - "rewards/margins": 1.588413953781128, - "rewards/rejected": -2.238370418548584, + "rewards/chosen": -0.6518217325210571, + "rewards/margins": 1.5873914957046509, + "rewards/rejected": -2.23921275138855, "step": 70 }, { "epoch": 0.08, - "grad_norm": 31.121018022045003, + "grad_norm": 31.503070208040914, "learning_rate": 4.1666666666666667e-07, - "logits/chosen": -1.4610167741775513, - "logits/rejected": -1.3438676595687866, - "logps/chosen": -249.2657470703125, - "logps/rejected": -405.2835693359375, - "loss": 0.2961, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -1.1774078607559204, - "rewards/margins": 2.065943717956543, - "rewards/rejected": -3.243351697921753, + "logits/chosen": -1.4591631889343262, + "logits/rejected": -1.3411717414855957, + "logps/chosen": -248.903076171875, + "logps/rejected": -403.9586181640625, + "loss": 0.2969, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.1737810373306274, + "rewards/margins": 2.056320905685425, + "rewards/rejected": -3.230102062225342, "step": 80 }, { "epoch": 0.09, - "grad_norm": 34.84421918271163, + "grad_norm": 32.69447381027626, "learning_rate": 4.6874999999999996e-07, - "logits/chosen": -1.295830488204956, - "logits/rejected": -1.2198989391326904, - "logps/chosen": -235.9141082763672, - "logps/rejected": -426.62005615234375, - "loss": 0.2782, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.0462534427642822, - "rewards/margins": 2.420970916748047, - "rewards/rejected": -3.467224597930908, + "logits/chosen": -1.2863261699676514, + "logits/rejected": -1.206099271774292, + "logps/chosen": -229.5722198486328, + "logps/rejected": -416.52410888671875, + "loss": 0.2791, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -0.9828346967697144, + "rewards/margins": 2.383430242538452, + "rewards/rejected": -3.366265058517456, "step": 90 }, { "epoch": 0.1, - "grad_norm": 38.18824950345103, + "grad_norm": 41.9692201061944, "learning_rate": 4.999732492681437e-07, - "logits/chosen": -1.4374310970306396, - "logits/rejected": -1.2723599672317505, - "logps/chosen": -237.91232299804688, - "logps/rejected": -481.5489807128906, - "loss": 0.254, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -1.0713858604431152, - "rewards/margins": 2.9585039615631104, - "rewards/rejected": -4.0298895835876465, + "logits/chosen": -1.3805668354034424, + "logits/rejected": -1.2119754552841187, + "logps/chosen": -233.46530151367188, + "logps/rejected": -480.0712890625, + "loss": 0.2553, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.0269155502319336, + "rewards/margins": 2.9881973266601562, + "rewards/rejected": -4.01511287689209, "step": 100 }, { "epoch": 0.1, - "eval_logits/chosen": -1.520336627960205, - "eval_logits/rejected": -1.4872528314590454, - "eval_logps/chosen": -814.5384521484375, - "eval_logps/rejected": -908.8818359375, - "eval_loss": 1.4761102199554443, - "eval_rewards/accuracies": 0.58984375, - "eval_rewards/chosen": -5.329049110412598, - "eval_rewards/margins": 0.8821536302566528, - "eval_rewards/rejected": -6.211202144622803, - "eval_runtime": 97.4159, - "eval_samples_per_second": 20.531, - "eval_steps_per_second": 0.328, + "eval_logits/chosen": -1.4310617446899414, + "eval_logits/rejected": -1.3999046087265015, + "eval_logps/chosen": -810.39892578125, + "eval_logps/rejected": -903.4208984375, + "eval_loss": 1.4884623289108276, + "eval_rewards/accuracies": 0.59375, + "eval_rewards/chosen": -5.28765344619751, + "eval_rewards/margins": 0.8689396977424622, + "eval_rewards/rejected": -6.156593322753906, + "eval_runtime": 98.2178, + "eval_samples_per_second": 20.363, + "eval_steps_per_second": 0.326, "step": 100 }, { "epoch": 0.12, - "grad_norm": 32.08336034758986, + "grad_norm": 30.202206469792845, "learning_rate": 4.996723692767926e-07, - "logits/chosen": -1.320703148841858, - "logits/rejected": -1.2061922550201416, - "logps/chosen": -245.25057983398438, - "logps/rejected": -540.9496459960938, - "loss": 0.2321, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -1.2422926425933838, - "rewards/margins": 3.4426703453063965, - "rewards/rejected": -4.684963226318359, + "logits/chosen": -1.2375271320343018, + "logits/rejected": -1.1157643795013428, + "logps/chosen": -244.78817749023438, + "logps/rejected": -537.5816650390625, + "loss": 0.2327, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.237668752670288, + "rewards/margins": 3.4136147499084473, + "rewards/rejected": -4.651283264160156, "step": 110 }, { "epoch": 0.13, - "grad_norm": 42.593718204224416, + "grad_norm": 43.15400681196086, "learning_rate": 4.990375746213598e-07, - "logits/chosen": -1.4631527662277222, - "logits/rejected": -1.3092104196548462, - "logps/chosen": -291.78375244140625, - "logps/rejected": -656.6674194335938, - "loss": 0.2292, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -1.5867398977279663, - "rewards/margins": 4.244009017944336, - "rewards/rejected": -5.830749034881592, + "logits/chosen": -1.3515715599060059, + "logits/rejected": -1.1862618923187256, + "logps/chosen": -273.26910400390625, + "logps/rejected": -611.77587890625, + "loss": 0.2309, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.401593565940857, + "rewards/margins": 3.9802393913269043, + "rewards/rejected": -5.381833076477051, "step": 120 }, { "epoch": 0.14, - "grad_norm": 34.153643966408474, + "grad_norm": 38.15525323839439, "learning_rate": 4.980697142834314e-07, - "logits/chosen": -1.5025428533554077, - "logits/rejected": -1.3630611896514893, - "logps/chosen": -250.1748046875, - "logps/rejected": -576.2555541992188, - "loss": 0.2284, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -1.1989684104919434, - "rewards/margins": 3.749941349029541, - "rewards/rejected": -4.948909759521484, + "logits/chosen": -1.364355206489563, + "logits/rejected": -1.2210981845855713, + "logps/chosen": -254.6910858154297, + "logps/rejected": -594.147705078125, + "loss": 0.227, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2441309690475464, + "rewards/margins": 3.8836989402770996, + "rewards/rejected": -5.127829551696777, "step": 130 }, { "epoch": 0.15, - "grad_norm": 38.62955098354431, + "grad_norm": 36.393564074980986, "learning_rate": 4.967700826904229e-07, - "logits/chosen": -1.5546451807022095, - "logits/rejected": -1.388270616531372, - "logps/chosen": -287.029541015625, - "logps/rejected": -771.34033203125, - "loss": 0.2221, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.5651198625564575, - "rewards/margins": 5.370504379272461, - "rewards/rejected": -6.935624599456787, + "logits/chosen": -1.4510648250579834, + "logits/rejected": -1.264467716217041, + "logps/chosen": -268.52435302734375, + "logps/rejected": -723.8096923828125, + "loss": 0.2227, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.380068063735962, + "rewards/margins": 5.080250263214111, + "rewards/rejected": -6.460318088531494, "step": 140 }, { "epoch": 0.16, - "grad_norm": 45.61220950393248, + "grad_norm": 38.04347531297831, "learning_rate": 4.951404179843962e-07, - "logits/chosen": -1.5795482397079468, - "logits/rejected": -1.3580058813095093, - "logps/chosen": -292.9833068847656, - "logps/rejected": -779.7533569335938, - "loss": 0.2009, - "rewards/accuracies": 0.90625, - "rewards/chosen": -1.4337162971496582, - "rewards/margins": 5.4721760749816895, - "rewards/rejected": -6.905892848968506, + "logits/chosen": -1.4860388040542603, + "logits/rejected": -1.2345077991485596, + "logps/chosen": -277.77691650390625, + "logps/rejected": -736.511962890625, + "loss": 0.2023, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2816520929336548, + "rewards/margins": 5.191825866699219, + "rewards/rejected": -6.473477363586426, "step": 150 }, { "epoch": 0.17, - "grad_norm": 40.221682931286466, + "grad_norm": 32.12468091191168, "learning_rate": 4.931828996974498e-07, - "logits/chosen": -1.5406692028045654, - "logits/rejected": -1.3174465894699097, - "logps/chosen": -287.9991760253906, - "logps/rejected": -712.6685791015625, - "loss": 0.2152, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.5834462642669678, - "rewards/margins": 4.822080135345459, - "rewards/rejected": -6.405526161193848, + "logits/chosen": -1.3758165836334229, + "logits/rejected": -1.1255621910095215, + "logps/chosen": -280.8691101074219, + "logps/rejected": -739.5682983398438, + "loss": 0.2119, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.5121452808380127, + "rewards/margins": 5.162380218505859, + "rewards/rejected": -6.674524784088135, "step": 160 }, { "epoch": 0.18, - "grad_norm": 52.489731314855526, + "grad_norm": 47.564638833055, "learning_rate": 4.909001458367866e-07, - "logits/chosen": -1.5561904907226562, - "logits/rejected": -1.3703653812408447, - "logps/chosen": -361.0309753417969, - "logps/rejected": -811.6133422851562, - "loss": 0.1911, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -2.442492961883545, - "rewards/margins": 4.990392208099365, - "rewards/rejected": -7.432885646820068, + "logits/chosen": -1.48409104347229, + "logits/rejected": -1.2840547561645508, + "logps/chosen": -345.6038513183594, + "logps/rejected": -792.1544189453125, + "loss": 0.1882, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.288221597671509, + "rewards/margins": 4.950075626373291, + "rewards/rejected": -7.2382965087890625, "step": 170 }, { "epoch": 0.19, - "grad_norm": 44.88732370136967, + "grad_norm": 47.35879767054759, "learning_rate": 4.882952093833627e-07, - "logits/chosen": -1.5479228496551514, - "logits/rejected": -1.402629017829895, - "logps/chosen": -337.51568603515625, - "logps/rejected": -855.9022216796875, - "loss": 0.2064, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -2.006166458129883, - "rewards/margins": 5.707094192504883, - "rewards/rejected": -7.713259696960449, + "logits/chosen": -1.5535860061645508, + "logits/rejected": -1.3911080360412598, + "logps/chosen": -306.82275390625, + "logps/rejected": -747.5235595703125, + "loss": 0.1986, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6992374658584595, + "rewards/margins": 4.930235385894775, + "rewards/rejected": -6.6294732093811035, "step": 180 }, { "epoch": 0.2, - "grad_norm": 39.308543148166194, + "grad_norm": 45.79057257125377, "learning_rate": 4.853715742087946e-07, - "logits/chosen": -1.4547832012176514, - "logits/rejected": -1.2370269298553467, - "logps/chosen": -310.5637512207031, - "logps/rejected": -798.4898071289062, - "loss": 0.1865, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -1.8440923690795898, - "rewards/margins": 5.3871355056762695, - "rewards/rejected": -7.231228828430176, + "logits/chosen": -1.381561279296875, + "logits/rejected": -1.1339786052703857, + "logps/chosen": -373.6202087402344, + "logps/rejected": -895.4216918945312, + "loss": 0.1816, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -2.474656820297241, + "rewards/margins": 5.725890159606934, + "rewards/rejected": -8.20054817199707, "step": 190 }, { "epoch": 0.21, - "grad_norm": 37.85441526280708, + "grad_norm": 47.26352047075587, "learning_rate": 4.821331504159906e-07, - "logits/chosen": -1.3665611743927002, - "logits/rejected": -1.1807641983032227, - "logps/chosen": -266.3150939941406, - "logps/rejected": -686.4867553710938, - "loss": 0.1844, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -1.5169986486434937, - "rewards/margins": 4.59818172454834, - "rewards/rejected": -6.115180015563965, + "logits/chosen": -1.174945592880249, + "logits/rejected": -0.9508455395698547, + "logps/chosen": -356.3412780761719, + "logps/rejected": -835.17041015625, + "loss": 0.1835, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.4172606468200684, + "rewards/margins": 5.184757232666016, + "rewards/rejected": -7.602017402648926, "step": 200 }, { "epoch": 0.21, - "eval_logits/chosen": -1.4664057493209839, - "eval_logits/rejected": -1.4103434085845947, - "eval_logps/chosen": -897.182373046875, - "eval_logps/rejected": -1032.5726318359375, - "eval_loss": 1.7253305912017822, - "eval_rewards/accuracies": 0.61328125, - "eval_rewards/chosen": -6.15548849105835, - "eval_rewards/margins": 1.2926223278045654, - "eval_rewards/rejected": -7.448111534118652, - "eval_runtime": 97.4439, - "eval_samples_per_second": 20.525, - "eval_steps_per_second": 0.328, + "eval_logits/chosen": -1.1731781959533691, + "eval_logits/rejected": -1.0838268995285034, + "eval_logps/chosen": -985.7893676757812, + "eval_logps/rejected": -1125.40234375, + "eval_loss": 1.7331193685531616, + "eval_rewards/accuracies": 0.640625, + "eval_rewards/chosen": -7.041557312011719, + "eval_rewards/margins": 1.3348509073257446, + "eval_rewards/rejected": -8.376408576965332, + "eval_runtime": 98.103, + "eval_samples_per_second": 20.387, + "eval_steps_per_second": 0.326, "step": 200 }, { "epoch": 0.22, - "grad_norm": 35.05458176508214, + "grad_norm": 36.03939088009213, "learning_rate": 4.785842691097342e-07, - "logits/chosen": -1.35039484500885, - "logits/rejected": -1.0796902179718018, - "logps/chosen": -383.1498107910156, - "logps/rejected": -1000.6253662109375, - "loss": 0.1702, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.5290865898132324, - "rewards/margins": 6.735090732574463, - "rewards/rejected": -9.264177322387695, + "logits/chosen": -1.1594005823135376, + "logits/rejected": -0.8251422047615051, + "logps/chosen": -394.606201171875, + "logps/rejected": -985.1688232421875, + "loss": 0.1713, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -2.6436514854431152, + "rewards/margins": 6.4659600257873535, + "rewards/rejected": -9.109611511230469, "step": 210 }, { "epoch": 0.23, - "grad_norm": 48.27429845185424, + "grad_norm": 43.43156507705205, "learning_rate": 4.7472967660421603e-07, - "logits/chosen": -1.0133236646652222, - "logits/rejected": -0.6695674657821655, - "logps/chosen": -290.65142822265625, - "logps/rejected": -783.7594604492188, - "loss": 0.1754, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.5812921524047852, - "rewards/margins": 5.510292053222656, - "rewards/rejected": -7.0915846824646, + "logits/chosen": -1.2681281566619873, + "logits/rejected": -0.9658529162406921, + "logps/chosen": -329.4635314941406, + "logps/rejected": -877.4290161132812, + "loss": 0.1768, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.9694128036499023, + "rewards/margins": 6.058867454528809, + "rewards/rejected": -8.028280258178711, "step": 220 }, { "epoch": 0.24, - "grad_norm": 41.00379708475208, + "grad_norm": 39.5007085264266, "learning_rate": 4.705745280752585e-07, - "logits/chosen": -0.7725291848182678, - "logits/rejected": -0.4129869341850281, - "logps/chosen": -351.3033752441406, - "logps/rejected": -841.9267578125, - "loss": 0.2017, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -2.1709389686584473, - "rewards/margins": 5.4972968101501465, - "rewards/rejected": -7.668235778808594, + "logits/chosen": -1.160872220993042, + "logits/rejected": -0.8863369822502136, + "logps/chosen": -309.687255859375, + "logps/rejected": -741.9949951171875, + "loss": 0.1887, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7547776699066162, + "rewards/margins": 4.914141654968262, + "rewards/rejected": -6.668919563293457, "step": 230 }, { "epoch": 0.25, - "grad_norm": 38.12279956674338, + "grad_norm": 46.06005320900485, "learning_rate": 4.6612438066572555e-07, - "logits/chosen": -0.4965124726295471, - "logits/rejected": -0.0673198476433754, - "logps/chosen": -287.70379638671875, - "logps/rejected": -758.7793579101562, - "loss": 0.1843, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.6674537658691406, - "rewards/margins": 5.193233966827393, - "rewards/rejected": -6.860687255859375, + "logits/chosen": -1.1971551179885864, + "logits/rejected": -0.9917539358139038, + "logps/chosen": -311.27239990234375, + "logps/rejected": -785.2099609375, + "loss": 0.1888, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.9031397104263306, + "rewards/margins": 5.221855163574219, + "rewards/rejected": -7.12499475479126, "step": 240 }, { "epoch": 0.26, - "grad_norm": 41.10699952348945, + "grad_norm": 33.36869122705678, "learning_rate": 4.6138518605333664e-07, - "logits/chosen": -0.6868407726287842, - "logits/rejected": -0.13767887651920319, - "logps/chosen": -342.7250671386719, - "logps/rejected": -987.6883544921875, - "loss": 0.1737, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -2.031644344329834, - "rewards/margins": 6.998709678649902, - "rewards/rejected": -9.030354499816895, + "logits/chosen": -1.2847325801849365, + "logits/rejected": -1.0283781290054321, + "logps/chosen": -380.08062744140625, + "logps/rejected": -1023.7361450195312, + "loss": 0.1712, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.4051995277404785, + "rewards/margins": 6.98563289642334, + "rewards/rejected": -9.39083194732666, "step": 250 }, { "epoch": 0.27, - "grad_norm": 36.89441241984481, + "grad_norm": 38.28924317534368, "learning_rate": 4.5636328249082514e-07, - "logits/chosen": -0.46804434061050415, - "logits/rejected": 0.16666679084300995, - "logps/chosen": -385.1826171875, - "logps/rejected": -1048.572021484375, - "loss": 0.1683, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -2.4862778186798096, - "rewards/margins": 7.211634635925293, - "rewards/rejected": -9.697912216186523, + "logits/chosen": -1.1341345310211182, + "logits/rejected": -0.7906533479690552, + "logps/chosen": -371.534912109375, + "logps/rejected": -1009.8132934570312, + "loss": 0.1647, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.3498005867004395, + "rewards/margins": 6.960524082183838, + "rewards/rejected": -9.310324668884277, "step": 260 }, { "epoch": 0.28, - "grad_norm": 33.10469086445788, + "grad_norm": 49.757488424994264, "learning_rate": 4.510653863290871e-07, - "logits/chosen": -0.6870437860488892, - "logits/rejected": -0.09238968789577484, - "logps/chosen": -350.16400146484375, - "logps/rejected": -1004.6405029296875, - "loss": 0.1688, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -2.0535225868225098, - "rewards/margins": 7.145571708679199, - "rewards/rejected": -9.199094772338867, + "logits/chosen": -1.1997982263565063, + "logits/rejected": -0.7962481379508972, + "logps/chosen": -366.2351379394531, + "logps/rejected": -1031.2001953125, + "loss": 0.1742, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.214233636856079, + "rewards/margins": 7.250457763671875, + "rewards/rejected": -9.464692115783691, "step": 270 }, { "epoch": 0.29, - "grad_norm": 36.36855269897335, + "grad_norm": 42.70778586845113, "learning_rate": 4.4549858303465737e-07, - "logits/chosen": -0.6977792382240295, - "logits/rejected": -0.09684981405735016, - "logps/chosen": -377.00201416015625, - "logps/rejected": -1076.9189453125, - "loss": 0.1621, + "logits/chosen": -0.9916412234306335, + "logits/rejected": -0.41027259826660156, + "logps/chosen": -407.8173828125, + "logps/rejected": -1142.300537109375, + "loss": 0.1684, "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -2.410522222518921, - "rewards/margins": 7.565877437591553, - "rewards/rejected": -9.976398468017578, + "rewards/chosen": -2.7186763286590576, + "rewards/margins": 7.9115400314331055, + "rewards/rejected": -10.630216598510742, "step": 280 }, { "epoch": 0.3, - "grad_norm": 31.447925234202767, + "grad_norm": 36.24189996324408, "learning_rate": 4.396703177135261e-07, - "logits/chosen": -0.9086171984672546, - "logits/rejected": -0.4163902699947357, - "logps/chosen": -320.157958984375, - "logps/rejected": -847.1726684570312, - "loss": 0.1843, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.8222324848175049, - "rewards/margins": 5.811877250671387, - "rewards/rejected": -7.6341094970703125, + "logits/chosen": -0.9031866192817688, + "logits/rejected": -0.3197212815284729, + "logps/chosen": -304.7436218261719, + "logps/rejected": -793.80126953125, + "loss": 0.1836, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.6680891513824463, + "rewards/margins": 5.43230676651001, + "rewards/rejected": -7.100395202636719, "step": 290 }, { "epoch": 0.31, - "grad_norm": 29.489990961921738, + "grad_norm": 37.54329943450868, "learning_rate": 4.335883851539693e-07, - "logits/chosen": -0.8754051327705383, - "logits/rejected": -0.39787793159484863, - "logps/chosen": -303.1324157714844, - "logps/rejected": -911.4088745117188, - "loss": 0.1635, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -1.7049236297607422, - "rewards/margins": 6.647863864898682, - "rewards/rejected": -8.352787017822266, + "logits/chosen": -0.6530806422233582, + "logits/rejected": -0.020840564742684364, + "logps/chosen": -314.2895202636719, + "logps/rejected": -932.5090942382812, + "loss": 0.1686, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.8164947032928467, + "rewards/margins": 6.747295379638672, + "rewards/rejected": -8.563791275024414, "step": 300 }, { "epoch": 0.31, - "eval_logits/chosen": -0.7514631748199463, - "eval_logits/rejected": -0.6256809234619141, - "eval_logps/chosen": -899.3142700195312, - "eval_logps/rejected": -1026.9749755859375, - "eval_loss": 1.6676901578903198, - "eval_rewards/accuracies": 0.59375, - "eval_rewards/chosen": -6.176807403564453, - "eval_rewards/margins": 1.215327262878418, - "eval_rewards/rejected": -7.392133712768555, - "eval_runtime": 97.2926, - "eval_samples_per_second": 20.557, - "eval_steps_per_second": 0.329, + "eval_logits/chosen": -0.4381692707538605, + "eval_logits/rejected": -0.26312655210494995, + "eval_logps/chosen": -962.072021484375, + "eval_logps/rejected": -1124.1737060546875, + "eval_loss": 1.840578556060791, + "eval_rewards/accuracies": 0.62109375, + "eval_rewards/chosen": -6.804385185241699, + "eval_rewards/margins": 1.5597366094589233, + "eval_rewards/rejected": -8.364120483398438, + "eval_runtime": 98.1242, + "eval_samples_per_second": 20.382, + "eval_steps_per_second": 0.326, "step": 300 }, { "epoch": 0.32, - "grad_norm": 36.71426987222159, + "grad_norm": 41.15609667911649, "learning_rate": 4.272609194017105e-07, - "logits/chosen": -0.7960424423217773, - "logits/rejected": -0.22463683784008026, - "logps/chosen": -326.64764404296875, - "logps/rejected": -895.4781494140625, - "loss": 0.1774, + "logits/chosen": -0.6619779467582703, + "logits/rejected": 0.0384717658162117, + "logps/chosen": -322.21038818359375, + "logps/rejected": -913.73095703125, + "loss": 0.183, "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -1.8289636373519897, - "rewards/margins": 6.345601558685303, - "rewards/rejected": -8.174566268920898, + "rewards/chosen": -1.7845910787582397, + "rewards/margins": 6.572502136230469, + "rewards/rejected": -8.357094764709473, "step": 310 }, { "epoch": 0.33, - "grad_norm": 29.01401480971697, + "grad_norm": 37.28649615999031, "learning_rate": 4.2069638288135547e-07, - "logits/chosen": -0.2821110785007477, - "logits/rejected": 0.2019030600786209, - "logps/chosen": -327.86187744140625, - "logps/rejected": -967.6357421875, - "loss": 0.1667, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.0269737243652344, - "rewards/margins": 6.851207733154297, - "rewards/rejected": -8.878181457519531, + "logits/chosen": -0.23517127335071564, + "logits/rejected": 0.2959689497947693, + "logps/chosen": -391.95355224609375, + "logps/rejected": -1064.50634765625, + "loss": 0.1752, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.667890787124634, + "rewards/margins": 7.178997039794922, + "rewards/rejected": -9.846887588500977, "step": 320 }, { "epoch": 0.35, - "grad_norm": 27.12838321931205, + "grad_norm": 31.072373142049294, "learning_rate": 4.139035550786494e-07, - "logits/chosen": -0.5025689005851746, - "logits/rejected": 0.3045334815979004, - "logps/chosen": -339.73321533203125, - "logps/rejected": -999.3941650390625, - "loss": 0.1479, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -2.077441453933716, - "rewards/margins": 7.147647857666016, - "rewards/rejected": -9.225088119506836, + "logits/chosen": -0.7624028921127319, + "logits/rejected": -0.01777508296072483, + "logps/chosen": -289.4612731933594, + "logps/rejected": -823.2060546875, + "loss": 0.155, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.5747220516204834, + "rewards/margins": 5.888485431671143, + "rewards/rejected": -7.463207244873047, "step": 330 }, { "epoch": 0.36, - "grad_norm": 38.66513102667531, + "grad_norm": 36.47785538155409, "learning_rate": 4.0689152079869306e-07, - "logits/chosen": -0.48522940278053284, - "logits/rejected": 0.2592470049858093, - "logps/chosen": -318.987060546875, - "logps/rejected": -1009.4090576171875, - "loss": 0.1602, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -1.873308539390564, - "rewards/margins": 7.443674564361572, - "rewards/rejected": -9.316983222961426, + "logits/chosen": -0.49213218688964844, + "logits/rejected": 0.3138001263141632, + "logps/chosen": -371.17645263671875, + "logps/rejected": -1086.1993408203125, + "loss": 0.1564, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -2.39520263671875, + "rewards/margins": 7.689682960510254, + "rewards/rejected": -10.08488655090332, "step": 340 }, { "epoch": 0.37, - "grad_norm": 33.031074442796964, + "grad_norm": 31.35948083223695, "learning_rate": 3.99669658015821e-07, - "logits/chosen": -0.2326478660106659, - "logits/rejected": 0.3859787583351135, - "logps/chosen": -323.79840087890625, - "logps/rejected": -1037.3876953125, - "loss": 0.1571, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -1.968605637550354, - "rewards/margins": 7.629488468170166, - "rewards/rejected": -9.59809398651123, + "logits/chosen": -0.6193244457244873, + "logits/rejected": -0.00405901949852705, + "logps/chosen": -344.1478271484375, + "logps/rejected": -1085.7281494140625, + "loss": 0.1596, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.172100305557251, + "rewards/margins": 7.909397125244141, + "rewards/rejected": -10.081499099731445, "step": 350 }, { "epoch": 0.38, - "grad_norm": 22.066600953398503, + "grad_norm": 30.302711657458016, "learning_rate": 3.92247625331392e-07, - "logits/chosen": -0.21493081748485565, - "logits/rejected": 0.5706053972244263, - "logps/chosen": -320.01806640625, - "logps/rejected": -916.8917846679688, - "loss": 0.1613, + "logits/chosen": -0.5307037234306335, + "logits/rejected": 0.21387667953968048, + "logps/chosen": -415.412109375, + "logps/rejected": -1183.3756103515625, + "loss": 0.1566, "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -1.859118103981018, - "rewards/margins": 6.541051387786865, - "rewards/rejected": -8.40017032623291, + "rewards/chosen": -2.8130581378936768, + "rewards/margins": 8.25195026397705, + "rewards/rejected": -11.065008163452148, "step": 360 }, { "epoch": 0.39, - "grad_norm": 22.111157147625544, + "grad_norm": 26.81531438398846, "learning_rate": 3.846353490562664e-07, - "logits/chosen": -0.13797323405742645, - "logits/rejected": 0.591605544090271, - "logps/chosen": -353.294921875, - "logps/rejected": -1085.5618896484375, - "loss": 0.1497, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -2.195202112197876, - "rewards/margins": 7.859287261962891, - "rewards/rejected": -10.054490089416504, + "logits/chosen": -0.48204341530799866, + "logits/rejected": 0.22626984119415283, + "logps/chosen": -382.4269104003906, + "logps/rejected": -1170.801025390625, + "loss": 0.1601, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.4865219593048096, + "rewards/margins": 8.420357704162598, + "rewards/rejected": -10.906880378723145, "step": 370 }, { "epoch": 0.4, - "grad_norm": 28.957108905206724, + "grad_norm": 30.212433709316432, "learning_rate": 3.768430099352445e-07, - "logits/chosen": -0.25790831446647644, - "logits/rejected": 0.3344423174858093, - "logps/chosen": -380.2027282714844, - "logps/rejected": -1103.0699462890625, - "loss": 0.1628, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -2.5323684215545654, - "rewards/margins": 7.7269287109375, - "rewards/rejected": -10.259297370910645, + "logits/chosen": -0.6095079183578491, + "logits/rejected": 0.011866395361721516, + "logps/chosen": -380.66131591796875, + "logps/rejected": -1092.6241455078125, + "loss": 0.1648, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -2.536954402923584, + "rewards/margins": 7.617884159088135, + "rewards/rejected": -10.154839515686035, "step": 380 }, { "epoch": 0.41, - "grad_norm": 28.445219714861263, + "grad_norm": 29.660841044507983, "learning_rate": 3.6888102953122304e-07, - "logits/chosen": -0.47374868392944336, - "logits/rejected": 0.09608779847621918, - "logps/chosen": -365.1648254394531, - "logps/rejected": -1052.339111328125, - "loss": 0.1582, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.427854061126709, - "rewards/margins": 7.350833892822266, - "rewards/rejected": -9.778688430786133, + "logits/chosen": -0.9982369542121887, + "logits/rejected": -0.5453655123710632, + "logps/chosen": -331.37579345703125, + "logps/rejected": -994.7216796875, + "loss": 0.1609, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.089963674545288, + "rewards/margins": 7.112550258636475, + "rewards/rejected": -9.2025146484375, "step": 390 }, { "epoch": 0.42, - "grad_norm": 26.329510138760806, + "grad_norm": 32.29658762698521, "learning_rate": 3.607600562872785e-07, - "logits/chosen": -0.4803605079650879, - "logits/rejected": -0.029133472591638565, - "logps/chosen": -345.6582336425781, - "logps/rejected": -932.9094848632812, - "loss": 0.1606, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -2.222365379333496, - "rewards/margins": 6.369531154632568, - "rewards/rejected": -8.591897010803223, + "logits/chosen": -0.9000040292739868, + "logits/rejected": -0.48531150817871094, + "logps/chosen": -376.4362487792969, + "logps/rejected": -1037.1690673828125, + "loss": 0.1652, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.5301454067230225, + "rewards/margins": 7.104345798492432, + "rewards/rejected": -9.634490966796875, "step": 400 }, { "epoch": 0.42, - "eval_logits/chosen": -0.6142900586128235, - "eval_logits/rejected": -0.4797673225402832, - "eval_logps/chosen": -989.371826171875, - "eval_logps/rejected": -1133.77001953125, - "eval_loss": 2.030726194381714, - "eval_rewards/accuracies": 0.6015625, - "eval_rewards/chosen": -7.077382564544678, - "eval_rewards/margins": 1.3827017545700073, - "eval_rewards/rejected": -8.460084915161133, - "eval_runtime": 97.3894, - "eval_samples_per_second": 20.536, - "eval_steps_per_second": 0.329, + "eval_logits/chosen": -0.9962400794029236, + "eval_logits/rejected": -0.8431026935577393, + "eval_logps/chosen": -1031.8543701171875, + "eval_logps/rejected": -1193.3768310546875, + "eval_loss": 2.2100112438201904, + "eval_rewards/accuracies": 0.61328125, + "eval_rewards/chosen": -7.502209663391113, + "eval_rewards/margins": 1.5539427995681763, + "eval_rewards/rejected": -9.056151390075684, + "eval_runtime": 98.118, + "eval_samples_per_second": 20.384, + "eval_steps_per_second": 0.326, "step": 400 }, { "epoch": 0.43, - "grad_norm": 30.32229808234622, + "grad_norm": 34.0414070298422, "learning_rate": 3.5249095128531856e-07, - "logits/chosen": -0.6582576036453247, - "logits/rejected": -0.05353846028447151, - "logps/chosen": -382.0814514160156, - "logps/rejected": -1017.8347778320312, - "loss": 0.1701, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -2.4453189373016357, - "rewards/margins": 6.919729709625244, - "rewards/rejected": -9.365047454833984, + "logits/chosen": -1.0619244575500488, + "logits/rejected": -0.5218402743339539, + "logps/chosen": -394.0702819824219, + "logps/rejected": -1090.901611328125, + "loss": 0.1732, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.565207004547119, + "rewards/margins": 7.530509948730469, + "rewards/rejected": -10.09571647644043, "step": 410 }, { "epoch": 0.44, - "grad_norm": 26.65710867508508, + "grad_norm": 27.417114292284587, "learning_rate": 3.4408477372034736e-07, - "logits/chosen": -0.6557037830352783, - "logits/rejected": -0.18612375855445862, - "logps/chosen": -369.58209228515625, - "logps/rejected": -1136.3447265625, - "loss": 0.154, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.332486867904663, - "rewards/margins": 8.186752319335938, - "rewards/rejected": -10.51923942565918, + "logits/chosen": -1.0055277347564697, + "logits/rejected": -0.6057981848716736, + "logps/chosen": -365.41180419921875, + "logps/rejected": -1116.855224609375, + "loss": 0.1549, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.2907841205596924, + "rewards/margins": 8.033559799194336, + "rewards/rejected": -10.324345588684082, "step": 420 }, { "epoch": 0.45, - "grad_norm": 26.24483385496738, + "grad_norm": 32.7011062848805, "learning_rate": 3.3555276610977276e-07, - "logits/chosen": -0.7492531538009644, - "logits/rejected": -0.0845475047826767, - "logps/chosen": -396.35797119140625, - "logps/rejected": -1169.997802734375, - "loss": 0.1583, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -2.655913829803467, - "rewards/margins": 8.302931785583496, - "rewards/rejected": -10.958845138549805, + "logits/chosen": -1.022843837738037, + "logits/rejected": -0.4059663712978363, + "logps/chosen": -383.0908203125, + "logps/rejected": -1129.064208984375, + "loss": 0.1608, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.523242235183716, + "rewards/margins": 8.026267051696777, + "rewards/rejected": -10.549509048461914, "step": 430 }, { "epoch": 0.46, - "grad_norm": 30.73000357585953, + "grad_norm": 42.12067109994041, "learning_rate": 3.269063392575352e-07, - "logits/chosen": -0.9397071599960327, - "logits/rejected": -0.30021554231643677, - "logps/chosen": -316.59466552734375, - "logps/rejected": -935.5177001953125, - "loss": 0.1863, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -1.8889516592025757, - "rewards/margins": 6.711340427398682, - "rewards/rejected": -8.600292205810547, + "logits/chosen": -1.0300745964050293, + "logits/rejected": -0.3939720690250397, + "logps/chosen": -334.4612731933594, + "logps/rejected": -978.0406494140625, + "loss": 0.1887, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -2.067617654800415, + "rewards/margins": 6.957903861999512, + "rewards/rejected": -9.025522232055664, "step": 440 }, { "epoch": 0.47, - "grad_norm": 37.85434716946158, + "grad_norm": 35.849772356982186, "learning_rate": 3.1815705699316964e-07, - "logits/chosen": -0.8938137292861938, - "logits/rejected": -0.3564215898513794, - "logps/chosen": -316.93023681640625, - "logps/rejected": -954.3201904296875, - "loss": 0.1637, + "logits/chosen": -0.9388716816902161, + "logits/rejected": -0.35359424352645874, + "logps/chosen": -347.19891357421875, + "logps/rejected": -1034.7232666015625, + "loss": 0.161, "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.857377290725708, - "rewards/margins": 6.8977460861206055, - "rewards/rejected": -8.75512409210205, + "rewards/chosen": -2.1600635051727295, + "rewards/margins": 7.3990912437438965, + "rewards/rejected": -9.55915355682373, "step": 450 }, { "epoch": 0.48, - "grad_norm": 47.07671687902545, + "grad_norm": 39.151807389239345, "learning_rate": 3.0931662070620794e-07, - "logits/chosen": -0.8714531660079956, - "logits/rejected": -0.3571350872516632, - "logps/chosen": -362.056396484375, - "logps/rejected": -1101.175048828125, - "loss": 0.166, + "logits/chosen": -0.8643101453781128, + "logits/rejected": -0.27845874428749084, + "logps/chosen": -354.6743469238281, + "logps/rejected": -1072.4947509765625, + "loss": 0.1681, "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -2.2315657138824463, - "rewards/margins": 7.899203300476074, - "rewards/rejected": -10.130769729614258, + "rewards/chosen": -2.157745599746704, + "rewards/margins": 7.686221122741699, + "rewards/rejected": -9.843966484069824, "step": 460 }, { "epoch": 0.49, - "grad_norm": 41.83209910846531, + "grad_norm": 32.7011429368001, "learning_rate": 3.003968536966078e-07, - "logits/chosen": -0.987261176109314, - "logits/rejected": -0.44738197326660156, - "logps/chosen": -340.4277648925781, - "logps/rejected": -1015.7706909179688, - "loss": 0.1612, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.083724021911621, - "rewards/margins": 7.344973087310791, - "rewards/rejected": -9.428696632385254, + "logits/chosen": -1.019325613975525, + "logits/rejected": -0.4210268557071686, + "logps/chosen": -332.55950927734375, + "logps/rejected": -982.16064453125, + "loss": 0.1643, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -2.0050415992736816, + "rewards/margins": 7.087553977966309, + "rewards/rejected": -9.092596054077148, "step": 470 }, { "epoch": 0.5, - "grad_norm": 70.09766166369198, + "grad_norm": 52.104303825235235, "learning_rate": 2.9140968536213693e-07, - "logits/chosen": -0.9282326698303223, - "logits/rejected": -0.4145265519618988, - "logps/chosen": -369.92498779296875, - "logps/rejected": -1071.494384765625, - "loss": 0.1731, + "logits/chosen": -0.9672222137451172, + "logits/rejected": -0.40181833505630493, + "logps/chosen": -371.7021789550781, + "logps/rejected": -1079.9622802734375, + "loss": 0.1754, "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -2.441403865814209, - "rewards/margins": 7.5186967849731445, - "rewards/rejected": -9.960100173950195, + "rewards/chosen": -2.4591755867004395, + "rewards/margins": 7.585604667663574, + "rewards/rejected": -10.044779777526855, "step": 480 }, { "epoch": 0.51, - "grad_norm": 31.072373563077146, + "grad_norm": 26.60910675405901, "learning_rate": 2.823671352438608e-07, - "logits/chosen": -0.7097938656806946, - "logits/rejected": -0.2332497388124466, - "logps/chosen": -376.07647705078125, - "logps/rejected": -1070.494873046875, - "loss": 0.1513, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -2.5956947803497314, - "rewards/margins": 7.397839546203613, - "rewards/rejected": -9.993532180786133, + "logits/chosen": -0.7346346378326416, + "logits/rejected": -0.19799475371837616, + "logps/chosen": -343.676025390625, + "logps/rejected": -995.9855346679688, + "loss": 0.155, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.2716901302337646, + "rewards/margins": 6.976748466491699, + "rewards/rejected": -9.24843978881836, "step": 490 }, { "epoch": 0.52, - "grad_norm": 39.48800882374922, + "grad_norm": 33.38193493130752, "learning_rate": 2.73281296951072e-07, - "logits/chosen": -1.0332391262054443, - "logits/rejected": -0.47291478514671326, - "logps/chosen": -354.30419921875, - "logps/rejected": -1030.0706787109375, - "loss": 0.163, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -2.254275321960449, - "rewards/margins": 7.255748748779297, - "rewards/rejected": -9.51002311706543, + "logits/chosen": -0.8303977847099304, + "logits/rejected": -0.23081405460834503, + "logps/chosen": -360.92291259765625, + "logps/rejected": -1006.4913940429688, + "loss": 0.1641, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.320462703704834, + "rewards/margins": 6.953768730163574, + "rewards/rejected": -9.274230003356934, "step": 500 }, { "epoch": 0.52, - "eval_logits/chosen": -0.9380449652671814, - "eval_logits/rejected": -0.8136107921600342, - "eval_logps/chosen": -936.579345703125, - "eval_logps/rejected": -1091.4378662109375, - "eval_loss": 1.8216179609298706, - "eval_rewards/accuracies": 0.58984375, - "eval_rewards/chosen": -6.549457550048828, - "eval_rewards/margins": 1.487306833267212, - "eval_rewards/rejected": -8.036764144897461, - "eval_runtime": 97.3225, - "eval_samples_per_second": 20.55, - "eval_steps_per_second": 0.329, + "eval_logits/chosen": -0.6576467752456665, + "eval_logits/rejected": -0.4577823877334595, + "eval_logps/chosen": -968.23291015625, + "eval_logps/rejected": -1137.78076171875, + "eval_loss": 1.7548068761825562, + "eval_rewards/accuracies": 0.62109375, + "eval_rewards/chosen": -6.865993499755859, + "eval_rewards/margins": 1.6341992616653442, + "eval_rewards/rejected": -8.500192642211914, + "eval_runtime": 98.2473, + "eval_samples_per_second": 20.357, + "eval_steps_per_second": 0.326, "step": 500 }, { "epoch": 0.53, - "grad_norm": 36.00441662885456, + "grad_norm": 31.189737216776532, "learning_rate": 2.641643219871597e-07, - "logits/chosen": -1.147862195968628, - "logits/rejected": -0.4513426721096039, - "logps/chosen": -346.061767578125, - "logps/rejected": -974.8058471679688, - "loss": 0.1448, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -1.9859822988510132, - "rewards/margins": 6.95660924911499, - "rewards/rejected": -8.942591667175293, + "logits/chosen": -0.9037526249885559, + "logits/rejected": -0.13355329632759094, + "logps/chosen": -379.1105041503906, + "logps/rejected": -1011.2286987304688, + "loss": 0.1503, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.316470146179199, + "rewards/margins": 6.990350246429443, + "rewards/rejected": -9.306819915771484, "step": 510 }, { "epoch": 0.54, - "grad_norm": 32.51878519513302, + "grad_norm": 31.41583601772438, "learning_rate": 2.550284034980507e-07, - "logits/chosen": -1.13883638381958, - "logits/rejected": -0.4823761582374573, - "logps/chosen": -372.8052062988281, - "logps/rejected": -1022.6721801757812, - "loss": 0.1752, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.303035020828247, - "rewards/margins": 7.056814670562744, - "rewards/rejected": -9.35984992980957, + "logits/chosen": -1.062774896621704, + "logits/rejected": -0.34700798988342285, + "logps/chosen": -379.5553283691406, + "logps/rejected": -1013.3961791992188, + "loss": 0.1702, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -2.3705356121063232, + "rewards/margins": 6.896553039550781, + "rewards/rejected": -9.267087936401367, "step": 520 }, { "epoch": 0.55, - "grad_norm": 22.31604773564865, + "grad_norm": 31.070313923011998, "learning_rate": 2.4588575996495794e-07, - "logits/chosen": -1.2034275531768799, - "logits/rejected": -0.5457441210746765, - "logps/chosen": -309.73455810546875, - "logps/rejected": -966.7014770507812, - "loss": 0.1395, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -1.8016897439956665, - "rewards/margins": 7.100803375244141, - "rewards/rejected": -8.902493476867676, + "logits/chosen": -1.191855788230896, + "logits/rejected": -0.5275939702987671, + "logps/chosen": -335.30316162109375, + "logps/rejected": -1002.8668212890625, + "loss": 0.14, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.05737566947937, + "rewards/margins": 7.206770896911621, + "rewards/rejected": -9.264147758483887, "step": 530 }, { "epoch": 0.57, - "grad_norm": 25.730552994224812, + "grad_norm": 30.413248217420676, "learning_rate": 2.367486188632446e-07, - "logits/chosen": -0.8954168558120728, - "logits/rejected": -0.3190861642360687, - "logps/chosen": -355.24444580078125, - "logps/rejected": -993.5911865234375, - "loss": 0.1404, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.2296109199523926, - "rewards/margins": 6.92122745513916, - "rewards/rejected": -9.150837898254395, + "logits/chosen": -0.7969384789466858, + "logits/rejected": -0.15588393807411194, + "logps/chosen": -376.9305725097656, + "logps/rejected": -1036.6219482421875, + "loss": 0.1454, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.446472644805908, + "rewards/margins": 7.134673118591309, + "rewards/rejected": -9.581145286560059, "step": 540 }, { "epoch": 0.58, - "grad_norm": 31.70801335762852, + "grad_norm": 32.84402177800055, "learning_rate": 2.276292003092593e-07, - "logits/chosen": -0.8856201171875, - "logits/rejected": -0.2036731243133545, - "logps/chosen": -384.66900634765625, - "logps/rejected": -1095.9716796875, - "loss": 0.1507, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.6011099815368652, - "rewards/margins": 7.6391730308532715, - "rewards/rejected": -10.24028205871582, + "logits/chosen": -0.8962065577507019, + "logits/rejected": -0.1975441575050354, + "logps/chosen": -391.3857727050781, + "logps/rejected": -1091.1324462890625, + "loss": 0.1598, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.6682772636413574, + "rewards/margins": 7.523612976074219, + "rewards/rejected": -10.191889762878418, "step": 550 }, { "epoch": 0.59, - "grad_norm": 33.368612321336464, + "grad_norm": 41.82646456808658, "learning_rate": 2.185397007170141e-07, - "logits/chosen": -0.6942281126976013, - "logits/rejected": -0.06129706650972366, - "logps/chosen": -311.1067810058594, - "logps/rejected": -868.57275390625, - "loss": 0.1601, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -2.0446078777313232, - "rewards/margins": 6.013201713562012, - "rewards/rejected": -8.057809829711914, + "logits/chosen": -0.6622243523597717, + "logits/rejected": -0.07754392921924591, + "logps/chosen": -343.5663757324219, + "logps/rejected": -914.6266479492188, + "loss": 0.1702, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -2.369204044342041, + "rewards/margins": 6.149145603179932, + "rewards/rejected": -8.518350601196289, "step": 560 }, { "epoch": 0.6, - "grad_norm": 37.62845344818987, + "grad_norm": 42.50944771180096, "learning_rate": 2.094922764865619e-07, - "logits/chosen": -0.7505870461463928, - "logits/rejected": -0.11658618599176407, - "logps/chosen": -386.23590087890625, - "logps/rejected": -1095.0679931640625, - "loss": 0.1543, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -2.496262311935425, - "rewards/margins": 7.5716872215271, - "rewards/rejected": -10.067950248718262, + "logits/chosen": -0.8091244697570801, + "logits/rejected": -0.2209562510251999, + "logps/chosen": -386.20660400390625, + "logps/rejected": -1064.3209228515625, + "loss": 0.1556, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.495969533920288, + "rewards/margins": 7.264508247375488, + "rewards/rejected": -9.760478973388672, "step": 570 }, { "epoch": 0.61, - "grad_norm": 26.026316709252203, + "grad_norm": 24.503945707028254, "learning_rate": 2.0049902774588797e-07, - "logits/chosen": -0.7590332627296448, - "logits/rejected": -0.031172871589660645, - "logps/chosen": -312.64642333984375, - "logps/rejected": -921.24560546875, - "loss": 0.1337, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -1.7880489826202393, - "rewards/margins": 6.684856414794922, - "rewards/rejected": -8.472906112670898, + "logits/chosen": -0.6946332454681396, + "logits/rejected": -0.022548992186784744, + "logps/chosen": -333.82281494140625, + "logps/rejected": -950.623046875, + "loss": 0.1383, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.9998124837875366, + "rewards/margins": 6.766866207122803, + "rewards/rejected": -8.766678810119629, "step": 580 }, { "epoch": 0.62, - "grad_norm": 25.651974922486566, + "grad_norm": 36.833597785790786, "learning_rate": 1.9157198216806238e-07, - "logits/chosen": -0.49569645524024963, - "logits/rejected": 0.2907310724258423, - "logps/chosen": -359.0349426269531, - "logps/rejected": -1026.7982177734375, - "loss": 0.1546, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -2.2977614402770996, - "rewards/margins": 7.176451206207275, - "rewards/rejected": -9.474211692810059, + "logits/chosen": -0.5168381333351135, + "logits/rejected": 0.11856722831726074, + "logps/chosen": -379.43206787109375, + "logps/rejected": -1062.462646484375, + "loss": 0.1541, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.5017330646514893, + "rewards/margins": 7.32912540435791, + "rewards/rejected": -9.83085823059082, "step": 590 }, { "epoch": 0.63, - "grad_norm": 29.13172550353607, + "grad_norm": 33.9778246131948, "learning_rate": 1.8272307888529274e-07, - "logits/chosen": -0.5350311398506165, - "logits/rejected": 0.32648250460624695, - "logps/chosen": -355.0140380859375, - "logps/rejected": -1063.1131591796875, - "loss": 0.1656, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.212515354156494, - "rewards/margins": 7.633421897888184, - "rewards/rejected": -9.845937728881836, + "logits/chosen": -0.5734451413154602, + "logits/rejected": 0.1789693534374237, + "logps/chosen": -380.5990295410156, + "logps/rejected": -1105.986572265625, + "loss": 0.1644, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.46836519241333, + "rewards/margins": 7.8063063621521, + "rewards/rejected": -10.27467155456543, "step": 600 }, { "epoch": 0.63, - "eval_logits/chosen": -0.5360411405563354, - "eval_logits/rejected": -0.37004101276397705, - "eval_logps/chosen": -934.728515625, - "eval_logps/rejected": -1083.6920166015625, - "eval_loss": 1.8090689182281494, - "eval_rewards/accuracies": 0.625, - "eval_rewards/chosen": -6.530948162078857, - "eval_rewards/margins": 1.4283562898635864, - "eval_rewards/rejected": -7.9593048095703125, - "eval_runtime": 97.4659, - "eval_samples_per_second": 20.52, - "eval_steps_per_second": 0.328, + "eval_logits/chosen": -0.6600009799003601, + "eval_logits/rejected": -0.48613861203193665, + "eval_logps/chosen": -931.4107055664062, + "eval_logps/rejected": -1085.6712646484375, + "eval_loss": 1.773822546005249, + "eval_rewards/accuracies": 0.6328125, + "eval_rewards/chosen": -6.497769832611084, + "eval_rewards/margins": 1.4813272953033447, + "eval_rewards/rejected": -7.979097843170166, + "eval_runtime": 98.1346, + "eval_samples_per_second": 20.38, + "eval_steps_per_second": 0.326, "step": 600 }, { "epoch": 0.64, - "grad_norm": 38.21490518689014, + "grad_norm": 23.195547630615426, "learning_rate": 1.7396415252139288e-07, - "logits/chosen": -0.7486557960510254, - "logits/rejected": -0.10511207580566406, - "logps/chosen": -321.5223693847656, - "logps/rejected": -989.8853759765625, - "loss": 0.1515, + "logits/chosen": -0.8410640954971313, + "logits/rejected": -0.25543102622032166, + "logps/chosen": -344.79241943359375, + "logps/rejected": -1009.0732421875, + "loss": 0.1522, "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -1.8860008716583252, - "rewards/margins": 7.177679538726807, - "rewards/rejected": -9.063680648803711, + "rewards/chosen": -2.118701457977295, + "rewards/margins": 7.136859893798828, + "rewards/rejected": -9.255559921264648, "step": 610 }, { "epoch": 0.65, - "grad_norm": 37.606116637861646, + "grad_norm": 33.06598787788393, "learning_rate": 1.6530691736402316e-07, - "logits/chosen": -0.5896056294441223, - "logits/rejected": 0.24608202278614044, - "logps/chosen": -332.99127197265625, - "logps/rejected": -969.2781982421875, - "loss": 0.1476, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.0645222663879395, - "rewards/margins": 6.90951681137085, - "rewards/rejected": -8.974038124084473, + "logits/chosen": -0.6116012334823608, + "logits/rejected": 0.1189853698015213, + "logps/chosen": -357.3853454589844, + "logps/rejected": -984.2900390625, + "loss": 0.1538, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -2.3084633350372314, + "rewards/margins": 6.815694332122803, + "rewards/rejected": -9.124157905578613, "step": 620 }, { "epoch": 0.66, - "grad_norm": 29.30177416360563, + "grad_norm": 28.610200901705788, "learning_rate": 1.5676295169786864e-07, - "logits/chosen": -0.5280860662460327, - "logits/rejected": 0.11714713275432587, - "logps/chosen": -344.58740234375, - "logps/rejected": -951.2314453125, - "loss": 0.1637, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -2.1917474269866943, - "rewards/margins": 6.532890319824219, - "rewards/rejected": -8.724637985229492, + "logits/chosen": -0.6708418130874634, + "logits/rejected": -0.08200804889202118, + "logps/chosen": -341.0719909667969, + "logps/rejected": -920.1785278320312, + "loss": 0.1649, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.1565937995910645, + "rewards/margins": 6.257513999938965, + "rewards/rejected": -8.414108276367188, "step": 630 }, { "epoch": 0.67, - "grad_norm": 20.00745545125351, + "grad_norm": 21.997087740272892, "learning_rate": 1.483436823197092e-07, - "logits/chosen": -0.7493909597396851, - "logits/rejected": 0.14372903108596802, - "logps/chosen": -323.84112548828125, - "logps/rejected": -1045.17333984375, - "loss": 0.1427, + "logits/chosen": -0.9185328483581543, + "logits/rejected": -0.1707996129989624, + "logps/chosen": -317.12139892578125, + "logps/rejected": -994.1339721679688, + "loss": 0.1445, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.9064871072769165, - "rewards/margins": 7.77541446685791, - "rewards/rejected": -9.681901931762695, + "rewards/chosen": -1.8392893075942993, + "rewards/margins": 7.332219123840332, + "rewards/rejected": -9.1715087890625, "step": 640 }, { "epoch": 0.68, - "grad_norm": 28.980496182255987, + "grad_norm": 29.646982459242487, "learning_rate": 1.4006036925609243e-07, - "logits/chosen": -0.6482175588607788, - "logits/rejected": 0.0625019520521164, - "logps/chosen": -341.03228759765625, - "logps/rejected": -1090.4324951171875, - "loss": 0.1431, + "logits/chosen": -0.7660094499588013, + "logits/rejected": -0.13168807327747345, + "logps/chosen": -343.16827392578125, + "logps/rejected": -1070.11083984375, + "loss": 0.143, "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -2.016282320022583, - "rewards/margins": 8.004495620727539, - "rewards/rejected": -10.020777702331543, + "rewards/chosen": -2.037642240524292, + "rewards/margins": 7.779918670654297, + "rewards/rejected": -9.817561149597168, "step": 650 }, { "epoch": 0.69, - "grad_norm": 28.837458672914796, + "grad_norm": 50.15165321129396, "learning_rate": 1.319240907040458e-07, - "logits/chosen": -0.7208763360977173, - "logits/rejected": 0.032781489193439484, - "logps/chosen": -354.48773193359375, - "logps/rejected": -1058.132080078125, - "loss": 0.135, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.1903700828552246, - "rewards/margins": 7.6033148765563965, - "rewards/rejected": -9.793684005737305, + "logits/chosen": -0.6865934133529663, + "logits/rejected": -0.06396986544132233, + "logps/chosen": -390.01690673828125, + "logps/rejected": -1111.46240234375, + "loss": 0.1377, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -2.5456621646881104, + "rewards/margins": 7.7813262939453125, + "rewards/rejected": -10.326987266540527, "step": 660 }, { "epoch": 0.7, - "grad_norm": 32.56630053751295, + "grad_norm": 35.59674126711792, "learning_rate": 1.239457282149695e-07, - "logits/chosen": -0.6535686254501343, - "logits/rejected": 0.021148119121789932, - "logps/chosen": -366.6623229980469, - "logps/rejected": -1171.2142333984375, - "loss": 0.1405, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -2.4802515506744385, - "rewards/margins": 8.500134468078613, - "rewards/rejected": -10.980386734008789, + "logits/chosen": -0.5814694166183472, + "logits/rejected": 0.02095670998096466, + "logps/chosen": -407.71063232421875, + "logps/rejected": -1218.3812255859375, + "loss": 0.1442, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -2.890734910964966, + "rewards/margins": 8.561320304870605, + "rewards/rejected": -11.452054977416992, "step": 670 }, { "epoch": 0.71, - "grad_norm": 36.66974740741856, + "grad_norm": 33.97518986004043, "learning_rate": 1.1613595214152711e-07, - "logits/chosen": -0.7827145457267761, - "logits/rejected": 0.08258621394634247, - "logps/chosen": -360.87493896484375, - "logps/rejected": -1151.84521484375, - "loss": 0.153, + "logits/chosen": -0.7169137001037598, + "logits/rejected": 0.04466833546757698, + "logps/chosen": -364.97601318359375, + "logps/rejected": -1123.3226318359375, + "loss": 0.1528, "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -2.3406710624694824, - "rewards/margins": 8.442497253417969, - "rewards/rejected": -10.783166885375977, + "rewards/chosen": -2.3816819190979004, + "rewards/margins": 8.11626148223877, + "rewards/rejected": -10.497942924499512, "step": 680 }, { "epoch": 0.72, - "grad_norm": 32.97786121900684, + "grad_norm": 30.078102150025952, "learning_rate": 1.0850520736699362e-07, - "logits/chosen": -0.7693961262702942, - "logits/rejected": 0.005597646348178387, - "logps/chosen": -345.9762268066406, - "logps/rejected": -1089.049072265625, - "loss": 0.1393, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.2439839839935303, - "rewards/margins": 7.891903877258301, - "rewards/rejected": -10.135889053344727, + "logits/chosen": -0.6698465347290039, + "logits/rejected": -0.047953587025403976, + "logps/chosen": -349.53106689453125, + "logps/rejected": -1050.202392578125, + "loss": 0.1418, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.2795326709747314, + "rewards/margins": 7.467889308929443, + "rewards/rejected": -9.747421264648438, "step": 690 }, { "epoch": 0.73, - "grad_norm": 45.53691199690897, + "grad_norm": 50.93260348353266, "learning_rate": 1.0106369933615042e-07, - "logits/chosen": -0.47497862577438354, - "logits/rejected": 0.2753020226955414, - "logps/chosen": -350.84698486328125, - "logps/rejected": -1033.717041015625, - "loss": 0.1552, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.2757205963134766, - "rewards/margins": 7.295037269592285, - "rewards/rejected": -9.570757865905762, + "logits/chosen": -0.38736292719841003, + "logits/rejected": 0.30570071935653687, + "logps/chosen": -358.25897216796875, + "logps/rejected": -1047.8114013671875, + "loss": 0.1568, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -2.349841356277466, + "rewards/margins": 7.361859321594238, + "rewards/rejected": -9.711699485778809, "step": 700 }, { "epoch": 0.73, - "eval_logits/chosen": -0.5351493954658508, - "eval_logits/rejected": -0.358839213848114, - "eval_logps/chosen": -1044.81787109375, - "eval_logps/rejected": -1206.419677734375, - "eval_loss": 2.0767157077789307, - "eval_rewards/accuracies": 0.59765625, - "eval_rewards/chosen": -7.631843090057373, - "eval_rewards/margins": 1.5547385215759277, - "eval_rewards/rejected": -9.186580657958984, - "eval_runtime": 97.3655, - "eval_samples_per_second": 20.541, - "eval_steps_per_second": 0.329, + "eval_logits/chosen": -0.5415219068527222, + "eval_logits/rejected": -0.35287633538246155, + "eval_logps/chosen": -1035.0318603515625, + "eval_logps/rejected": -1212.30859375, + "eval_loss": 2.0363433361053467, + "eval_rewards/accuracies": 0.63671875, + "eval_rewards/chosen": -7.53398323059082, + "eval_rewards/margins": 1.71148681640625, + "eval_rewards/rejected": -9.24547004699707, + "eval_runtime": 98.1372, + "eval_samples_per_second": 20.38, + "eval_steps_per_second": 0.326, "step": 700 }, { "epoch": 0.74, - "grad_norm": 20.912820446414077, + "grad_norm": 29.555000338233505, "learning_rate": 9.382138040640714e-08, - "logits/chosen": -0.7432624101638794, - "logits/rejected": 0.19342878460884094, - "logps/chosen": -351.33599853515625, - "logps/rejected": -1151.90625, - "loss": 0.134, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -2.2001760005950928, - "rewards/margins": 8.568601608276367, - "rewards/rejected": -10.768776893615723, + "logits/chosen": -0.7881192564964294, + "logits/rejected": 0.08800046145915985, + "logps/chosen": -361.5810852050781, + "logps/rejected": -1154.0849609375, + "loss": 0.1333, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -2.302626371383667, + "rewards/margins": 8.48793888092041, + "rewards/rejected": -10.79056453704834, "step": 710 }, { "epoch": 0.75, - "grad_norm": 43.893306649250164, + "grad_norm": 38.4818506115184, "learning_rate": 8.678793653740632e-08, - "logits/chosen": -0.6166023015975952, - "logits/rejected": 0.08570433408021927, - "logps/chosen": -365.81402587890625, - "logps/rejected": -1052.766357421875, - "loss": 0.1483, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.3786137104034424, - "rewards/margins": 7.3965888023376465, - "rewards/rejected": -9.775201797485352, + "logits/chosen": -0.6763113737106323, + "logits/rejected": -0.06647912412881851, + "logps/chosen": -368.46966552734375, + "logps/rejected": -1021.611328125, + "loss": 0.1492, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.4051706790924072, + "rewards/margins": 7.058481693267822, + "rewards/rejected": -9.463651657104492, "step": 720 }, { "epoch": 0.76, - "grad_norm": 27.044416498643887, + "grad_norm": 36.63051407727175, "learning_rate": 7.997277433690983e-08, - "logits/chosen": -0.8449182510375977, - "logits/rejected": -0.04736803472042084, - "logps/chosen": -356.64935302734375, - "logps/rejected": -1065.8873291015625, - "loss": 0.1552, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.2181100845336914, - "rewards/margins": 7.574668884277344, - "rewards/rejected": -9.792778968811035, + "logits/chosen": -0.9104700088500977, + "logits/rejected": -0.22938068211078644, + "logps/chosen": -364.0018615722656, + "logps/rejected": -1037.9288330078125, + "loss": 0.1596, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -2.291635036468506, + "rewards/margins": 7.221558570861816, + "rewards/rejected": -9.513193130493164, "step": 730 }, { "epoch": 0.77, - "grad_norm": 44.06234339270125, + "grad_norm": 43.735869951508946, "learning_rate": 7.338500848029602e-08, - "logits/chosen": -0.7003141641616821, - "logits/rejected": 0.08363965153694153, - "logps/chosen": -341.93157958984375, - "logps/rejected": -1166.3671875, - "loss": 0.1518, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -2.0371365547180176, - "rewards/margins": 8.86846923828125, - "rewards/rejected": -10.905606269836426, + "logits/chosen": -0.7912919521331787, + "logits/rejected": -0.14580750465393066, + "logps/chosen": -346.33514404296875, + "logps/rejected": -1128.053466796875, + "loss": 0.1546, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.081172466278076, + "rewards/margins": 8.441293716430664, + "rewards/rejected": -10.522466659545898, "step": 740 }, { "epoch": 0.78, - "grad_norm": 43.35684912820975, + "grad_norm": 41.046590518066616, "learning_rate": 6.70334495204884e-08, - "logits/chosen": -0.568864643573761, - "logits/rejected": 0.3128158748149872, - "logps/chosen": -353.63616943359375, - "logps/rejected": -1124.0401611328125, - "loss": 0.1381, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -2.2875678539276123, - "rewards/margins": 8.232528686523438, - "rewards/rejected": -10.520097732543945, + "logits/chosen": -0.6530207395553589, + "logits/rejected": 0.07747974246740341, + "logps/chosen": -362.73638916015625, + "logps/rejected": -1119.141357421875, + "loss": 0.1367, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -2.378570079803467, + "rewards/margins": 8.092538833618164, + "rewards/rejected": -10.471108436584473, "step": 750 }, { "epoch": 0.8, - "grad_norm": 26.263047158523968, + "grad_norm": 32.343949612453194, "learning_rate": 6.092659210462231e-08, - "logits/chosen": -0.7811049222946167, - "logits/rejected": 0.1209399476647377, - "logps/chosen": -373.8915710449219, - "logps/rejected": -1145.06396484375, - "loss": 0.1361, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -2.237269878387451, - "rewards/margins": 8.342939376831055, - "rewards/rejected": -10.580209732055664, + "logits/chosen": -0.8776817321777344, + "logits/rejected": -0.11572667211294174, + "logps/chosen": -385.73626708984375, + "logps/rejected": -1134.219970703125, + "loss": 0.1375, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3557167053222656, + "rewards/margins": 8.116053581237793, + "rewards/rejected": -10.471770286560059, "step": 760 }, { "epoch": 0.81, - "grad_norm": 33.38453699325153, + "grad_norm": 34.162011887537034, "learning_rate": 5.507260361320737e-08, - "logits/chosen": -0.6777874231338501, - "logits/rejected": 0.20796041190624237, - "logps/chosen": -355.58575439453125, - "logps/rejected": -1019.7391357421875, - "loss": 0.1459, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -2.242889642715454, - "rewards/margins": 7.201622009277344, - "rewards/rejected": -9.444511413574219, + "logits/chosen": -0.8442201614379883, + "logits/rejected": -0.04310985282063484, + "logps/chosen": -379.82598876953125, + "logps/rejected": -1050.9014892578125, + "loss": 0.1475, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.4852917194366455, + "rewards/margins": 7.2708420753479, + "rewards/rejected": -9.756132125854492, "step": 770 }, { "epoch": 0.82, - "grad_norm": 27.52737122861175, + "grad_norm": 25.498709571788076, "learning_rate": 4.947931323697982e-08, - "logits/chosen": -0.7127518653869629, - "logits/rejected": 0.13608665764331818, - "logps/chosen": -345.1900939941406, - "logps/rejected": -1019.2976684570312, - "loss": 0.1525, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.1412816047668457, - "rewards/margins": 7.251255989074707, - "rewards/rejected": -9.392538070678711, + "logits/chosen": -0.7225035429000854, + "logits/rejected": -0.010953882709145546, + "logps/chosen": -381.2359313964844, + "logps/rejected": -1071.4010009765625, + "loss": 0.1532, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.501739978790283, + "rewards/margins": 7.411829471588135, + "rewards/rejected": -9.913569450378418, "step": 780 }, { "epoch": 0.83, - "grad_norm": 28.47492588939245, + "grad_norm": 25.875216505837162, "learning_rate": 4.415420150605398e-08, - "logits/chosen": -0.7804344892501831, - "logits/rejected": -0.030652623623609543, - "logps/chosen": -339.2191162109375, - "logps/rejected": -1011.9390869140625, - "loss": 0.1619, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -2.1389918327331543, - "rewards/margins": 7.208333492279053, - "rewards/rejected": -9.347326278686523, + "logits/chosen": -0.785541832447052, + "logits/rejected": -0.13479521870613098, + "logps/chosen": -381.22271728515625, + "logps/rejected": -1061.69482421875, + "loss": 0.1581, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.559027910232544, + "rewards/margins": 7.285855293273926, + "rewards/rejected": -9.844882011413574, "step": 790 }, { "epoch": 0.84, - "grad_norm": 21.149940049259552, + "grad_norm": 19.242215161672252, "learning_rate": 3.9104390285376374e-08, - "logits/chosen": -0.6159350872039795, - "logits/rejected": 0.16070613265037537, - "logps/chosen": -341.08917236328125, - "logps/rejected": -1105.7109375, - "loss": 0.1377, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -2.1167824268341064, - "rewards/margins": 8.197529792785645, - "rewards/rejected": -10.314311027526855, + "logits/chosen": -0.6329415440559387, + "logits/rejected": 0.04099388048052788, + "logps/chosen": -369.5647888183594, + "logps/rejected": -1150.815185546875, + "loss": 0.1394, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -2.401538610458374, + "rewards/margins": 8.36381721496582, + "rewards/rejected": -10.765356063842773, "step": 800 }, { "epoch": 0.84, - "eval_logits/chosen": -0.588712751865387, - "eval_logits/rejected": -0.41556617617607117, - "eval_logps/chosen": -1010.3356323242188, - "eval_logps/rejected": -1168.1900634765625, - "eval_loss": 2.030700206756592, - "eval_rewards/accuracies": 0.60546875, - "eval_rewards/chosen": -7.287020683288574, - "eval_rewards/margins": 1.5172640085220337, - "eval_rewards/rejected": -8.804285049438477, - "eval_runtime": 97.2124, - "eval_samples_per_second": 20.573, - "eval_steps_per_second": 0.329, + "eval_logits/chosen": -0.670225203037262, + "eval_logits/rejected": -0.48720771074295044, + "eval_logps/chosen": -1030.3460693359375, + "eval_logps/rejected": -1205.7071533203125, + "eval_loss": 2.0375936031341553, + "eval_rewards/accuracies": 0.62890625, + "eval_rewards/chosen": -7.487125873565674, + "eval_rewards/margins": 1.6923303604125977, + "eval_rewards/rejected": -9.179455757141113, + "eval_runtime": 98.1828, + "eval_samples_per_second": 20.37, + "eval_steps_per_second": 0.326, "step": 800 }, { "epoch": 0.85, - "grad_norm": 24.57939572568816, + "grad_norm": 27.860467320433408, "learning_rate": 3.433663324986208e-08, - "logits/chosen": -0.662031352519989, - "logits/rejected": 0.13034440577030182, - "logps/chosen": -334.8114318847656, - "logps/rejected": -1060.559814453125, - "loss": 0.158, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -2.030988931655884, - "rewards/margins": 7.7772111892700195, - "rewards/rejected": -9.808199882507324, + "logits/chosen": -0.7032958269119263, + "logits/rejected": -0.039611607789993286, + "logps/chosen": -372.03179931640625, + "logps/rejected": -1109.5592041015625, + "loss": 0.1578, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.4031922817230225, + "rewards/margins": 7.895002841949463, + "rewards/rejected": -10.298196792602539, "step": 810 }, { "epoch": 0.86, - "grad_norm": 44.23966241882351, + "grad_norm": 25.24048108497769, "learning_rate": 2.9857306851953897e-08, - "logits/chosen": -0.6313251256942749, - "logits/rejected": 0.16264604032039642, - "logps/chosen": -337.66143798828125, - "logps/rejected": -1053.348388671875, - "loss": 0.1489, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -2.0846145153045654, - "rewards/margins": 7.6865739822387695, - "rewards/rejected": -9.771188735961914, + "logits/chosen": -0.7006738185882568, + "logits/rejected": -0.0007687866454944015, + "logps/chosen": -365.3314514160156, + "logps/rejected": -1100.225830078125, + "loss": 0.1513, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -2.361314535140991, + "rewards/margins": 7.878649711608887, + "rewards/rejected": -10.23996353149414, "step": 820 }, { "epoch": 0.87, - "grad_norm": 45.06145430483538, + "grad_norm": 44.97714986455337, "learning_rate": 2.567240179368185e-08, - "logits/chosen": -0.6046188473701477, - "logits/rejected": 0.2691899836063385, - "logps/chosen": -344.50836181640625, - "logps/rejected": -1100.190185546875, - "loss": 0.1406, + "logits/chosen": -0.720068097114563, + "logits/rejected": 0.043493710458278656, + "logps/chosen": -367.99725341796875, + "logps/rejected": -1123.581787109375, + "loss": 0.1378, "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -2.13590669631958, - "rewards/margins": 8.086740493774414, - "rewards/rejected": -10.222646713256836, + "rewards/chosen": -2.370795488357544, + "rewards/margins": 8.085766792297363, + "rewards/rejected": -10.456562995910645, "step": 830 }, { "epoch": 0.88, - "grad_norm": 44.72639985506377, + "grad_norm": 44.41948671122666, "learning_rate": 2.1787515014630357e-08, - "logits/chosen": -0.9460281133651733, - "logits/rejected": 0.00803391169756651, - "logps/chosen": -338.85980224609375, - "logps/rejected": -1107.776611328125, - "loss": 0.139, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -1.9058221578598022, - "rewards/margins": 8.424005508422852, - "rewards/rejected": -10.329828262329102, + "logits/chosen": -1.0506001710891724, + "logits/rejected": -0.2063721865415573, + "logps/chosen": -365.94390869140625, + "logps/rejected": -1148.2757568359375, + "loss": 0.1357, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1766631603240967, + "rewards/margins": 8.558156967163086, + "rewards/rejected": -10.734820365905762, "step": 840 }, { "epoch": 0.89, - "grad_norm": 33.16503245142525, + "grad_norm": 42.10277256318548, "learning_rate": 1.820784220652766e-08, - "logits/chosen": -0.46998101472854614, - "logits/rejected": 0.2729955017566681, - "logps/chosen": -349.9148254394531, - "logps/rejected": -1000.7681884765625, - "loss": 0.1575, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -2.2772185802459717, - "rewards/margins": 6.997946262359619, - "rewards/rejected": -9.275164604187012, + "logits/chosen": -0.6166629195213318, + "logits/rejected": 0.04001082107424736, + "logps/chosen": -383.52716064453125, + "logps/rejected": -1040.70849609375, + "loss": 0.16, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.613341808319092, + "rewards/margins": 7.061226844787598, + "rewards/rejected": -9.674568176269531, "step": 850 }, { "epoch": 0.9, - "grad_norm": 34.5662684913937, + "grad_norm": 31.329260800118423, "learning_rate": 1.4938170864468636e-08, - "logits/chosen": -0.7951310276985168, - "logits/rejected": 0.0224321149289608, - "logps/chosen": -330.31744384765625, - "logps/rejected": -1079.454345703125, - "loss": 0.1483, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": -2.00297212600708, - "rewards/margins": 8.02238941192627, - "rewards/rejected": -10.025362014770508, + "logits/chosen": -0.9054363369941711, + "logits/rejected": -0.24197959899902344, + "logps/chosen": -361.8520812988281, + "logps/rejected": -1125.2762451171875, + "loss": 0.146, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.3183188438415527, + "rewards/margins": 8.165262222290039, + "rewards/rejected": -10.483580589294434, "step": 860 }, { "epoch": 0.91, - "grad_norm": 39.90580157192396, + "grad_norm": 38.47485051939267, "learning_rate": 1.1982873884064465e-08, - "logits/chosen": -0.7433925867080688, - "logits/rejected": 0.03819055110216141, - "logps/chosen": -343.10076904296875, - "logps/rejected": -1177.3214111328125, - "loss": 0.1454, + "logits/chosen": -0.8719817996025085, + "logits/rejected": -0.21304121613502502, + "logps/chosen": -369.63983154296875, + "logps/rejected": -1203.4268798828125, + "loss": 0.1429, "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -2.095435619354248, - "rewards/margins": 8.91866683959961, - "rewards/rejected": -11.0141019821167, + "rewards/chosen": -2.360826015472412, + "rewards/margins": 8.914331436157227, + "rewards/rejected": -11.27515697479248, "step": 870 }, { "epoch": 0.92, - "grad_norm": 33.335658583470845, + "grad_norm": 35.666238345982485, "learning_rate": 9.345903713082304e-09, - "logits/chosen": -0.7445138692855835, - "logits/rejected": 0.06519349664449692, - "logps/chosen": -360.95751953125, - "logps/rejected": -1240.344970703125, - "loss": 0.1403, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -2.1640257835388184, - "rewards/margins": 9.403306007385254, - "rewards/rejected": -11.567331314086914, + "logits/chosen": -0.8036599159240723, + "logits/rejected": -0.10646633803844452, + "logps/chosen": -394.9730224609375, + "logps/rejected": -1276.0770263671875, + "loss": 0.1397, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -2.504180908203125, + "rewards/margins": 9.420472145080566, + "rewards/rejected": -11.924654006958008, "step": 880 }, { "epoch": 0.93, - "grad_norm": 34.256779785154734, + "grad_norm": 48.95679032310491, "learning_rate": 7.030787065396865e-09, - "logits/chosen": -0.5979295969009399, - "logits/rejected": 0.12500345706939697, - "logps/chosen": -324.71563720703125, - "logps/rejected": -1090.809814453125, - "loss": 0.1529, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -1.9876606464385986, - "rewards/margins": 8.175052642822266, - "rewards/rejected": -10.162714004516602, + "logits/chosen": -0.6822945475578308, + "logits/rejected": -0.068966343998909, + "logps/chosen": -355.93572998046875, + "logps/rejected": -1121.0040283203125, + "loss": 0.1593, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -2.2998616695404053, + "rewards/margins": 8.164793968200684, + "rewards/rejected": -10.464655876159668, "step": 890 }, { "epoch": 0.94, - "grad_norm": 21.350853836162187, + "grad_norm": 23.544657517060433, "learning_rate": 5.04062020432286e-09, - "logits/chosen": -0.6045975685119629, - "logits/rejected": 0.057004231959581375, - "logps/chosen": -314.7482604980469, - "logps/rejected": -1017.1994018554688, - "loss": 0.1462, + "logits/chosen": -0.7034295797348022, + "logits/rejected": -0.13987545669078827, + "logps/chosen": -338.71307373046875, + "logps/rejected": -1044.3170166015625, + "loss": 0.1421, "rewards/accuracies": 0.90625, - "rewards/chosen": -1.934511423110962, - "rewards/margins": 7.4669060707092285, - "rewards/rejected": -9.401416778564453, + "rewards/chosen": -2.174159526824951, + "rewards/margins": 7.498434543609619, + "rewards/rejected": -9.672593116760254, "step": 900 }, { "epoch": 0.94, - "eval_logits/chosen": -0.5892084240913391, - "eval_logits/rejected": -0.4189561903476715, - "eval_logps/chosen": -1001.0963134765625, - "eval_logps/rejected": -1160.1461181640625, - "eval_loss": 2.023190498352051, - "eval_rewards/accuracies": 0.61328125, - "eval_rewards/chosen": -7.1946282386779785, - "eval_rewards/margins": 1.5292174816131592, - "eval_rewards/rejected": -8.723845481872559, - "eval_runtime": 97.2614, - "eval_samples_per_second": 20.563, - "eval_steps_per_second": 0.329, + "eval_logits/chosen": -0.7534744143486023, + "eval_logits/rejected": -0.5762284994125366, + "eval_logps/chosen": -1012.8751831054688, + "eval_logps/rejected": -1186.8892822265625, + "eval_loss": 2.0172135829925537, + "eval_rewards/accuracies": 0.62109375, + "eval_rewards/chosen": -7.312416076660156, + "eval_rewards/margins": 1.6788610219955444, + "eval_rewards/rejected": -8.991276741027832, + "eval_runtime": 98.135, + "eval_samples_per_second": 20.38, + "eval_steps_per_second": 0.326, "step": 900 }, { "epoch": 0.95, - "grad_norm": 23.95340887933029, + "grad_norm": 893.5723100340648, "learning_rate": 3.3780648016376866e-09, - "logits/chosen": -0.6600544452667236, - "logits/rejected": 0.006202346179634333, - "logps/chosen": -325.9316101074219, - "logps/rejected": -1070.564208984375, - "loss": 0.1294, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -2.046983480453491, - "rewards/margins": 7.899454593658447, - "rewards/rejected": -9.94643783569336, + "logits/chosen": -0.7067383527755737, + "logits/rejected": -0.755331814289093, + "logps/chosen": -923.3671875, + "logps/rejected": -311.7925109863281, + "loss": 4.6731, + "rewards/accuracies": 0.08749999850988388, + "rewards/chosen": -6.301113128662109, + "rewards/margins": -4.41973352432251, + "rewards/rejected": -1.8813793659210205, "step": 910 }, { "epoch": 0.96, - "grad_norm": 27.425291781296558, + "grad_norm": 1005.3276230330481, "learning_rate": 2.0453443778310766e-09, - "logits/chosen": -0.7382737994194031, - "logits/rejected": -0.02898242510855198, - "logps/chosen": -355.5832824707031, - "logps/rejected": -1111.990234375, - "loss": 0.159, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -2.140547275543213, - "rewards/margins": 8.157334327697754, - "rewards/rejected": -10.297882080078125, + "logits/chosen": -0.8030230402946472, + "logits/rejected": -0.8817335963249207, + "logps/chosen": -822.5032958984375, + "logps/rejected": -278.01202392578125, + "loss": 3.5691, + "rewards/accuracies": 0.10000000149011612, + "rewards/chosen": -5.397690773010254, + "rewards/margins": -3.8739609718322754, + "rewards/rejected": -1.5237300395965576, "step": 920 }, { "epoch": 0.97, - "grad_norm": 34.03050925733607, + "grad_norm": 817.1440449371347, "learning_rate": 1.0442413283435758e-09, - "logits/chosen": -0.6262997388839722, - "logits/rejected": 0.14048056304454803, - "logps/chosen": -344.0566101074219, - "logps/rejected": -1069.548583984375, - "loss": 0.1613, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -2.1957008838653564, - "rewards/margins": 7.760766506195068, - "rewards/rejected": -9.956467628479004, + "logits/chosen": -0.8951903581619263, + "logits/rejected": -1.0553044080734253, + "logps/chosen": -670.3468017578125, + "logps/rejected": -246.47073364257812, + "loss": 3.1789, + "rewards/accuracies": 0.20624999701976776, + "rewards/chosen": -3.9740867614746094, + "rewards/margins": -2.8322198390960693, + "rewards/rejected": -1.1418668031692505, "step": 930 }, { "epoch": 0.98, - "grad_norm": 34.773688515138254, + "grad_norm": 796.4972175026098, "learning_rate": 3.760945397705828e-10, - "logits/chosen": -0.7389376163482666, - "logits/rejected": 0.0326564684510231, - "logps/chosen": -334.1388854980469, - "logps/rejected": -1029.4984130859375, - "loss": 0.1497, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.1344165802001953, - "rewards/margins": 7.39968204498291, - "rewards/rejected": -9.534098625183105, + "logits/chosen": -1.0590890645980835, + "logits/rejected": -1.0901000499725342, + "logps/chosen": -723.5357666015625, + "logps/rejected": -248.53427124023438, + "loss": 2.7666, + "rewards/accuracies": 0.11874999850988388, + "rewards/chosen": -3.7788455486297607, + "rewards/margins": -2.6434824466705322, + "rewards/rejected": -1.1353634595870972, "step": 940 }, { "epoch": 0.99, - "grad_norm": 25.746566103316148, + "grad_norm": 958.5472940739504, "learning_rate": 4.17975992204056e-11, - "logits/chosen": -0.5864154696464539, - "logits/rejected": 0.14644786715507507, - "logps/chosen": -315.15032958984375, - "logps/rejected": -1021.39453125, - "loss": 0.1545, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -1.8772588968276978, - "rewards/margins": 7.5670928955078125, - "rewards/rejected": -9.444352149963379, + "logits/chosen": -0.9802714586257935, + "logits/rejected": -1.1157965660095215, + "logps/chosen": -605.3143310546875, + "logps/rejected": -239.96054077148438, + "loss": 2.7492, + "rewards/accuracies": 0.10625000298023224, + "rewards/chosen": -3.1763741970062256, + "rewards/margins": -2.1106882095336914, + "rewards/rejected": -1.0656859874725342, "step": 950 }, { "epoch": 1.0, "step": 955, "total_flos": 0.0, - "train_loss": 0.194384015168195, - "train_runtime": 15417.2553, - "train_samples_per_second": 7.931, - "train_steps_per_second": 0.062 + "train_loss": 0.1926751920690087, + "train_runtime": 814.3751, + "train_samples_per_second": 150.137, + "train_steps_per_second": 1.173 } ], "logging_steps": 10,