diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6240 @@ +{ + "best_metric": 0.38143062591552734, + "best_model_checkpoint": "./models/checkpoint-405", + "epoch": 1.8, + "eval_steps": 45, + "global_step": 405, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0044444444444444444, + "grad_norm": 3.2011494636535645, + "learning_rate": 2.173913043478261e-06, + "logits/chosen": 1.6946959495544434, + "logits/rejected": 1.7046217918395996, + "logps/chosen": -123.91139221191406, + "logps/rejected": -152.06222534179688, + "loss": 0.5724, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14388123154640198, + "rewards/margins": 0.2681159973144531, + "rewards/rejected": -0.12423478066921234, + "step": 1 + }, + { + "epoch": 0.008888888888888889, + "grad_norm": 2.7101495265960693, + "learning_rate": 4.347826086956522e-06, + "logits/chosen": 2.161226749420166, + "logits/rejected": 2.1654703617095947, + "logps/chosen": -257.9621276855469, + "logps/rejected": -336.0558776855469, + "loss": 0.6413, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.051631927490234375, + "rewards/margins": 0.10870284587144852, + "rewards/rejected": -0.05707092583179474, + "step": 2 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 2.1376404762268066, + "learning_rate": 6.521739130434783e-06, + "logits/chosen": 1.9573543071746826, + "logits/rejected": 1.8775691986083984, + "logps/chosen": -253.7610626220703, + "logps/rejected": -210.71412658691406, + "loss": 0.527, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1437244415283203, + "rewards/margins": 0.37279435992240906, + "rewards/rejected": -0.22906990349292755, + "step": 3 + }, + { + "epoch": 0.017777777777777778, + "grad_norm": 2.6771838665008545, + "learning_rate": 8.695652173913044e-06, + "logits/chosen": 2.2943520545959473, + "logits/rejected": 2.242229461669922, + "logps/chosen": -384.8254089355469, + "logps/rejected": -270.86602783203125, + "loss": 0.5007, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3624267578125, + "rewards/margins": 0.435385137796402, + "rewards/rejected": -0.07295837253332138, + "step": 4 + }, + { + "epoch": 0.022222222222222223, + "grad_norm": 3.228928565979004, + "learning_rate": 1.0869565217391305e-05, + "logits/chosen": 2.2134103775024414, + "logits/rejected": 2.145387887954712, + "logps/chosen": -316.5057373046875, + "logps/rejected": -360.7799377441406, + "loss": 0.6288, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0028488151729106903, + "rewards/margins": 0.1330413818359375, + "rewards/rejected": -0.1301925629377365, + "step": 5 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 3.082205057144165, + "learning_rate": 1.3043478260869566e-05, + "logits/chosen": 2.1260976791381836, + "logits/rejected": 2.1222031116485596, + "logps/chosen": -358.46337890625, + "logps/rejected": -424.693359375, + "loss": 0.5927, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11278533935546875, + "rewards/margins": 0.21288147568702698, + "rewards/rejected": -0.10009613633155823, + "step": 6 + }, + { + "epoch": 0.03111111111111111, + "grad_norm": 3.3314132690429688, + "learning_rate": 1.5217391304347828e-05, + "logits/chosen": 2.2984189987182617, + "logits/rejected": 2.247058391571045, + "logps/chosen": -534.6464233398438, + "logps/rejected": -502.7433776855469, + "loss": 0.5904, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02135010063648224, + "rewards/margins": 0.21751099824905396, + "rewards/rejected": -0.19616088271141052, + "step": 7 + }, + { + "epoch": 0.035555555555555556, + "grad_norm": 3.316230058670044, + "learning_rate": 1.739130434782609e-05, + "logits/chosen": 2.123837947845459, + "logits/rejected": 2.181354284286499, + "logps/chosen": -245.4013671875, + "logps/rejected": -403.6361083984375, + "loss": 0.6805, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11298942565917969, + "rewards/margins": 0.027771372348070145, + "rewards/rejected": 0.08521804958581924, + "step": 8 + }, + { + "epoch": 0.04, + "grad_norm": 2.0176970958709717, + "learning_rate": 1.956521739130435e-05, + "logits/chosen": 1.7723705768585205, + "logits/rejected": 1.846294641494751, + "logps/chosen": -183.6702423095703, + "logps/rejected": -227.79495239257812, + "loss": 0.7198, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1414192169904709, + "rewards/margins": -0.05180053412914276, + "rewards/rejected": -0.08961868286132812, + "step": 9 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 2.7427902221679688, + "learning_rate": 2.173913043478261e-05, + "logits/chosen": 2.143216848373413, + "logits/rejected": 2.135941982269287, + "logps/chosen": -362.2386169433594, + "logps/rejected": -326.8141174316406, + "loss": 0.6138, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03693237528204918, + "rewards/margins": 0.16980285942554474, + "rewards/rejected": -0.13287048041820526, + "step": 10 + }, + { + "epoch": 0.04888888888888889, + "grad_norm": 2.896284580230713, + "learning_rate": 2.391304347826087e-05, + "logits/chosen": 2.1498055458068848, + "logits/rejected": 2.200744152069092, + "logps/chosen": -248.22348022460938, + "logps/rejected": -351.1915283203125, + "loss": 0.6633, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.033481597900390625, + "rewards/margins": 0.09698867797851562, + "rewards/rejected": -0.13047027587890625, + "step": 11 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 3.3868470191955566, + "learning_rate": 2.608695652173913e-05, + "logits/chosen": 2.2540273666381836, + "logits/rejected": 2.0217299461364746, + "logps/chosen": -290.06707763671875, + "logps/rejected": -291.1870422363281, + "loss": 0.6204, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.057250212877988815, + "rewards/margins": 0.15169525146484375, + "rewards/rejected": -0.20894546806812286, + "step": 12 + }, + { + "epoch": 0.057777777777777775, + "grad_norm": 2.541471004486084, + "learning_rate": 2.826086956521739e-05, + "logits/chosen": 2.0605201721191406, + "logits/rejected": 1.9781224727630615, + "logps/chosen": -280.5556335449219, + "logps/rejected": -206.835693359375, + "loss": 0.6807, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02671203762292862, + "rewards/margins": 0.0282897986471653, + "rewards/rejected": -0.001577761024236679, + "step": 13 + }, + { + "epoch": 0.06222222222222222, + "grad_norm": 3.5865259170532227, + "learning_rate": 3.0434782608695656e-05, + "logits/chosen": 2.4662587642669678, + "logits/rejected": 2.4787802696228027, + "logps/chosen": -318.6181945800781, + "logps/rejected": -371.00048828125, + "loss": 0.7615, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.21838226914405823, + "rewards/margins": -0.1308029294013977, + "rewards/rejected": -0.08757934719324112, + "step": 14 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 1.8042571544647217, + "learning_rate": 3.260869565217392e-05, + "logits/chosen": 1.8834528923034668, + "logits/rejected": 1.8412845134735107, + "logps/chosen": -160.182861328125, + "logps/rejected": -134.62167358398438, + "loss": 0.6405, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.048540499061346054, + "rewards/margins": 0.10904045403003693, + "rewards/rejected": -0.15758095681667328, + "step": 15 + }, + { + "epoch": 0.07111111111111111, + "grad_norm": 3.6051087379455566, + "learning_rate": 3.478260869565218e-05, + "logits/chosen": 2.4191336631774902, + "logits/rejected": 2.4424967765808105, + "logps/chosen": -330.76373291015625, + "logps/rejected": -360.3594970703125, + "loss": 0.7277, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3086807429790497, + "rewards/margins": -0.06518251448869705, + "rewards/rejected": -0.24349823594093323, + "step": 16 + }, + { + "epoch": 0.07555555555555556, + "grad_norm": 2.667231321334839, + "learning_rate": 3.695652173913043e-05, + "logits/chosen": 2.087791919708252, + "logits/rejected": 2.067237615585327, + "logps/chosen": -260.45025634765625, + "logps/rejected": -310.5743713378906, + "loss": 0.6497, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.17707443237304688, + "rewards/margins": 0.09263762831687927, + "rewards/rejected": -0.26971206068992615, + "step": 17 + }, + { + "epoch": 0.08, + "grad_norm": 2.471524477005005, + "learning_rate": 3.91304347826087e-05, + "logits/chosen": 1.9968055486679077, + "logits/rejected": 1.9818394184112549, + "logps/chosen": -148.7676239013672, + "logps/rejected": -160.41592407226562, + "loss": 0.6841, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.12331848591566086, + "rewards/margins": 0.021195977926254272, + "rewards/rejected": -0.14451447129249573, + "step": 18 + }, + { + "epoch": 0.08444444444444445, + "grad_norm": 3.896228790283203, + "learning_rate": 4.130434782608696e-05, + "logits/chosen": 2.1896002292633057, + "logits/rejected": 2.2027523517608643, + "logps/chosen": -280.221923828125, + "logps/rejected": -345.39849853515625, + "loss": 0.6961, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.23584365844726562, + "rewards/margins": -0.005173489451408386, + "rewards/rejected": -0.23067016899585724, + "step": 19 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 2.9211912155151367, + "learning_rate": 4.347826086956522e-05, + "logits/chosen": 1.9726223945617676, + "logits/rejected": 1.9995529651641846, + "logps/chosen": -223.56761169433594, + "logps/rejected": -288.2007141113281, + "loss": 0.6075, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14491653442382812, + "rewards/margins": 0.18477173149585724, + "rewards/rejected": -0.32968828082084656, + "step": 20 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 3.5073137283325195, + "learning_rate": 4.565217391304348e-05, + "logits/chosen": 2.084686517715454, + "logits/rejected": 2.1801323890686035, + "logps/chosen": -302.4353332519531, + "logps/rejected": -421.34222412109375, + "loss": 0.6006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2602279782295227, + "rewards/margins": 0.19529570639133453, + "rewards/rejected": -0.45552366971969604, + "step": 21 + }, + { + "epoch": 0.09777777777777778, + "grad_norm": 4.747559070587158, + "learning_rate": 4.782608695652174e-05, + "logits/chosen": 2.384913682937622, + "logits/rejected": 2.304309368133545, + "logps/chosen": -486.0771484375, + "logps/rejected": -395.3549499511719, + "loss": 0.7814, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.5777496099472046, + "rewards/margins": -0.1678207516670227, + "rewards/rejected": -0.4099288880825043, + "step": 22 + }, + { + "epoch": 0.10222222222222223, + "grad_norm": 3.5170698165893555, + "learning_rate": 5e-05, + "logits/chosen": 2.422769546508789, + "logits/rejected": 2.377617597579956, + "logps/chosen": -356.8462219238281, + "logps/rejected": -335.63311767578125, + "loss": 0.7371, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5601089596748352, + "rewards/margins": -0.08314056694507599, + "rewards/rejected": -0.4769684076309204, + "step": 23 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 1.9441181421279907, + "learning_rate": 4.999932336875371e-05, + "logits/chosen": 1.8213062286376953, + "logits/rejected": 1.8396917581558228, + "logps/chosen": -134.66082763671875, + "logps/rejected": -149.367431640625, + "loss": 0.65, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14266128838062286, + "rewards/margins": 0.0897216796875, + "rewards/rejected": -0.23238298296928406, + "step": 24 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 3.068272352218628, + "learning_rate": 4.9997293511641216e-05, + "logits/chosen": 2.278895139694214, + "logits/rejected": 2.275456190109253, + "logps/chosen": -333.77490234375, + "logps/rejected": -472.25897216796875, + "loss": 0.4571, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23233337700366974, + "rewards/margins": 0.5613830089569092, + "rewards/rejected": -0.7937164306640625, + "step": 25 + }, + { + "epoch": 0.11555555555555555, + "grad_norm": 1.8619799613952637, + "learning_rate": 4.999391053853971e-05, + "logits/chosen": 1.8031303882598877, + "logits/rejected": 1.8489115238189697, + "logps/chosen": -96.40797424316406, + "logps/rejected": -161.97528076171875, + "loss": 0.5789, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05866394191980362, + "rewards/margins": 0.2442375123500824, + "rewards/rejected": -0.3029014468193054, + "step": 26 + }, + { + "epoch": 0.12, + "grad_norm": 5.386238098144531, + "learning_rate": 4.998917463257121e-05, + "logits/chosen": 2.5145506858825684, + "logits/rejected": 2.4547314643859863, + "logps/chosen": -413.3758239746094, + "logps/rejected": -371.0469055175781, + "loss": 0.7603, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5716583728790283, + "rewards/margins": -0.12001956254243851, + "rewards/rejected": -0.4516388177871704, + "step": 27 + }, + { + "epoch": 0.12444444444444444, + "grad_norm": 4.0328145027160645, + "learning_rate": 4.998308605009268e-05, + "logits/chosen": 1.9604880809783936, + "logits/rejected": 1.9736435413360596, + "logps/chosen": -294.1231689453125, + "logps/rejected": -225.6836700439453, + "loss": 0.7918, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.390707403421402, + "rewards/margins": -0.17286226153373718, + "rewards/rejected": -0.217845156788826, + "step": 28 + }, + { + "epoch": 0.1288888888888889, + "grad_norm": 2.2388572692871094, + "learning_rate": 4.997564512068212e-05, + "logits/chosen": 1.7558441162109375, + "logits/rejected": 1.728846549987793, + "logps/chosen": -262.6331787109375, + "logps/rejected": -276.0820617675781, + "loss": 0.5986, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.36693495512008667, + "rewards/margins": 0.3257931172847748, + "rewards/rejected": -0.6927281022071838, + "step": 29 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 2.462900400161743, + "learning_rate": 4.9966852247120764e-05, + "logits/chosen": 2.112412214279175, + "logits/rejected": 2.067960739135742, + "logps/chosen": -276.7638244628906, + "logps/rejected": -449.4617919921875, + "loss": 0.3334, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44448322057724, + "rewards/margins": 0.9420753121376038, + "rewards/rejected": -1.3865585327148438, + "step": 30 + }, + { + "epoch": 0.13777777777777778, + "grad_norm": 2.836688280105591, + "learning_rate": 4.995670790537125e-05, + "logits/chosen": 1.9061617851257324, + "logits/rejected": 1.8595614433288574, + "logps/chosen": -160.72055053710938, + "logps/rejected": -129.44073486328125, + "loss": 0.7916, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3917423486709595, + "rewards/margins": -0.17064018547534943, + "rewards/rejected": -0.22110214829444885, + "step": 31 + }, + { + "epoch": 0.14222222222222222, + "grad_norm": 2.559312343597412, + "learning_rate": 4.994521264455187e-05, + "logits/chosen": 2.1637351512908936, + "logits/rejected": 2.2050771713256836, + "logps/chosen": -304.7850341796875, + "logps/rejected": -348.8180847167969, + "loss": 0.4367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3886520266532898, + "rewards/margins": 0.71138995885849, + "rewards/rejected": -1.1000419855117798, + "step": 32 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 4.237148761749268, + "learning_rate": 4.993236708690683e-05, + "logits/chosen": 2.109586238861084, + "logits/rejected": 2.0614817142486572, + "logps/chosen": -375.04638671875, + "logps/rejected": -299.8951110839844, + "loss": 0.7802, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5753310918807983, + "rewards/margins": -0.14784467220306396, + "rewards/rejected": -0.4274864196777344, + "step": 33 + }, + { + "epoch": 0.1511111111111111, + "grad_norm": 2.7111451625823975, + "learning_rate": 4.991817192777259e-05, + "logits/chosen": 2.2655739784240723, + "logits/rejected": 2.2684638500213623, + "logps/chosen": -313.1495056152344, + "logps/rejected": -275.92181396484375, + "loss": 0.5089, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40856704115867615, + "rewards/margins": 0.41301044821739197, + "rewards/rejected": -0.8215774893760681, + "step": 34 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 3.8092589378356934, + "learning_rate": 4.9902627935540205e-05, + "logits/chosen": 2.179189682006836, + "logits/rejected": 2.120750904083252, + "logps/chosen": -387.86395263671875, + "logps/rejected": -440.36138916015625, + "loss": 0.799, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1031348705291748, + "rewards/margins": 0.2254989743232727, + "rewards/rejected": -1.3286339044570923, + "step": 35 + }, + { + "epoch": 0.16, + "grad_norm": 3.2711353302001953, + "learning_rate": 4.9885735951613745e-05, + "logits/chosen": 2.114718198776245, + "logits/rejected": 2.0954365730285645, + "logps/chosen": -358.84710693359375, + "logps/rejected": -392.369384765625, + "loss": 0.5344, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9632622003555298, + "rewards/margins": 0.3675689697265625, + "rewards/rejected": -1.3308311700820923, + "step": 36 + }, + { + "epoch": 0.16444444444444445, + "grad_norm": 5.44711446762085, + "learning_rate": 4.9867496890364726e-05, + "logits/chosen": 2.1442081928253174, + "logits/rejected": 2.1060421466827393, + "logps/chosen": -323.090087890625, + "logps/rejected": -316.7653503417969, + "loss": 0.6798, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6822533011436462, + "rewards/margins": 0.13051298260688782, + "rewards/rejected": -0.8127662539482117, + "step": 37 + }, + { + "epoch": 0.1688888888888889, + "grad_norm": 3.415454626083374, + "learning_rate": 4.984791173908267e-05, + "logits/chosen": 2.2119979858398438, + "logits/rejected": 2.155428409576416, + "logps/chosen": -411.5396728515625, + "logps/rejected": -440.5595703125, + "loss": 0.3268, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0235031843185425, + "rewards/margins": 1.1276824474334717, + "rewards/rejected": -2.1511855125427246, + "step": 38 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 2.106764554977417, + "learning_rate": 4.982698155792159e-05, + "logits/chosen": 1.676947832107544, + "logits/rejected": 1.8609204292297363, + "logps/chosen": -207.95248413085938, + "logps/rejected": -255.15560913085938, + "loss": 0.4656, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20581628382205963, + "rewards/margins": 0.5979617834091187, + "rewards/rejected": -0.8037780523300171, + "step": 39 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 2.561739444732666, + "learning_rate": 4.980470747984265e-05, + "logits/chosen": 1.903275489807129, + "logits/rejected": 1.878113031387329, + "logps/chosen": -231.56222534179688, + "logps/rejected": -203.46058654785156, + "loss": 0.4862, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5600662231445312, + "rewards/margins": 0.49753645062446594, + "rewards/rejected": -1.0576026439666748, + "step": 40 + }, + { + "epoch": 0.18222222222222223, + "grad_norm": 3.4072999954223633, + "learning_rate": 4.9781090710552835e-05, + "logits/chosen": 2.3119935989379883, + "logits/rejected": 2.174755334854126, + "logps/chosen": -306.8345947265625, + "logps/rejected": -353.8470764160156, + "loss": 0.5132, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6789085865020752, + "rewards/margins": 0.40302202105522156, + "rewards/rejected": -1.0819306373596191, + "step": 41 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 2.7821035385131836, + "learning_rate": 4.975613252843966e-05, + "logits/chosen": 1.9496957063674927, + "logits/rejected": 1.9538969993591309, + "logps/chosen": -228.1984100341797, + "logps/rejected": -221.22930908203125, + "loss": 0.5304, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.608843982219696, + "rewards/margins": 0.46152499318122864, + "rewards/rejected": -1.070369005203247, + "step": 42 + }, + { + "epoch": 0.19111111111111112, + "grad_norm": 2.4308483600616455, + "learning_rate": 4.9729834284501995e-05, + "logits/chosen": 1.9656260013580322, + "logits/rejected": 1.9733545780181885, + "logps/chosen": -214.34481811523438, + "logps/rejected": -260.02227783203125, + "loss": 0.4267, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5842536687850952, + "rewards/margins": 0.8209755420684814, + "rewards/rejected": -1.4052292108535767, + "step": 43 + }, + { + "epoch": 0.19555555555555557, + "grad_norm": 3.7162206172943115, + "learning_rate": 4.970219740227693e-05, + "logits/chosen": 2.2411859035491943, + "logits/rejected": 2.2703304290771484, + "logps/chosen": -327.88653564453125, + "logps/rejected": -440.0887145996094, + "loss": 0.4777, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2694365978240967, + "rewards/margins": 0.9213591814041138, + "rewards/rejected": -2.1907958984375, + "step": 44 + }, + { + "epoch": 0.2, + "grad_norm": 2.9216010570526123, + "learning_rate": 4.9673223377762715e-05, + "logits/chosen": 2.17927885055542, + "logits/rejected": 2.1836795806884766, + "logps/chosen": -384.8567199707031, + "logps/rejected": -417.8768310546875, + "loss": 0.3178, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5690780878067017, + "rewards/margins": 1.0208160877227783, + "rewards/rejected": -1.5898940563201904, + "step": 45 + }, + { + "epoch": 0.2, + "eval_logits/chosen": 2.1410844326019287, + "eval_logits/rejected": 2.0819036960601807, + "eval_logps/chosen": -302.23443603515625, + "eval_logps/rejected": -335.14215087890625, + "eval_loss": 0.590552568435669, + "eval_rewards/accuracies": 0.6964285969734192, + "eval_rewards/chosen": -1.194185495376587, + "eval_rewards/margins": 0.738350510597229, + "eval_rewards/rejected": -1.932536244392395, + "eval_runtime": 17.8289, + "eval_samples_per_second": 2.804, + "eval_steps_per_second": 0.393, + "step": 45 + }, + { + "epoch": 0.20444444444444446, + "grad_norm": 5.059287071228027, + "learning_rate": 4.9642913779337757e-05, + "logits/chosen": 1.6329092979431152, + "logits/rejected": 1.6129851341247559, + "logps/chosen": -348.64971923828125, + "logps/rejected": -319.07080078125, + "loss": 0.6636, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3844482898712158, + "rewards/margins": 0.060698702931404114, + "rewards/rejected": -1.445146918296814, + "step": 46 + }, + { + "epoch": 0.2088888888888889, + "grad_norm": 2.8684635162353516, + "learning_rate": 4.9611270247675776e-05, + "logits/chosen": 1.4863775968551636, + "logits/rejected": 1.5527057647705078, + "logps/chosen": -104.7228775024414, + "logps/rejected": -144.18276977539062, + "loss": 0.7198, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.2717903256416321, + "rewards/margins": -0.05223694443702698, + "rewards/rejected": -0.2195533812046051, + "step": 47 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 4.421427249908447, + "learning_rate": 4.9578294495656965e-05, + "logits/chosen": 2.095689296722412, + "logits/rejected": 2.1000843048095703, + "logps/chosen": -379.61566162109375, + "logps/rejected": -351.688232421875, + "loss": 0.5805, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1613686084747314, + "rewards/margins": 0.24983596801757812, + "rewards/rejected": -1.4112045764923096, + "step": 48 + }, + { + "epoch": 0.21777777777777776, + "grad_norm": 2.3262526988983154, + "learning_rate": 4.954398830827524e-05, + "logits/chosen": 1.5170578956604004, + "logits/rejected": 1.41743004322052, + "logps/chosen": -141.56716918945312, + "logps/rejected": -152.45985412597656, + "loss": 0.6136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.250314325094223, + "rewards/margins": 0.1659536361694336, + "rewards/rejected": -0.4162679612636566, + "step": 49 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 1.6063750982284546, + "learning_rate": 4.950835354254167e-05, + "logits/chosen": 2.113161087036133, + "logits/rejected": 2.027552843093872, + "logps/chosen": -283.05487060546875, + "logps/rejected": -338.911865234375, + "loss": 0.202, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8912537097930908, + "rewards/margins": 1.6376852989196777, + "rewards/rejected": -2.5289390087127686, + "step": 50 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 1.9353735446929932, + "learning_rate": 4.947139212738395e-05, + "logits/chosen": 2.0660266876220703, + "logits/rejected": 1.9209285974502563, + "logps/chosen": -392.48199462890625, + "logps/rejected": -411.1744079589844, + "loss": 0.1246, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.723092794418335, + "rewards/margins": 2.101870536804199, + "rewards/rejected": -3.8249635696411133, + "step": 51 + }, + { + "epoch": 0.2311111111111111, + "grad_norm": 3.488070011138916, + "learning_rate": 4.943310606354192e-05, + "logits/chosen": 2.190558433532715, + "logits/rejected": 2.1188831329345703, + "logps/chosen": -325.7753601074219, + "logps/rejected": -416.0865783691406, + "loss": 0.4128, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3961410522460938, + "rewards/margins": 1.5372390747070312, + "rewards/rejected": -2.933380126953125, + "step": 52 + }, + { + "epoch": 0.23555555555555555, + "grad_norm": 9.58780574798584, + "learning_rate": 4.9393497423459376e-05, + "logits/chosen": 2.139993667602539, + "logits/rejected": 2.1976380348205566, + "logps/chosen": -336.98150634765625, + "logps/rejected": -287.92987060546875, + "loss": 1.3144, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.950069546699524, + "rewards/margins": -0.7764908075332642, + "rewards/rejected": -1.1735787391662598, + "step": 53 + }, + { + "epoch": 0.24, + "grad_norm": 2.2417423725128174, + "learning_rate": 4.935256835117179e-05, + "logits/chosen": 2.2602908611297607, + "logits/rejected": 2.23490047454834, + "logps/chosen": -410.2571105957031, + "logps/rejected": -527.3067016601562, + "loss": 0.212, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7630027532577515, + "rewards/margins": 3.0140035152435303, + "rewards/rejected": -4.77700662612915, + "step": 54 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 2.4229862689971924, + "learning_rate": 4.931032106219029e-05, + "logits/chosen": 1.751630425453186, + "logits/rejected": 1.79133939743042, + "logps/chosen": -251.48580932617188, + "logps/rejected": -306.2901611328125, + "loss": 0.248, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.854656994342804, + "rewards/margins": 1.2701172828674316, + "rewards/rejected": -2.124774217605591, + "step": 55 + }, + { + "epoch": 0.24888888888888888, + "grad_norm": 7.835116386413574, + "learning_rate": 4.926675784338174e-05, + "logits/chosen": 1.9964067935943604, + "logits/rejected": 2.117382287979126, + "logps/chosen": -287.7210388183594, + "logps/rejected": -260.3846740722656, + "loss": 0.9769, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.8229798078536987, + "rewards/margins": -0.4610947072505951, + "rewards/rejected": -1.3618850708007812, + "step": 56 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 3.7002341747283936, + "learning_rate": 4.922188105284495e-05, + "logits/chosen": 2.1905529499053955, + "logits/rejected": 2.094609260559082, + "logps/chosen": -429.6041259765625, + "logps/rejected": -498.65740966796875, + "loss": 0.2605, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7014801502227783, + "rewards/margins": 1.3251266479492188, + "rewards/rejected": -4.026606559753418, + "step": 57 + }, + { + "epoch": 0.2577777777777778, + "grad_norm": 1.7439998388290405, + "learning_rate": 4.9175693119783013e-05, + "logits/chosen": 1.88455069065094, + "logits/rejected": 1.802990198135376, + "logps/chosen": -438.767578125, + "logps/rejected": -430.30194091796875, + "loss": 0.1209, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.552638292312622, + "rewards/margins": 2.1732101440429688, + "rewards/rejected": -4.725848197937012, + "step": 58 + }, + { + "epoch": 0.26222222222222225, + "grad_norm": 2.541234254837036, + "learning_rate": 4.912819654437182e-05, + "logits/chosen": 2.0551681518554688, + "logits/rejected": 1.987473726272583, + "logps/chosen": -397.0820617675781, + "logps/rejected": -455.72442626953125, + "loss": 0.1633, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4276413917541504, + "rewards/margins": 1.9974074363708496, + "rewards/rejected": -3.425048828125, + "step": 59 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 2.0597288608551025, + "learning_rate": 4.9079393897624745e-05, + "logits/chosen": 1.7409842014312744, + "logits/rejected": 1.764671802520752, + "logps/chosen": -259.36578369140625, + "logps/rejected": -400.5860595703125, + "loss": 0.226, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.98876953125, + "rewards/margins": 3.264120578765869, + "rewards/rejected": -4.252890110015869, + "step": 60 + }, + { + "epoch": 0.27111111111111114, + "grad_norm": 3.1185505390167236, + "learning_rate": 4.9029287821253445e-05, + "logits/chosen": 2.0401980876922607, + "logits/rejected": 2.0045888423919678, + "logps/chosen": -233.93736267089844, + "logps/rejected": -272.4693603515625, + "loss": 0.4837, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8766113519668579, + "rewards/margins": 0.49696657061576843, + "rewards/rejected": -1.3735778331756592, + "step": 61 + }, + { + "epoch": 0.27555555555555555, + "grad_norm": 0.503414511680603, + "learning_rate": 4.897788102752485e-05, + "logits/chosen": 2.0203349590301514, + "logits/rejected": 1.9820505380630493, + "logps/chosen": -306.28814697265625, + "logps/rejected": -438.020263671875, + "loss": 0.0311, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1499321460723877, + "rewards/margins": 4.276589393615723, + "rewards/rejected": -5.426521301269531, + "step": 62 + }, + { + "epoch": 0.28, + "grad_norm": 5.02221155166626, + "learning_rate": 4.8925176299114416e-05, + "logits/chosen": 1.7009226083755493, + "logits/rejected": 1.5652942657470703, + "logps/chosen": -425.44415283203125, + "logps/rejected": -476.310546875, + "loss": 0.2624, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0838661193847656, + "rewards/margins": 2.905472993850708, + "rewards/rejected": -5.989339351654053, + "step": 63 + }, + { + "epoch": 0.28444444444444444, + "grad_norm": 10.18476390838623, + "learning_rate": 4.8871176488955415e-05, + "logits/chosen": 1.9149291515350342, + "logits/rejected": 1.852341651916504, + "logps/chosen": -378.48138427734375, + "logps/rejected": -236.2688751220703, + "loss": 2.0384, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.0230088233947754, + "rewards/margins": -1.4521996974945068, + "rewards/rejected": -1.5708091259002686, + "step": 64 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 2.800845146179199, + "learning_rate": 4.881588452008456e-05, + "logits/chosen": 1.3878483772277832, + "logits/rejected": 1.4146528244018555, + "logps/chosen": -115.78246307373047, + "logps/rejected": -137.14707946777344, + "loss": 0.6799, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.47142869234085083, + "rewards/margins": 0.036671459674835205, + "rewards/rejected": -0.508100152015686, + "step": 65 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 4.6413726806640625, + "learning_rate": 4.875930338548376e-05, + "logits/chosen": 1.9736688137054443, + "logits/rejected": 1.8488330841064453, + "logps/chosen": -233.1080322265625, + "logps/rejected": -270.9902038574219, + "loss": 0.4431, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.331383466720581, + "rewards/margins": 0.5846099853515625, + "rewards/rejected": -1.9159934520721436, + "step": 66 + }, + { + "epoch": 0.29777777777777775, + "grad_norm": 8.721001625061035, + "learning_rate": 4.87014361479181e-05, + "logits/chosen": 1.8805785179138184, + "logits/rejected": 1.8359558582305908, + "logps/chosen": -321.511962890625, + "logps/rejected": -275.58209228515625, + "loss": 1.3317, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.167226552963257, + "rewards/margins": -0.7222648859024048, + "rewards/rejected": -2.4449615478515625, + "step": 67 + }, + { + "epoch": 0.3022222222222222, + "grad_norm": 3.038731098175049, + "learning_rate": 4.864228593977006e-05, + "logits/chosen": 2.2448015213012695, + "logits/rejected": 2.306088447570801, + "logps/chosen": -347.45849609375, + "logps/rejected": -423.48297119140625, + "loss": 0.229, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2222633361816406, + "rewards/margins": 2.896373748779297, + "rewards/rejected": -5.1186370849609375, + "step": 68 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 4.552579879760742, + "learning_rate": 4.858185596286997e-05, + "logits/chosen": 2.1001484394073486, + "logits/rejected": 1.979229211807251, + "logps/chosen": -295.586181640625, + "logps/rejected": -303.2032470703125, + "loss": 0.3926, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.715203881263733, + "rewards/margins": 2.0468697547912598, + "rewards/rejected": -3.762073516845703, + "step": 69 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 3.6529836654663086, + "learning_rate": 4.852014948832268e-05, + "logits/chosen": 1.9788322448730469, + "logits/rejected": 1.9580605030059814, + "logps/chosen": -213.86343383789062, + "logps/rejected": -261.7784118652344, + "loss": 0.524, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2392014265060425, + "rewards/margins": 0.37295836210250854, + "rewards/rejected": -1.6121597290039062, + "step": 70 + }, + { + "epoch": 0.31555555555555553, + "grad_norm": 1.9879947900772095, + "learning_rate": 4.8457169856330485e-05, + "logits/chosen": 1.942040205001831, + "logits/rejected": 1.8756356239318848, + "logps/chosen": -313.9153747558594, + "logps/rejected": -427.9259948730469, + "loss": 0.1422, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7393081188201904, + "rewards/margins": 4.251686096191406, + "rewards/rejected": -5.990994453430176, + "step": 71 + }, + { + "epoch": 0.32, + "grad_norm": 5.4608001708984375, + "learning_rate": 4.839292047601234e-05, + "logits/chosen": 1.7308683395385742, + "logits/rejected": 1.7762730121612549, + "logps/chosen": -293.4880065917969, + "logps/rejected": -250.67381286621094, + "loss": 0.6266, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6677329540252686, + "rewards/margins": 1.1602835655212402, + "rewards/rejected": -2.828016757965088, + "step": 72 + }, + { + "epoch": 0.3244444444444444, + "grad_norm": 11.236379623413086, + "learning_rate": 4.832740482521931e-05, + "logits/chosen": 1.5850169658660889, + "logits/rejected": 1.6753277778625488, + "logps/chosen": -275.2799072265625, + "logps/rejected": -174.08926391601562, + "loss": 1.7558, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.16182279586792, + "rewards/margins": -1.245647668838501, + "rewards/rejected": -1.916175127029419, + "step": 73 + }, + { + "epoch": 0.3288888888888889, + "grad_norm": 1.0129520893096924, + "learning_rate": 4.826062645034631e-05, + "logits/chosen": 2.147963047027588, + "logits/rejected": 2.13932728767395, + "logps/chosen": -533.4431762695312, + "logps/rejected": -664.0446166992188, + "loss": 0.038, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.084846496582031, + "rewards/margins": 4.350230693817139, + "rewards/rejected": -8.435077667236328, + "step": 74 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 8.0421724319458, + "learning_rate": 4.819258896614014e-05, + "logits/chosen": 2.038822650909424, + "logits/rejected": 2.02886962890625, + "logps/chosen": -347.89373779296875, + "logps/rejected": -322.3008728027344, + "loss": 0.4447, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.955258369445801, + "rewards/margins": 0.5941513180732727, + "rewards/rejected": -3.5494096279144287, + "step": 75 + }, + { + "epoch": 0.3377777777777778, + "grad_norm": 10.355673789978027, + "learning_rate": 4.812329605550381e-05, + "logits/chosen": 2.0920519828796387, + "logits/rejected": 2.0875403881073, + "logps/chosen": -335.3114318847656, + "logps/rejected": -350.8553466796875, + "loss": 0.9773, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.6420540809631348, + "rewards/margins": 0.05045384168624878, + "rewards/rejected": -3.6925079822540283, + "step": 76 + }, + { + "epoch": 0.3422222222222222, + "grad_norm": 0.42018255591392517, + "learning_rate": 4.805275146929721e-05, + "logits/chosen": 2.0620903968811035, + "logits/rejected": 2.108494281768799, + "logps/chosen": -342.378662109375, + "logps/rejected": -450.7765808105469, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.201615810394287, + "rewards/margins": 4.498736381530762, + "rewards/rejected": -6.700352668762207, + "step": 77 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 0.4926978349685669, + "learning_rate": 4.7980959026134044e-05, + "logits/chosen": 1.9305293560028076, + "logits/rejected": 2.000296115875244, + "logps/chosen": -292.07354736328125, + "logps/rejected": -431.2196044921875, + "loss": 0.0316, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1881355047225952, + "rewards/margins": 3.963468074798584, + "rewards/rejected": -5.151603698730469, + "step": 78 + }, + { + "epoch": 0.3511111111111111, + "grad_norm": 4.756207466125488, + "learning_rate": 4.790792261217512e-05, + "logits/chosen": 2.1176319122314453, + "logits/rejected": 2.1045117378234863, + "logps/chosen": -309.7745666503906, + "logps/rejected": -279.74969482421875, + "loss": 0.5587, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1116164922714233, + "rewards/margins": 0.3822830319404602, + "rewards/rejected": -1.4938995838165283, + "step": 79 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.1574506163597107, + "learning_rate": 4.783364618091803e-05, + "logits/chosen": 2.228512763977051, + "logits/rejected": 2.239095687866211, + "logps/chosen": -452.6519470214844, + "logps/rejected": -519.6800537109375, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.023036241531372, + "rewards/margins": 5.381681442260742, + "rewards/rejected": -7.404717922210693, + "step": 80 + }, + { + "epoch": 0.36, + "grad_norm": 17.396940231323242, + "learning_rate": 4.7758133752983135e-05, + "logits/chosen": 2.2299280166625977, + "logits/rejected": 2.275631904602051, + "logps/chosen": -481.376708984375, + "logps/rejected": -424.80029296875, + "loss": 1.0849, + "rewards/accuracies": 0.5, + "rewards/chosen": -4.431848049163818, + "rewards/margins": -0.2735259532928467, + "rewards/rejected": -4.158322334289551, + "step": 81 + }, + { + "epoch": 0.36444444444444446, + "grad_norm": 2.288203477859497, + "learning_rate": 4.7681389415895864e-05, + "logits/chosen": 1.9121689796447754, + "logits/rejected": 1.9124395847320557, + "logps/chosen": -365.42669677734375, + "logps/rejected": -389.65118408203125, + "loss": 0.3228, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7858033180236816, + "rewards/margins": 1.5727282762527466, + "rewards/rejected": -4.358531475067139, + "step": 82 + }, + { + "epoch": 0.3688888888888889, + "grad_norm": 17.783615112304688, + "learning_rate": 4.7603417323865547e-05, + "logits/chosen": 2.15109920501709, + "logits/rejected": 2.273561954498291, + "logps/chosen": -492.3847351074219, + "logps/rejected": -423.0718994140625, + "loss": 2.6581, + "rewards/accuracies": 0.5, + "rewards/chosen": -4.891546726226807, + "rewards/margins": -1.6526780128479004, + "rewards/rejected": -3.2388687133789062, + "step": 83 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 3.1931591033935547, + "learning_rate": 4.752422169756048e-05, + "logits/chosen": 2.216590404510498, + "logits/rejected": 2.2690138816833496, + "logps/chosen": -395.58953857421875, + "logps/rejected": -360.23663330078125, + "loss": 0.2526, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.509881615638733, + "rewards/margins": 2.4663939476013184, + "rewards/rejected": -3.976275682449341, + "step": 84 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 0.4893577992916107, + "learning_rate": 4.74438068238795e-05, + "logits/chosen": 2.0717084407806396, + "logits/rejected": 2.0398921966552734, + "logps/chosen": -313.4667053222656, + "logps/rejected": -517.9127197265625, + "loss": 0.023, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5163437128067017, + "rewards/margins": 4.198652744293213, + "rewards/rejected": -5.714996337890625, + "step": 85 + }, + { + "epoch": 0.38222222222222224, + "grad_norm": 12.34611988067627, + "learning_rate": 4.736217705571989e-05, + "logits/chosen": 1.8056581020355225, + "logits/rejected": 1.906313180923462, + "logps/chosen": -328.50921630859375, + "logps/rejected": -252.91552734375, + "loss": 1.6628, + "rewards/accuracies": 0.0, + "rewards/chosen": -3.34942626953125, + "rewards/margins": -1.445077657699585, + "rewards/rejected": -1.9043487310409546, + "step": 86 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 11.105749130249023, + "learning_rate": 4.7279336811741806e-05, + "logits/chosen": 2.4057044982910156, + "logits/rejected": 2.336398124694824, + "logps/chosen": -602.2092895507812, + "logps/rejected": -557.8479614257812, + "loss": 0.6199, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.7743942737579346, + "rewards/margins": 0.2018601894378662, + "rewards/rejected": -3.976254463195801, + "step": 87 + }, + { + "epoch": 0.39111111111111113, + "grad_norm": 17.08523941040039, + "learning_rate": 4.7195290576129034e-05, + "logits/chosen": 2.213070869445801, + "logits/rejected": 2.195730686187744, + "logps/chosen": -453.27703857421875, + "logps/rejected": -573.7431640625, + "loss": 2.1029, + "rewards/accuracies": 0.5, + "rewards/chosen": -4.91035795211792, + "rewards/margins": 0.73905348777771, + "rewards/rejected": -5.649411201477051, + "step": 88 + }, + { + "epoch": 0.39555555555555555, + "grad_norm": 1.3783494234085083, + "learning_rate": 4.711004289834632e-05, + "logits/chosen": 2.123533248901367, + "logits/rejected": 2.0941734313964844, + "logps/chosen": -282.4661865234375, + "logps/rejected": -387.58892822265625, + "loss": 0.1011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8440461158752441, + "rewards/margins": 2.256805419921875, + "rewards/rejected": -4.100851535797119, + "step": 89 + }, + { + "epoch": 0.4, + "grad_norm": 2.8574769496917725, + "learning_rate": 4.702359839289306e-05, + "logits/chosen": 2.0068724155426025, + "logits/rejected": 2.0709903240203857, + "logps/chosen": -362.0110168457031, + "logps/rejected": -384.2983093261719, + "loss": 0.2747, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.201084852218628, + "rewards/margins": 1.2602746486663818, + "rewards/rejected": -3.4613595008850098, + "step": 90 + }, + { + "epoch": 0.4, + "eval_logits/chosen": 2.161973237991333, + "eval_logits/rejected": 2.1175014972686768, + "eval_logps/chosen": -310.35650634765625, + "eval_logps/rejected": -351.593994140625, + "eval_loss": 0.5963193774223328, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -2.006390333175659, + "eval_rewards/margins": 1.5713260173797607, + "eval_rewards/rejected": -3.57771635055542, + "eval_runtime": 17.4029, + "eval_samples_per_second": 2.873, + "eval_steps_per_second": 0.402, + "step": 90 + }, + { + "epoch": 0.40444444444444444, + "grad_norm": 12.335380554199219, + "learning_rate": 4.693596173905352e-05, + "logits/chosen": 2.364140272140503, + "logits/rejected": 2.410036563873291, + "logps/chosen": -285.2010192871094, + "logps/rejected": -307.5989074707031, + "loss": 1.0673, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.113780975341797, + "rewards/margins": -0.5538902282714844, + "rewards/rejected": -1.5598907470703125, + "step": 91 + }, + { + "epoch": 0.4088888888888889, + "grad_norm": 15.816446304321289, + "learning_rate": 4.684713768064357e-05, + "logits/chosen": 1.9842954874038696, + "logits/rejected": 2.057584762573242, + "logps/chosen": -406.9105224609375, + "logps/rejected": -411.98211669921875, + "loss": 1.7238, + "rewards/accuracies": 0.0, + "rewards/chosen": -5.087683200836182, + "rewards/margins": -1.399601697921753, + "rewards/rejected": -3.6880815029144287, + "step": 92 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 9.741644859313965, + "learning_rate": 4.6757131025753886e-05, + "logits/chosen": 1.788228988647461, + "logits/rejected": 1.8304262161254883, + "logps/chosen": -257.06671142578125, + "logps/rejected": -334.83636474609375, + "loss": 0.9151, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.2510857582092285, + "rewards/margins": 0.16967010498046875, + "rewards/rejected": -2.4207558631896973, + "step": 93 + }, + { + "epoch": 0.4177777777777778, + "grad_norm": 2.8402011394500732, + "learning_rate": 4.666594664648965e-05, + "logits/chosen": 2.0854671001434326, + "logits/rejected": 2.104097366333008, + "logps/chosen": -244.27151489257812, + "logps/rejected": -314.5588073730469, + "loss": 0.3846, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4888412654399872, + "rewards/margins": 1.8905991315841675, + "rewards/rejected": -2.3794403076171875, + "step": 94 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 8.729580879211426, + "learning_rate": 4.657358947870691e-05, + "logits/chosen": 2.1040725708007812, + "logits/rejected": 2.0335569381713867, + "logps/chosen": -278.30023193359375, + "logps/rejected": -244.13815307617188, + "loss": 1.8136, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.41619873046875, + "rewards/margins": -0.721272349357605, + "rewards/rejected": -1.6949265003204346, + "step": 95 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 10.822171211242676, + "learning_rate": 4.648006452174529e-05, + "logits/chosen": 2.428173542022705, + "logits/rejected": 2.208796977996826, + "logps/chosen": -400.2300720214844, + "logps/rejected": -370.8153381347656, + "loss": 1.0031, + "rewards/accuracies": 0.0, + "rewards/chosen": -3.485565185546875, + "rewards/margins": -0.5405601859092712, + "rewards/rejected": -2.945004940032959, + "step": 96 + }, + { + "epoch": 0.4311111111111111, + "grad_norm": 0.6067838668823242, + "learning_rate": 4.638537683815744e-05, + "logits/chosen": 2.0516138076782227, + "logits/rejected": 2.128382682800293, + "logps/chosen": -282.2442321777344, + "logps/rejected": -405.89276123046875, + "loss": 0.0315, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.78936767578125, + "rewards/margins": 3.77711820602417, + "rewards/rejected": -4.56648588180542, + "step": 97 + }, + { + "epoch": 0.43555555555555553, + "grad_norm": 13.154459953308105, + "learning_rate": 4.628953155343499e-05, + "logits/chosen": 2.1049790382385254, + "logits/rejected": 1.929673433303833, + "logps/chosen": -305.3583984375, + "logps/rejected": -173.13394165039062, + "loss": 0.9443, + "rewards/accuracies": 0.0, + "rewards/chosen": -2.3848648071289062, + "rewards/margins": -0.4350753426551819, + "rewards/rejected": -1.9497895240783691, + "step": 98 + }, + { + "epoch": 0.44, + "grad_norm": 14.330009460449219, + "learning_rate": 4.6192533855731114e-05, + "logits/chosen": 2.194329261779785, + "logits/rejected": 2.2239904403686523, + "logps/chosen": -411.20849609375, + "logps/rejected": -397.444580078125, + "loss": 1.1319, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.436021566390991, + "rewards/margins": 0.39365994930267334, + "rewards/rejected": -3.829681396484375, + "step": 99 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 7.170289993286133, + "learning_rate": 4.609438899557964e-05, + "logits/chosen": 2.3336665630340576, + "logits/rejected": 2.3739962577819824, + "logps/chosen": -443.4410095214844, + "logps/rejected": -623.44384765625, + "loss": 0.3145, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.313551425933838, + "rewards/margins": 2.143171787261963, + "rewards/rejected": -5.456723213195801, + "step": 100 + }, + { + "epoch": 0.4488888888888889, + "grad_norm": 1.4472028017044067, + "learning_rate": 4.5995102285610906e-05, + "logits/chosen": 2.0881364345550537, + "logits/rejected": 2.0009138584136963, + "logps/chosen": -430.8775634765625, + "logps/rejected": -477.1756591796875, + "loss": 0.0688, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0700042247772217, + "rewards/margins": 3.3247146606445312, + "rewards/rejected": -5.394719123840332, + "step": 101 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 4.1629252433776855, + "learning_rate": 4.589467910026411e-05, + "logits/chosen": 1.530840516090393, + "logits/rejected": 1.5147547721862793, + "logps/chosen": -123.47633361816406, + "logps/rejected": -148.20376586914062, + "loss": 0.837, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8232139348983765, + "rewards/margins": -0.18887022137641907, + "rewards/rejected": -0.6343437433242798, + "step": 102 + }, + { + "epoch": 0.4577777777777778, + "grad_norm": 2.2841246128082275, + "learning_rate": 4.579312487549649e-05, + "logits/chosen": 1.9281361103057861, + "logits/rejected": 2.022286891937256, + "logps/chosen": -349.215576171875, + "logps/rejected": -505.2444763183594, + "loss": 0.1805, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0184952020645142, + "rewards/margins": 3.793022394180298, + "rewards/rejected": -4.811517715454102, + "step": 103 + }, + { + "epoch": 0.4622222222222222, + "grad_norm": 5.752498149871826, + "learning_rate": 4.5690445108488964e-05, + "logits/chosen": 2.1871137619018555, + "logits/rejected": 2.213275909423828, + "logps/chosen": -212.89849853515625, + "logps/rejected": -298.1326599121094, + "loss": 0.4131, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0786117315292358, + "rewards/margins": 0.7388886213302612, + "rewards/rejected": -1.817500352859497, + "step": 104 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 2.0671677589416504, + "learning_rate": 4.5586645357348636e-05, + "logits/chosen": 1.9253795146942139, + "logits/rejected": 1.9396390914916992, + "logps/chosen": -283.0425720214844, + "logps/rejected": -399.41876220703125, + "loss": 0.2595, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7986934781074524, + "rewards/margins": 3.9111409187316895, + "rewards/rejected": -4.709834098815918, + "step": 105 + }, + { + "epoch": 0.4711111111111111, + "grad_norm": 2.6245131492614746, + "learning_rate": 4.548173124080789e-05, + "logits/chosen": 2.175868511199951, + "logits/rejected": 2.2268357276916504, + "logps/chosen": -401.7663269042969, + "logps/rejected": -339.01263427734375, + "loss": 0.2061, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.636662244796753, + "rewards/margins": 2.116255283355713, + "rewards/rejected": -3.752917528152466, + "step": 106 + }, + { + "epoch": 0.47555555555555556, + "grad_norm": 6.8939642906188965, + "learning_rate": 4.5375708437920284e-05, + "logits/chosen": 2.150783061981201, + "logits/rejected": 2.192080020904541, + "logps/chosen": -327.8550720214844, + "logps/rejected": -408.950439453125, + "loss": 0.4908, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.347644090652466, + "rewards/margins": 1.237553358078003, + "rewards/rejected": -3.5851974487304688, + "step": 107 + }, + { + "epoch": 0.48, + "grad_norm": 3.782555103302002, + "learning_rate": 4.526858268775313e-05, + "logits/chosen": 1.875314712524414, + "logits/rejected": 1.9068584442138672, + "logps/chosen": -265.00701904296875, + "logps/rejected": -323.212890625, + "loss": 0.2026, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4442001581192017, + "rewards/margins": 2.3625869750976562, + "rewards/rejected": -3.8067870140075684, + "step": 108 + }, + { + "epoch": 0.48444444444444446, + "grad_norm": 0.7733494639396667, + "learning_rate": 4.516035978907681e-05, + "logits/chosen": 1.999725103378296, + "logits/rejected": 1.9631330966949463, + "logps/chosen": -347.4098815917969, + "logps/rejected": -423.82171630859375, + "loss": 0.0474, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3635421991348267, + "rewards/margins": 3.0967469215393066, + "rewards/rejected": -4.460289001464844, + "step": 109 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 2.2969586849212646, + "learning_rate": 4.50510456000509e-05, + "logits/chosen": 1.6264564990997314, + "logits/rejected": 1.6307601928710938, + "logps/chosen": -256.4902648925781, + "logps/rejected": -350.5125427246094, + "loss": 0.2928, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.814366102218628, + "rewards/margins": 3.2768468856811523, + "rewards/rejected": -5.091212749481201, + "step": 110 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 7.435751914978027, + "learning_rate": 4.494064603790708e-05, + "logits/chosen": 1.9676257371902466, + "logits/rejected": 1.9142203330993652, + "logps/chosen": -340.17498779296875, + "logps/rejected": -344.9024658203125, + "loss": 0.648, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.670006513595581, + "rewards/margins": 0.3124961853027344, + "rewards/rejected": -1.9825026988983154, + "step": 111 + }, + { + "epoch": 0.49777777777777776, + "grad_norm": 2.219264268875122, + "learning_rate": 4.482916707862884e-05, + "logits/chosen": 2.203705310821533, + "logits/rejected": 2.050464153289795, + "logps/chosen": -281.28857421875, + "logps/rejected": -352.7940673828125, + "loss": 0.1345, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9629112482070923, + "rewards/margins": 2.2336831092834473, + "rewards/rejected": -3.19659423828125, + "step": 112 + }, + { + "epoch": 0.5022222222222222, + "grad_norm": 18.2978515625, + "learning_rate": 4.471661475662792e-05, + "logits/chosen": 1.856745719909668, + "logits/rejected": 1.9189677238464355, + "logps/chosen": -463.925048828125, + "logps/rejected": -392.9632873535156, + "loss": 2.6401, + "rewards/accuracies": 0.5, + "rewards/chosen": -5.859647750854492, + "rewards/margins": -2.113811731338501, + "rewards/rejected": -3.745835781097412, + "step": 113 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 7.751908779144287, + "learning_rate": 4.460299516441777e-05, + "logits/chosen": 2.136542797088623, + "logits/rejected": 2.011000156402588, + "logps/chosen": -279.3929138183594, + "logps/rejected": -225.48109436035156, + "loss": 0.8985, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4863853454589844, + "rewards/margins": 0.2738412022590637, + "rewards/rejected": -1.7602264881134033, + "step": 114 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 4.417598724365234, + "learning_rate": 4.4488314452283675e-05, + "logits/chosen": 1.400985836982727, + "logits/rejected": 1.5235412120819092, + "logps/chosen": -130.8572235107422, + "logps/rejected": -137.48133850097656, + "loss": 0.6302, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7593280673027039, + "rewards/margins": 0.35355114936828613, + "rewards/rejected": -1.1128792762756348, + "step": 115 + }, + { + "epoch": 0.5155555555555555, + "grad_norm": 3.6468398571014404, + "learning_rate": 4.437257882794991e-05, + "logits/chosen": 2.243985414505005, + "logits/rejected": 2.1406588554382324, + "logps/chosen": -485.3035583496094, + "logps/rejected": -437.01959228515625, + "loss": 0.308, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8883087635040283, + "rewards/margins": 1.3152587413787842, + "rewards/rejected": -4.2035675048828125, + "step": 116 + }, + { + "epoch": 0.52, + "grad_norm": 3.9299352169036865, + "learning_rate": 4.425579455624364e-05, + "logits/chosen": 1.9169459342956543, + "logits/rejected": 1.82602858543396, + "logps/chosen": -202.2696075439453, + "logps/rejected": -183.041015625, + "loss": 0.5871, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4907638430595398, + "rewards/margins": 0.28335878252983093, + "rewards/rejected": -0.7741226553916931, + "step": 117 + }, + { + "epoch": 0.5244444444444445, + "grad_norm": 5.041739463806152, + "learning_rate": 4.413796795875586e-05, + "logits/chosen": 1.7944426536560059, + "logits/rejected": 1.8221161365509033, + "logps/chosen": -212.9854278564453, + "logps/rejected": -245.502197265625, + "loss": 0.6075, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.522188663482666, + "rewards/margins": 0.5173491835594177, + "rewards/rejected": -2.0395379066467285, + "step": 118 + }, + { + "epoch": 0.5288888888888889, + "grad_norm": 2.375946044921875, + "learning_rate": 4.4019105413499164e-05, + "logits/chosen": 2.1719205379486084, + "logits/rejected": 2.069880962371826, + "logps/chosen": -417.11004638671875, + "logps/rejected": -380.632080078125, + "loss": 0.1328, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.728323459625244, + "rewards/margins": 2.0067780017852783, + "rewards/rejected": -4.735101699829102, + "step": 119 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 4.835366725921631, + "learning_rate": 4.389921335456253e-05, + "logits/chosen": 2.228755474090576, + "logits/rejected": 2.121194839477539, + "logps/chosen": -456.9483337402344, + "logps/rejected": -443.54644775390625, + "loss": 0.4242, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.9684982299804688, + "rewards/margins": 3.314952850341797, + "rewards/rejected": -5.283451080322266, + "step": 120 + }, + { + "epoch": 0.5377777777777778, + "grad_norm": 9.487476348876953, + "learning_rate": 4.3778298271762995e-05, + "logits/chosen": 1.9492430686950684, + "logits/rejected": 1.8923718929290771, + "logps/chosen": -367.70208740234375, + "logps/rejected": -292.9391174316406, + "loss": 1.1745, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.854750156402588, + "rewards/margins": -0.488888680934906, + "rewards/rejected": -2.365861415863037, + "step": 121 + }, + { + "epoch": 0.5422222222222223, + "grad_norm": 8.035406112670898, + "learning_rate": 4.365636671029445e-05, + "logits/chosen": 1.636220097541809, + "logits/rejected": 1.6882154941558838, + "logps/chosen": -239.27474975585938, + "logps/rejected": -248.72625732421875, + "loss": 0.7769, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7846871614456177, + "rewards/margins": 0.6797889471054077, + "rewards/rejected": -2.4644761085510254, + "step": 122 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 5.362706184387207, + "learning_rate": 4.3533425270373216e-05, + "logits/chosen": 2.0839271545410156, + "logits/rejected": 2.0972325801849365, + "logps/chosen": -387.6666564941406, + "logps/rejected": -393.9215393066406, + "loss": 0.2218, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9992005825042725, + "rewards/margins": 3.244002342224121, + "rewards/rejected": -6.243203163146973, + "step": 123 + }, + { + "epoch": 0.5511111111111111, + "grad_norm": 2.3104119300842285, + "learning_rate": 4.340948060688088e-05, + "logits/chosen": 1.8295419216156006, + "logits/rejected": 1.8262109756469727, + "logps/chosen": -224.88113403320312, + "logps/rejected": -227.4783477783203, + "loss": 0.3689, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5778324604034424, + "rewards/margins": 0.8512080907821655, + "rewards/rejected": -1.429040551185608, + "step": 124 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 4.099146366119385, + "learning_rate": 4.328453942900402e-05, + "logits/chosen": 1.97019362449646, + "logits/rejected": 1.9800690412521362, + "logps/chosen": -287.9169921875, + "logps/rejected": -343.14788818359375, + "loss": 0.3836, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4385543763637543, + "rewards/margins": 1.3104095458984375, + "rewards/rejected": -1.7489639520645142, + "step": 125 + }, + { + "epoch": 0.56, + "grad_norm": 4.081305503845215, + "learning_rate": 4.3158608499871024e-05, + "logits/chosen": 2.1010217666625977, + "logits/rejected": 2.029930830001831, + "logps/chosen": -325.297119140625, + "logps/rejected": -344.98284912109375, + "loss": 0.342, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5290114879608154, + "rewards/margins": 0.9010803699493408, + "rewards/rejected": -2.4300918579101562, + "step": 126 + }, + { + "epoch": 0.5644444444444444, + "grad_norm": 0.4260007441043854, + "learning_rate": 4.3031694636186e-05, + "logits/chosen": 2.314997434616089, + "logits/rejected": 2.244847059249878, + "logps/chosen": -407.2716369628906, + "logps/rejected": -440.58770751953125, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8063247799873352, + "rewards/margins": 4.322688579559326, + "rewards/rejected": -5.1290130615234375, + "step": 127 + }, + { + "epoch": 0.5688888888888889, + "grad_norm": 22.247989654541016, + "learning_rate": 4.2903804707859835e-05, + "logits/chosen": 1.9879422187805176, + "logits/rejected": 2.056591749191284, + "logps/chosen": -241.29771423339844, + "logps/rejected": -261.1890563964844, + "loss": 1.5621, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.3791258335113525, + "rewards/margins": -1.075968861579895, + "rewards/rejected": -1.3031569719314575, + "step": 128 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 3.094059705734253, + "learning_rate": 4.2774945637638236e-05, + "logits/chosen": 2.2621870040893555, + "logits/rejected": 2.235694408416748, + "logps/chosen": -403.2272644042969, + "logps/rejected": -458.10858154296875, + "loss": 0.1943, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8431869745254517, + "rewards/margins": 2.3183135986328125, + "rewards/rejected": -4.161500453948975, + "step": 129 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 6.682619094848633, + "learning_rate": 4.2645124400727074e-05, + "logits/chosen": 1.864232063293457, + "logits/rejected": 1.8359177112579346, + "logps/chosen": -260.30364990234375, + "logps/rejected": -291.6693420410156, + "loss": 1.0362, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.559638261795044, + "rewards/margins": 0.7007129788398743, + "rewards/rejected": -2.2603511810302734, + "step": 130 + }, + { + "epoch": 0.5822222222222222, + "grad_norm": 4.2206597328186035, + "learning_rate": 4.251434802441476e-05, + "logits/chosen": 2.0464115142822266, + "logits/rejected": 2.0911216735839844, + "logps/chosen": -243.08462524414062, + "logps/rejected": -254.49130249023438, + "loss": 0.5624, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2900955379009247, + "rewards/margins": 0.2913353145122528, + "rewards/rejected": -0.001239776611328125, + "step": 131 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 3.9296364784240723, + "learning_rate": 4.238262358769192e-05, + "logits/chosen": 2.281747341156006, + "logits/rejected": 2.3363983631134033, + "logps/chosen": -285.22998046875, + "logps/rejected": -372.3949279785156, + "loss": 0.2539, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6220627427101135, + "rewards/margins": 1.3760398626327515, + "rewards/rejected": -1.9981026649475098, + "step": 132 + }, + { + "epoch": 0.5911111111111111, + "grad_norm": 2.2730822563171387, + "learning_rate": 4.224995822086812e-05, + "logits/chosen": 2.2451581954956055, + "logits/rejected": 2.2252092361450195, + "logps/chosen": -394.37017822265625, + "logps/rejected": -501.9224853515625, + "loss": 0.1271, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13392946124076843, + "rewards/margins": 4.8956298828125, + "rewards/rejected": -4.76170015335083, + "step": 133 + }, + { + "epoch": 0.5955555555555555, + "grad_norm": 2.509901523590088, + "learning_rate": 4.211635910518595e-05, + "logits/chosen": 1.6302995681762695, + "logits/rejected": 1.689335823059082, + "logps/chosen": -152.21542358398438, + "logps/rejected": -137.0872802734375, + "loss": 0.4905, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1873771846294403, + "rewards/margins": 0.48279035091400146, + "rewards/rejected": -0.6701675653457642, + "step": 134 + }, + { + "epoch": 0.6, + "grad_norm": 5.595450401306152, + "learning_rate": 4.198183347243233e-05, + "logits/chosen": 2.134934663772583, + "logits/rejected": 2.091696262359619, + "logps/chosen": -333.56829833984375, + "logps/rejected": -360.50970458984375, + "loss": 0.3094, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9313689470291138, + "rewards/margins": 2.534543037414551, + "rewards/rejected": -3.465911865234375, + "step": 135 + }, + { + "epoch": 0.6, + "eval_logits/chosen": 2.2205488681793213, + "eval_logits/rejected": 2.174258232116699, + "eval_logps/chosen": -303.35302734375, + "eval_logps/rejected": -344.1379089355469, + "eval_loss": 0.43841180205345154, + "eval_rewards/accuracies": 0.8035714030265808, + "eval_rewards/chosen": -1.3060392141342163, + "eval_rewards/margins": 1.5260727405548096, + "eval_rewards/rejected": -2.8321120738983154, + "eval_runtime": 17.3865, + "eval_samples_per_second": 2.876, + "eval_steps_per_second": 0.403, + "step": 135 + }, + { + "epoch": 0.6044444444444445, + "grad_norm": 1.6538218259811401, + "learning_rate": 4.184638860454696e-05, + "logits/chosen": 1.9310147762298584, + "logits/rejected": 1.8604496717453003, + "logps/chosen": -251.001708984375, + "logps/rejected": -290.62005615234375, + "loss": 0.1732, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33168870210647583, + "rewards/margins": 1.7937004566192627, + "rewards/rejected": -2.1253890991210938, + "step": 136 + }, + { + "epoch": 0.6088888888888889, + "grad_norm": 3.1406407356262207, + "learning_rate": 4.1710031833228225e-05, + "logits/chosen": 1.7651350498199463, + "logits/rejected": 1.8405566215515137, + "logps/chosen": -175.06227111816406, + "logps/rejected": -272.6947326660156, + "loss": 0.1498, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38391417264938354, + "rewards/margins": 1.8765029907226562, + "rewards/rejected": -2.2604172229766846, + "step": 137 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 4.962210178375244, + "learning_rate": 4.157277053953631e-05, + "logits/chosen": 2.104128837585449, + "logits/rejected": 2.078892230987549, + "logps/chosen": -259.95330810546875, + "logps/rejected": -243.87139892578125, + "loss": 0.3842, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8645896911621094, + "rewards/margins": 1.2094483375549316, + "rewards/rejected": -2.074038028717041, + "step": 138 + }, + { + "epoch": 0.6177777777777778, + "grad_norm": 5.0981316566467285, + "learning_rate": 4.143461215349361e-05, + "logits/chosen": 2.3514866828918457, + "logits/rejected": 2.3009910583496094, + "logps/chosen": -429.4484558105469, + "logps/rejected": -535.6287231445312, + "loss": 0.2313, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3444290161132812, + "rewards/margins": 2.3630645275115967, + "rewards/rejected": -4.707493782043457, + "step": 139 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 4.2106523513793945, + "learning_rate": 4.129556415368261e-05, + "logits/chosen": 2.048675060272217, + "logits/rejected": 2.0247228145599365, + "logps/chosen": -283.77264404296875, + "logps/rejected": -262.07989501953125, + "loss": 0.4932, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2498573064804077, + "rewards/margins": 0.9712372422218323, + "rewards/rejected": -2.2210946083068848, + "step": 140 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 10.738434791564941, + "learning_rate": 4.115563406684103e-05, + "logits/chosen": 2.148074150085449, + "logits/rejected": 2.117837905883789, + "logps/chosen": -340.57489013671875, + "logps/rejected": -366.0396423339844, + "loss": 1.6829, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.95953369140625, + "rewards/margins": 0.4566100835800171, + "rewards/rejected": -1.416143774986267, + "step": 141 + }, + { + "epoch": 0.6311111111111111, + "grad_norm": 8.303956985473633, + "learning_rate": 4.101482946745439e-05, + "logits/chosen": 2.478304386138916, + "logits/rejected": 2.3817453384399414, + "logps/chosen": -477.30059814453125, + "logps/rejected": -419.9468994140625, + "loss": 0.4244, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.896254062652588, + "rewards/margins": 0.6844373941421509, + "rewards/rejected": -3.5806915760040283, + "step": 142 + }, + { + "epoch": 0.6355555555555555, + "grad_norm": 5.196935653686523, + "learning_rate": 4.0873157977346e-05, + "logits/chosen": 2.295231342315674, + "logits/rejected": 2.320071220397949, + "logps/chosen": -327.35858154296875, + "logps/rejected": -336.46234130859375, + "loss": 0.3213, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11004638671875, + "rewards/margins": 1.7110825777053833, + "rewards/rejected": -1.6010361909866333, + "step": 143 + }, + { + "epoch": 0.64, + "grad_norm": 0.837842583656311, + "learning_rate": 4.073062726526443e-05, + "logits/chosen": 2.3723278045654297, + "logits/rejected": 2.1718640327453613, + "logps/chosen": -361.47161865234375, + "logps/rejected": -342.56048583984375, + "loss": 0.0281, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03143996000289917, + "rewards/margins": 3.5597052574157715, + "rewards/rejected": -3.5911452770233154, + "step": 144 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 1.8959003686904907, + "learning_rate": 4.058724504646834e-05, + "logits/chosen": 2.1700668334960938, + "logits/rejected": 2.0942935943603516, + "logps/chosen": -257.6955261230469, + "logps/rejected": -254.59393310546875, + "loss": 0.2177, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6744873523712158, + "rewards/margins": 1.8073105812072754, + "rewards/rejected": -1.13282310962677, + "step": 145 + }, + { + "epoch": 0.6488888888888888, + "grad_norm": 3.8877060413360596, + "learning_rate": 4.044301908230889e-05, + "logits/chosen": 2.336484909057617, + "logits/rejected": 2.250220775604248, + "logps/chosen": -329.42779541015625, + "logps/rejected": -461.0723876953125, + "loss": 0.2504, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0788795948028564, + "rewards/margins": 5.8840203285217285, + "rewards/rejected": -4.805140972137451, + "step": 146 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 4.329055309295654, + "learning_rate": 4.0297957179809586e-05, + "logits/chosen": 1.7940289974212646, + "logits/rejected": 1.8224772214889526, + "logps/chosen": -218.82870483398438, + "logps/rejected": -236.94589233398438, + "loss": 0.6222, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5779021978378296, + "rewards/margins": 0.4461887776851654, + "rewards/rejected": -2.0240910053253174, + "step": 147 + }, + { + "epoch": 0.6577777777777778, + "grad_norm": 1.4359842538833618, + "learning_rate": 4.0152067191243696e-05, + "logits/chosen": 1.9685239791870117, + "logits/rejected": 2.0288798809051514, + "logps/chosen": -357.14801025390625, + "logps/rejected": -412.6529541015625, + "loss": 0.0852, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2776429653167725, + "rewards/margins": 2.4739623069763184, + "rewards/rejected": -4.75160551071167, + "step": 148 + }, + { + "epoch": 0.6622222222222223, + "grad_norm": 12.442399978637695, + "learning_rate": 4.000535701370921e-05, + "logits/chosen": 1.7401182651519775, + "logits/rejected": 1.621551752090454, + "logps/chosen": -320.3838195800781, + "logps/rejected": -198.91424560546875, + "loss": 1.6628, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.767268419265747, + "rewards/margins": -0.9375503659248352, + "rewards/rejected": -1.8297181129455566, + "step": 149 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 17.23033332824707, + "learning_rate": 3.985783458870134e-05, + "logits/chosen": 2.1593716144561768, + "logits/rejected": 2.1524195671081543, + "logps/chosen": -375.254150390625, + "logps/rejected": -289.0406494140625, + "loss": 1.0413, + "rewards/accuracies": 0.0, + "rewards/chosen": -2.9647598266601562, + "rewards/margins": -0.5915945768356323, + "rewards/rejected": -2.3731651306152344, + "step": 150 + }, + { + "epoch": 0.6711111111111111, + "grad_norm": 0.4952673614025116, + "learning_rate": 3.9709507901682675e-05, + "logits/chosen": 2.376957893371582, + "logits/rejected": 2.328672409057617, + "logps/chosen": -484.62518310546875, + "logps/rejected": -505.002197265625, + "loss": 0.0204, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.708740234375, + "rewards/margins": 4.536779403686523, + "rewards/rejected": -5.245519638061523, + "step": 151 + }, + { + "epoch": 0.6755555555555556, + "grad_norm": 4.280153751373291, + "learning_rate": 3.95603849816509e-05, + "logits/chosen": 2.2534637451171875, + "logits/rejected": 2.347764015197754, + "logps/chosen": -311.7674560546875, + "logps/rejected": -343.17584228515625, + "loss": 0.2535, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9193252325057983, + "rewards/margins": 1.2428758144378662, + "rewards/rejected": -2.162200927734375, + "step": 152 + }, + { + "epoch": 0.68, + "grad_norm": 9.200690269470215, + "learning_rate": 3.941047390070419e-05, + "logits/chosen": 2.2525882720947266, + "logits/rejected": 2.196587324142456, + "logps/chosen": -419.79833984375, + "logps/rejected": -368.85174560546875, + "loss": 0.8223, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.9374847412109375, + "rewards/margins": 0.6514175534248352, + "rewards/rejected": -2.588902235031128, + "step": 153 + }, + { + "epoch": 0.6844444444444444, + "grad_norm": 1.1576966047286987, + "learning_rate": 3.925978277360428e-05, + "logits/chosen": 2.2419962882995605, + "logits/rejected": 2.240370750427246, + "logps/chosen": -354.83514404296875, + "logps/rejected": -398.2762451171875, + "loss": 0.0467, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2429847717285156, + "rewards/margins": 3.4657950401306152, + "rewards/rejected": -4.708779811859131, + "step": 154 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 4.058935642242432, + "learning_rate": 3.910831975733717e-05, + "logits/chosen": 2.4752016067504883, + "logits/rejected": 2.4061837196350098, + "logps/chosen": -374.433349609375, + "logps/rejected": -460.99163818359375, + "loss": 0.1247, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6075196266174316, + "rewards/margins": 2.1309003829956055, + "rewards/rejected": -3.738420009613037, + "step": 155 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 10.329422950744629, + "learning_rate": 3.895609305067162e-05, + "logits/chosen": 2.1816141605377197, + "logits/rejected": 2.1901464462280273, + "logps/chosen": -354.7869873046875, + "logps/rejected": -334.8106384277344, + "loss": 1.1709, + "rewards/accuracies": 0.0, + "rewards/chosen": -1.6891334056854248, + "rewards/margins": -0.787255048751831, + "rewards/rejected": -0.9018783569335938, + "step": 156 + }, + { + "epoch": 0.6977777777777778, + "grad_norm": 2.0047101974487305, + "learning_rate": 3.8803110893715334e-05, + "logits/chosen": 2.076343536376953, + "logits/rejected": 2.1251060962677, + "logps/chosen": -226.72457885742188, + "logps/rejected": -348.7533874511719, + "loss": 0.1027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06249618902802467, + "rewards/margins": 5.404012680053711, + "rewards/rejected": -5.4665093421936035, + "step": 157 + }, + { + "epoch": 0.7022222222222222, + "grad_norm": 8.143394470214844, + "learning_rate": 3.864938156746891e-05, + "logits/chosen": 2.237619400024414, + "logits/rejected": 2.3040781021118164, + "logps/chosen": -429.0187683105469, + "logps/rejected": -341.12957763671875, + "loss": 0.6455, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7430390119552612, + "rewards/margins": 1.6202683448791504, + "rewards/rejected": -3.363307237625122, + "step": 158 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 2.048825979232788, + "learning_rate": 3.849491339337758e-05, + "logits/chosen": 2.246427297592163, + "logits/rejected": 2.1864523887634277, + "logps/chosen": -253.68792724609375, + "logps/rejected": -255.00852966308594, + "loss": 0.1813, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2091705799102783, + "rewards/margins": 1.7127196788787842, + "rewards/rejected": -2.9218902587890625, + "step": 159 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 9.93562126159668, + "learning_rate": 3.833971473288084e-05, + "logits/chosen": 2.260481357574463, + "logits/rejected": 2.2601776123046875, + "logps/chosen": -375.98193359375, + "logps/rejected": -420.81494140625, + "loss": 0.6181, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0803894996643066, + "rewards/margins": 0.15620207786560059, + "rewards/rejected": -3.2365915775299072, + "step": 160 + }, + { + "epoch": 0.7155555555555555, + "grad_norm": 3.1551766395568848, + "learning_rate": 3.818379398695969e-05, + "logits/chosen": 1.9815887212753296, + "logits/rejected": 1.9442949295043945, + "logps/chosen": -316.49468994140625, + "logps/rejected": -369.946044921875, + "loss": 0.132, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.701190173625946, + "rewards/margins": 2.1365509033203125, + "rewards/rejected": -2.8377411365509033, + "step": 161 + }, + { + "epoch": 0.72, + "grad_norm": 13.20207405090332, + "learning_rate": 3.802715959568205e-05, + "logits/chosen": 2.195608377456665, + "logits/rejected": 2.118527889251709, + "logps/chosen": -396.55755615234375, + "logps/rejected": -423.4131164550781, + "loss": 0.8681, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.12845778465271, + "rewards/margins": 2.663003444671631, + "rewards/rejected": -4.791460990905762, + "step": 162 + }, + { + "epoch": 0.7244444444444444, + "grad_norm": 1.2311620712280273, + "learning_rate": 3.7869820037745776e-05, + "logits/chosen": 2.1139190196990967, + "logits/rejected": 2.142258644104004, + "logps/chosen": -275.6441955566406, + "logps/rejected": -338.9689636230469, + "loss": 0.1183, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2249420881271362, + "rewards/margins": 3.1651391983032227, + "rewards/rejected": -4.39008092880249, + "step": 163 + }, + { + "epoch": 0.7288888888888889, + "grad_norm": 18.346521377563477, + "learning_rate": 3.771178383001976e-05, + "logits/chosen": 2.330061435699463, + "logits/rejected": 2.222029685974121, + "logps/chosen": -468.13232421875, + "logps/rejected": -425.3597412109375, + "loss": 0.6944, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.2604997158050537, + "rewards/margins": 1.2873930931091309, + "rewards/rejected": -4.5478925704956055, + "step": 164 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 4.46051025390625, + "learning_rate": 3.7553059527082913e-05, + "logits/chosen": 2.2368054389953613, + "logits/rejected": 2.2174384593963623, + "logps/chosen": -287.6416931152344, + "logps/rejected": -241.71742248535156, + "loss": 0.4803, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1810874938964844, + "rewards/margins": 0.794731855392456, + "rewards/rejected": -1.9758193492889404, + "step": 165 + }, + { + "epoch": 0.7377777777777778, + "grad_norm": 5.725604057312012, + "learning_rate": 3.739365572076105e-05, + "logits/chosen": 2.309138536453247, + "logits/rejected": 2.297959327697754, + "logps/chosen": -313.10382080078125, + "logps/rejected": -440.99212646484375, + "loss": 0.3831, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8803879022598267, + "rewards/margins": 1.21652090549469, + "rewards/rejected": -3.0969088077545166, + "step": 166 + }, + { + "epoch": 0.7422222222222222, + "grad_norm": 13.70535945892334, + "learning_rate": 3.7233581039661874e-05, + "logits/chosen": 2.021416187286377, + "logits/rejected": 2.0485005378723145, + "logps/chosen": -308.900146484375, + "logps/rejected": -373.67474365234375, + "loss": 0.6832, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8083343505859375, + "rewards/margins": 1.9991533756256104, + "rewards/rejected": -2.807487726211548, + "step": 167 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 5.6754560470581055, + "learning_rate": 3.707284414870786e-05, + "logits/chosen": 2.3587806224823, + "logits/rejected": 2.424814224243164, + "logps/chosen": -379.08697509765625, + "logps/rejected": -437.82147216796875, + "loss": 0.2897, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.100062608718872, + "rewards/margins": 1.1164734363555908, + "rewards/rejected": -2.216536045074463, + "step": 168 + }, + { + "epoch": 0.7511111111111111, + "grad_norm": 13.687298774719238, + "learning_rate": 3.691145374866723e-05, + "logits/chosen": 2.0991220474243164, + "logits/rejected": 2.0921735763549805, + "logps/chosen": -251.90283203125, + "logps/rejected": -294.06097412109375, + "loss": 1.2515, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.1877260208129883, + "rewards/margins": -0.7983794212341309, + "rewards/rejected": -2.3893463611602783, + "step": 169 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 2.953883647918701, + "learning_rate": 3.6749418575683e-05, + "logits/chosen": 1.9750038385391235, + "logits/rejected": 1.9700895547866821, + "logps/chosen": -258.9554748535156, + "logps/rejected": -268.0426940917969, + "loss": 0.107, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.537645697593689, + "rewards/margins": 2.264012336730957, + "rewards/rejected": -3.8016581535339355, + "step": 170 + }, + { + "epoch": 0.76, + "grad_norm": 2.0586588382720947, + "learning_rate": 3.658674740080004e-05, + "logits/chosen": 2.249845504760742, + "logits/rejected": 2.1530356407165527, + "logps/chosen": -373.0096740722656, + "logps/rejected": -384.4673767089844, + "loss": 0.147, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.49631041288375854, + "rewards/margins": 3.1010565757751465, + "rewards/rejected": -2.6047463417053223, + "step": 171 + }, + { + "epoch": 0.7644444444444445, + "grad_norm": 9.671707153320312, + "learning_rate": 3.642344902949034e-05, + "logits/chosen": 2.1441969871520996, + "logits/rejected": 2.0829548835754395, + "logps/chosen": -375.480224609375, + "logps/rejected": -306.5086975097656, + "loss": 0.558, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.187718152999878, + "rewards/margins": 0.452168345451355, + "rewards/rejected": -2.6398866176605225, + "step": 172 + }, + { + "epoch": 0.7688888888888888, + "grad_norm": 0.8413982391357422, + "learning_rate": 3.6259532301176335e-05, + "logits/chosen": 1.6508468389511108, + "logits/rejected": 1.683258056640625, + "logps/chosen": -302.1081848144531, + "logps/rejected": -320.01141357421875, + "loss": 0.1217, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08628615736961365, + "rewards/margins": 3.430840492248535, + "rewards/rejected": -3.5171265602111816, + "step": 173 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 11.386160850524902, + "learning_rate": 3.6095006088752447e-05, + "logits/chosen": 2.2143354415893555, + "logits/rejected": 2.283841848373413, + "logps/chosen": -428.9851379394531, + "logps/rejected": -506.7236328125, + "loss": 0.4424, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3689346313476562, + "rewards/margins": 0.6975066661834717, + "rewards/rejected": -4.066441535949707, + "step": 174 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 2.112983226776123, + "learning_rate": 3.592987929810476e-05, + "logits/chosen": 1.9571995735168457, + "logits/rejected": 1.922455072402954, + "logps/chosen": -289.2620849609375, + "logps/rejected": -418.0646057128906, + "loss": 0.1977, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3730583190917969, + "rewards/margins": 4.838759422302246, + "rewards/rejected": -5.211817741394043, + "step": 175 + }, + { + "epoch": 0.7822222222222223, + "grad_norm": 2.702089786529541, + "learning_rate": 3.576416086762896e-05, + "logits/chosen": 1.9095101356506348, + "logits/rejected": 1.9017338752746582, + "logps/chosen": -258.8391418457031, + "logps/rejected": -258.28948974609375, + "loss": 0.1737, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08884277939796448, + "rewards/margins": 2.2755606174468994, + "rewards/rejected": -2.364403486251831, + "step": 176 + }, + { + "epoch": 0.7866666666666666, + "grad_norm": 3.3134331703186035, + "learning_rate": 3.5597859767746524e-05, + "logits/chosen": 2.0989699363708496, + "logits/rejected": 2.059138774871826, + "logps/chosen": -253.71551513671875, + "logps/rejected": -245.14794921875, + "loss": 0.2049, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4205713272094727, + "rewards/margins": 1.4892207384109497, + "rewards/rejected": -3.909791946411133, + "step": 177 + }, + { + "epoch": 0.7911111111111111, + "grad_norm": 0.9361621141433716, + "learning_rate": 3.543098500041906e-05, + "logits/chosen": 2.1984782218933105, + "logits/rejected": 2.0851023197174072, + "logps/chosen": -280.2938537597656, + "logps/rejected": -311.8728332519531, + "loss": 0.0635, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.518353283405304, + "rewards/margins": 2.7252793312072754, + "rewards/rejected": -2.206925868988037, + "step": 178 + }, + { + "epoch": 0.7955555555555556, + "grad_norm": 35.653995513916016, + "learning_rate": 3.526354559866113e-05, + "logits/chosen": 2.0546839237213135, + "logits/rejected": 2.004641532897949, + "logps/chosen": -270.5155334472656, + "logps/rejected": -285.1980285644531, + "loss": 3.7422, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.489393711090088, + "rewards/margins": -3.35947322845459, + "rewards/rejected": -0.1299205720424652, + "step": 179 + }, + { + "epoch": 0.8, + "grad_norm": 21.546016693115234, + "learning_rate": 3.509555062605121e-05, + "logits/chosen": 2.0889358520507812, + "logits/rejected": 2.1452298164367676, + "logps/chosen": -404.2633056640625, + "logps/rejected": -504.6707458496094, + "loss": 0.8305, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.7308120727539062, + "rewards/margins": 0.4282197952270508, + "rewards/rejected": -4.159031867980957, + "step": 180 + }, + { + "epoch": 0.8, + "eval_logits/chosen": 2.1556396484375, + "eval_logits/rejected": 2.1094348430633545, + "eval_logps/chosen": -310.0359191894531, + "eval_logps/rejected": -357.9092712402344, + "eval_loss": 0.4349474012851715, + "eval_rewards/accuracies": 0.7857142686843872, + "eval_rewards/chosen": -1.9743303060531616, + "eval_rewards/margins": 2.2349183559417725, + "eval_rewards/rejected": -4.2092485427856445, + "eval_runtime": 17.3856, + "eval_samples_per_second": 2.876, + "eval_steps_per_second": 0.403, + "step": 180 + }, + { + "epoch": 0.8044444444444444, + "grad_norm": 1.8578166961669922, + "learning_rate": 3.492700917624113e-05, + "logits/chosen": 1.87994384765625, + "logits/rejected": 1.8280099630355835, + "logps/chosen": -235.28530883789062, + "logps/rejected": -234.61431884765625, + "loss": 0.1254, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8087150454521179, + "rewards/margins": 2.258847236633301, + "rewards/rejected": -3.0675621032714844, + "step": 181 + }, + { + "epoch": 0.8088888888888889, + "grad_norm": 1.5471972227096558, + "learning_rate": 3.4757930372463775e-05, + "logits/chosen": 1.9977684020996094, + "logits/rejected": 1.8153365850448608, + "logps/chosen": -311.0814208984375, + "logps/rejected": -285.3901672363281, + "loss": 0.0704, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35354921221733093, + "rewards/margins": 2.717729091644287, + "rewards/rejected": -2.364180088043213, + "step": 182 + }, + { + "epoch": 0.8133333333333334, + "grad_norm": 32.15199279785156, + "learning_rate": 3.458832336703929e-05, + "logits/chosen": 1.8522934913635254, + "logits/rejected": 1.7468159198760986, + "logps/chosen": -580.458740234375, + "logps/rejected": -361.63299560546875, + "loss": 4.9968, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.116351127624512, + "rewards/margins": -4.255195617675781, + "rewards/rejected": -5.8611555099487305, + "step": 183 + }, + { + "epoch": 0.8177777777777778, + "grad_norm": 21.2011661529541, + "learning_rate": 3.4418197340879635e-05, + "logits/chosen": 2.1630630493164062, + "logits/rejected": 2.1617729663848877, + "logps/chosen": -496.76910400390625, + "logps/rejected": -401.410888671875, + "loss": 2.7916, + "rewards/accuracies": 0.0, + "rewards/chosen": -6.106187343597412, + "rewards/margins": -2.5850629806518555, + "rewards/rejected": -3.5211243629455566, + "step": 184 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 3.49141263961792, + "learning_rate": 3.4247561502991604e-05, + "logits/chosen": 2.1068267822265625, + "logits/rejected": 2.1182143688201904, + "logps/chosen": -385.7279052734375, + "logps/rejected": -520.2858276367188, + "loss": 0.2494, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5602035522460938, + "rewards/margins": 3.892549991607666, + "rewards/rejected": -7.45275354385376, + "step": 185 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 10.206978797912598, + "learning_rate": 3.407642508997838e-05, + "logits/chosen": 1.9911150932312012, + "logits/rejected": 1.9793498516082764, + "logps/chosen": -355.60675048828125, + "logps/rejected": -286.39141845703125, + "loss": 0.5621, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6045563220977783, + "rewards/margins": 0.29973304271698, + "rewards/rejected": -3.9042892456054688, + "step": 186 + }, + { + "epoch": 0.8311111111111111, + "grad_norm": 3.9811224937438965, + "learning_rate": 3.3904797365539514e-05, + "logits/chosen": 1.9419519901275635, + "logits/rejected": 1.943634033203125, + "logps/chosen": -345.35076904296875, + "logps/rejected": -388.5447998046875, + "loss": 0.3847, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5699470639228821, + "rewards/margins": 2.5326507091522217, + "rewards/rejected": -1.9627037048339844, + "step": 187 + }, + { + "epoch": 0.8355555555555556, + "grad_norm": 7.058280944824219, + "learning_rate": 3.37326876199695e-05, + "logits/chosen": 2.377598524093628, + "logits/rejected": 2.3797459602355957, + "logps/chosen": -341.9767150878906, + "logps/rejected": -513.3931884765625, + "loss": 0.197, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.686824083328247, + "rewards/margins": 2.547370672225952, + "rewards/rejected": -4.234194755554199, + "step": 188 + }, + { + "epoch": 0.84, + "grad_norm": 1.5856622457504272, + "learning_rate": 3.356010516965486e-05, + "logits/chosen": 1.8028912544250488, + "logits/rejected": 1.8407230377197266, + "logps/chosen": -206.43429565429688, + "logps/rejected": -305.61212158203125, + "loss": 0.1631, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5213749408721924, + "rewards/margins": 4.116602420806885, + "rewards/rejected": -3.5952274799346924, + "step": 189 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 9.480084419250488, + "learning_rate": 3.3387059356569875e-05, + "logits/chosen": 2.0444135665893555, + "logits/rejected": 2.0769548416137695, + "logps/chosen": -260.6351318359375, + "logps/rejected": -255.31646728515625, + "loss": 1.054, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5492042303085327, + "rewards/margins": -0.4221389889717102, + "rewards/rejected": 0.9713432192802429, + "step": 190 + }, + { + "epoch": 0.8488888888888889, + "grad_norm": 5.375828742980957, + "learning_rate": 3.321355954777087e-05, + "logits/chosen": 2.0929622650146484, + "logits/rejected": 2.0118932723999023, + "logps/chosen": -271.47088623046875, + "logps/rejected": -324.151123046875, + "loss": 0.3697, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7393569946289062, + "rewards/margins": 1.4251999855041504, + "rewards/rejected": -2.1645569801330566, + "step": 191 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 11.164929389953613, + "learning_rate": 3.3039615134889206e-05, + "logits/chosen": 2.168374538421631, + "logits/rejected": 2.050518035888672, + "logps/chosen": -425.24169921875, + "logps/rejected": -436.30914306640625, + "loss": 0.9201, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.633542060852051, + "rewards/margins": -0.2216278314590454, + "rewards/rejected": -2.411914110183716, + "step": 192 + }, + { + "epoch": 0.8577777777777778, + "grad_norm": 12.873815536499023, + "learning_rate": 3.286523553362287e-05, + "logits/chosen": 2.097388505935669, + "logits/rejected": 2.0157864093780518, + "logps/chosen": -274.7909851074219, + "logps/rejected": -251.3322296142578, + "loss": 1.2622, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.8827362060546875, + "rewards/margins": 0.5675584077835083, + "rewards/rejected": -2.4502944946289062, + "step": 193 + }, + { + "epoch": 0.8622222222222222, + "grad_norm": 8.37449836730957, + "learning_rate": 3.269043018322681e-05, + "logits/chosen": 2.1862282752990723, + "logits/rejected": 2.087134838104248, + "logps/chosen": -295.71875, + "logps/rejected": -301.2806396484375, + "loss": 0.4823, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7563506960868835, + "rewards/margins": 2.514920234680176, + "rewards/rejected": -3.271270751953125, + "step": 194 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 6.452545642852783, + "learning_rate": 3.2515208546002e-05, + "logits/chosen": 1.9793057441711426, + "logits/rejected": 2.004866600036621, + "logps/chosen": -242.0079803466797, + "logps/rejected": -277.5906677246094, + "loss": 0.4756, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.457282304763794, + "rewards/margins": 1.3829689025878906, + "rewards/rejected": -2.8402512073516846, + "step": 195 + }, + { + "epoch": 0.8711111111111111, + "grad_norm": 25.117042541503906, + "learning_rate": 3.233958010678322e-05, + "logits/chosen": 2.0711257457733154, + "logits/rejected": 2.1235086917877197, + "logps/chosen": -509.37188720703125, + "logps/rejected": -579.4296875, + "loss": 0.7721, + "rewards/accuracies": 0.5, + "rewards/chosen": -4.672909736633301, + "rewards/margins": 1.30540931224823, + "rewards/rejected": -5.97831916809082, + "step": 196 + }, + { + "epoch": 0.8755555555555555, + "grad_norm": 2.4943253993988037, + "learning_rate": 3.216355437242564e-05, + "logits/chosen": 2.0617835521698, + "logits/rejected": 2.025505781173706, + "logps/chosen": -224.05841064453125, + "logps/rejected": -297.93768310546875, + "loss": 0.1517, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27078327536582947, + "rewards/margins": 1.8932006359100342, + "rewards/rejected": -2.1639838218688965, + "step": 197 + }, + { + "epoch": 0.88, + "grad_norm": 1.4619039297103882, + "learning_rate": 3.1987140871290236e-05, + "logits/chosen": 1.9786646366119385, + "logits/rejected": 1.9430062770843506, + "logps/chosen": -194.2051544189453, + "logps/rejected": -184.22048950195312, + "loss": 0.1253, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5335159301757812, + "rewards/margins": 2.0488877296447754, + "rewards/rejected": -1.5153717994689941, + "step": 198 + }, + { + "epoch": 0.8844444444444445, + "grad_norm": 7.211589336395264, + "learning_rate": 3.181034915272797e-05, + "logits/chosen": 2.10782527923584, + "logits/rejected": 2.206753730773926, + "logps/chosen": -336.33038330078125, + "logps/rejected": -462.14654541015625, + "loss": 0.8607, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9421745538711548, + "rewards/margins": 3.117570161819458, + "rewards/rejected": -4.059744358062744, + "step": 199 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 2.614577054977417, + "learning_rate": 3.1633188786562914e-05, + "logits/chosen": 1.9760353565216064, + "logits/rejected": 1.902787685394287, + "logps/chosen": -248.97714233398438, + "logps/rejected": -256.932861328125, + "loss": 0.282, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1982636451721191, + "rewards/margins": 2.316314697265625, + "rewards/rejected": -1.1180511713027954, + "step": 200 + }, + { + "epoch": 0.8933333333333333, + "grad_norm": 10.176268577575684, + "learning_rate": 3.1455669362574214e-05, + "logits/chosen": 1.9834389686584473, + "logits/rejected": 1.794731855392456, + "logps/chosen": -350.74102783203125, + "logps/rejected": -293.0185241699219, + "loss": 0.6404, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7628203630447388, + "rewards/margins": 2.335862159729004, + "rewards/rejected": -4.098682403564453, + "step": 201 + }, + { + "epoch": 0.8977777777777778, + "grad_norm": 1.9432324171066284, + "learning_rate": 3.1277800489977e-05, + "logits/chosen": 1.688536286354065, + "logits/rejected": 1.688530445098877, + "logps/chosen": -238.6612548828125, + "logps/rejected": -331.52606201171875, + "loss": 0.1152, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12506788969039917, + "rewards/margins": 3.6676583290100098, + "rewards/rejected": -3.542590618133545, + "step": 202 + }, + { + "epoch": 0.9022222222222223, + "grad_norm": 1.8283239603042603, + "learning_rate": 3.1099591796902215e-05, + "logits/chosen": 2.159648895263672, + "logits/rejected": 2.12233567237854, + "logps/chosen": -421.9817199707031, + "logps/rejected": -420.61663818359375, + "loss": 0.0986, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.661656141281128, + "rewards/margins": 2.6688404083251953, + "rewards/rejected": -6.330496311187744, + "step": 203 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 17.021015167236328, + "learning_rate": 3.092105292987548e-05, + "logits/chosen": 1.7743968963623047, + "logits/rejected": 1.7986412048339844, + "logps/chosen": -164.3032989501953, + "logps/rejected": -203.1534881591797, + "loss": 1.8414, + "rewards/accuracies": 0.0, + "rewards/chosen": -3.1501376628875732, + "rewards/margins": -1.529021143913269, + "rewards/rejected": -1.6211166381835938, + "step": 204 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 12.92226791381836, + "learning_rate": 3.07421935532949e-05, + "logits/chosen": 1.701080083847046, + "logits/rejected": 1.747081995010376, + "logps/chosen": -134.5420379638672, + "logps/rejected": -180.4077911376953, + "loss": 0.7422, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.9285032749176025, + "rewards/margins": 0.21163922548294067, + "rewards/rejected": -2.1401424407958984, + "step": 205 + }, + { + "epoch": 0.9155555555555556, + "grad_norm": 12.34039306640625, + "learning_rate": 3.056302334890786e-05, + "logits/chosen": 2.1032023429870605, + "logits/rejected": 2.1539621353149414, + "logps/chosen": -291.208984375, + "logps/rejected": -335.7510986328125, + "loss": 0.5075, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06739044934511185, + "rewards/margins": 1.4255584478378296, + "rewards/rejected": -1.3581680059432983, + "step": 206 + }, + { + "epoch": 0.92, + "grad_norm": 0.44443604350090027, + "learning_rate": 3.03835520152871e-05, + "logits/chosen": 2.3766026496887207, + "logits/rejected": 2.2064499855041504, + "logps/chosen": -357.09075927734375, + "logps/rejected": -521.0325927734375, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4761963188648224, + "rewards/margins": 5.920969009399414, + "rewards/rejected": -6.397165298461914, + "step": 207 + }, + { + "epoch": 0.9244444444444444, + "grad_norm": 1.3140870332717896, + "learning_rate": 3.0203789267305567e-05, + "logits/chosen": 2.188861131668091, + "logits/rejected": 2.1991310119628906, + "logps/chosen": -318.3990783691406, + "logps/rejected": -407.42578125, + "loss": 0.179, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.08726167678833, + "rewards/margins": 3.1875786781311035, + "rewards/rejected": -4.274840354919434, + "step": 208 + }, + { + "epoch": 0.9288888888888889, + "grad_norm": 12.631479263305664, + "learning_rate": 3.002374483561064e-05, + "logits/chosen": 2.1044745445251465, + "logits/rejected": 2.079122543334961, + "logps/chosen": -422.732421875, + "logps/rejected": -583.081298828125, + "loss": 0.3081, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.848828077316284, + "rewards/margins": 4.261569499969482, + "rewards/rejected": -7.1103973388671875, + "step": 209 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 17.813030242919922, + "learning_rate": 2.9843428466097385e-05, + "logits/chosen": 2.1924643516540527, + "logits/rejected": 2.1515285968780518, + "logps/chosen": -404.7247314453125, + "logps/rejected": -385.24407958984375, + "loss": 1.7449, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.990626335144043, + "rewards/margins": -0.6515921354293823, + "rewards/rejected": -3.33903431892395, + "step": 210 + }, + { + "epoch": 0.9377777777777778, + "grad_norm": 3.6667134761810303, + "learning_rate": 2.9662849919380976e-05, + "logits/chosen": 1.874267339706421, + "logits/rejected": 1.896430492401123, + "logps/chosen": -279.3658447265625, + "logps/rejected": -271.2750244140625, + "loss": 0.4278, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9690204858779907, + "rewards/margins": 2.050572633743286, + "rewards/rejected": -3.0195930004119873, + "step": 211 + }, + { + "epoch": 0.9422222222222222, + "grad_norm": 21.304107666015625, + "learning_rate": 2.9482018970268393e-05, + "logits/chosen": 2.03654408454895, + "logits/rejected": 2.1858012676239014, + "logps/chosen": -307.8687744140625, + "logps/rejected": -392.89447021484375, + "loss": 1.358, + "rewards/accuracies": 0.0, + "rewards/chosen": -3.445587158203125, + "rewards/margins": -1.0587692260742188, + "rewards/rejected": -2.3868179321289062, + "step": 212 + }, + { + "epoch": 0.9466666666666667, + "grad_norm": 2.9452733993530273, + "learning_rate": 2.930094540722927e-05, + "logits/chosen": 2.1484179496765137, + "logits/rejected": 2.2047171592712402, + "logps/chosen": -240.11737060546875, + "logps/rejected": -360.74237060546875, + "loss": 0.1765, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6234557628631592, + "rewards/margins": 1.6575775146484375, + "rewards/rejected": -3.2810332775115967, + "step": 213 + }, + { + "epoch": 0.9511111111111111, + "grad_norm": 22.092178344726562, + "learning_rate": 2.911963903186606e-05, + "logits/chosen": 2.0055017471313477, + "logits/rejected": 1.9135019779205322, + "logps/chosen": -241.86856079101562, + "logps/rejected": -239.4849395751953, + "loss": 1.1702, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.504570722579956, + "rewards/margins": -0.4858473837375641, + "rewards/rejected": -1.0187233686447144, + "step": 214 + }, + { + "epoch": 0.9555555555555556, + "grad_norm": 0.4408101439476013, + "learning_rate": 2.8938109658383454e-05, + "logits/chosen": 2.263948440551758, + "logits/rejected": 2.16243314743042, + "logps/chosen": -397.47772216796875, + "logps/rejected": -574.7552490234375, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.030035376548767, + "rewards/margins": 8.122578620910645, + "rewards/rejected": -9.152613639831543, + "step": 215 + }, + { + "epoch": 0.96, + "grad_norm": 0.1885669231414795, + "learning_rate": 2.8756367113057148e-05, + "logits/chosen": 2.174750566482544, + "logits/rejected": 2.1712284088134766, + "logps/chosen": -379.27142333984375, + "logps/rejected": -556.3659057617188, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10250397026538849, + "rewards/margins": 6.272984504699707, + "rewards/rejected": -6.170480728149414, + "step": 216 + }, + { + "epoch": 0.9644444444444444, + "grad_norm": 2.534093141555786, + "learning_rate": 2.857442123370195e-05, + "logits/chosen": 1.986964225769043, + "logits/rejected": 1.985724925994873, + "logps/chosen": -314.1033020019531, + "logps/rejected": -273.2991943359375, + "loss": 0.3304, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2605462074279785, + "rewards/margins": 1.8220291137695312, + "rewards/rejected": -3.0825753211975098, + "step": 217 + }, + { + "epoch": 0.9688888888888889, + "grad_norm": 9.718185424804688, + "learning_rate": 2.8392281869139213e-05, + "logits/chosen": 2.056429386138916, + "logits/rejected": 2.060886859893799, + "logps/chosen": -310.82916259765625, + "logps/rejected": -361.8159484863281, + "loss": 0.8212, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.2504210472106934, + "rewards/margins": 2.31430721282959, + "rewards/rejected": -4.564728736877441, + "step": 218 + }, + { + "epoch": 0.9733333333333334, + "grad_norm": 13.960983276367188, + "learning_rate": 2.8209958878663778e-05, + "logits/chosen": 2.2462310791015625, + "logits/rejected": 2.28263521194458, + "logps/chosen": -465.261474609375, + "logps/rejected": -421.1733093261719, + "loss": 0.728, + "rewards/accuracies": 0.5, + "rewards/chosen": -5.611042022705078, + "rewards/margins": -0.04042929410934448, + "rewards/rejected": -5.570612907409668, + "step": 219 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 9.203478813171387, + "learning_rate": 2.8027462131510208e-05, + "logits/chosen": 1.9416842460632324, + "logits/rejected": 1.7857491970062256, + "logps/chosen": -345.6712646484375, + "logps/rejected": -251.47076416015625, + "loss": 0.8191, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.3168740272521973, + "rewards/margins": -0.017238736152648926, + "rewards/rejected": -3.299635410308838, + "step": 220 + }, + { + "epoch": 0.9822222222222222, + "grad_norm": 1.7355222702026367, + "learning_rate": 2.7844801506318617e-05, + "logits/chosen": 2.240471363067627, + "logits/rejected": 2.2063498497009277, + "logps/chosen": -328.5857238769531, + "logps/rejected": -420.4921875, + "loss": 0.0914, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8121414184570312, + "rewards/margins": 2.7087008953094482, + "rewards/rejected": -4.5208420753479, + "step": 221 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 1.7263237237930298, + "learning_rate": 2.7661986890599943e-05, + "logits/chosen": 1.7395219802856445, + "logits/rejected": 1.784384846687317, + "logps/chosen": -214.15451049804688, + "logps/rejected": -284.48638916015625, + "loss": 0.3769, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4917289614677429, + "rewards/margins": 2.53422212600708, + "rewards/rejected": -3.0259511470794678, + "step": 222 + }, + { + "epoch": 0.9911111111111112, + "grad_norm": 2.090808629989624, + "learning_rate": 2.747902818020067e-05, + "logits/chosen": 2.032662868499756, + "logits/rejected": 1.8991634845733643, + "logps/chosen": -398.57904052734375, + "logps/rejected": -394.4957275390625, + "loss": 0.0918, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1145386695861816, + "rewards/margins": 2.4051780700683594, + "rewards/rejected": -5.519716739654541, + "step": 223 + }, + { + "epoch": 0.9955555555555555, + "grad_norm": 0.48346611857414246, + "learning_rate": 2.7295935278767233e-05, + "logits/chosen": 2.2755024433135986, + "logits/rejected": 2.3426759243011475, + "logps/chosen": -392.48272705078125, + "logps/rejected": -447.3723449707031, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.353735327720642, + "rewards/margins": 4.4421586990356445, + "rewards/rejected": -5.795893669128418, + "step": 224 + }, + { + "epoch": 1.0, + "grad_norm": 19.339488983154297, + "learning_rate": 2.711271809720986e-05, + "logits/chosen": 2.270242214202881, + "logits/rejected": 2.0706443786621094, + "logps/chosen": -485.1645202636719, + "logps/rejected": -381.00018310546875, + "loss": 1.2152, + "rewards/accuracies": 0.0, + "rewards/chosen": -3.213038682937622, + "rewards/margins": -0.8432999849319458, + "rewards/rejected": -2.369738817214966, + "step": 225 + }, + { + "epoch": 1.0, + "eval_logits/chosen": 2.1288914680480957, + "eval_logits/rejected": 2.083587408065796, + "eval_logps/chosen": -313.3813781738281, + "eval_logps/rejected": -365.7991027832031, + "eval_loss": 0.4423667788505554, + "eval_rewards/accuracies": 0.8214285969734192, + "eval_rewards/chosen": -2.3088743686676025, + "eval_rewards/margins": 2.689358949661255, + "eval_rewards/rejected": -4.998233318328857, + "eval_runtime": 17.388, + "eval_samples_per_second": 2.876, + "eval_steps_per_second": 0.403, + "step": 225 + }, + { + "epoch": 1.0044444444444445, + "grad_norm": 1.6202038526535034, + "learning_rate": 2.6929386553166164e-05, + "logits/chosen": 2.034777879714966, + "logits/rejected": 1.9399088621139526, + "logps/chosen": -270.7170104980469, + "logps/rejected": -321.8533020019531, + "loss": 0.0773, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10045319050550461, + "rewards/margins": 4.125720977783203, + "rewards/rejected": -4.226174354553223, + "step": 226 + }, + { + "epoch": 1.008888888888889, + "grad_norm": 0.899365246295929, + "learning_rate": 2.6745950570464212e-05, + "logits/chosen": 1.856791377067566, + "logits/rejected": 1.838975429534912, + "logps/chosen": -187.313720703125, + "logps/rejected": -206.15573120117188, + "loss": 0.1061, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18259316682815552, + "rewards/margins": 2.7951650619506836, + "rewards/rejected": -2.9777581691741943, + "step": 227 + }, + { + "epoch": 1.0133333333333334, + "grad_norm": 24.894893646240234, + "learning_rate": 2.6562420078585433e-05, + "logits/chosen": 2.2783782482147217, + "logits/rejected": 2.3907415866851807, + "logps/chosen": -562.4542846679688, + "logps/rejected": -450.45098876953125, + "loss": 2.5241, + "rewards/accuracies": 0.5, + "rewards/chosen": -8.394607543945312, + "rewards/margins": -0.29980039596557617, + "rewards/rejected": -8.094807624816895, + "step": 228 + }, + { + "epoch": 1.0177777777777777, + "grad_norm": 10.114426612854004, + "learning_rate": 2.637880501212705e-05, + "logits/chosen": 2.3603391647338867, + "logits/rejected": 2.353978157043457, + "logps/chosen": -342.9703674316406, + "logps/rejected": -352.3290710449219, + "loss": 0.623, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0490471124649048, + "rewards/margins": 2.3047902584075928, + "rewards/rejected": -3.353837490081787, + "step": 229 + }, + { + "epoch": 1.0222222222222221, + "grad_norm": 0.2897517681121826, + "learning_rate": 2.619511531026436e-05, + "logits/chosen": 2.2957711219787598, + "logits/rejected": 2.2823054790496826, + "logps/chosen": -380.21844482421875, + "logps/rejected": -504.15045166015625, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6248611211776733, + "rewards/margins": 6.897383213043213, + "rewards/rejected": -7.522244453430176, + "step": 230 + }, + { + "epoch": 1.0266666666666666, + "grad_norm": 1.0437899827957153, + "learning_rate": 2.6011360916212734e-05, + "logits/chosen": 2.2502756118774414, + "logits/rejected": 2.169267177581787, + "logps/chosen": -248.31495666503906, + "logps/rejected": -254.9483642578125, + "loss": 0.0581, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.568634033203125, + "rewards/margins": 2.8449950218200684, + "rewards/rejected": -2.2763609886169434, + "step": 231 + }, + { + "epoch": 1.031111111111111, + "grad_norm": 2.103546142578125, + "learning_rate": 2.5827551776689323e-05, + "logits/chosen": 1.5411741733551025, + "logits/rejected": 1.4888989925384521, + "logps/chosen": -151.44955444335938, + "logps/rejected": -160.1112518310547, + "loss": 0.14, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6025131344795227, + "rewards/margins": 2.4829578399658203, + "rewards/rejected": -3.0854709148406982, + "step": 232 + }, + { + "epoch": 1.0355555555555556, + "grad_norm": 2.739473581314087, + "learning_rate": 2.564369784137472e-05, + "logits/chosen": 1.7263026237487793, + "logits/rejected": 1.783945083618164, + "logps/chosen": -234.08197021484375, + "logps/rejected": -273.28668212890625, + "loss": 0.2093, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8093013763427734, + "rewards/margins": 2.4584481716156006, + "rewards/rejected": -3.267749547958374, + "step": 233 + }, + { + "epoch": 1.04, + "grad_norm": 3.7832846641540527, + "learning_rate": 2.54598090623743e-05, + "logits/chosen": 1.8052504062652588, + "logits/rejected": 1.766016960144043, + "logps/chosen": -296.1597900390625, + "logps/rejected": -256.9058532714844, + "loss": 0.3296, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5597388744354248, + "rewards/margins": 0.9551147222518921, + "rewards/rejected": -2.5148537158966064, + "step": 234 + }, + { + "epoch": 1.0444444444444445, + "grad_norm": 2.446428060531616, + "learning_rate": 2.527589539367956e-05, + "logits/chosen": 2.325028896331787, + "logits/rejected": 2.232288360595703, + "logps/chosen": -376.904052734375, + "logps/rejected": -400.5955505371094, + "loss": 0.0538, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.687849521636963, + "rewards/margins": 3.5299012660980225, + "rewards/rejected": -7.217750549316406, + "step": 235 + }, + { + "epoch": 1.048888888888889, + "grad_norm": 0.11488201469182968, + "learning_rate": 2.50919667906293e-05, + "logits/chosen": 1.7712184190750122, + "logits/rejected": 1.815232515335083, + "logps/chosen": -246.863037109375, + "logps/rejected": -351.25238037109375, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2323249876499176, + "rewards/margins": 5.702582359313965, + "rewards/rejected": -5.93490743637085, + "step": 236 + }, + { + "epoch": 1.0533333333333332, + "grad_norm": 0.46934670209884644, + "learning_rate": 2.4908033209370705e-05, + "logits/chosen": 2.1205997467041016, + "logits/rejected": 2.0082976818084717, + "logps/chosen": -443.21771240234375, + "logps/rejected": -437.1765441894531, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.834454357624054, + "rewards/margins": 6.396831512451172, + "rewards/rejected": -7.23128604888916, + "step": 237 + }, + { + "epoch": 1.0577777777777777, + "grad_norm": 0.6756075620651245, + "learning_rate": 2.4724104606320445e-05, + "logits/chosen": 2.1717934608459473, + "logits/rejected": 2.1605606079101562, + "logps/chosen": -349.8464050292969, + "logps/rejected": -456.8037109375, + "loss": 0.0355, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42097780108451843, + "rewards/margins": 6.1533660888671875, + "rewards/rejected": -6.574343681335449, + "step": 238 + }, + { + "epoch": 1.0622222222222222, + "grad_norm": 0.5172697901725769, + "learning_rate": 2.4540190937625708e-05, + "logits/chosen": 2.2337419986724854, + "logits/rejected": 2.2302417755126953, + "logps/chosen": -269.57037353515625, + "logps/rejected": -465.9024658203125, + "loss": 0.0192, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4147706031799316, + "rewards/margins": 4.075361728668213, + "rewards/rejected": -5.4901323318481445, + "step": 239 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 0.2905597984790802, + "learning_rate": 2.4356302158625288e-05, + "logits/chosen": 2.1833200454711914, + "logits/rejected": 2.207943916320801, + "logps/chosen": -326.02178955078125, + "logps/rejected": -411.3348388671875, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16143493354320526, + "rewards/margins": 5.066305637359619, + "rewards/rejected": -4.904870510101318, + "step": 240 + }, + { + "epoch": 1.0711111111111111, + "grad_norm": 0.2205185443162918, + "learning_rate": 2.4172448223310682e-05, + "logits/chosen": 1.7023603916168213, + "logits/rejected": 1.5953627824783325, + "logps/chosen": -170.35665893554688, + "logps/rejected": -241.34259033203125, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1372658014297485, + "rewards/margins": 4.4668803215026855, + "rewards/rejected": -5.6041460037231445, + "step": 241 + }, + { + "epoch": 1.0755555555555556, + "grad_norm": 0.5335175395011902, + "learning_rate": 2.3988639083787272e-05, + "logits/chosen": 1.9782161712646484, + "logits/rejected": 1.9465917348861694, + "logps/chosen": -299.92816162109375, + "logps/rejected": -304.3218994140625, + "loss": 0.0304, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17397230863571167, + "rewards/margins": 3.7744734287261963, + "rewards/rejected": -3.600501298904419, + "step": 242 + }, + { + "epoch": 1.08, + "grad_norm": 9.141175270080566, + "learning_rate": 2.3804884689735642e-05, + "logits/chosen": 2.1661500930786133, + "logits/rejected": 2.1720974445343018, + "logps/chosen": -267.519775390625, + "logps/rejected": -319.2179260253906, + "loss": 0.4535, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1500840187072754, + "rewards/margins": 3.123685598373413, + "rewards/rejected": -4.273769378662109, + "step": 243 + }, + { + "epoch": 1.0844444444444445, + "grad_norm": 0.07122190296649933, + "learning_rate": 2.3621194987872955e-05, + "logits/chosen": 2.0959739685058594, + "logits/rejected": 1.9889600276947021, + "logps/chosen": -352.41363525390625, + "logps/rejected": -451.14453125, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7250168323516846, + "rewards/margins": 7.163854598999023, + "rewards/rejected": -6.438838005065918, + "step": 244 + }, + { + "epoch": 1.0888888888888888, + "grad_norm": 6.129761695861816, + "learning_rate": 2.3437579921414573e-05, + "logits/chosen": 2.127330780029297, + "logits/rejected": 1.9886295795440674, + "logps/chosen": -403.7293701171875, + "logps/rejected": -486.6906433105469, + "loss": 0.2336, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07745209336280823, + "rewards/margins": 5.993661403656006, + "rewards/rejected": -6.071113586425781, + "step": 245 + }, + { + "epoch": 1.0933333333333333, + "grad_norm": 32.1196174621582, + "learning_rate": 2.325404942953579e-05, + "logits/chosen": 1.87540864944458, + "logits/rejected": 1.9732717275619507, + "logps/chosen": -407.6719665527344, + "logps/rejected": -368.49078369140625, + "loss": 5.0416, + "rewards/accuracies": 0.0, + "rewards/chosen": -7.963120937347412, + "rewards/margins": -4.95430850982666, + "rewards/rejected": -3.00881290435791, + "step": 246 + }, + { + "epoch": 1.0977777777777777, + "grad_norm": 0.005643940530717373, + "learning_rate": 2.3070613446833842e-05, + "logits/chosen": 2.0901176929473877, + "logits/rejected": 2.1729612350463867, + "logps/chosen": -399.19500732421875, + "logps/rejected": -582.9959716796875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.139540195465088, + "rewards/margins": 9.500133514404297, + "rewards/rejected": -11.639673233032227, + "step": 247 + }, + { + "epoch": 1.1022222222222222, + "grad_norm": 13.798872947692871, + "learning_rate": 2.288728190279014e-05, + "logits/chosen": 2.2860894203186035, + "logits/rejected": 2.2638816833496094, + "logps/chosen": -482.7290954589844, + "logps/rejected": -440.7649230957031, + "loss": 1.9496, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.7536072731018066, + "rewards/margins": 3.122256278991699, + "rewards/rejected": -5.875863552093506, + "step": 248 + }, + { + "epoch": 1.1066666666666667, + "grad_norm": 0.43348434567451477, + "learning_rate": 2.270406472123277e-05, + "logits/chosen": 2.0960116386413574, + "logits/rejected": 2.1119589805603027, + "logps/chosen": -236.1478271484375, + "logps/rejected": -304.01947021484375, + "loss": 0.0264, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2858787775039673, + "rewards/margins": 3.6734910011291504, + "rewards/rejected": -3.959369659423828, + "step": 249 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 3.9212396144866943, + "learning_rate": 2.2520971819799328e-05, + "logits/chosen": 2.0135841369628906, + "logits/rejected": 1.9526537656784058, + "logps/chosen": -181.21868896484375, + "logps/rejected": -253.9673614501953, + "loss": 0.2664, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40475767850875854, + "rewards/margins": 1.227912187576294, + "rewards/rejected": -1.6326699256896973, + "step": 250 + }, + { + "epoch": 1.1155555555555556, + "grad_norm": 1.6736501455307007, + "learning_rate": 2.2338013109400056e-05, + "logits/chosen": 2.3246517181396484, + "logits/rejected": 2.323885679244995, + "logps/chosen": -365.00140380859375, + "logps/rejected": -421.6229553222656, + "loss": 0.0485, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9462188482284546, + "rewards/margins": 4.564491271972656, + "rewards/rejected": -3.618272542953491, + "step": 251 + }, + { + "epoch": 1.12, + "grad_norm": 20.39493179321289, + "learning_rate": 2.215519849368138e-05, + "logits/chosen": 2.3385281562805176, + "logits/rejected": 2.2685837745666504, + "logps/chosen": -561.4131469726562, + "logps/rejected": -440.8316955566406, + "loss": 0.8448, + "rewards/accuracies": 0.5, + "rewards/chosen": -7.6562395095825195, + "rewards/margins": -0.15807795524597168, + "rewards/rejected": -7.498161315917969, + "step": 252 + }, + { + "epoch": 1.1244444444444444, + "grad_norm": 0.006911400239914656, + "learning_rate": 2.1972537868489797e-05, + "logits/chosen": 2.0427744388580322, + "logits/rejected": 2.068326473236084, + "logps/chosen": -424.22393798828125, + "logps/rejected": -637.6040649414062, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.107600450515747, + "rewards/margins": 11.686300277709961, + "rewards/rejected": -13.793901443481445, + "step": 253 + }, + { + "epoch": 1.1288888888888888, + "grad_norm": 9.414591789245605, + "learning_rate": 2.1790041121336225e-05, + "logits/chosen": 1.8931891918182373, + "logits/rejected": 1.7515565156936646, + "logps/chosen": -474.2149353027344, + "logps/rejected": -366.8948059082031, + "loss": 0.4807, + "rewards/accuracies": 0.5, + "rewards/chosen": -4.580873012542725, + "rewards/margins": 0.6344245672225952, + "rewards/rejected": -5.215297698974609, + "step": 254 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 0.31954923272132874, + "learning_rate": 2.1607718130860782e-05, + "logits/chosen": 2.2394070625305176, + "logits/rejected": 2.203941822052002, + "logps/chosen": -317.05987548828125, + "logps/rejected": -365.3947448730469, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9734389781951904, + "rewards/margins": 4.665194988250732, + "rewards/rejected": -5.638634204864502, + "step": 255 + }, + { + "epoch": 1.1377777777777778, + "grad_norm": 0.7436378598213196, + "learning_rate": 2.142557876629805e-05, + "logits/chosen": 2.0395288467407227, + "logits/rejected": 2.0879290103912354, + "logps/chosen": -368.682373046875, + "logps/rejected": -483.10504150390625, + "loss": 0.0241, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.940394639968872, + "rewards/margins": 4.465216636657715, + "rewards/rejected": -7.405611038208008, + "step": 256 + }, + { + "epoch": 1.1422222222222222, + "grad_norm": 0.5857129096984863, + "learning_rate": 2.124363288694285e-05, + "logits/chosen": 2.189476251602173, + "logits/rejected": 2.2042782306671143, + "logps/chosen": -297.9048767089844, + "logps/rejected": -462.4341125488281, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4767074584960938, + "rewards/margins": 5.004980564117432, + "rewards/rejected": -7.481688022613525, + "step": 257 + }, + { + "epoch": 1.1466666666666667, + "grad_norm": 1.8212025165557861, + "learning_rate": 2.1061890341616558e-05, + "logits/chosen": 2.0773448944091797, + "logits/rejected": 1.953477144241333, + "logps/chosen": -466.6849670410156, + "logps/rejected": -428.0595703125, + "loss": 0.0469, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.071170330047607, + "rewards/margins": 3.6165356636047363, + "rewards/rejected": -8.687705993652344, + "step": 258 + }, + { + "epoch": 1.1511111111111112, + "grad_norm": 0.49154847860336304, + "learning_rate": 2.0880360968133954e-05, + "logits/chosen": 1.9941173791885376, + "logits/rejected": 1.903878927230835, + "logps/chosen": -443.1854553222656, + "logps/rejected": -469.02886962890625, + "loss": 0.0185, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.136099338531494, + "rewards/margins": 5.983541965484619, + "rewards/rejected": -8.119641304016113, + "step": 259 + }, + { + "epoch": 1.1555555555555554, + "grad_norm": 0.7263330817222595, + "learning_rate": 2.0699054592770737e-05, + "logits/chosen": 2.341273307800293, + "logits/rejected": 2.2972702980041504, + "logps/chosen": -383.4603271484375, + "logps/rejected": -436.1636962890625, + "loss": 0.0187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9301009774208069, + "rewards/margins": 4.749177932739258, + "rewards/rejected": -5.67927885055542, + "step": 260 + }, + { + "epoch": 1.16, + "grad_norm": 1.4579602479934692, + "learning_rate": 2.0517981029731616e-05, + "logits/chosen": 2.224546432495117, + "logits/rejected": 2.069706916809082, + "logps/chosen": -441.466064453125, + "logps/rejected": -530.3856201171875, + "loss": 0.0534, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12378081679344177, + "rewards/margins": 5.265193462371826, + "rewards/rejected": -5.388974189758301, + "step": 261 + }, + { + "epoch": 1.1644444444444444, + "grad_norm": 0.380569189786911, + "learning_rate": 2.0337150080619033e-05, + "logits/chosen": 1.961578130722046, + "logits/rejected": 1.9327723979949951, + "logps/chosen": -419.30328369140625, + "logps/rejected": -416.2418212890625, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4250718355178833, + "rewards/margins": 6.134527683258057, + "rewards/rejected": -7.55959939956665, + "step": 262 + }, + { + "epoch": 1.1688888888888889, + "grad_norm": 13.600119590759277, + "learning_rate": 2.0156571533902627e-05, + "logits/chosen": 1.979763388633728, + "logits/rejected": 1.8556675910949707, + "logps/chosen": -295.2734069824219, + "logps/rejected": -243.45123291015625, + "loss": 0.5916, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.295403242111206, + "rewards/margins": 0.9111607074737549, + "rewards/rejected": -4.206563949584961, + "step": 263 + }, + { + "epoch": 1.1733333333333333, + "grad_norm": 16.93592071533203, + "learning_rate": 1.997625516438937e-05, + "logits/chosen": 2.4186158180236816, + "logits/rejected": 2.3498375415802, + "logps/chosen": -611.62841796875, + "logps/rejected": -590.39306640625, + "loss": 0.8758, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.509602546691895, + "rewards/margins": 1.1703383922576904, + "rewards/rejected": -10.679941177368164, + "step": 264 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 2.826754331588745, + "learning_rate": 1.9796210732694442e-05, + "logits/chosen": 1.9747998714447021, + "logits/rejected": 1.9943749904632568, + "logps/chosen": -265.30975341796875, + "logps/rejected": -372.72296142578125, + "loss": 0.0875, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4182487726211548, + "rewards/margins": 4.4431986808776855, + "rewards/rejected": -5.861447334289551, + "step": 265 + }, + { + "epoch": 1.1822222222222223, + "grad_norm": 5.334752082824707, + "learning_rate": 1.9616447984712914e-05, + "logits/chosen": 1.6797964572906494, + "logits/rejected": 1.7042231559753418, + "logps/chosen": -150.200439453125, + "logps/rejected": -205.3989715576172, + "loss": 0.1681, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8328521251678467, + "rewards/margins": 1.797693133354187, + "rewards/rejected": -3.630545139312744, + "step": 266 + }, + { + "epoch": 1.1866666666666668, + "grad_norm": 4.4336323738098145, + "learning_rate": 1.9436976651092144e-05, + "logits/chosen": 1.883796215057373, + "logits/rejected": 1.7938141822814941, + "logps/chosen": -158.01724243164062, + "logps/rejected": -183.39508056640625, + "loss": 0.3805, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2773910462856293, + "rewards/margins": 0.9651764035224915, + "rewards/rejected": -0.6877853870391846, + "step": 267 + }, + { + "epoch": 1.1911111111111112, + "grad_norm": 9.777835845947266, + "learning_rate": 1.9257806446705116e-05, + "logits/chosen": 2.2978739738464355, + "logits/rejected": 2.2143282890319824, + "logps/chosen": -346.0021667480469, + "logps/rejected": -279.994384765625, + "loss": 0.4109, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.30205535888671875, + "rewards/margins": 2.2577598094940186, + "rewards/rejected": -2.5598151683807373, + "step": 268 + }, + { + "epoch": 1.1955555555555555, + "grad_norm": 0.7445770502090454, + "learning_rate": 1.9078947070124523e-05, + "logits/chosen": 1.995645523071289, + "logits/rejected": 2.0005688667297363, + "logps/chosen": -301.2586669921875, + "logps/rejected": -353.81927490234375, + "loss": 0.0387, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5910667181015015, + "rewards/margins": 3.9083282947540283, + "rewards/rejected": -4.499395370483398, + "step": 269 + }, + { + "epoch": 1.2, + "grad_norm": 7.762486457824707, + "learning_rate": 1.8900408203097787e-05, + "logits/chosen": 1.905322551727295, + "logits/rejected": 1.7590628862380981, + "logps/chosen": -250.0103759765625, + "logps/rejected": -283.5964050292969, + "loss": 0.2447, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9317550659179688, + "rewards/margins": 3.9607110023498535, + "rewards/rejected": -5.892466068267822, + "step": 270 + }, + { + "epoch": 1.2, + "eval_logits/chosen": 2.1097989082336426, + "eval_logits/rejected": 2.0639405250549316, + "eval_logps/chosen": -318.4396057128906, + "eval_logps/rejected": -374.51708984375, + "eval_loss": 0.4267149865627289, + "eval_rewards/accuracies": 0.8214285969734192, + "eval_rewards/chosen": -2.8146941661834717, + "eval_rewards/margins": 3.0553336143493652, + "eval_rewards/rejected": -5.8700270652771, + "eval_runtime": 17.407, + "eval_samples_per_second": 2.872, + "eval_steps_per_second": 0.402, + "step": 270 + }, + { + "epoch": 1.2044444444444444, + "grad_norm": 1.0685392618179321, + "learning_rate": 1.8722199510023012e-05, + "logits/chosen": 2.049793004989624, + "logits/rejected": 2.014737606048584, + "logps/chosen": -357.0670166015625, + "logps/rejected": -461.2732238769531, + "loss": 0.0457, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.979574680328369, + "rewards/margins": 3.4011411666870117, + "rewards/rejected": -8.380716323852539, + "step": 271 + }, + { + "epoch": 1.208888888888889, + "grad_norm": 0.4621816575527191, + "learning_rate": 1.854433063742579e-05, + "logits/chosen": 2.1473259925842285, + "logits/rejected": 2.1311964988708496, + "logps/chosen": -233.38954162597656, + "logps/rejected": -311.93170166015625, + "loss": 0.0227, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1365326642990112, + "rewards/margins": 3.9022364616394043, + "rewards/rejected": -2.7657036781311035, + "step": 272 + }, + { + "epoch": 1.2133333333333334, + "grad_norm": 3.8542513847351074, + "learning_rate": 1.8366811213437092e-05, + "logits/chosen": 2.036423921585083, + "logits/rejected": 2.026139974594116, + "logps/chosen": -301.99395751953125, + "logps/rejected": -330.7635498046875, + "loss": 0.1548, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5693366527557373, + "rewards/margins": 1.7907044887542725, + "rewards/rejected": -3.3600411415100098, + "step": 273 + }, + { + "epoch": 1.2177777777777778, + "grad_norm": 0.18561817705631256, + "learning_rate": 1.8189650847272037e-05, + "logits/chosen": 2.12514066696167, + "logits/rejected": 2.1645846366882324, + "logps/chosen": -372.3150939941406, + "logps/rejected": -337.253173828125, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3940017223358154, + "rewards/margins": 5.315286159515381, + "rewards/rejected": -8.709287643432617, + "step": 274 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 9.204042434692383, + "learning_rate": 1.8012859128709766e-05, + "logits/chosen": 1.8135225772857666, + "logits/rejected": 1.8564603328704834, + "logps/chosen": -192.2001953125, + "logps/rejected": -243.14703369140625, + "loss": 0.5688, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7113640308380127, + "rewards/margins": 0.32638704776763916, + "rewards/rejected": -1.0377510786056519, + "step": 275 + }, + { + "epoch": 1.2266666666666666, + "grad_norm": 0.10035301744937897, + "learning_rate": 1.783644562757436e-05, + "logits/chosen": 2.467790126800537, + "logits/rejected": 2.2855076789855957, + "logps/chosen": -284.48785400390625, + "logps/rejected": -418.7071533203125, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9228286743164062, + "rewards/margins": 6.6048784255981445, + "rewards/rejected": -5.682049751281738, + "step": 276 + }, + { + "epoch": 1.231111111111111, + "grad_norm": 0.995993435382843, + "learning_rate": 1.7660419893216785e-05, + "logits/chosen": 2.3109967708587646, + "logits/rejected": 2.2218995094299316, + "logps/chosen": -331.196044921875, + "logps/rejected": -288.77044677734375, + "loss": 0.0384, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.583350419998169, + "rewards/margins": 3.377068519592285, + "rewards/rejected": -2.793717861175537, + "step": 277 + }, + { + "epoch": 1.2355555555555555, + "grad_norm": 0.09387421607971191, + "learning_rate": 1.7484791453998006e-05, + "logits/chosen": 2.2291605472564697, + "logits/rejected": 2.205029249191284, + "logps/chosen": -329.593994140625, + "logps/rejected": -481.59332275390625, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18573302030563354, + "rewards/margins": 7.1214494705200195, + "rewards/rejected": -7.307182312011719, + "step": 278 + }, + { + "epoch": 1.24, + "grad_norm": 1.2010304927825928, + "learning_rate": 1.7309569816773193e-05, + "logits/chosen": 1.4943195581436157, + "logits/rejected": 1.5665395259857178, + "logps/chosen": -118.16926574707031, + "logps/rejected": -229.98049926757812, + "loss": 0.0459, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5613498687744141, + "rewards/margins": 3.4735331535339355, + "rewards/rejected": -2.9121835231781006, + "step": 279 + }, + { + "epoch": 1.2444444444444445, + "grad_norm": 9.927587509155273, + "learning_rate": 1.7134764466377136e-05, + "logits/chosen": 2.1885547637939453, + "logits/rejected": 2.239022731781006, + "logps/chosen": -398.05059814453125, + "logps/rejected": -413.83624267578125, + "loss": 0.4304, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5941482782363892, + "rewards/margins": 0.6228576898574829, + "rewards/rejected": -1.217005968093872, + "step": 280 + }, + { + "epoch": 1.248888888888889, + "grad_norm": 5.716561794281006, + "learning_rate": 1.69603848651108e-05, + "logits/chosen": 1.9646629095077515, + "logits/rejected": 1.9000844955444336, + "logps/chosen": -269.88067626953125, + "logps/rejected": -253.2309112548828, + "loss": 0.3016, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.193873643875122, + "rewards/margins": 1.1279609203338623, + "rewards/rejected": -3.3218345642089844, + "step": 281 + }, + { + "epoch": 1.2533333333333334, + "grad_norm": 0.3801816999912262, + "learning_rate": 1.6786440452229134e-05, + "logits/chosen": 2.2147722244262695, + "logits/rejected": 2.1242868900299072, + "logps/chosen": -451.5334167480469, + "logps/rejected": -462.0749206542969, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0167465209960938, + "rewards/margins": 5.644117832183838, + "rewards/rejected": -8.660863876342773, + "step": 282 + }, + { + "epoch": 1.2577777777777777, + "grad_norm": 0.1913336217403412, + "learning_rate": 1.6612940643430138e-05, + "logits/chosen": 2.109816551208496, + "logits/rejected": 2.2030255794525146, + "logps/chosen": -255.25289916992188, + "logps/rejected": -452.1776123046875, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.056928277015686, + "rewards/margins": 7.02421760559082, + "rewards/rejected": -8.081146240234375, + "step": 283 + }, + { + "epoch": 1.2622222222222224, + "grad_norm": 12.739026069641113, + "learning_rate": 1.6439894830345143e-05, + "logits/chosen": 1.5907762050628662, + "logits/rejected": 1.58561372756958, + "logps/chosen": -197.10760498046875, + "logps/rejected": -260.8538818359375, + "loss": 0.6363, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.7191286087036133, + "rewards/margins": 1.830248236656189, + "rewards/rejected": -4.549376487731934, + "step": 284 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 1.540250539779663, + "learning_rate": 1.6267312380030506e-05, + "logits/chosen": 2.0246505737304688, + "logits/rejected": 1.983656883239746, + "logps/chosen": -283.9218444824219, + "logps/rejected": -365.05535888671875, + "loss": 0.0741, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7960922122001648, + "rewards/margins": 5.497363567352295, + "rewards/rejected": -6.293455600738525, + "step": 285 + }, + { + "epoch": 1.271111111111111, + "grad_norm": 3.0418801307678223, + "learning_rate": 1.609520263446049e-05, + "logits/chosen": 2.055178165435791, + "logits/rejected": 2.165806293487549, + "logps/chosen": -276.8818054199219, + "logps/rejected": -422.906005859375, + "loss": 0.2023, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.977742910385132, + "rewards/margins": 3.4164116382598877, + "rewards/rejected": -6.3941545486450195, + "step": 286 + }, + { + "epoch": 1.2755555555555556, + "grad_norm": 1.8675150871276855, + "learning_rate": 1.5923574910021624e-05, + "logits/chosen": 1.7561115026474, + "logits/rejected": 1.7464289665222168, + "logps/chosen": -180.07662963867188, + "logps/rejected": -193.69613647460938, + "loss": 0.1926, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2910804748535156, + "rewards/margins": 3.408679246902466, + "rewards/rejected": -3.6997597217559814, + "step": 287 + }, + { + "epoch": 1.28, + "grad_norm": 11.50502872467041, + "learning_rate": 1.5752438497008405e-05, + "logits/chosen": 1.795478343963623, + "logits/rejected": 1.8433971405029297, + "logps/chosen": -314.85888671875, + "logps/rejected": -392.9442138671875, + "loss": 0.4887, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7009613513946533, + "rewards/margins": 4.940203666687012, + "rewards/rejected": -6.641165256500244, + "step": 288 + }, + { + "epoch": 1.2844444444444445, + "grad_norm": 0.6544823050498962, + "learning_rate": 1.558180265912037e-05, + "logits/chosen": 2.1740856170654297, + "logits/rejected": 2.0556159019470215, + "logps/chosen": -291.9125671386719, + "logps/rejected": -370.18865966796875, + "loss": 0.021, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0329957008361816, + "rewards/margins": 4.283731460571289, + "rewards/rejected": -5.3167266845703125, + "step": 289 + }, + { + "epoch": 1.2888888888888888, + "grad_norm": 1.4096264839172363, + "learning_rate": 1.5411676632960713e-05, + "logits/chosen": 1.9917266368865967, + "logits/rejected": 2.0029103755950928, + "logps/chosen": -215.18414306640625, + "logps/rejected": -224.6077117919922, + "loss": 0.0592, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.029798150062561, + "rewards/margins": 3.0806381702423096, + "rewards/rejected": -2.050839900970459, + "step": 290 + }, + { + "epoch": 1.2933333333333334, + "grad_norm": 3.500157117843628, + "learning_rate": 1.5242069627536225e-05, + "logits/chosen": 2.0910866260528564, + "logits/rejected": 2.094388484954834, + "logps/chosen": -268.5745544433594, + "logps/rejected": -347.7729797363281, + "loss": 0.1554, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5284897089004517, + "rewards/margins": 1.9156463146209717, + "rewards/rejected": -2.444136142730713, + "step": 291 + }, + { + "epoch": 1.2977777777777777, + "grad_norm": 7.339416027069092, + "learning_rate": 1.5072990823758871e-05, + "logits/chosen": 2.043335437774658, + "logits/rejected": 1.9694390296936035, + "logps/chosen": -267.2714538574219, + "logps/rejected": -318.1759338378906, + "loss": 0.5384, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.948584794998169, + "rewards/margins": 0.3823028802871704, + "rewards/rejected": -2.33088755607605, + "step": 292 + }, + { + "epoch": 1.3022222222222222, + "grad_norm": 2.8766441345214844, + "learning_rate": 1.490444937394879e-05, + "logits/chosen": 1.6397064924240112, + "logits/rejected": 1.58877432346344, + "logps/chosen": -206.1599578857422, + "logps/rejected": -258.51251220703125, + "loss": 0.2138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5400612354278564, + "rewards/margins": 1.5494178533554077, + "rewards/rejected": -2.0894789695739746, + "step": 293 + }, + { + "epoch": 1.3066666666666666, + "grad_norm": 0.5003223419189453, + "learning_rate": 1.4736454401338872e-05, + "logits/chosen": 2.242143154144287, + "logits/rejected": 2.218358039855957, + "logps/chosen": -492.30926513671875, + "logps/rejected": -563.084716796875, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.345100402832031, + "rewards/margins": 8.439008712768555, + "rewards/rejected": -13.784109115600586, + "step": 294 + }, + { + "epoch": 1.3111111111111111, + "grad_norm": 9.198110580444336, + "learning_rate": 1.4569014999580937e-05, + "logits/chosen": 2.083486795425415, + "logits/rejected": 2.0191855430603027, + "logps/chosen": -514.994384765625, + "logps/rejected": -621.652587890625, + "loss": 0.1112, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.739448547363281, + "rewards/margins": 2.7005691528320312, + "rewards/rejected": -11.440017700195312, + "step": 295 + }, + { + "epoch": 1.3155555555555556, + "grad_norm": 18.878862380981445, + "learning_rate": 1.4402140232253486e-05, + "logits/chosen": 2.4532670974731445, + "logits/rejected": 2.3905553817749023, + "logps/chosen": -382.56829833984375, + "logps/rejected": -470.17926025390625, + "loss": 0.5653, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.797785758972168, + "rewards/margins": 4.899234771728516, + "rewards/rejected": -8.697020530700684, + "step": 296 + }, + { + "epoch": 1.32, + "grad_norm": 0.9160619974136353, + "learning_rate": 1.4235839132371038e-05, + "logits/chosen": 2.1893036365509033, + "logits/rejected": 2.259230136871338, + "logps/chosen": -349.79779052734375, + "logps/rejected": -420.09796142578125, + "loss": 0.0345, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3005684614181519, + "rewards/margins": 5.341613292694092, + "rewards/rejected": -6.642181396484375, + "step": 297 + }, + { + "epoch": 1.3244444444444445, + "grad_norm": 1.0921589136123657, + "learning_rate": 1.407012070189524e-05, + "logits/chosen": 1.6390105485916138, + "logits/rejected": 1.6804091930389404, + "logps/chosen": -323.83782958984375, + "logps/rejected": -502.33782958984375, + "loss": 0.0708, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.742315769195557, + "rewards/margins": 5.594854831695557, + "rewards/rejected": -10.337170600891113, + "step": 298 + }, + { + "epoch": 1.3288888888888888, + "grad_norm": 0.014698788523674011, + "learning_rate": 1.3904993911247561e-05, + "logits/chosen": 2.3741211891174316, + "logits/rejected": 2.227184295654297, + "logps/chosen": -381.3302307128906, + "logps/rejected": -436.189697265625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0916748046875, + "rewards/margins": 8.013049125671387, + "rewards/rejected": -7.921374797821045, + "step": 299 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 12.252896308898926, + "learning_rate": 1.3740467698823662e-05, + "logits/chosen": 2.037738800048828, + "logits/rejected": 1.979736566543579, + "logps/chosen": -299.84228515625, + "logps/rejected": -390.47296142578125, + "loss": 0.8992, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.8193671703338623, + "rewards/margins": 2.5206613540649414, + "rewards/rejected": -6.340028762817383, + "step": 300 + }, + { + "epoch": 1.3377777777777777, + "grad_norm": 12.054088592529297, + "learning_rate": 1.3576550970509666e-05, + "logits/chosen": 1.7213351726531982, + "logits/rejected": 1.7385635375976562, + "logps/chosen": -322.126708984375, + "logps/rejected": -534.2659912109375, + "loss": 0.7611, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.1543197631835938, + "rewards/margins": 7.169881343841553, + "rewards/rejected": -10.324201583862305, + "step": 301 + }, + { + "epoch": 1.3422222222222222, + "grad_norm": 1.8875739574432373, + "learning_rate": 1.341325259919996e-05, + "logits/chosen": 2.0664215087890625, + "logits/rejected": 2.0522894859313965, + "logps/chosen": -221.20150756835938, + "logps/rejected": -283.83770751953125, + "loss": 0.0715, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.671368420124054, + "rewards/margins": 2.8214011192321777, + "rewards/rejected": -3.492769718170166, + "step": 302 + }, + { + "epoch": 1.3466666666666667, + "grad_norm": 11.853067398071289, + "learning_rate": 1.325058142431701e-05, + "logits/chosen": 1.6087274551391602, + "logits/rejected": 1.6155368089675903, + "logps/chosen": -160.04257202148438, + "logps/rejected": -224.98550415039062, + "loss": 0.6955, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0727493166923523, + "rewards/margins": 2.172374725341797, + "rewards/rejected": -2.245124101638794, + "step": 303 + }, + { + "epoch": 1.3511111111111112, + "grad_norm": 2.025550603866577, + "learning_rate": 1.3088546251332772e-05, + "logits/chosen": 1.9365671873092651, + "logits/rejected": 1.9474778175354004, + "logps/chosen": -421.8943786621094, + "logps/rejected": -382.8135070800781, + "loss": 0.1339, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.756591796875, + "rewards/margins": 5.394895076751709, + "rewards/rejected": -7.151486873626709, + "step": 304 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 0.24477213621139526, + "learning_rate": 1.2927155851292145e-05, + "logits/chosen": 1.8736495971679688, + "logits/rejected": 1.8653249740600586, + "logps/chosen": -244.5578155517578, + "logps/rejected": -318.7236328125, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7046654224395752, + "rewards/margins": 4.947877883911133, + "rewards/rejected": -6.652543067932129, + "step": 305 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 53.01292037963867, + "learning_rate": 1.2766418960338128e-05, + "logits/chosen": 2.017364025115967, + "logits/rejected": 1.9819977283477783, + "logps/chosen": -376.64581298828125, + "logps/rejected": -330.441162109375, + "loss": 2.673, + "rewards/accuracies": 0.0, + "rewards/chosen": -9.418815612792969, + "rewards/margins": -2.5403449535369873, + "rewards/rejected": -6.878470420837402, + "step": 306 + }, + { + "epoch": 1.3644444444444446, + "grad_norm": 6.39654541015625, + "learning_rate": 1.260634427923896e-05, + "logits/chosen": 1.2513247728347778, + "logits/rejected": 1.2842328548431396, + "logps/chosen": -137.0913543701172, + "logps/rejected": -154.89610290527344, + "loss": 0.3182, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5993289947509766, + "rewards/margins": 2.3029263019561768, + "rewards/rejected": -2.9022552967071533, + "step": 307 + }, + { + "epoch": 1.3688888888888888, + "grad_norm": 0.30323663353919983, + "learning_rate": 1.2446940472917099e-05, + "logits/chosen": 2.068608283996582, + "logits/rejected": 2.0424392223358154, + "logps/chosen": -323.708740234375, + "logps/rejected": -347.22998046875, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0544846057891846, + "rewards/margins": 4.629805088043213, + "rewards/rejected": -6.684289932250977, + "step": 308 + }, + { + "epoch": 1.3733333333333333, + "grad_norm": 1.1456687450408936, + "learning_rate": 1.2288216169980243e-05, + "logits/chosen": 1.7509403228759766, + "logits/rejected": 1.8240234851837158, + "logps/chosen": -181.7119903564453, + "logps/rejected": -259.93548583984375, + "loss": 0.067, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8926734924316406, + "rewards/margins": 3.159923553466797, + "rewards/rejected": -5.0525970458984375, + "step": 309 + }, + { + "epoch": 1.3777777777777778, + "grad_norm": 22.11775779724121, + "learning_rate": 1.213017996225424e-05, + "logits/chosen": 1.9033875465393066, + "logits/rejected": 1.7332472801208496, + "logps/chosen": -356.8746337890625, + "logps/rejected": -316.99420166015625, + "loss": 1.4625, + "rewards/accuracies": 0.5, + "rewards/chosen": -4.758178234100342, + "rewards/margins": 3.5830559730529785, + "rewards/rejected": -8.34123420715332, + "step": 310 + }, + { + "epoch": 1.3822222222222222, + "grad_norm": 7.521746635437012, + "learning_rate": 1.1972840404317961e-05, + "logits/chosen": 2.111452102661133, + "logits/rejected": 1.976496934890747, + "logps/chosen": -428.1273193359375, + "logps/rejected": -545.6732788085938, + "loss": 0.1086, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.942473411560059, + "rewards/margins": 3.5422022342681885, + "rewards/rejected": -14.484675407409668, + "step": 311 + }, + { + "epoch": 1.3866666666666667, + "grad_norm": 0.11550098657608032, + "learning_rate": 1.1816206013040313e-05, + "logits/chosen": 1.9739415645599365, + "logits/rejected": 1.9305256605148315, + "logps/chosen": -288.68017578125, + "logps/rejected": -353.60418701171875, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.820286512374878, + "rewards/margins": 5.378190040588379, + "rewards/rejected": -7.198476791381836, + "step": 312 + }, + { + "epoch": 1.3911111111111112, + "grad_norm": 19.025943756103516, + "learning_rate": 1.1660285267119167e-05, + "logits/chosen": 2.469484806060791, + "logits/rejected": 2.470264196395874, + "logps/chosen": -707.8921508789062, + "logps/rejected": -753.548828125, + "loss": 0.4269, + "rewards/accuracies": 0.5, + "rewards/chosen": -6.678134441375732, + "rewards/margins": 6.853320121765137, + "rewards/rejected": -13.531454086303711, + "step": 313 + }, + { + "epoch": 1.3955555555555557, + "grad_norm": 5.186333179473877, + "learning_rate": 1.150508660662242e-05, + "logits/chosen": 1.8456952571868896, + "logits/rejected": 1.8528308868408203, + "logps/chosen": -483.7690734863281, + "logps/rejected": -454.6048583984375, + "loss": 0.1185, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.390696048736572, + "rewards/margins": 2.1679461002349854, + "rewards/rejected": -9.558642387390137, + "step": 314 + }, + { + "epoch": 1.4, + "grad_norm": 2.9729480743408203, + "learning_rate": 1.1350618432531098e-05, + "logits/chosen": 1.75775945186615, + "logits/rejected": 1.7443618774414062, + "logps/chosen": -313.0162048339844, + "logps/rejected": -333.1780090332031, + "loss": 0.0539, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.031980991363525, + "rewards/margins": 3.3457300662994385, + "rewards/rejected": -8.377711296081543, + "step": 315 + }, + { + "epoch": 1.4, + "eval_logits/chosen": 2.042860269546509, + "eval_logits/rejected": 1.9980798959732056, + "eval_logps/chosen": -329.1581726074219, + "eval_logps/rejected": -387.9140930175781, + "eval_loss": 0.446563184261322, + "eval_rewards/accuracies": 0.8035714030265808, + "eval_rewards/chosen": -3.886552572250366, + "eval_rewards/margins": 3.3231773376464844, + "eval_rewards/rejected": -7.2097296714782715, + "eval_runtime": 17.4013, + "eval_samples_per_second": 2.873, + "eval_steps_per_second": 0.402, + "step": 315 + }, + { + "epoch": 1.4044444444444444, + "grad_norm": 11.154594421386719, + "learning_rate": 1.1196889106284669e-05, + "logits/chosen": 1.755511999130249, + "logits/rejected": 1.762006163597107, + "logps/chosen": -291.4748229980469, + "logps/rejected": -280.0130615234375, + "loss": 0.4381, + "rewards/accuracies": 0.5, + "rewards/chosen": -5.212803840637207, + "rewards/margins": 3.228024959564209, + "rewards/rejected": -8.440828323364258, + "step": 316 + }, + { + "epoch": 1.4088888888888889, + "grad_norm": 1.097583532333374, + "learning_rate": 1.1043906949328387e-05, + "logits/chosen": 1.9886606931686401, + "logits/rejected": 1.96701979637146, + "logps/chosen": -252.5841827392578, + "logps/rejected": -349.20745849609375, + "loss": 0.0325, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4393486976623535, + "rewards/margins": 3.46529221534729, + "rewards/rejected": -5.904641151428223, + "step": 317 + }, + { + "epoch": 1.4133333333333333, + "grad_norm": 0.9752767086029053, + "learning_rate": 1.0891680242662835e-05, + "logits/chosen": 2.0138909816741943, + "logits/rejected": 1.918421983718872, + "logps/chosen": -300.9414978027344, + "logps/rejected": -353.33404541015625, + "loss": 0.0272, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.531005859375, + "rewards/margins": 3.710988759994507, + "rewards/rejected": -9.241994857788086, + "step": 318 + }, + { + "epoch": 1.4177777777777778, + "grad_norm": 16.44363784790039, + "learning_rate": 1.0740217226395724e-05, + "logits/chosen": 2.0399329662323, + "logits/rejected": 1.9319369792938232, + "logps/chosen": -391.0609130859375, + "logps/rejected": -363.857177734375, + "loss": 0.4585, + "rewards/accuracies": 0.5, + "rewards/chosen": -4.746470928192139, + "rewards/margins": 1.3634958267211914, + "rewards/rejected": -6.10996675491333, + "step": 319 + }, + { + "epoch": 1.4222222222222223, + "grad_norm": 33.33434295654297, + "learning_rate": 1.0589526099295816e-05, + "logits/chosen": 2.1048226356506348, + "logits/rejected": 1.9768327474594116, + "logps/chosen": -581.8522338867188, + "logps/rejected": -463.3633728027344, + "loss": 2.6066, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.7820405960083, + "rewards/margins": -0.41506481170654297, + "rewards/rejected": -9.366975784301758, + "step": 320 + }, + { + "epoch": 1.4266666666666667, + "grad_norm": 0.01582302711904049, + "learning_rate": 1.0439615018349109e-05, + "logits/chosen": 1.9011285305023193, + "logits/rejected": 1.9159646034240723, + "logps/chosen": -358.68157958984375, + "logps/rejected": -534.3988037109375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9317246675491333, + "rewards/margins": 9.084466934204102, + "rewards/rejected": -11.016191482543945, + "step": 321 + }, + { + "epoch": 1.431111111111111, + "grad_norm": 6.929481506347656, + "learning_rate": 1.029049209831733e-05, + "logits/chosen": 1.9835437536239624, + "logits/rejected": 1.9601792097091675, + "logps/chosen": -285.4648132324219, + "logps/rejected": -420.031005859375, + "loss": 0.3011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0451195240020752, + "rewards/margins": 8.181748390197754, + "rewards/rejected": -9.22686767578125, + "step": 322 + }, + { + "epoch": 1.4355555555555555, + "grad_norm": 12.37388801574707, + "learning_rate": 1.0142165411298662e-05, + "logits/chosen": 2.131269693374634, + "logits/rejected": 2.1731009483337402, + "logps/chosen": -299.0472717285156, + "logps/rejected": -404.6293029785156, + "loss": 0.3707, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4244904518127441, + "rewards/margins": 3.208432197570801, + "rewards/rejected": -4.632922649383545, + "step": 323 + }, + { + "epoch": 1.44, + "grad_norm": 0.8624144196510315, + "learning_rate": 9.994642986290797e-06, + "logits/chosen": 2.057706832885742, + "logits/rejected": 2.074605941772461, + "logps/chosen": -324.3763427734375, + "logps/rejected": -441.5903015136719, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.426907539367676, + "rewards/margins": 5.264835834503174, + "rewards/rejected": -8.691743850708008, + "step": 324 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 5.4227118492126465, + "learning_rate": 9.847932808756308e-06, + "logits/chosen": 2.1998391151428223, + "logits/rejected": 2.201568126678467, + "logps/chosen": -312.435791015625, + "logps/rejected": -431.78179931640625, + "loss": 0.1895, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3775376081466675, + "rewards/margins": 3.8356902599334717, + "rewards/rejected": -5.213228225708008, + "step": 325 + }, + { + "epoch": 1.448888888888889, + "grad_norm": 2.2142040729522705, + "learning_rate": 9.702042820190415e-06, + "logits/chosen": 1.5558602809906006, + "logits/rejected": 1.7116918563842773, + "logps/chosen": -197.79367065429688, + "logps/rejected": -246.2130126953125, + "loss": 0.0913, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3813637495040894, + "rewards/margins": 2.347956418991089, + "rewards/rejected": -3.7293200492858887, + "step": 326 + }, + { + "epoch": 1.4533333333333334, + "grad_norm": 8.604325294494629, + "learning_rate": 9.556980917691116e-06, + "logits/chosen": 1.6613447666168213, + "logits/rejected": 1.7617850303649902, + "logps/chosen": -347.57830810546875, + "logps/rejected": -387.5290222167969, + "loss": 0.4048, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3771653175354004, + "rewards/margins": 0.8042678833007812, + "rewards/rejected": -4.181433200836182, + "step": 327 + }, + { + "epoch": 1.4577777777777778, + "grad_norm": 0.06135905534029007, + "learning_rate": 9.412754953531663e-06, + "logits/chosen": 2.1208975315093994, + "logits/rejected": 1.9472355842590332, + "logps/chosen": -416.2113342285156, + "logps/rejected": -469.04803466796875, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4611542224884033, + "rewards/margins": 6.7728729248046875, + "rewards/rejected": -10.234027862548828, + "step": 328 + }, + { + "epoch": 1.462222222222222, + "grad_norm": 4.409871578216553, + "learning_rate": 9.269372734735577e-06, + "logits/chosen": 1.9324915409088135, + "logits/rejected": 1.8667106628417969, + "logps/chosen": -224.12960815429688, + "logps/rejected": -257.76385498046875, + "loss": 0.1307, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.500388145446777, + "rewards/margins": 2.151371955871582, + "rewards/rejected": -6.651760101318359, + "step": 329 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 40.46012496948242, + "learning_rate": 9.126842022654003e-06, + "logits/chosen": 2.013392686843872, + "logits/rejected": 2.085439682006836, + "logps/chosen": -343.09381103515625, + "logps/rejected": -382.8746337890625, + "loss": 1.476, + "rewards/accuracies": 0.5, + "rewards/chosen": -7.1528754234313965, + "rewards/margins": 1.4587280750274658, + "rewards/rejected": -8.611603736877441, + "step": 330 + }, + { + "epoch": 1.471111111111111, + "grad_norm": 20.39794158935547, + "learning_rate": 8.985170532545622e-06, + "logits/chosen": 2.2019968032836914, + "logits/rejected": 2.255478858947754, + "logps/chosen": -461.25250244140625, + "logps/rejected": -522.5560302734375, + "loss": 0.6261, + "rewards/accuracies": 0.5, + "rewards/chosen": -7.226644515991211, + "rewards/margins": 1.7861032485961914, + "rewards/rejected": -9.012747764587402, + "step": 331 + }, + { + "epoch": 1.4755555555555555, + "grad_norm": 0.8205786347389221, + "learning_rate": 8.844365933158973e-06, + "logits/chosen": 2.0666050910949707, + "logits/rejected": 2.1347484588623047, + "logps/chosen": -518.1484375, + "logps/rejected": -621.5133666992188, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.620497703552246, + "rewards/margins": 7.697022914886475, + "rewards/rejected": -14.317520141601562, + "step": 332 + }, + { + "epoch": 1.48, + "grad_norm": 2.1303653717041016, + "learning_rate": 8.704435846317386e-06, + "logits/chosen": 1.9880008697509766, + "logits/rejected": 2.010342836380005, + "logps/chosen": -331.64874267578125, + "logps/rejected": -382.2865905761719, + "loss": 0.051, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6854767799377441, + "rewards/margins": 3.8334319591522217, + "rewards/rejected": -5.518908500671387, + "step": 333 + }, + { + "epoch": 1.4844444444444445, + "grad_norm": 5.076652526855469, + "learning_rate": 8.565387846506395e-06, + "logits/chosen": 1.8777854442596436, + "logits/rejected": 1.8597569465637207, + "logps/chosen": -285.7859191894531, + "logps/rejected": -302.11083984375, + "loss": 0.2184, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.237748146057129, + "rewards/margins": 1.4745651483535767, + "rewards/rejected": -5.712313175201416, + "step": 334 + }, + { + "epoch": 1.488888888888889, + "grad_norm": 1.7670209407806396, + "learning_rate": 8.427229460463696e-06, + "logits/chosen": 2.0296010971069336, + "logits/rejected": 2.046407461166382, + "logps/chosen": -446.6829528808594, + "logps/rejected": -450.63531494140625, + "loss": 0.0681, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3315367698669434, + "rewards/margins": 3.1756088733673096, + "rewards/rejected": -5.507145881652832, + "step": 335 + }, + { + "epoch": 1.4933333333333334, + "grad_norm": 7.628573894500732, + "learning_rate": 8.28996816677177e-06, + "logits/chosen": 1.8448824882507324, + "logits/rejected": 1.8389427661895752, + "logps/chosen": -414.1676025390625, + "logps/rejected": -435.0848083496094, + "loss": 0.1649, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.346246242523193, + "rewards/margins": 4.283938407897949, + "rewards/rejected": -9.6301851272583, + "step": 336 + }, + { + "epoch": 1.4977777777777779, + "grad_norm": 5.303137302398682, + "learning_rate": 8.153611395453045e-06, + "logits/chosen": 1.9505963325500488, + "logits/rejected": 1.9452285766601562, + "logps/chosen": -294.60015869140625, + "logps/rejected": -381.62030029296875, + "loss": 0.2634, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9013266563415527, + "rewards/margins": 1.7531030178070068, + "rewards/rejected": -4.654429912567139, + "step": 337 + }, + { + "epoch": 1.5022222222222221, + "grad_norm": 11.043461799621582, + "learning_rate": 8.018166527567672e-06, + "logits/chosen": 2.0403127670288086, + "logits/rejected": 1.9792909622192383, + "logps/chosen": -424.71881103515625, + "logps/rejected": -502.0084228515625, + "loss": 0.4432, + "rewards/accuracies": 0.5, + "rewards/chosen": -7.235692024230957, + "rewards/margins": 4.200654983520508, + "rewards/rejected": -11.436347007751465, + "step": 338 + }, + { + "epoch": 1.5066666666666668, + "grad_norm": 0.34782156348228455, + "learning_rate": 7.883640894814043e-06, + "logits/chosen": 2.0710644721984863, + "logits/rejected": 2.027409553527832, + "logps/chosen": -235.57655334472656, + "logps/rejected": -296.014892578125, + "loss": 0.02, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5666137337684631, + "rewards/margins": 5.824798107147217, + "rewards/rejected": -6.391411781311035, + "step": 339 + }, + { + "epoch": 1.511111111111111, + "grad_norm": 3.823324203491211, + "learning_rate": 7.75004177913188e-06, + "logits/chosen": 2.1038994789123535, + "logits/rejected": 2.085219383239746, + "logps/chosen": -384.879638671875, + "logps/rejected": -339.5116882324219, + "loss": 0.074, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.644735813140869, + "rewards/margins": 4.097072601318359, + "rewards/rejected": -6.7418084144592285, + "step": 340 + }, + { + "epoch": 1.5155555555555555, + "grad_norm": 0.00015221821377053857, + "learning_rate": 7.617376412308083e-06, + "logits/chosen": 2.0240237712860107, + "logits/rejected": 1.9871121644973755, + "logps/chosen": -351.7294616699219, + "logps/rejected": -592.8782958984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.147392749786377, + "rewards/margins": 13.006977081298828, + "rewards/rejected": -15.154369354248047, + "step": 341 + }, + { + "epoch": 1.52, + "grad_norm": 1.3263826370239258, + "learning_rate": 7.485651975585236e-06, + "logits/chosen": 1.8890652656555176, + "logits/rejected": 1.8711776733398438, + "logps/chosen": -324.95245361328125, + "logps/rejected": -423.5935974121094, + "loss": 0.0299, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.109809875488281, + "rewards/margins": 5.546566009521484, + "rewards/rejected": -11.656375885009766, + "step": 342 + }, + { + "epoch": 1.5244444444444445, + "grad_norm": 4.467953205108643, + "learning_rate": 7.354875599272928e-06, + "logits/chosen": 1.321131944656372, + "logits/rejected": 1.298929214477539, + "logps/chosen": -137.98204040527344, + "logps/rejected": -102.74874877929688, + "loss": 0.3132, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.748739242553711, + "rewards/margins": 1.1912882328033447, + "rewards/rejected": -2.9400274753570557, + "step": 343 + }, + { + "epoch": 1.528888888888889, + "grad_norm": 2.1642391681671143, + "learning_rate": 7.2250543623617685e-06, + "logits/chosen": 2.2403130531311035, + "logits/rejected": 2.209939956665039, + "logps/chosen": -361.478759765625, + "logps/rejected": -535.7804565429688, + "loss": 0.0569, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.326800584793091, + "rewards/margins": 6.651968479156494, + "rewards/rejected": -8.978769302368164, + "step": 344 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.018650932237505913, + "learning_rate": 7.096195292140173e-06, + "logits/chosen": 1.889040470123291, + "logits/rejected": 1.954929232597351, + "logps/chosen": -331.3338623046875, + "logps/rejected": -661.5016479492188, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6049957275390625, + "rewards/margins": 11.392202377319336, + "rewards/rejected": -13.997198104858398, + "step": 345 + }, + { + "epoch": 1.537777777777778, + "grad_norm": 0.07977497577667236, + "learning_rate": 6.968305363814001e-06, + "logits/chosen": 2.1549904346466064, + "logits/rejected": 2.1628024578094482, + "logps/chosen": -401.5853271484375, + "logps/rejected": -551.8875732421875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.6251935958862305, + "rewards/margins": 7.631624221801758, + "rewards/rejected": -12.256817817687988, + "step": 346 + }, + { + "epoch": 1.5422222222222222, + "grad_norm": 9.609312057495117, + "learning_rate": 6.841391500128982e-06, + "logits/chosen": 1.9556026458740234, + "logits/rejected": 2.000077247619629, + "logps/chosen": -265.30108642578125, + "logps/rejected": -381.78399658203125, + "loss": 0.1607, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.087409973144531, + "rewards/margins": 1.7995681762695312, + "rewards/rejected": -5.8869781494140625, + "step": 347 + }, + { + "epoch": 1.5466666666666666, + "grad_norm": 0.00032588234171271324, + "learning_rate": 6.715460570995988e-06, + "logits/chosen": 2.1855061054229736, + "logits/rejected": 2.1387851238250732, + "logps/chosen": -434.9403076171875, + "logps/rejected": -738.8302612304688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6018309593200684, + "rewards/margins": 11.849609375, + "rewards/rejected": -14.451440811157227, + "step": 348 + }, + { + "epoch": 1.551111111111111, + "grad_norm": 1.5105094909667969, + "learning_rate": 6.5905193931191235e-06, + "logits/chosen": 2.143610715866089, + "logits/rejected": 2.2100303173065186, + "logps/chosen": -380.15618896484375, + "logps/rejected": -462.9521789550781, + "loss": 0.0708, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9106452465057373, + "rewards/margins": 4.415860176086426, + "rewards/rejected": -6.326505661010742, + "step": 349 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.15542344748973846, + "learning_rate": 6.46657472962679e-06, + "logits/chosen": 2.022047758102417, + "logits/rejected": 1.9033942222595215, + "logps/chosen": -345.0733642578125, + "logps/rejected": -438.015380859375, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09320831298828125, + "rewards/margins": 8.476028442382812, + "rewards/rejected": -8.569236755371094, + "step": 350 + }, + { + "epoch": 1.56, + "grad_norm": 3.5860283374786377, + "learning_rate": 6.343633289705555e-06, + "logits/chosen": 1.9373621940612793, + "logits/rejected": 1.8331228494644165, + "logps/chosen": -333.8984069824219, + "logps/rejected": -243.1856231689453, + "loss": 0.0951, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.346085548400879, + "rewards/margins": 2.6990599632263184, + "rewards/rejected": -6.045145511627197, + "step": 351 + }, + { + "epoch": 1.5644444444444443, + "grad_norm": 1.0526074171066284, + "learning_rate": 6.221701728237009e-06, + "logits/chosen": 1.999690055847168, + "logits/rejected": 2.0342698097229004, + "logps/chosen": -347.6802978515625, + "logps/rejected": -321.7969970703125, + "loss": 0.036, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.819159507751465, + "rewards/margins": 3.3115005493164062, + "rewards/rejected": -8.130660057067871, + "step": 352 + }, + { + "epoch": 1.568888888888889, + "grad_norm": 0.5887879133224487, + "learning_rate": 6.100786645437481e-06, + "logits/chosen": 1.1116806268692017, + "logits/rejected": 1.1577059030532837, + "logps/chosen": -76.88727569580078, + "logps/rejected": -119.89225006103516, + "loss": 0.0565, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09421806037425995, + "rewards/margins": 2.9204256534576416, + "rewards/rejected": -2.8262076377868652, + "step": 353 + }, + { + "epoch": 1.5733333333333333, + "grad_norm": 0.1103566437959671, + "learning_rate": 5.980894586500841e-06, + "logits/chosen": 2.108466625213623, + "logits/rejected": 2.086857795715332, + "logps/chosen": -387.1605224609375, + "logps/rejected": -562.6422119140625, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.905645847320557, + "rewards/margins": 7.762504577636719, + "rewards/rejected": -13.668149948120117, + "step": 354 + }, + { + "epoch": 1.5777777777777777, + "grad_norm": 8.797289848327637, + "learning_rate": 5.8620320412441475e-06, + "logits/chosen": 1.8724584579467773, + "logits/rejected": 1.9823896884918213, + "logps/chosen": -301.8065185546875, + "logps/rejected": -380.43853759765625, + "loss": 0.4206, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.267813205718994, + "rewards/margins": 1.2750152349472046, + "rewards/rejected": -3.542828321456909, + "step": 355 + }, + { + "epoch": 1.5822222222222222, + "grad_norm": 6.784573078155518, + "learning_rate": 5.744205443756364e-06, + "logits/chosen": 1.9750076532363892, + "logits/rejected": 2.0933682918548584, + "logps/chosen": -423.0281677246094, + "logps/rejected": -526.6314697265625, + "loss": 0.2207, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.2557196617126465, + "rewards/margins": 1.4171913862228394, + "rewards/rejected": -8.672910690307617, + "step": 356 + }, + { + "epoch": 1.5866666666666667, + "grad_norm": 2.033442258834839, + "learning_rate": 5.627421172050096e-06, + "logits/chosen": 1.7873187065124512, + "logits/rejected": 1.7788472175598145, + "logps/chosen": -231.81149291992188, + "logps/rejected": -276.6201171875, + "loss": 0.179, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4262077808380127, + "rewards/margins": 4.1126708984375, + "rewards/rejected": -5.538878440856934, + "step": 357 + }, + { + "epoch": 1.5911111111111111, + "grad_norm": 0.03710145130753517, + "learning_rate": 5.511685547716328e-06, + "logits/chosen": 2.091726064682007, + "logits/rejected": 2.0658438205718994, + "logps/chosen": -469.0487060546875, + "logps/rejected": -553.4110107421875, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.09385085105896, + "rewards/margins": 8.56124496459961, + "rewards/rejected": -11.655096054077148, + "step": 358 + }, + { + "epoch": 1.5955555555555554, + "grad_norm": 0.9721232056617737, + "learning_rate": 5.397004835582242e-06, + "logits/chosen": 2.050297498703003, + "logits/rejected": 2.0740513801574707, + "logps/chosen": -394.7935485839844, + "logps/rejected": -656.4100341796875, + "loss": 0.0184, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.355221748352051, + "rewards/margins": 11.149545669555664, + "rewards/rejected": -14.504767417907715, + "step": 359 + }, + { + "epoch": 1.6, + "grad_norm": 0.1482057273387909, + "learning_rate": 5.2833852433720855e-06, + "logits/chosen": 2.2320728302001953, + "logits/rejected": 2.227717876434326, + "logps/chosen": -449.0965881347656, + "logps/rejected": -448.2038879394531, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.938412666320801, + "rewards/margins": 5.938919544219971, + "rewards/rejected": -12.87733268737793, + "step": 360 + }, + { + "epoch": 1.6, + "eval_logits/chosen": 2.020325183868408, + "eval_logits/rejected": 1.97914719581604, + "eval_logps/chosen": -332.6748352050781, + "eval_logps/rejected": -392.21588134765625, + "eval_loss": 0.41243118047714233, + "eval_rewards/accuracies": 0.8035714030265808, + "eval_rewards/chosen": -4.238221645355225, + "eval_rewards/margins": 3.401686429977417, + "eval_rewards/rejected": -7.639908313751221, + "eval_runtime": 17.4022, + "eval_samples_per_second": 2.873, + "eval_steps_per_second": 0.402, + "step": 360 + }, + { + "epoch": 1.6044444444444443, + "grad_norm": 26.702436447143555, + "learning_rate": 5.170832921371163e-06, + "logits/chosen": 2.1601366996765137, + "logits/rejected": 2.0925962924957275, + "logps/chosen": -737.0338134765625, + "logps/rejected": -734.032958984375, + "loss": 0.6978, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.603326797485352, + "rewards/margins": 4.569097995758057, + "rewards/rejected": -16.17242431640625, + "step": 361 + }, + { + "epoch": 1.608888888888889, + "grad_norm": 4.281423091888428, + "learning_rate": 5.059353962092917e-06, + "logits/chosen": 1.8992071151733398, + "logits/rejected": 1.9108917713165283, + "logps/chosen": -202.74957275390625, + "logps/rejected": -191.141357421875, + "loss": 0.1472, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23720017075538635, + "rewards/margins": 1.9029862880706787, + "rewards/rejected": -2.140186309814453, + "step": 362 + }, + { + "epoch": 1.6133333333333333, + "grad_norm": 0.12904126942157745, + "learning_rate": 4.9489543999491045e-06, + "logits/chosen": 2.1836905479431152, + "logits/rejected": 2.0876262187957764, + "logps/chosen": -383.00787353515625, + "logps/rejected": -463.5958251953125, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0018112175166606903, + "rewards/margins": 6.877252578735352, + "rewards/rejected": -6.879063606262207, + "step": 363 + }, + { + "epoch": 1.6177777777777778, + "grad_norm": 0.3242693245410919, + "learning_rate": 4.839640210923197e-06, + "logits/chosen": 1.981348991394043, + "logits/rejected": 1.8337197303771973, + "logps/chosen": -231.2086944580078, + "logps/rejected": -257.2821044921875, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9566452503204346, + "rewards/margins": 4.5026397705078125, + "rewards/rejected": -6.459284782409668, + "step": 364 + }, + { + "epoch": 1.6222222222222222, + "grad_norm": 0.7935138940811157, + "learning_rate": 4.731417312246877e-06, + "logits/chosen": 1.637596607208252, + "logits/rejected": 1.6671159267425537, + "logps/chosen": -181.33856201171875, + "logps/rejected": -310.20806884765625, + "loss": 0.0237, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1557204723358154, + "rewards/margins": 6.343451023101807, + "rewards/rejected": -7.499171257019043, + "step": 365 + }, + { + "epoch": 1.6266666666666667, + "grad_norm": 0.17146991193294525, + "learning_rate": 4.624291562079719e-06, + "logits/chosen": 1.5095144510269165, + "logits/rejected": 1.552412748336792, + "logps/chosen": -283.5166015625, + "logps/rejected": -318.7493591308594, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3418242931365967, + "rewards/margins": 5.029508590698242, + "rewards/rejected": -8.371332168579102, + "step": 366 + }, + { + "epoch": 1.6311111111111112, + "grad_norm": 10.096573829650879, + "learning_rate": 4.518268759192115e-06, + "logits/chosen": 2.28369402885437, + "logits/rejected": 2.316972255706787, + "logps/chosen": -435.752685546875, + "logps/rejected": -479.5185241699219, + "loss": 0.1262, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.010312080383301, + "rewards/margins": 4.6889142990112305, + "rewards/rejected": -9.699226379394531, + "step": 367 + }, + { + "epoch": 1.6355555555555554, + "grad_norm": 0.13205423951148987, + "learning_rate": 4.413354642651369e-06, + "logits/chosen": 2.1447973251342773, + "logits/rejected": 2.208026885986328, + "logps/chosen": -333.09185791015625, + "logps/rejected": -571.9179077148438, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.114415168762207, + "rewards/margins": 6.399701118469238, + "rewards/rejected": -8.514116287231445, + "step": 368 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 1.1356106996536255, + "learning_rate": 4.309554891511036e-06, + "logits/chosen": 2.151458740234375, + "logits/rejected": 2.0680348873138428, + "logps/chosen": -406.594970703125, + "logps/rejected": -577.03662109375, + "loss": 0.0331, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.874485731124878, + "rewards/margins": 9.36522102355957, + "rewards/rejected": -11.239706039428711, + "step": 369 + }, + { + "epoch": 1.6444444444444444, + "grad_norm": 0.09853781759738922, + "learning_rate": 4.206875124503506e-06, + "logits/chosen": 2.1071839332580566, + "logits/rejected": 2.133695602416992, + "logps/chosen": -299.2562255859375, + "logps/rejected": -481.5537109375, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.580816745758057, + "rewards/margins": 6.295032024383545, + "rewards/rejected": -10.875848770141602, + "step": 370 + }, + { + "epoch": 1.6488888888888888, + "grad_norm": 1.1135728359222412, + "learning_rate": 4.105320899735882e-06, + "logits/chosen": 1.5641443729400635, + "logits/rejected": 1.5768111944198608, + "logps/chosen": -173.53851318359375, + "logps/rejected": -211.27960205078125, + "loss": 0.0758, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20349428057670593, + "rewards/margins": 2.9661078453063965, + "rewards/rejected": -3.169602155685425, + "step": 371 + }, + { + "epoch": 1.6533333333333333, + "grad_norm": 0.5490663647651672, + "learning_rate": 4.004897714389103e-06, + "logits/chosen": 2.0074357986450195, + "logits/rejected": 2.0224769115448, + "logps/chosen": -344.1773681640625, + "logps/rejected": -435.5508728027344, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9361190795898438, + "rewards/margins": 5.090202331542969, + "rewards/rejected": -9.026321411132812, + "step": 372 + }, + { + "epoch": 1.6577777777777778, + "grad_norm": 1.7369130849838257, + "learning_rate": 3.90561100442036e-06, + "logits/chosen": 1.8831748962402344, + "logits/rejected": 1.8279378414154053, + "logps/chosen": -236.67889404296875, + "logps/rejected": -348.7945251464844, + "loss": 0.074, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25758588314056396, + "rewards/margins": 6.053953170776367, + "rewards/rejected": -6.311539173126221, + "step": 373 + }, + { + "epoch": 1.6622222222222223, + "grad_norm": 0.015169711783528328, + "learning_rate": 3.8074661442688868e-06, + "logits/chosen": 2.0551671981811523, + "logits/rejected": 1.9679946899414062, + "logps/chosen": -285.3905944824219, + "logps/rejected": -503.6309814453125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3399689197540283, + "rewards/margins": 8.514545440673828, + "rewards/rejected": -9.854513168334961, + "step": 374 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.20316235721111298, + "learning_rate": 3.710468446565005e-06, + "logits/chosen": 1.994492769241333, + "logits/rejected": 1.9339189529418945, + "logps/chosen": -295.31878662109375, + "logps/rejected": -363.3540954589844, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.135162353515625, + "rewards/margins": 4.933624267578125, + "rewards/rejected": -7.06878662109375, + "step": 375 + }, + { + "epoch": 1.6711111111111112, + "grad_norm": 1.6665427684783936, + "learning_rate": 3.6146231618425646e-06, + "logits/chosen": 1.9451243877410889, + "logits/rejected": 2.0122804641723633, + "logps/chosen": -426.1136169433594, + "logps/rejected": -602.61279296875, + "loss": 0.0643, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.325921654701233, + "rewards/margins": 8.843436241149902, + "rewards/rejected": -10.169357299804688, + "step": 376 + }, + { + "epoch": 1.6755555555555555, + "grad_norm": 76.74038696289062, + "learning_rate": 3.5199354782547156e-06, + "logits/chosen": 2.1591286659240723, + "logits/rejected": 2.029425859451294, + "logps/chosen": -350.96746826171875, + "logps/rejected": -409.662841796875, + "loss": 2.5142, + "rewards/accuracies": 0.5, + "rewards/chosen": -5.8959503173828125, + "rewards/margins": -0.014461994171142578, + "rewards/rejected": -5.88148832321167, + "step": 377 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 1.8813796043395996, + "learning_rate": 3.4264105212930915e-06, + "logits/chosen": 1.5345783233642578, + "logits/rejected": 1.5260515213012695, + "logps/chosen": -137.23216247558594, + "logps/rejected": -172.87872314453125, + "loss": 0.3194, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7019901275634766, + "rewards/margins": 2.383763313293457, + "rewards/rejected": -3.0857534408569336, + "step": 378 + }, + { + "epoch": 1.6844444444444444, + "grad_norm": 2.3558857440948486, + "learning_rate": 3.3340533535103467e-06, + "logits/chosen": 1.783468246459961, + "logits/rejected": 1.840031385421753, + "logps/chosen": -209.6715087890625, + "logps/rejected": -231.9936065673828, + "loss": 0.1628, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41487425565719604, + "rewards/margins": 1.7598159313201904, + "rewards/rejected": -2.1746902465820312, + "step": 379 + }, + { + "epoch": 1.6888888888888889, + "grad_norm": 1.7316559553146362, + "learning_rate": 3.2428689742461188e-06, + "logits/chosen": 2.061565399169922, + "logits/rejected": 2.0696067810058594, + "logps/chosen": -303.8612060546875, + "logps/rejected": -330.27777099609375, + "loss": 0.0797, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3209824562072754, + "rewards/margins": 5.411049842834473, + "rewards/rejected": -6.732032299041748, + "step": 380 + }, + { + "epoch": 1.6933333333333334, + "grad_norm": 12.0430269241333, + "learning_rate": 3.152862319356428e-06, + "logits/chosen": 1.936488389968872, + "logits/rejected": 1.8861385583877563, + "logps/chosen": -475.26220703125, + "logps/rejected": -372.1424560546875, + "loss": 0.1641, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.164106845855713, + "rewards/margins": 3.2129530906677246, + "rewards/rejected": -9.377059936523438, + "step": 381 + }, + { + "epoch": 1.6977777777777778, + "grad_norm": 0.1999710500240326, + "learning_rate": 3.064038260946478e-06, + "logits/chosen": 2.0421996116638184, + "logits/rejected": 1.8888078927993774, + "logps/chosen": -289.0198059082031, + "logps/rejected": -385.6482849121094, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0051002502441406, + "rewards/margins": 6.449030876159668, + "rewards/rejected": -7.454131126403809, + "step": 382 + }, + { + "epoch": 1.7022222222222223, + "grad_norm": 0.18681201338768005, + "learning_rate": 2.9764016071069434e-06, + "logits/chosen": 2.0140395164489746, + "logits/rejected": 2.0304765701293945, + "logps/chosen": -271.6080627441406, + "logps/rejected": -379.2652893066406, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2312802076339722, + "rewards/margins": 5.010470867156982, + "rewards/rejected": -6.241751194000244, + "step": 383 + }, + { + "epoch": 1.7066666666666666, + "grad_norm": 0.5151910185813904, + "learning_rate": 2.8899571016536786e-06, + "logits/chosen": 1.9135384559631348, + "logits/rejected": 1.8421276807785034, + "logps/chosen": -327.75213623046875, + "logps/rejected": -383.0606689453125, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.634103298187256, + "rewards/margins": 5.9485626220703125, + "rewards/rejected": -9.582666397094727, + "step": 384 + }, + { + "epoch": 1.7111111111111112, + "grad_norm": 0.13173969089984894, + "learning_rate": 2.8047094238709633e-06, + "logits/chosen": 2.236691474914551, + "logits/rejected": 2.233146905899048, + "logps/chosen": -469.94879150390625, + "logps/rejected": -569.8831176757812, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.104136943817139, + "rewards/margins": 5.658421516418457, + "rewards/rejected": -12.762557983398438, + "step": 385 + }, + { + "epoch": 1.7155555555555555, + "grad_norm": 0.7114121317863464, + "learning_rate": 2.720663188258199e-06, + "logits/chosen": 1.9220545291900635, + "logits/rejected": 1.9399724006652832, + "logps/chosen": -412.7835693359375, + "logps/rejected": -462.8909606933594, + "loss": 0.0238, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.260728359222412, + "rewards/margins": 4.300273895263672, + "rewards/rejected": -8.561002731323242, + "step": 386 + }, + { + "epoch": 1.72, + "grad_norm": 25.617097854614258, + "learning_rate": 2.637822944280116e-06, + "logits/chosen": 1.6366169452667236, + "logits/rejected": 1.6212671995162964, + "logps/chosen": -231.51577758789062, + "logps/rejected": -197.76779174804688, + "loss": 1.1373, + "rewards/accuracies": 0.0, + "rewards/chosen": -3.7230637073516846, + "rewards/margins": -0.7492774724960327, + "rewards/rejected": -2.9737863540649414, + "step": 387 + }, + { + "epoch": 1.7244444444444444, + "grad_norm": 2.152416706085205, + "learning_rate": 2.5561931761205082e-06, + "logits/chosen": 1.781626582145691, + "logits/rejected": 1.814887523651123, + "logps/chosen": -261.75830078125, + "logps/rejected": -285.1453857421875, + "loss": 0.0976, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2200870513916016, + "rewards/margins": 5.611637115478516, + "rewards/rejected": -6.831724166870117, + "step": 388 + }, + { + "epoch": 1.728888888888889, + "grad_norm": 28.067974090576172, + "learning_rate": 2.475778302439524e-06, + "logits/chosen": 1.615212321281433, + "logits/rejected": 1.6488571166992188, + "logps/chosen": -317.0101318359375, + "logps/rejected": -197.44625854492188, + "loss": 2.2031, + "rewards/accuracies": 0.5, + "rewards/chosen": -4.956669330596924, + "rewards/margins": 0.2624635696411133, + "rewards/rejected": -5.219132423400879, + "step": 389 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 19.71435546875, + "learning_rate": 2.396582676134462e-06, + "logits/chosen": 2.0480542182922363, + "logits/rejected": 2.0675265789031982, + "logps/chosen": -269.1255187988281, + "logps/rejected": -295.6541442871094, + "loss": 1.3784, + "rewards/accuracies": 0.0, + "rewards/chosen": -4.372478485107422, + "rewards/margins": -0.9686035513877869, + "rewards/rejected": -3.4038748741149902, + "step": 390 + }, + { + "epoch": 1.7377777777777776, + "grad_norm": 3.7129645347595215, + "learning_rate": 2.318610584104142e-06, + "logits/chosen": 1.7886816263198853, + "logits/rejected": 1.657137393951416, + "logps/chosen": -400.5063781738281, + "logps/rejected": -427.5238037109375, + "loss": 0.0983, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.910498142242432, + "rewards/margins": 5.50076961517334, + "rewards/rejected": -10.411267280578613, + "step": 391 + }, + { + "epoch": 1.7422222222222223, + "grad_norm": 16.56305503845215, + "learning_rate": 2.241866247016869e-06, + "logits/chosen": 2.101799488067627, + "logits/rejected": 2.064134120941162, + "logps/chosen": -435.5615234375, + "logps/rejected": -487.712158203125, + "loss": 0.6052, + "rewards/accuracies": 0.5, + "rewards/chosen": -6.707480430603027, + "rewards/margins": 3.5099639892578125, + "rewards/rejected": -10.21744441986084, + "step": 392 + }, + { + "epoch": 1.7466666666666666, + "grad_norm": 0.02198374643921852, + "learning_rate": 2.166353819081968e-06, + "logits/chosen": 2.1594762802124023, + "logits/rejected": 2.2261545658111572, + "logps/chosen": -441.47528076171875, + "logps/rejected": -560.4810791015625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.149522304534912, + "rewards/margins": 8.681150436401367, + "rewards/rejected": -11.830673217773438, + "step": 393 + }, + { + "epoch": 1.751111111111111, + "grad_norm": 14.180363655090332, + "learning_rate": 2.092077387824884e-06, + "logits/chosen": 2.0479955673217773, + "logits/rejected": 1.947251796722412, + "logps/chosen": -368.0101623535156, + "logps/rejected": -422.68780517578125, + "loss": 0.3844, + "rewards/accuracies": 0.5, + "rewards/chosen": -5.151028633117676, + "rewards/margins": 3.9921188354492188, + "rewards/rejected": -9.143147468566895, + "step": 394 + }, + { + "epoch": 1.7555555555555555, + "grad_norm": 0.006763577461242676, + "learning_rate": 2.0190409738659653e-06, + "logits/chosen": 2.2438273429870605, + "logits/rejected": 2.2101495265960693, + "logps/chosen": -518.1273803710938, + "logps/rejected": -702.2735595703125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.715429782867432, + "rewards/margins": 12.057371139526367, + "rewards/rejected": -16.77280044555664, + "step": 395 + }, + { + "epoch": 1.76, + "grad_norm": 0.18149635195732117, + "learning_rate": 1.9472485307027945e-06, + "logits/chosen": 2.1681084632873535, + "logits/rejected": 2.203289270401001, + "logps/chosen": -347.28582763671875, + "logps/rejected": -500.73455810546875, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5240983963012695, + "rewards/margins": 6.283698081970215, + "rewards/rejected": -8.807796478271484, + "step": 396 + }, + { + "epoch": 1.7644444444444445, + "grad_norm": 4.91969108581543, + "learning_rate": 1.876703944496197e-06, + "logits/chosen": 1.9020869731903076, + "logits/rejected": 1.7357096672058105, + "logps/chosen": -352.68695068359375, + "logps/rejected": -305.7750244140625, + "loss": 0.1342, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5846099853515625, + "rewards/margins": 4.8691534996032715, + "rewards/rejected": -7.453763961791992, + "step": 397 + }, + { + "epoch": 1.7688888888888887, + "grad_norm": 57.27471923828125, + "learning_rate": 1.8074110338598682e-06, + "logits/chosen": 2.0281596183776855, + "logits/rejected": 1.8878042697906494, + "logps/chosen": -631.1502685546875, + "logps/rejected": -437.7010498046875, + "loss": 3.8594, + "rewards/accuracies": 0.5, + "rewards/chosen": -13.262107849121094, + "rewards/margins": -1.1633968353271484, + "rewards/rejected": -12.098711013793945, + "step": 398 + }, + { + "epoch": 1.7733333333333334, + "grad_norm": 1.8617291450500488, + "learning_rate": 1.7393735496536944e-06, + "logits/chosen": 2.0407192707061768, + "logits/rejected": 1.8599579334259033, + "logps/chosen": -491.43707275390625, + "logps/rejected": -467.81060791015625, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.183530807495117, + "rewards/margins": 7.340937614440918, + "rewards/rejected": -15.524469375610352, + "step": 399 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 2.8930482864379883, + "learning_rate": 1.6725951747806918e-06, + "logits/chosen": 1.4244745969772339, + "logits/rejected": 1.4205752611160278, + "logps/chosen": -128.19241333007812, + "logps/rejected": -157.7073974609375, + "loss": 0.3631, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8688427209854126, + "rewards/margins": 1.723132848739624, + "rewards/rejected": -2.591975450515747, + "step": 400 + }, + { + "epoch": 1.7822222222222224, + "grad_norm": 3.2958927154541016, + "learning_rate": 1.6070795239876618e-06, + "logits/chosen": 2.2915682792663574, + "logits/rejected": 2.2581777572631836, + "logps/chosen": -368.1529541015625, + "logps/rejected": -491.930419921875, + "loss": 0.0333, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.686471700668335, + "rewards/margins": 7.626599311828613, + "rewards/rejected": -11.313071250915527, + "step": 401 + }, + { + "epoch": 1.7866666666666666, + "grad_norm": 3.986274480819702, + "learning_rate": 1.5428301436695159e-06, + "logits/chosen": 1.5831184387207031, + "logits/rejected": 1.6049795150756836, + "logps/chosen": -165.51271057128906, + "logps/rejected": -185.55548095703125, + "loss": 0.2804, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1501312404870987, + "rewards/margins": 1.7175559997558594, + "rewards/rejected": -1.8676872253417969, + "step": 402 + }, + { + "epoch": 1.791111111111111, + "grad_norm": 29.756507873535156, + "learning_rate": 1.479850511677322e-06, + "logits/chosen": 2.165071487426758, + "logits/rejected": 2.0931169986724854, + "logps/chosen": -528.3546142578125, + "logps/rejected": -583.2621459960938, + "loss": 1.384, + "rewards/accuracies": 0.5, + "rewards/chosen": -6.661299228668213, + "rewards/margins": 4.992590427398682, + "rewards/rejected": -11.653889656066895, + "step": 403 + }, + { + "epoch": 1.7955555555555556, + "grad_norm": 6.813007354736328, + "learning_rate": 1.4181440371300342e-06, + "logits/chosen": 1.9760260581970215, + "logits/rejected": 1.9418466091156006, + "logps/chosen": -360.95574951171875, + "logps/rejected": -394.4306335449219, + "loss": 0.1623, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3200485706329346, + "rewards/margins": 4.02227783203125, + "rewards/rejected": -5.3423261642456055, + "step": 404 + }, + { + "epoch": 1.8, + "grad_norm": 0.347569078207016, + "learning_rate": 1.3577140602299448e-06, + "logits/chosen": 2.0109634399414062, + "logits/rejected": 1.9843730926513672, + "logps/chosen": -423.1788330078125, + "logps/rejected": -527.6969604492188, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6651382446289062, + "rewards/margins": 6.632790565490723, + "rewards/rejected": -9.297929763793945, + "step": 405 + }, + { + "epoch": 1.8, + "eval_logits/chosen": 2.008713722229004, + "eval_logits/rejected": 1.9683387279510498, + "eval_logps/chosen": -333.8675231933594, + "eval_logps/rejected": -395.43695068359375, + "eval_loss": 0.38143062591552734, + "eval_rewards/accuracies": 0.8214285969734192, + "eval_rewards/chosen": -4.357491493225098, + "eval_rewards/margins": 3.6045258045196533, + "eval_rewards/rejected": -7.96201753616333, + "eval_runtime": 17.3982, + "eval_samples_per_second": 2.874, + "eval_steps_per_second": 0.402, + "step": 405 + } + ], + "logging_steps": 1, + "max_steps": 450, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 45, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}