{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997382884061764, "eval_steps": 100, "global_step": 1910, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005234231876472127, "grad_norm": 0.5695298687302295, "learning_rate": 2.617801047120419e-08, "logits/chosen": -0.4997953176498413, "logits/rejected": -0.5751151442527771, "logps/chosen": -395.12640380859375, "logps/rejected": -316.8270568847656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.005234231876472127, "grad_norm": 0.5536662200797415, "learning_rate": 2.617801047120419e-07, "logits/chosen": -0.5801360011100769, "logits/rejected": -0.6067044138908386, "logps/chosen": -304.32293701171875, "logps/rejected": -244.29046630859375, "loss": 0.6929, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": -0.0005742510547861457, "rewards/margins": 0.00042081804713234305, "rewards/rejected": -0.0009950690437108278, "step": 10 }, { "epoch": 0.010468463752944255, "grad_norm": 0.4593035639265128, "learning_rate": 5.235602094240838e-07, "logits/chosen": -0.6146650314331055, "logits/rejected": -0.615092396736145, "logps/chosen": -267.64739990234375, "logps/rejected": -261.7188415527344, "loss": 0.6931, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0001226421882165596, "rewards/margins": -0.00022562053345609456, "rewards/rejected": 0.00034826272167265415, "step": 20 }, { "epoch": 0.015702695629416383, "grad_norm": 0.5145955390417808, "learning_rate": 7.853403141361258e-07, "logits/chosen": -0.6289754509925842, "logits/rejected": -0.6177533268928528, "logps/chosen": -280.3735046386719, "logps/rejected": -242.95175170898438, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": 0.0002500644768588245, "rewards/margins": 0.0001505579421063885, "rewards/rejected": 9.950650564860553e-05, "step": 30 }, { "epoch": 0.02093692750588851, "grad_norm": 0.5230612257382828, "learning_rate": 1.0471204188481676e-06, "logits/chosen": -0.5945444107055664, "logits/rejected": -0.6121580600738525, "logps/chosen": -267.5122375488281, "logps/rejected": -268.740234375, "loss": 0.6926, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0005460727261379361, "rewards/margins": 0.0009088722290471196, "rewards/rejected": -0.0003627995611168444, "step": 40 }, { "epoch": 0.02617115938236064, "grad_norm": 0.5459284518349332, "learning_rate": 1.3089005235602096e-06, "logits/chosen": -0.6024752259254456, "logits/rejected": -0.6329907774925232, "logps/chosen": -285.15838623046875, "logps/rejected": -254.0716552734375, "loss": 0.693, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0005030709435231984, "rewards/margins": 0.000754082459025085, "rewards/rejected": -0.0002510116610210389, "step": 50 }, { "epoch": 0.031405391258832765, "grad_norm": 0.47503175617580706, "learning_rate": 1.5706806282722515e-06, "logits/chosen": -0.6266981363296509, "logits/rejected": -0.6208000183105469, "logps/chosen": -318.3113098144531, "logps/rejected": -271.3620300292969, "loss": 0.6917, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0019611348398029804, "rewards/margins": 0.002480647061020136, "rewards/rejected": -0.0005195126286707819, "step": 60 }, { "epoch": 0.036639623135304895, "grad_norm": 0.5387833374092045, "learning_rate": 1.8324607329842933e-06, "logits/chosen": -0.5752447843551636, "logits/rejected": -0.6073333024978638, "logps/chosen": -274.0575866699219, "logps/rejected": -241.2310028076172, "loss": 0.6912, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0042357188649475574, "rewards/margins": 0.0048388526774942875, "rewards/rejected": -0.0006031342782080173, "step": 70 }, { "epoch": 0.04187385501177702, "grad_norm": 0.573484197995026, "learning_rate": 2.094240837696335e-06, "logits/chosen": -0.5686159133911133, "logits/rejected": -0.5909486413002014, "logps/chosen": -298.2327880859375, "logps/rejected": -275.91986083984375, "loss": 0.6902, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.007856507785618305, "rewards/margins": 0.0063124834559857845, "rewards/rejected": 0.0015440242132171988, "step": 80 }, { "epoch": 0.04710808688824915, "grad_norm": 0.5157342119342403, "learning_rate": 2.356020942408377e-06, "logits/chosen": -0.6013309955596924, "logits/rejected": -0.6156548857688904, "logps/chosen": -256.23602294921875, "logps/rejected": -235.50259399414062, "loss": 0.6896, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.010481725446879864, "rewards/margins": 0.005771929398179054, "rewards/rejected": 0.0047097960487008095, "step": 90 }, { "epoch": 0.05234231876472128, "grad_norm": 0.5281481398431945, "learning_rate": 2.617801047120419e-06, "logits/chosen": -0.5954197645187378, "logits/rejected": -0.6362258195877075, "logps/chosen": -246.4618682861328, "logps/rejected": -205.40481567382812, "loss": 0.6865, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.01956891641020775, "rewards/margins": 0.013755053281784058, "rewards/rejected": 0.005813863128423691, "step": 100 }, { "epoch": 0.05234231876472128, "eval_logits/chosen": -0.5952818393707275, "eval_logits/rejected": -0.6047875285148621, "eval_logps/chosen": -274.37066650390625, "eval_logps/rejected": -252.90138244628906, "eval_loss": 0.685746967792511, "eval_rewards/accuracies": 0.6809999942779541, "eval_rewards/chosen": 0.02022642455995083, "eval_rewards/margins": 0.016626423224806786, "eval_rewards/rejected": 0.003600001335144043, "eval_runtime": 492.1641, "eval_samples_per_second": 4.064, "eval_steps_per_second": 0.254, "step": 100 }, { "epoch": 0.05757655064119341, "grad_norm": 0.5272794123234011, "learning_rate": 2.8795811518324613e-06, "logits/chosen": -0.5827732682228088, "logits/rejected": -0.6242547035217285, "logps/chosen": -261.44049072265625, "logps/rejected": -211.53970336914062, "loss": 0.6829, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.024311328306794167, "rewards/margins": 0.0224609337747097, "rewards/rejected": 0.0018503913888707757, "step": 110 }, { "epoch": 0.06281078251766553, "grad_norm": 0.5281893762396359, "learning_rate": 3.141361256544503e-06, "logits/chosen": -0.5533467531204224, "logits/rejected": -0.5774653553962708, "logps/chosen": -323.4781799316406, "logps/rejected": -284.3753967285156, "loss": 0.6828, "rewards/accuracies": 0.65625, "rewards/chosen": 0.028400782495737076, "rewards/margins": 0.018748918548226357, "rewards/rejected": 0.009651863016188145, "step": 120 }, { "epoch": 0.06804501439413765, "grad_norm": 0.5624037848656742, "learning_rate": 3.403141361256545e-06, "logits/chosen": -0.519309401512146, "logits/rejected": -0.538366436958313, "logps/chosen": -298.1244201660156, "logps/rejected": -273.02166748046875, "loss": 0.6764, "rewards/accuracies": 0.71875, "rewards/chosen": 0.018718790262937546, "rewards/margins": 0.03766729682683945, "rewards/rejected": -0.0189485065639019, "step": 130 }, { "epoch": 0.07327924627060979, "grad_norm": 0.6241582910982458, "learning_rate": 3.6649214659685865e-06, "logits/chosen": -0.6155737638473511, "logits/rejected": -0.615781307220459, "logps/chosen": -254.7561798095703, "logps/rejected": -252.4250030517578, "loss": 0.6661, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.0013563375687226653, "rewards/margins": 0.060362327843904495, "rewards/rejected": -0.0617186613380909, "step": 140 }, { "epoch": 0.07851347814708191, "grad_norm": 0.7819020006868519, "learning_rate": 3.926701570680629e-06, "logits/chosen": -0.5579255819320679, "logits/rejected": -0.5662384033203125, "logps/chosen": -294.1602478027344, "logps/rejected": -283.9729309082031, "loss": 0.6542, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.06525762379169464, "rewards/margins": 0.0802302211523056, "rewards/rejected": -0.14548785984516144, "step": 150 }, { "epoch": 0.08374771002355404, "grad_norm": 1.0082244738839943, "learning_rate": 4.18848167539267e-06, "logits/chosen": -0.6144393086433411, "logits/rejected": -0.6438087224960327, "logps/chosen": -293.11529541015625, "logps/rejected": -274.2389221191406, "loss": 0.6383, "rewards/accuracies": 0.6875, "rewards/chosen": -0.17805984616279602, "rewards/margins": 0.11895015090703964, "rewards/rejected": -0.29701000452041626, "step": 160 }, { "epoch": 0.08898194190002617, "grad_norm": 1.0185195747317044, "learning_rate": 4.450261780104713e-06, "logits/chosen": -0.6413242220878601, "logits/rejected": -0.6541165709495544, "logps/chosen": -264.94549560546875, "logps/rejected": -280.5323486328125, "loss": 0.6307, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.06771933287382126, "rewards/margins": 0.1421503722667694, "rewards/rejected": -0.20986969769001007, "step": 170 }, { "epoch": 0.0942161737764983, "grad_norm": 1.4632164660193285, "learning_rate": 4.712041884816754e-06, "logits/chosen": -0.8007810711860657, "logits/rejected": -0.8442818522453308, "logps/chosen": -335.73040771484375, "logps/rejected": -315.95654296875, "loss": 0.6239, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.2423553764820099, "rewards/margins": 0.19543686509132385, "rewards/rejected": -0.43779221177101135, "step": 180 }, { "epoch": 0.09945040565297043, "grad_norm": 1.4793919762168768, "learning_rate": 4.9738219895287965e-06, "logits/chosen": -0.8508358001708984, "logits/rejected": -0.9171808362007141, "logps/chosen": -319.34222412109375, "logps/rejected": -312.38458251953125, "loss": 0.5741, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.3020084500312805, "rewards/margins": 0.32925525307655334, "rewards/rejected": -0.6312636733055115, "step": 190 }, { "epoch": 0.10468463752944256, "grad_norm": 1.3909979729045314, "learning_rate": 4.999661831436499e-06, "logits/chosen": -0.8342474699020386, "logits/rejected": -0.8582927584648132, "logps/chosen": -305.7598571777344, "logps/rejected": -329.6683044433594, "loss": 0.5773, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.3809957504272461, "rewards/margins": 0.3516261875629425, "rewards/rejected": -0.7326219081878662, "step": 200 }, { "epoch": 0.10468463752944256, "eval_logits/chosen": -0.9088600277900696, "eval_logits/rejected": -0.9407602548599243, "eval_logps/chosen": -330.3779296875, "eval_logps/rejected": -347.161376953125, "eval_loss": 0.580152690410614, "eval_rewards/accuracies": 0.7080000042915344, "eval_rewards/chosen": -0.5398465991020203, "eval_rewards/margins": 0.3991530239582062, "eval_rewards/rejected": -0.9389996528625488, "eval_runtime": 493.0243, "eval_samples_per_second": 4.057, "eval_steps_per_second": 0.254, "step": 200 }, { "epoch": 0.10991886940591468, "grad_norm": 2.2981570438136902, "learning_rate": 4.9984929711403395e-06, "logits/chosen": -0.8167268633842468, "logits/rejected": -0.8397541046142578, "logps/chosen": -311.28460693359375, "logps/rejected": -334.7868347167969, "loss": 0.5759, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6255691051483154, "rewards/margins": 0.4341684877872467, "rewards/rejected": -1.0597375631332397, "step": 210 }, { "epoch": 0.11515310128238682, "grad_norm": 2.1371618628386155, "learning_rate": 4.996489634487865e-06, "logits/chosen": -0.8237009048461914, "logits/rejected": -0.8077519536018372, "logps/chosen": -394.8115539550781, "logps/rejected": -410.77276611328125, "loss": 0.5898, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0387346744537354, "rewards/margins": 0.47027960419654846, "rewards/rejected": -1.509014368057251, "step": 220 }, { "epoch": 0.12038733315885894, "grad_norm": 2.1931382967274287, "learning_rate": 4.9936524905772466e-06, "logits/chosen": -0.7595096826553345, "logits/rejected": -0.7923563718795776, "logps/chosen": -336.1356201171875, "logps/rejected": -397.56024169921875, "loss": 0.5126, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.9009682536125183, "rewards/margins": 0.6920944452285767, "rewards/rejected": -1.5930627584457397, "step": 230 }, { "epoch": 0.12562156503533106, "grad_norm": 3.5772907228062776, "learning_rate": 4.9899824869915e-06, "logits/chosen": -0.8187972903251648, "logits/rejected": -0.8766587376594543, "logps/chosen": -422.3431701660156, "logps/rejected": -457.0511779785156, "loss": 0.5458, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.592961311340332, "rewards/margins": 0.48186540603637695, "rewards/rejected": -2.074826717376709, "step": 240 }, { "epoch": 0.13085579691180318, "grad_norm": 1.9360981239656954, "learning_rate": 4.985480849482012e-06, "logits/chosen": -0.8391903042793274, "logits/rejected": -0.8728139996528625, "logps/chosen": -411.3055725097656, "logps/rejected": -408.16082763671875, "loss": 0.5184, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0500333309173584, "rewards/margins": 0.5413428544998169, "rewards/rejected": -1.5913760662078857, "step": 250 }, { "epoch": 0.1360900287882753, "grad_norm": 3.8063942330720124, "learning_rate": 4.980149081559142e-06, "logits/chosen": -0.8690627813339233, "logits/rejected": -0.9096955060958862, "logps/chosen": -373.99871826171875, "logps/rejected": -421.1673889160156, "loss": 0.5362, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9393401145935059, "rewards/margins": 0.7630335092544556, "rewards/rejected": -1.702373743057251, "step": 260 }, { "epoch": 0.14132426066474746, "grad_norm": 2.3218168753541364, "learning_rate": 4.9739889639900655e-06, "logits/chosen": -0.8408964276313782, "logits/rejected": -0.8704169392585754, "logps/chosen": -416.67352294921875, "logps/rejected": -505.36279296875, "loss": 0.4785, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.3642221689224243, "rewards/margins": 0.9632150530815125, "rewards/rejected": -2.327437162399292, "step": 270 }, { "epoch": 0.14655849254121958, "grad_norm": 3.6911223439591407, "learning_rate": 4.967002554204009e-06, "logits/chosen": -0.9045387506484985, "logits/rejected": -0.9169967770576477, "logps/chosen": -359.976806640625, "logps/rejected": -423.2659606933594, "loss": 0.5433, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8073030710220337, "rewards/margins": 0.6288052201271057, "rewards/rejected": -1.4361083507537842, "step": 280 }, { "epoch": 0.1517927244176917, "grad_norm": 3.3423618640552095, "learning_rate": 4.959192185605089e-06, "logits/chosen": -0.9444735646247864, "logits/rejected": -0.9996434450149536, "logps/chosen": -437.74822998046875, "logps/rejected": -472.47003173828125, "loss": 0.5385, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0655570030212402, "rewards/margins": 0.7227457761764526, "rewards/rejected": -1.7883027791976929, "step": 290 }, { "epoch": 0.15702695629416383, "grad_norm": 1.7496473866481979, "learning_rate": 4.950560466792969e-06, "logits/chosen": -0.9565297961235046, "logits/rejected": -1.0268198251724243, "logps/chosen": -383.75982666015625, "logps/rejected": -431.46868896484375, "loss": 0.546, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1211820840835571, "rewards/margins": 0.7908428907394409, "rewards/rejected": -1.9120250940322876, "step": 300 }, { "epoch": 0.15702695629416383, "eval_logits/chosen": -1.0509591102600098, "eval_logits/rejected": -1.093695044517517, "eval_logps/chosen": -375.9070739746094, "eval_logps/rejected": -426.7812194824219, "eval_loss": 0.5337316393852234, "eval_rewards/accuracies": 0.7369999885559082, "eval_rewards/chosen": -0.9951376914978027, "eval_rewards/margins": 0.7400606274604797, "eval_rewards/rejected": -1.7351982593536377, "eval_runtime": 490.0828, "eval_samples_per_second": 4.081, "eval_steps_per_second": 0.255, "step": 300 }, { "epoch": 0.16226118817063595, "grad_norm": 4.167924029619072, "learning_rate": 4.9411102806916185e-06, "logits/chosen": -0.9661039113998413, "logits/rejected": -1.0193841457366943, "logps/chosen": -386.1661071777344, "logps/rejected": -459.80059814453125, "loss": 0.4885, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.1400443315505981, "rewards/margins": 0.9499530792236328, "rewards/rejected": -2.0899975299835205, "step": 310 }, { "epoch": 0.16749542004710807, "grad_norm": 3.7134888984526406, "learning_rate": 4.930844783586424e-06, "logits/chosen": -0.9965893030166626, "logits/rejected": -1.0922787189483643, "logps/chosen": -487.04620361328125, "logps/rejected": -534.34326171875, "loss": 0.5121, "rewards/accuracies": 0.78125, "rewards/chosen": -2.133643627166748, "rewards/margins": 0.8644089698791504, "rewards/rejected": -2.9980525970458984, "step": 320 }, { "epoch": 0.17272965192358022, "grad_norm": 2.6372716992994625, "learning_rate": 4.919767404070033e-06, "logits/chosen": -1.044533371925354, "logits/rejected": -1.132899522781372, "logps/chosen": -446.17132568359375, "logps/rejected": -512.2128295898438, "loss": 0.5259, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.830833077430725, "rewards/margins": 0.8785037994384766, "rewards/rejected": -2.709336519241333, "step": 330 }, { "epoch": 0.17796388380005235, "grad_norm": 2.585264829780915, "learning_rate": 4.907881841897216e-06, "logits/chosen": -1.0099565982818604, "logits/rejected": -1.0530710220336914, "logps/chosen": -442.65887451171875, "logps/rejected": -488.6505432128906, "loss": 0.4837, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5058012008666992, "rewards/margins": 0.7143822908401489, "rewards/rejected": -2.2201833724975586, "step": 340 }, { "epoch": 0.18319811567652447, "grad_norm": 3.3879317879760484, "learning_rate": 4.89519206674919e-06, "logits/chosen": -0.9927116632461548, "logits/rejected": -1.0346943140029907, "logps/chosen": -456.9622497558594, "logps/rejected": -577.3074951171875, "loss": 0.4784, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.0847067832946777, "rewards/margins": 1.002403974533081, "rewards/rejected": -3.0871105194091797, "step": 350 }, { "epoch": 0.1884323475529966, "grad_norm": 3.57516881024075, "learning_rate": 4.881702316907769e-06, "logits/chosen": -1.0031338930130005, "logits/rejected": -1.0955650806427002, "logps/chosen": -503.44683837890625, "logps/rejected": -570.7683715820312, "loss": 0.4926, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.9374068975448608, "rewards/margins": 1.0033470392227173, "rewards/rejected": -2.940753936767578, "step": 360 }, { "epoch": 0.19366657942946872, "grad_norm": 5.038599527660393, "learning_rate": 4.86741709783982e-06, "logits/chosen": -1.026948094367981, "logits/rejected": -1.0862653255462646, "logps/chosen": -428.8387145996094, "logps/rejected": -521.2025146484375, "loss": 0.4521, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6744959354400635, "rewards/margins": 1.2723809480667114, "rewards/rejected": -2.9468765258789062, "step": 370 }, { "epoch": 0.19890081130594087, "grad_norm": 2.580969896149703, "learning_rate": 4.852341180692471e-06, "logits/chosen": -1.0237780809402466, "logits/rejected": -1.0312225818634033, "logps/chosen": -460.7245178222656, "logps/rejected": -579.8516845703125, "loss": 0.4707, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.8517287969589233, "rewards/margins": 1.0035735368728638, "rewards/rejected": -2.855302333831787, "step": 380 }, { "epoch": 0.204135043182413, "grad_norm": 4.049123524870898, "learning_rate": 4.836479600699579e-06, "logits/chosen": -1.021194577217102, "logits/rejected": -1.065612554550171, "logps/chosen": -474.7022399902344, "logps/rejected": -515.19384765625, "loss": 0.5062, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6703779697418213, "rewards/margins": 0.9831420183181763, "rewards/rejected": -2.653519868850708, "step": 390 }, { "epoch": 0.2093692750588851, "grad_norm": 2.669349458066594, "learning_rate": 4.819837655500014e-06, "logits/chosen": -1.055345892906189, "logits/rejected": -1.1153924465179443, "logps/chosen": -409.0503845214844, "logps/rejected": -475.8119201660156, "loss": 0.501, "rewards/accuracies": 0.75, "rewards/chosen": -1.5296335220336914, "rewards/margins": 0.9046236872673035, "rewards/rejected": -2.4342570304870605, "step": 400 }, { "epoch": 0.2093692750588851, "eval_logits/chosen": -1.0595479011535645, "eval_logits/rejected": -1.101104974746704, "eval_logps/chosen": -458.5479431152344, "eval_logps/rejected": -529.427734375, "eval_loss": 0.5120114088058472, "eval_rewards/accuracies": 0.753000020980835, "eval_rewards/chosen": -1.8215464353561401, "eval_rewards/margins": 0.9401166439056396, "eval_rewards/rejected": -2.7616631984710693, "eval_runtime": 490.6583, "eval_samples_per_second": 4.076, "eval_steps_per_second": 0.255, "step": 400 }, { "epoch": 0.21460350693535724, "grad_norm": 2.693659265285776, "learning_rate": 4.802420903368286e-06, "logits/chosen": -1.017884612083435, "logits/rejected": -1.0450990200042725, "logps/chosen": -452.4844665527344, "logps/rejected": -545.2508544921875, "loss": 0.5075, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.978095293045044, "rewards/margins": 0.8133966326713562, "rewards/rejected": -2.791491746902466, "step": 410 }, { "epoch": 0.21983773881182936, "grad_norm": 3.0138584618040816, "learning_rate": 4.784235161358124e-06, "logits/chosen": -1.0173218250274658, "logits/rejected": -1.0448085069656372, "logps/chosen": -483.49609375, "logps/rejected": -571.6468505859375, "loss": 0.5192, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.0148584842681885, "rewards/margins": 0.8309059143066406, "rewards/rejected": -2.84576416015625, "step": 420 }, { "epoch": 0.22507197068830148, "grad_norm": 2.7069812702685327, "learning_rate": 4.765286503359632e-06, "logits/chosen": -0.953274130821228, "logits/rejected": -1.0237443447113037, "logps/chosen": -445.3104553222656, "logps/rejected": -500.704345703125, "loss": 0.5173, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7284530401229858, "rewards/margins": 0.7751290798187256, "rewards/rejected": -2.503582239151001, "step": 430 }, { "epoch": 0.23030620256477363, "grad_norm": 2.6842575235901256, "learning_rate": 4.745581258070654e-06, "logits/chosen": -0.9820082783699036, "logits/rejected": -1.0258147716522217, "logps/chosen": -424.865966796875, "logps/rejected": -499.13031005859375, "loss": 0.5075, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5546586513519287, "rewards/margins": 0.8893228769302368, "rewards/rejected": -2.443981409072876, "step": 440 }, { "epoch": 0.23554043444124576, "grad_norm": 2.8248158426262036, "learning_rate": 4.725126006883047e-06, "logits/chosen": -0.9185377955436707, "logits/rejected": -0.94728022813797, "logps/chosen": -403.70159912109375, "logps/rejected": -514.3756713867188, "loss": 0.5284, "rewards/accuracies": 0.6875, "rewards/chosen": -1.593689203262329, "rewards/margins": 0.8767625689506531, "rewards/rejected": -2.470451593399048, "step": 450 }, { "epoch": 0.24077466631771788, "grad_norm": 3.479296042367863, "learning_rate": 4.70392758168454e-06, "logits/chosen": -0.8726640939712524, "logits/rejected": -0.9334267377853394, "logps/chosen": -439.721435546875, "logps/rejected": -487.37384033203125, "loss": 0.5126, "rewards/accuracies": 0.75, "rewards/chosen": -1.5047967433929443, "rewards/margins": 0.8623396158218384, "rewards/rejected": -2.3671364784240723, "step": 460 }, { "epoch": 0.24600889819419, "grad_norm": 4.319750283791727, "learning_rate": 4.68199306257695e-06, "logits/chosen": -0.9039748311042786, "logits/rejected": -0.9309199452400208, "logps/chosen": -492.0690002441406, "logps/rejected": -554.73388671875, "loss": 0.4953, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.8066781759262085, "rewards/margins": 0.9244368672370911, "rewards/rejected": -2.7311148643493652, "step": 470 }, { "epoch": 0.2512431300706621, "grad_norm": 3.14164730805949, "learning_rate": 4.659329775511478e-06, "logits/chosen": -0.9303410649299622, "logits/rejected": -0.977057933807373, "logps/chosen": -399.0423889160156, "logps/rejected": -490.00543212890625, "loss": 0.4683, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4180445671081543, "rewards/margins": 1.0890886783599854, "rewards/rejected": -2.5071330070495605, "step": 480 }, { "epoch": 0.2564773619471343, "grad_norm": 2.3614868571221357, "learning_rate": 4.635945289841902e-06, "logits/chosen": -0.8816567659378052, "logits/rejected": -0.9269768595695496, "logps/chosen": -415.19219970703125, "logps/rejected": -476.89556884765625, "loss": 0.4735, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3752299547195435, "rewards/margins": 0.9180902242660522, "rewards/rejected": -2.2933201789855957, "step": 490 }, { "epoch": 0.26171159382360637, "grad_norm": 2.8583338952559925, "learning_rate": 4.611847415796476e-06, "logits/chosen": -0.8435959815979004, "logits/rejected": -0.9220407605171204, "logps/chosen": -441.06201171875, "logps/rejected": -533.26904296875, "loss": 0.4525, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.673166275024414, "rewards/margins": 1.2388008832931519, "rewards/rejected": -2.9119672775268555, "step": 500 }, { "epoch": 0.26171159382360637, "eval_logits/chosen": -0.9134161472320557, "eval_logits/rejected": -0.9429917931556702, "eval_logps/chosen": -474.96240234375, "eval_logps/rejected": -561.74462890625, "eval_loss": 0.5090299248695374, "eval_rewards/accuracies": 0.7509999871253967, "eval_rewards/chosen": -1.985690951347351, "eval_rewards/margins": 1.0991418361663818, "eval_rewards/rejected": -3.0848329067230225, "eval_runtime": 490.4743, "eval_samples_per_second": 4.078, "eval_steps_per_second": 0.255, "step": 500 }, { "epoch": 0.2669458257000785, "grad_norm": 3.035220836230476, "learning_rate": 4.587044201869378e-06, "logits/chosen": -0.8761506080627441, "logits/rejected": -0.902166485786438, "logps/chosen": -425.97332763671875, "logps/rejected": -501.02789306640625, "loss": 0.497, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6953567266464233, "rewards/margins": 0.9890697598457336, "rewards/rejected": -2.6844265460968018, "step": 510 }, { "epoch": 0.2721800575765506, "grad_norm": 3.130477955866568, "learning_rate": 4.561543932132574e-06, "logits/chosen": -0.7917976379394531, "logits/rejected": -0.8613120317459106, "logps/chosen": -403.6101379394531, "logps/rejected": -495.6336975097656, "loss": 0.4638, "rewards/accuracies": 0.75, "rewards/chosen": -1.1837232112884521, "rewards/margins": 1.1299959421157837, "rewards/rejected": -2.3137192726135254, "step": 520 }, { "epoch": 0.27741428945302277, "grad_norm": 5.62051791025711, "learning_rate": 4.535355123469009e-06, "logits/chosen": -0.8685650825500488, "logits/rejected": -0.9064447283744812, "logps/chosen": -453.49505615234375, "logps/rejected": -571.8440551757812, "loss": 0.4962, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7710943222045898, "rewards/margins": 1.2530162334442139, "rewards/rejected": -3.0241103172302246, "step": 530 }, { "epoch": 0.2826485213294949, "grad_norm": 4.031206237193703, "learning_rate": 4.508486522728037e-06, "logits/chosen": -0.8691636919975281, "logits/rejected": -0.8954900503158569, "logps/chosen": -481.83001708984375, "logps/rejected": -557.0582275390625, "loss": 0.464, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.1070663928985596, "rewards/margins": 1.1235601902008057, "rewards/rejected": -3.2306265830993652, "step": 540 }, { "epoch": 0.287882753205967, "grad_norm": 2.5570522477618023, "learning_rate": 4.480947103804044e-06, "logits/chosen": -0.8274758458137512, "logits/rejected": -0.8582462072372437, "logps/chosen": -516.5406494140625, "logps/rejected": -561.04736328125, "loss": 0.5267, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.3619277477264404, "rewards/margins": 0.9269983172416687, "rewards/rejected": -3.288925886154175, "step": 550 }, { "epoch": 0.29311698508243916, "grad_norm": 2.788640181418083, "learning_rate": 4.452746064639239e-06, "logits/chosen": -0.9235193133354187, "logits/rejected": -0.9567066431045532, "logps/chosen": -500.55810546875, "logps/rejected": -591.5838012695312, "loss": 0.4856, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.2232844829559326, "rewards/margins": 0.9942595362663269, "rewards/rejected": -3.217543840408325, "step": 560 }, { "epoch": 0.29835121695891126, "grad_norm": 2.993526317828841, "learning_rate": 4.423892824151617e-06, "logits/chosen": -0.9176028966903687, "logits/rejected": -0.9547454118728638, "logps/chosen": -483.21490478515625, "logps/rejected": -539.8753051757812, "loss": 0.4752, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7947994470596313, "rewards/margins": 0.9781554937362671, "rewards/rejected": -2.7729547023773193, "step": 570 }, { "epoch": 0.3035854488353834, "grad_norm": 3.5431736202634885, "learning_rate": 4.3943970190891164e-06, "logits/chosen": -0.9206205606460571, "logits/rejected": -0.9628446698188782, "logps/chosen": -463.5152282714844, "logps/rejected": -528.37158203125, "loss": 0.4885, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6288913488388062, "rewards/margins": 1.1642273664474487, "rewards/rejected": -2.793118953704834, "step": 580 }, { "epoch": 0.30881968071185556, "grad_norm": 3.365610293148701, "learning_rate": 4.364268500811025e-06, "logits/chosen": -0.8485568165779114, "logits/rejected": -0.9199141263961792, "logps/chosen": -483.6082458496094, "logps/rejected": -560.9791259765625, "loss": 0.5057, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.1659016609191895, "rewards/margins": 1.034793734550476, "rewards/rejected": -3.200695514678955, "step": 590 }, { "epoch": 0.31405391258832765, "grad_norm": 4.5918534300247975, "learning_rate": 4.333517331997704e-06, "logits/chosen": -0.9025293588638306, "logits/rejected": -0.9563030004501343, "logps/chosen": -486.9297790527344, "logps/rejected": -560.46630859375, "loss": 0.508, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.339505434036255, "rewards/margins": 0.8578627705574036, "rewards/rejected": -3.1973681449890137, "step": 600 }, { "epoch": 0.31405391258832765, "eval_logits/chosen": -0.96256422996521, "eval_logits/rejected": -0.9954620599746704, "eval_logps/chosen": -497.45501708984375, "eval_logps/rejected": -568.3763427734375, "eval_loss": 0.5005487203598022, "eval_rewards/accuracies": 0.7599999904632568, "eval_rewards/chosen": -2.210617780685425, "eval_rewards/margins": 0.940531849861145, "eval_rewards/rejected": -3.151149272918701, "eval_runtime": 491.9737, "eval_samples_per_second": 4.065, "eval_steps_per_second": 0.254, "step": 600 }, { "epoch": 0.3192881444647998, "grad_norm": 2.901694328327758, "learning_rate": 4.302153783289737e-06, "logits/chosen": -0.9074804186820984, "logits/rejected": -0.9480104446411133, "logps/chosen": -492.01910400390625, "logps/rejected": -600.9209594726562, "loss": 0.469, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.1020569801330566, "rewards/margins": 1.1244044303894043, "rewards/rejected": -3.226461410522461, "step": 610 }, { "epoch": 0.3245223763412719, "grad_norm": 3.94946155728758, "learning_rate": 4.270188329857613e-06, "logits/chosen": -0.8973082304000854, "logits/rejected": -0.9559001922607422, "logps/chosen": -465.6111755371094, "logps/rejected": -525.2369384765625, "loss": 0.4846, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6850999593734741, "rewards/margins": 1.1260297298431396, "rewards/rejected": -2.811129570007324, "step": 620 }, { "epoch": 0.32975660821774405, "grad_norm": 3.033290372148656, "learning_rate": 4.237631647903115e-06, "logits/chosen": -0.9191315770149231, "logits/rejected": -0.9655174016952515, "logps/chosen": -424.0669860839844, "logps/rejected": -516.409912109375, "loss": 0.5221, "rewards/accuracies": 0.75, "rewards/chosen": -1.6103105545043945, "rewards/margins": 1.0200879573822021, "rewards/rejected": -2.630398988723755, "step": 630 }, { "epoch": 0.33499084009421615, "grad_norm": 6.947248145776445, "learning_rate": 4.204494611093548e-06, "logits/chosen": -0.8765581846237183, "logits/rejected": -0.9510295987129211, "logps/chosen": -403.50701904296875, "logps/rejected": -494.7275390625, "loss": 0.5006, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.570704698562622, "rewards/margins": 1.1784632205963135, "rewards/rejected": -2.7491679191589355, "step": 640 }, { "epoch": 0.3402250719706883, "grad_norm": 3.486686904718388, "learning_rate": 4.170788286930024e-06, "logits/chosen": -0.9069339632987976, "logits/rejected": -0.9199384450912476, "logps/chosen": -444.85009765625, "logps/rejected": -555.9014282226562, "loss": 0.5045, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.9438743591308594, "rewards/margins": 1.0122339725494385, "rewards/rejected": -2.956108570098877, "step": 650 }, { "epoch": 0.34545930384716045, "grad_norm": 4.061094803349214, "learning_rate": 4.136523933051005e-06, "logits/chosen": -0.8677660822868347, "logits/rejected": -0.9190397262573242, "logps/chosen": -406.159912109375, "logps/rejected": -498.9130859375, "loss": 0.497, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4709824323654175, "rewards/margins": 0.9458149671554565, "rewards/rejected": -2.416797399520874, "step": 660 }, { "epoch": 0.35069353572363254, "grad_norm": 4.528134734617534, "learning_rate": 4.101712993472348e-06, "logits/chosen": -0.8407672643661499, "logits/rejected": -0.8770118951797485, "logps/chosen": -421.9403381347656, "logps/rejected": -511.84820556640625, "loss": 0.5147, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5217100381851196, "rewards/margins": 1.0356905460357666, "rewards/rejected": -2.5574002265930176, "step": 670 }, { "epoch": 0.3559277676001047, "grad_norm": 3.331708093137532, "learning_rate": 4.066367094765091e-06, "logits/chosen": -0.8362857103347778, "logits/rejected": -0.8756265640258789, "logps/chosen": -427.3866271972656, "logps/rejected": -472.51116943359375, "loss": 0.5107, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6107534170150757, "rewards/margins": 0.761100709438324, "rewards/rejected": -2.371854066848755, "step": 680 }, { "epoch": 0.3611619994765768, "grad_norm": 3.054351768159171, "learning_rate": 4.030498042172277e-06, "logits/chosen": -0.887158989906311, "logits/rejected": -0.9305670857429504, "logps/chosen": -436.98663330078125, "logps/rejected": -536.4581298828125, "loss": 0.4933, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5907622575759888, "rewards/margins": 1.0380842685699463, "rewards/rejected": -2.6288466453552246, "step": 690 }, { "epoch": 0.36639623135304894, "grad_norm": 3.9217623567649165, "learning_rate": 3.994117815666095e-06, "logits/chosen": -0.8911476135253906, "logits/rejected": -0.9339841604232788, "logps/chosen": -435.4269104003906, "logps/rejected": -515.2606201171875, "loss": 0.4852, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6368818283081055, "rewards/margins": 0.932881236076355, "rewards/rejected": -2.56976318359375, "step": 700 }, { "epoch": 0.36639623135304894, "eval_logits/chosen": -0.9476205110549927, "eval_logits/rejected": -0.9794169664382935, "eval_logps/chosen": -416.10260009765625, "eval_logps/rejected": -494.5317077636719, "eval_loss": 0.5027905106544495, "eval_rewards/accuracies": 0.7770000100135803, "eval_rewards/chosen": -1.3970927000045776, "eval_rewards/margins": 1.0156108140945435, "eval_rewards/rejected": -2.412703275680542, "eval_runtime": 489.1035, "eval_samples_per_second": 4.089, "eval_steps_per_second": 0.256, "step": 700 }, { "epoch": 0.3716304632295211, "grad_norm": 3.6606573607441772, "learning_rate": 3.957238565946672e-06, "logits/chosen": -0.8809655904769897, "logits/rejected": -0.9375411868095398, "logps/chosen": -473.4790954589844, "logps/rejected": -525.9794311523438, "loss": 0.4515, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.573014736175537, "rewards/margins": 1.1010257005691528, "rewards/rejected": -2.6740405559539795, "step": 710 }, { "epoch": 0.3768646951059932, "grad_norm": 4.5368251934335655, "learning_rate": 3.919872610383831e-06, "logits/chosen": -0.8901004791259766, "logits/rejected": -0.9005835652351379, "logps/chosen": -456.73626708984375, "logps/rejected": -578.81201171875, "loss": 0.4683, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.028618335723877, "rewards/margins": 1.2738052606582642, "rewards/rejected": -3.3024234771728516, "step": 720 }, { "epoch": 0.38209892698246534, "grad_norm": 4.7885163624566704, "learning_rate": 3.882032428903195e-06, "logits/chosen": -0.8851186633110046, "logits/rejected": -0.8868842124938965, "logps/chosen": -499.48040771484375, "logps/rejected": -608.1116333007812, "loss": 0.5104, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.6055073738098145, "rewards/margins": 1.1688528060913086, "rewards/rejected": -3.774359941482544, "step": 730 }, { "epoch": 0.38733315885893743, "grad_norm": 2.6857761500293766, "learning_rate": 3.84373065981799e-06, "logits/chosen": -0.8212822675704956, "logits/rejected": -0.851559042930603, "logps/chosen": -512.4810791015625, "logps/rejected": -562.7243041992188, "loss": 0.451, "rewards/accuracies": 0.78125, "rewards/chosen": -2.420043468475342, "rewards/margins": 1.00551438331604, "rewards/rejected": -3.425558090209961, "step": 740 }, { "epoch": 0.3925673907354096, "grad_norm": 5.276994078087987, "learning_rate": 3.8049800956079552e-06, "logits/chosen": -0.8459002375602722, "logits/rejected": -0.8481931686401367, "logps/chosen": -452.31103515625, "logps/rejected": -549.0643310546875, "loss": 0.4316, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8753325939178467, "rewards/margins": 1.1157124042510986, "rewards/rejected": -2.9910449981689453, "step": 750 }, { "epoch": 0.39780162261188173, "grad_norm": 4.2662925093714525, "learning_rate": 3.765793678646753e-06, "logits/chosen": -0.8927903175354004, "logits/rejected": -0.9021077156066895, "logps/chosen": -400.74273681640625, "logps/rejected": -483.39813232421875, "loss": 0.538, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5194071531295776, "rewards/margins": 1.0460177659988403, "rewards/rejected": -2.565425157546997, "step": 760 }, { "epoch": 0.40303585448835383, "grad_norm": 2.642122221399109, "learning_rate": 3.726184496879323e-06, "logits/chosen": -0.8244895935058594, "logits/rejected": -0.8434764742851257, "logps/chosen": -467.0747985839844, "logps/rejected": -540.9425048828125, "loss": 0.5261, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.963444471359253, "rewards/margins": 0.8455499410629272, "rewards/rejected": -2.8089945316314697, "step": 770 }, { "epoch": 0.408270086364826, "grad_norm": 3.728031543819746, "learning_rate": 3.686165779450619e-06, "logits/chosen": -0.8528604507446289, "logits/rejected": -0.8891122937202454, "logps/chosen": -460.427734375, "logps/rejected": -544.4713134765625, "loss": 0.4439, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.879953145980835, "rewards/margins": 1.079024314880371, "rewards/rejected": -2.958977699279785, "step": 780 }, { "epoch": 0.4135043182412981, "grad_norm": 2.85654604962964, "learning_rate": 3.645750892287178e-06, "logits/chosen": -0.8759455680847168, "logits/rejected": -0.8966878652572632, "logps/chosen": -451.11865234375, "logps/rejected": -574.4451293945312, "loss": 0.5208, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9754890203475952, "rewards/margins": 1.0046672821044922, "rewards/rejected": -2.980156421661377, "step": 790 }, { "epoch": 0.4187385501177702, "grad_norm": 2.7417872161100374, "learning_rate": 3.604953333633009e-06, "logits/chosen": -0.8261939883232117, "logits/rejected": -0.884585976600647, "logps/chosen": -468.4187927246094, "logps/rejected": -541.056640625, "loss": 0.5474, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.829167366027832, "rewards/margins": 0.9327658414840698, "rewards/rejected": -2.7619330883026123, "step": 800 }, { "epoch": 0.4187385501177702, "eval_logits/chosen": -0.8851308822631836, "eval_logits/rejected": -0.9115467667579651, "eval_logps/chosen": -455.87139892578125, "eval_logps/rejected": -529.6283569335938, "eval_loss": 0.4966377317905426, "eval_rewards/accuracies": 0.7670000195503235, "eval_rewards/chosen": -1.794780969619751, "eval_rewards/margins": 0.9688891768455505, "eval_rewards/rejected": -2.7636702060699463, "eval_runtime": 491.3021, "eval_samples_per_second": 4.071, "eval_steps_per_second": 0.254, "step": 800 }, { "epoch": 0.4239727819942423, "grad_norm": 4.3302971642908465, "learning_rate": 3.56378672954129e-06, "logits/chosen": -0.8280191421508789, "logits/rejected": -0.8614367246627808, "logps/chosen": -439.85662841796875, "logps/rejected": -539.2572021484375, "loss": 0.5092, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.863586187362671, "rewards/margins": 0.8855009078979492, "rewards/rejected": -2.74908709526062, "step": 810 }, { "epoch": 0.42920701387071447, "grad_norm": 3.1116409091724786, "learning_rate": 3.5222648293233806e-06, "logits/chosen": -0.8204169273376465, "logits/rejected": -0.8331009149551392, "logps/chosen": -489.24383544921875, "logps/rejected": -579.9151611328125, "loss": 0.5038, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.3326661586761475, "rewards/margins": 1.1430214643478394, "rewards/rejected": -3.4756877422332764, "step": 820 }, { "epoch": 0.4344412457471866, "grad_norm": 3.2429890344778074, "learning_rate": 3.4804015009566573e-06, "logits/chosen": -0.7415329217910767, "logits/rejected": -0.7724164724349976, "logps/chosen": -455.18572998046875, "logps/rejected": -487.82733154296875, "loss": 0.5401, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.0002198219299316, "rewards/margins": 0.7170491218566895, "rewards/rejected": -2.717268705368042, "step": 830 }, { "epoch": 0.4396754776236587, "grad_norm": 3.5668607410560975, "learning_rate": 3.4382107264527244e-06, "logits/chosen": -0.7589952945709229, "logits/rejected": -0.7982163429260254, "logps/chosen": -483.5228576660156, "logps/rejected": -528.97021484375, "loss": 0.4736, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7952258586883545, "rewards/margins": 1.0175402164459229, "rewards/rejected": -2.8127660751342773, "step": 840 }, { "epoch": 0.44490970950013087, "grad_norm": 4.0501340588509365, "learning_rate": 3.3957065971875387e-06, "logits/chosen": -0.795210063457489, "logits/rejected": -0.8109579086303711, "logps/chosen": -458.65814208984375, "logps/rejected": -532.5260620117188, "loss": 0.4962, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.041184902191162, "rewards/margins": 1.0093342065811157, "rewards/rejected": -3.0505189895629883, "step": 850 }, { "epoch": 0.45014394137660296, "grad_norm": 3.973745164078984, "learning_rate": 3.352903309194999e-06, "logits/chosen": -0.8046371340751648, "logits/rejected": -0.8593829274177551, "logps/chosen": -461.7774353027344, "logps/rejected": -595.0950927734375, "loss": 0.4721, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.2889552116394043, "rewards/margins": 1.1978638172149658, "rewards/rejected": -3.4868195056915283, "step": 860 }, { "epoch": 0.4553781732530751, "grad_norm": 2.672124703356638, "learning_rate": 3.309815158425591e-06, "logits/chosen": -0.7477758526802063, "logits/rejected": -0.7731175422668457, "logps/chosen": -472.6768493652344, "logps/rejected": -555.8510131835938, "loss": 0.5073, "rewards/accuracies": 0.78125, "rewards/chosen": -2.2288527488708496, "rewards/margins": 0.9096274375915527, "rewards/rejected": -3.1384804248809814, "step": 870 }, { "epoch": 0.46061240512954726, "grad_norm": 2.903999411374589, "learning_rate": 3.266456535971654e-06, "logits/chosen": -0.864072322845459, "logits/rejected": -0.8836487531661987, "logps/chosen": -464.50115966796875, "logps/rejected": -530.8693237304688, "loss": 0.4929, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0616631507873535, "rewards/margins": 0.824569821357727, "rewards/rejected": -2.88623309135437, "step": 880 }, { "epoch": 0.46584663700601936, "grad_norm": 2.4287136163193974, "learning_rate": 3.2228419232608692e-06, "logits/chosen": -0.8057514429092407, "logits/rejected": -0.8173397183418274, "logps/chosen": -457.15032958984375, "logps/rejected": -542.5882568359375, "loss": 0.4903, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.8502060174942017, "rewards/margins": 0.9911335110664368, "rewards/rejected": -2.841339588165283, "step": 890 }, { "epoch": 0.4710808688824915, "grad_norm": 2.8863413320590774, "learning_rate": 3.1789858872195888e-06, "logits/chosen": -0.771654486656189, "logits/rejected": -0.8108331561088562, "logps/chosen": -418.285888671875, "logps/rejected": -514.6083984375, "loss": 0.5246, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6747478246688843, "rewards/margins": 0.8920180201530457, "rewards/rejected": -2.566765785217285, "step": 900 }, { "epoch": 0.4710808688824915, "eval_logits/chosen": -0.7979649305343628, "eval_logits/rejected": -0.8138096928596497, "eval_logps/chosen": -429.2431335449219, "eval_logps/rejected": -507.4219055175781, "eval_loss": 0.49433061480522156, "eval_rewards/accuracies": 0.765999972820282, "eval_rewards/chosen": -1.5284981727600098, "eval_rewards/margins": 1.013107419013977, "eval_rewards/rejected": -2.5416059494018555, "eval_runtime": 490.1393, "eval_samples_per_second": 4.08, "eval_steps_per_second": 0.255, "step": 900 }, { "epoch": 0.4763151007589636, "grad_norm": 2.884609247157233, "learning_rate": 3.1349030754075945e-06, "logits/chosen": -0.7577202320098877, "logits/rejected": -0.764434278011322, "logps/chosen": -435.4732360839844, "logps/rejected": -516.2320556640625, "loss": 0.5219, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7493627071380615, "rewards/margins": 0.8683775067329407, "rewards/rejected": -2.6177401542663574, "step": 910 }, { "epoch": 0.48154933263543576, "grad_norm": 2.1669192855886257, "learning_rate": 3.0906082111259313e-06, "logits/chosen": -0.8027156591415405, "logits/rejected": -0.8391119837760925, "logps/chosen": -479.461181640625, "logps/rejected": -578.0491943359375, "loss": 0.4802, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.1167943477630615, "rewards/margins": 1.2451462745666504, "rewards/rejected": -3.361940383911133, "step": 920 }, { "epoch": 0.48678356451190785, "grad_norm": 3.304438899218671, "learning_rate": 3.046116088499449e-06, "logits/chosen": -0.8075677752494812, "logits/rejected": -0.818462073802948, "logps/chosen": -551.7511596679688, "logps/rejected": -617.6654052734375, "loss": 0.4831, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.702815532684326, "rewards/margins": 0.8609806895256042, "rewards/rejected": -3.563796281814575, "step": 930 }, { "epoch": 0.49201779638838, "grad_norm": 5.162139925744972, "learning_rate": 3.0014415675356813e-06, "logits/chosen": -0.8480289578437805, "logits/rejected": -0.8807282447814941, "logps/chosen": -580.1326904296875, "logps/rejected": -658.1190795898438, "loss": 0.451, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.9515488147735596, "rewards/margins": 1.0807795524597168, "rewards/rejected": -4.032328128814697, "step": 940 }, { "epoch": 0.49725202826485215, "grad_norm": 4.270308518724467, "learning_rate": 2.9565995691617242e-06, "logits/chosen": -0.8305244445800781, "logits/rejected": -0.8603514432907104, "logps/chosen": -577.685302734375, "logps/rejected": -670.2604370117188, "loss": 0.4688, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.654804229736328, "rewards/margins": 1.3251168727874756, "rewards/rejected": -3.9799208641052246, "step": 950 }, { "epoch": 0.5024862601413242, "grad_norm": 3.9112848274397374, "learning_rate": 2.9116050702407706e-06, "logits/chosen": -0.8263424634933472, "logits/rejected": -0.8838019371032715, "logps/chosen": -500.05841064453125, "logps/rejected": -590.2412109375, "loss": 0.5068, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.197012424468994, "rewards/margins": 1.2063963413238525, "rewards/rejected": -3.4034087657928467, "step": 960 }, { "epoch": 0.5077204920177963, "grad_norm": 2.6527116057322813, "learning_rate": 2.8664730985699537e-06, "logits/chosen": -0.8739676475524902, "logits/rejected": -0.8983599543571472, "logps/chosen": -471.83978271484375, "logps/rejected": -584.647216796875, "loss": 0.4578, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.0771896839141846, "rewards/margins": 1.0904505252838135, "rewards/rejected": -3.167639970779419, "step": 970 }, { "epoch": 0.5129547238942685, "grad_norm": 3.0654272233325806, "learning_rate": 2.8212187278611907e-06, "logits/chosen": -0.8305708765983582, "logits/rejected": -0.8489472270011902, "logps/chosen": -484.65679931640625, "logps/rejected": -540.0225219726562, "loss": 0.4925, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8167269229888916, "rewards/margins": 0.965971827507019, "rewards/rejected": -2.7826988697052, "step": 980 }, { "epoch": 0.5181889557707406, "grad_norm": 2.8308862425981896, "learning_rate": 2.7758570727066843e-06, "logits/chosen": -0.8484777212142944, "logits/rejected": -0.861672580242157, "logps/chosen": -470.0426330566406, "logps/rejected": -552.4635620117188, "loss": 0.5011, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.991310715675354, "rewards/margins": 1.036561369895935, "rewards/rejected": -3.027872085571289, "step": 990 }, { "epoch": 0.5234231876472127, "grad_norm": 3.1475704256823818, "learning_rate": 2.730403283530767e-06, "logits/chosen": -0.7995238900184631, "logits/rejected": -0.8208199739456177, "logps/chosen": -514.13525390625, "logps/rejected": -616.7322387695312, "loss": 0.4635, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.589371681213379, "rewards/margins": 1.151365876197815, "rewards/rejected": -3.7407374382019043, "step": 1000 }, { "epoch": 0.5234231876472127, "eval_logits/chosen": -0.8520967364311218, "eval_logits/rejected": -0.871288001537323, "eval_logps/chosen": -558.1610107421875, "eval_logps/rejected": -656.6334228515625, "eval_loss": 0.4907907247543335, "eval_rewards/accuracies": 0.7630000114440918, "eval_rewards/chosen": -2.817676544189453, "eval_rewards/margins": 1.2160439491271973, "eval_rewards/rejected": -4.03372049331665, "eval_runtime": 490.4932, "eval_samples_per_second": 4.078, "eval_steps_per_second": 0.255, "step": 1000 }, { "epoch": 0.528657419523685, "grad_norm": 2.8934222921049226, "learning_rate": 2.6848725415297888e-06, "logits/chosen": -0.8423219919204712, "logits/rejected": -0.8691143989562988, "logps/chosen": -526.5357666015625, "logps/rejected": -626.977294921875, "loss": 0.4864, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.859957456588745, "rewards/margins": 1.0318825244903564, "rewards/rejected": -3.8918404579162598, "step": 1010 }, { "epoch": 0.533891651400157, "grad_norm": 4.072441147880593, "learning_rate": 2.639280053601719e-06, "logits/chosen": -0.7951158285140991, "logits/rejected": -0.8444026708602905, "logps/chosen": -551.2407836914062, "logps/rejected": -624.159912109375, "loss": 0.4753, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.523639678955078, "rewards/margins": 1.181814193725586, "rewards/rejected": -3.7054543495178223, "step": 1020 }, { "epoch": 0.5391258832766291, "grad_norm": 2.288139837580109, "learning_rate": 2.59364104726716e-06, "logits/chosen": -0.7849830389022827, "logits/rejected": -0.7941724061965942, "logps/chosen": -558.4784545898438, "logps/rejected": -631.0448608398438, "loss": 0.4813, "rewards/accuracies": 0.71875, "rewards/chosen": -2.9077861309051514, "rewards/margins": 1.099591851234436, "rewards/rejected": -4.007378101348877, "step": 1030 }, { "epoch": 0.5443601151531012, "grad_norm": 2.255027947939241, "learning_rate": 2.547970765583491e-06, "logits/chosen": -0.7766658663749695, "logits/rejected": -0.7682461142539978, "logps/chosen": -551.2001953125, "logps/rejected": -626.3408203125, "loss": 0.508, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.647867202758789, "rewards/margins": 0.965995192527771, "rewards/rejected": -3.6138622760772705, "step": 1040 }, { "epoch": 0.5495943470295734, "grad_norm": 3.365620498364263, "learning_rate": 2.502284462053799e-06, "logits/chosen": -0.7241968512535095, "logits/rejected": -0.7473636269569397, "logps/chosen": -527.13427734375, "logps/rejected": -617.5802612304688, "loss": 0.4658, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.2644190788269043, "rewards/margins": 1.2528817653656006, "rewards/rejected": -3.517301082611084, "step": 1050 }, { "epoch": 0.5548285789060455, "grad_norm": 4.206518455943664, "learning_rate": 2.456597395532338e-06, "logits/chosen": -0.7641677856445312, "logits/rejected": -0.8080291748046875, "logps/chosen": -470.62872314453125, "logps/rejected": -533.24560546875, "loss": 0.4611, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.032501220703125, "rewards/margins": 1.0605053901672363, "rewards/rejected": -3.0930066108703613, "step": 1060 }, { "epoch": 0.5600628107825176, "grad_norm": 2.230846159297882, "learning_rate": 2.4109248251281953e-06, "logits/chosen": -0.7514528632164001, "logits/rejected": -0.775009036064148, "logps/chosen": -500.794921875, "logps/rejected": -553.78466796875, "loss": 0.4948, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.0267245769500732, "rewards/margins": 1.0498759746551514, "rewards/rejected": -3.0766005516052246, "step": 1070 }, { "epoch": 0.5652970426589898, "grad_norm": 2.9850161302908567, "learning_rate": 2.365282005108875e-06, "logits/chosen": -0.7392014265060425, "logits/rejected": -0.7644953727722168, "logps/chosen": -484.65216064453125, "logps/rejected": -520.8948974609375, "loss": 0.5131, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.1898112297058105, "rewards/margins": 0.9680379629135132, "rewards/rejected": -3.1578493118286133, "step": 1080 }, { "epoch": 0.5705312745354619, "grad_norm": 4.871043269302411, "learning_rate": 2.319684179805491e-06, "logits/chosen": -0.7259557247161865, "logits/rejected": -0.7469737529754639, "logps/chosen": -513.62353515625, "logps/rejected": -555.1145629882812, "loss": 0.5266, "rewards/accuracies": 0.71875, "rewards/chosen": -2.5438778400421143, "rewards/margins": 0.932669997215271, "rewards/rejected": -3.476547956466675, "step": 1090 }, { "epoch": 0.575765506411934, "grad_norm": 2.9604109906028677, "learning_rate": 2.2741465785212905e-06, "logits/chosen": -0.7221305966377258, "logits/rejected": -0.7538542747497559, "logps/chosen": -503.03143310546875, "logps/rejected": -606.3543090820312, "loss": 0.4856, "rewards/accuracies": 0.71875, "rewards/chosen": -2.407829761505127, "rewards/margins": 0.9688541293144226, "rewards/rejected": -3.3766837120056152, "step": 1100 }, { "epoch": 0.575765506411934, "eval_logits/chosen": -0.7912909984588623, "eval_logits/rejected": -0.8044111728668213, "eval_logps/chosen": -512.9990234375, "eval_logps/rejected": -602.4694213867188, "eval_loss": 0.4817214012145996, "eval_rewards/accuracies": 0.7720000147819519, "eval_rewards/chosen": -2.3660569190979004, "eval_rewards/margins": 1.1260240077972412, "eval_rewards/rejected": -3.4920809268951416, "eval_runtime": 490.5656, "eval_samples_per_second": 4.077, "eval_steps_per_second": 0.255, "step": 1100 }, { "epoch": 0.5809997382884062, "grad_norm": 4.040674423079137, "learning_rate": 2.2286844104451848e-06, "logits/chosen": -0.7596691250801086, "logits/rejected": -0.7672920227050781, "logps/chosen": -506.1031799316406, "logps/rejected": -615.0109252929688, "loss": 0.51, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.4000015258789062, "rewards/margins": 1.183712363243103, "rewards/rejected": -3.583714008331299, "step": 1110 }, { "epoch": 0.5862339701648783, "grad_norm": 3.766321110228989, "learning_rate": 2.183312859572008e-06, "logits/chosen": -0.7806628346443176, "logits/rejected": -0.7962976098060608, "logps/chosen": -550.5853271484375, "logps/rejected": -597.5955200195312, "loss": 0.5075, "rewards/accuracies": 0.71875, "rewards/chosen": -2.502775192260742, "rewards/margins": 0.9967296719551086, "rewards/rejected": -3.499504804611206, "step": 1120 }, { "epoch": 0.5914682020413504, "grad_norm": 3.8765385487802715, "learning_rate": 2.1380470796311843e-06, "logits/chosen": -0.7329021692276001, "logits/rejected": -0.716983437538147, "logps/chosen": -548.9044799804688, "logps/rejected": -640.8181762695312, "loss": 0.4782, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.5259509086608887, "rewards/margins": 1.2471174001693726, "rewards/rejected": -3.77306866645813, "step": 1130 }, { "epoch": 0.5967024339178225, "grad_norm": 3.776816209238374, "learning_rate": 2.092902189025507e-06, "logits/chosen": -0.7345478534698486, "logits/rejected": -0.765870213508606, "logps/chosen": -567.6248168945312, "logps/rejected": -644.7882690429688, "loss": 0.4409, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.6834120750427246, "rewards/margins": 1.2605786323547363, "rewards/rejected": -3.943991184234619, "step": 1140 }, { "epoch": 0.6019366657942947, "grad_norm": 2.4633069424861476, "learning_rate": 2.0478932657817105e-06, "logits/chosen": -0.7708589434623718, "logits/rejected": -0.7781729698181152, "logps/chosen": -546.5011596679688, "logps/rejected": -616.3358154296875, "loss": 0.4785, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.6856911182403564, "rewards/margins": 1.0351371765136719, "rewards/rejected": -3.7208282947540283, "step": 1150 }, { "epoch": 0.6071708976707668, "grad_norm": 2.9067779299636363, "learning_rate": 2.0030353425145376e-06, "logits/chosen": -0.7892950773239136, "logits/rejected": -0.7964081764221191, "logps/chosen": -454.22735595703125, "logps/rejected": -548.8640747070312, "loss": 0.551, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.2731716632843018, "rewards/margins": 1.0861868858337402, "rewards/rejected": -3.359358549118042, "step": 1160 }, { "epoch": 0.6124051295472389, "grad_norm": 3.0126218778741816, "learning_rate": 1.958343401405964e-06, "logits/chosen": -0.7989486455917358, "logits/rejected": -0.8354769945144653, "logps/chosen": -471.26971435546875, "logps/rejected": -576.9542236328125, "loss": 0.515, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.9779512882232666, "rewards/margins": 1.1961392164230347, "rewards/rejected": -3.1740903854370117, "step": 1170 }, { "epoch": 0.6176393614237111, "grad_norm": 2.8356127345530298, "learning_rate": 1.9138323692012734e-06, "logits/chosen": -0.7760749459266663, "logits/rejected": -0.8243614435195923, "logps/chosen": -477.05255126953125, "logps/rejected": -567.8319091796875, "loss": 0.4482, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9603474140167236, "rewards/margins": 1.083135962486267, "rewards/rejected": -3.0434834957122803, "step": 1180 }, { "epoch": 0.6228735933001832, "grad_norm": 2.653741359724002, "learning_rate": 1.8695171122236443e-06, "logits/chosen": -0.7773482799530029, "logits/rejected": -0.8091195821762085, "logps/chosen": -534.4111328125, "logps/rejected": -603.28564453125, "loss": 0.4399, "rewards/accuracies": 0.84375, "rewards/chosen": -2.1728804111480713, "rewards/margins": 1.205948829650879, "rewards/rejected": -3.37882924079895, "step": 1190 }, { "epoch": 0.6281078251766553, "grad_norm": 3.666686057880921, "learning_rate": 1.8254124314089225e-06, "logits/chosen": -0.7455651760101318, "logits/rejected": -0.7562496066093445, "logps/chosen": -496.6683044433594, "logps/rejected": -561.0657958984375, "loss": 0.5013, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.1546523571014404, "rewards/margins": 0.8877624273300171, "rewards/rejected": -3.042415142059326, "step": 1200 }, { "epoch": 0.6281078251766553, "eval_logits/chosen": -0.7745435237884521, "eval_logits/rejected": -0.7890085577964783, "eval_logps/chosen": -488.01080322265625, "eval_logps/rejected": -582.3287353515625, "eval_loss": 0.4859620928764343, "eval_rewards/accuracies": 0.7720000147819519, "eval_rewards/chosen": -2.1161751747131348, "eval_rewards/margins": 1.1744980812072754, "eval_rewards/rejected": -3.29067325592041, "eval_runtime": 492.7641, "eval_samples_per_second": 4.059, "eval_steps_per_second": 0.254, "step": 1200 }, { "epoch": 0.6333420570531274, "grad_norm": 4.445062813407665, "learning_rate": 1.781533057362221e-06, "logits/chosen": -0.7981818914413452, "logits/rejected": -0.8196622729301453, "logps/chosen": -482.60601806640625, "logps/rejected": -558.7507934570312, "loss": 0.4991, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.063744068145752, "rewards/margins": 1.000634789466858, "rewards/rejected": -3.0643787384033203, "step": 1210 }, { "epoch": 0.6385762889295996, "grad_norm": 4.507009476169383, "learning_rate": 1.7378936454380277e-06, "logits/chosen": -0.7655137181282043, "logits/rejected": -0.7727285623550415, "logps/chosen": -491.4285583496094, "logps/rejected": -586.7752685546875, "loss": 0.4859, "rewards/accuracies": 0.75, "rewards/chosen": -2.225255012512207, "rewards/margins": 1.2578017711639404, "rewards/rejected": -3.4830570220947266, "step": 1220 }, { "epoch": 0.6438105208060717, "grad_norm": 2.769661114858915, "learning_rate": 1.6945087708454273e-06, "logits/chosen": -0.7237203121185303, "logits/rejected": -0.752585768699646, "logps/chosen": -524.2711791992188, "logps/rejected": -591.5977783203125, "loss": 0.4308, "rewards/accuracies": 0.75, "rewards/chosen": -2.2781286239624023, "rewards/margins": 1.1484158039093018, "rewards/rejected": -3.426544666290283, "step": 1230 }, { "epoch": 0.6490447526825438, "grad_norm": 2.8409702645316486, "learning_rate": 1.651392923780105e-06, "logits/chosen": -0.7248971462249756, "logits/rejected": -0.7399358153343201, "logps/chosen": -502.482666015625, "logps/rejected": -614.2011108398438, "loss": 0.4665, "rewards/accuracies": 0.8125, "rewards/chosen": -2.312042713165283, "rewards/margins": 1.1948063373565674, "rewards/rejected": -3.5068485736846924, "step": 1240 }, { "epoch": 0.654278984559016, "grad_norm": 2.5610614385353614, "learning_rate": 1.608560504584737e-06, "logits/chosen": -0.7505759000778198, "logits/rejected": -0.748982846736908, "logps/chosen": -497.2088317871094, "logps/rejected": -608.0076904296875, "loss": 0.4391, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.297612428665161, "rewards/margins": 1.178468942642212, "rewards/rejected": -3.476081132888794, "step": 1250 }, { "epoch": 0.6595132164354881, "grad_norm": 2.9288478965975204, "learning_rate": 1.5660258189393945e-06, "logits/chosen": -0.7709019780158997, "logits/rejected": -0.7804166674613953, "logps/chosen": -484.7782287597656, "logps/rejected": -561.656494140625, "loss": 0.4529, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.0165915489196777, "rewards/margins": 1.224448323249817, "rewards/rejected": -3.241039752960205, "step": 1260 }, { "epoch": 0.6647474483119602, "grad_norm": 3.3843415697325123, "learning_rate": 1.5238030730835578e-06, "logits/chosen": -0.7160421013832092, "logits/rejected": -0.7430087327957153, "logps/chosen": -469.3851623535156, "logps/rejected": -589.1715698242188, "loss": 0.4304, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.223741292953491, "rewards/margins": 1.258303165435791, "rewards/rejected": -3.4820446968078613, "step": 1270 }, { "epoch": 0.6699816801884323, "grad_norm": 4.31798859648472, "learning_rate": 1.4819063690713565e-06, "logits/chosen": -0.7581356167793274, "logits/rejected": -0.7594733238220215, "logps/chosen": -471.11822509765625, "logps/rejected": -587.6780395507812, "loss": 0.4618, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.2457804679870605, "rewards/margins": 1.2690086364746094, "rewards/rejected": -3.514788866043091, "step": 1280 }, { "epoch": 0.6752159120649045, "grad_norm": 2.9043056997624785, "learning_rate": 1.4403497000615885e-06, "logits/chosen": -0.729811429977417, "logits/rejected": -0.7537652254104614, "logps/chosen": -517.2977294921875, "logps/rejected": -622.2907104492188, "loss": 0.5079, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.318469524383545, "rewards/margins": 1.2079073190689087, "rewards/rejected": -3.5263772010803223, "step": 1290 }, { "epoch": 0.6804501439413766, "grad_norm": 3.476254076191995, "learning_rate": 1.3991469456441273e-06, "logits/chosen": -0.7037397623062134, "logits/rejected": -0.7578923106193542, "logps/chosen": -535.3514404296875, "logps/rejected": -628.6389770507812, "loss": 0.4497, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.396596670150757, "rewards/margins": 1.2228056192398071, "rewards/rejected": -3.6194019317626953, "step": 1300 }, { "epoch": 0.6804501439413766, "eval_logits/chosen": -0.7940347790718079, "eval_logits/rejected": -0.8095559477806091, "eval_logps/chosen": -524.7894897460938, "eval_logps/rejected": -626.9693603515625, "eval_loss": 0.485016793012619, "eval_rewards/accuracies": 0.7730000019073486, "eval_rewards/chosen": -2.483961820602417, "eval_rewards/margins": 1.2531172037124634, "eval_rewards/rejected": -3.73707914352417, "eval_runtime": 490.5183, "eval_samples_per_second": 4.077, "eval_steps_per_second": 0.255, "step": 1300 }, { "epoch": 0.6856843758178487, "grad_norm": 4.224665538714578, "learning_rate": 1.3583118672042441e-06, "logits/chosen": -0.7764405608177185, "logits/rejected": -0.7887049913406372, "logps/chosen": -516.7277221679688, "logps/rejected": -632.888427734375, "loss": 0.472, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.473681926727295, "rewards/margins": 1.352980136871338, "rewards/rejected": -3.8266615867614746, "step": 1310 }, { "epoch": 0.6909186076943209, "grad_norm": 4.563976167233384, "learning_rate": 1.3178581033264218e-06, "logits/chosen": -0.7718938589096069, "logits/rejected": -0.7976632714271545, "logps/chosen": -519.279541015625, "logps/rejected": -601.4541015625, "loss": 0.4487, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.553548812866211, "rewards/margins": 1.0802887678146362, "rewards/rejected": -3.633837938308716, "step": 1320 }, { "epoch": 0.696152839570793, "grad_norm": 4.184522446421226, "learning_rate": 1.2777991652391757e-06, "logits/chosen": -0.7361895442008972, "logits/rejected": -0.7566362619400024, "logps/chosen": -527.201416015625, "logps/rejected": -645.9843139648438, "loss": 0.4656, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.564059257507324, "rewards/margins": 1.190525770187378, "rewards/rejected": -3.7545852661132812, "step": 1330 }, { "epoch": 0.7013870714472651, "grad_norm": 4.166233355017187, "learning_rate": 1.2381484323024178e-06, "logits/chosen": -0.713969349861145, "logits/rejected": -0.7467511892318726, "logps/chosen": -518.2627563476562, "logps/rejected": -595.9432373046875, "loss": 0.4851, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.4644479751586914, "rewards/margins": 1.2395137548446655, "rewards/rejected": -3.7039618492126465, "step": 1340 }, { "epoch": 0.7066213033237373, "grad_norm": 3.865579244226054, "learning_rate": 1.1989191475388518e-06, "logits/chosen": -0.75743567943573, "logits/rejected": -0.7837706804275513, "logps/chosen": -509.4811096191406, "logps/rejected": -594.09033203125, "loss": 0.4915, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.323971748352051, "rewards/margins": 1.0227216482162476, "rewards/rejected": -3.346693515777588, "step": 1350 }, { "epoch": 0.7118555352002094, "grad_norm": 2.652467671657092, "learning_rate": 1.160124413210918e-06, "logits/chosen": -0.7264483571052551, "logits/rejected": -0.7697917222976685, "logps/chosen": -533.6471557617188, "logps/rejected": -609.0975341796875, "loss": 0.4969, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.2726407051086426, "rewards/margins": 1.1175401210784912, "rewards/rejected": -3.3901805877685547, "step": 1360 }, { "epoch": 0.7170897670766815, "grad_norm": 2.5577134603401506, "learning_rate": 1.1217771864447396e-06, "logits/chosen": -0.8089788556098938, "logits/rejected": -0.8140621185302734, "logps/chosen": -544.7674560546875, "logps/rejected": -623.3822631835938, "loss": 0.4132, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.3746323585510254, "rewards/margins": 1.3340243101119995, "rewards/rejected": -3.7086567878723145, "step": 1370 }, { "epoch": 0.7223239989531536, "grad_norm": 3.219449421326901, "learning_rate": 1.08389027490255e-06, "logits/chosen": -0.7905744314193726, "logits/rejected": -0.7953212857246399, "logps/chosen": -490.263916015625, "logps/rejected": -593.0517578125, "loss": 0.4631, "rewards/accuracies": 0.75, "rewards/chosen": -2.2938039302825928, "rewards/margins": 1.0697680711746216, "rewards/rejected": -3.363571882247925, "step": 1380 }, { "epoch": 0.7275582308296258, "grad_norm": 2.9190971855424936, "learning_rate": 1.046476332505036e-06, "logits/chosen": -0.736000657081604, "logits/rejected": -0.7762190103530884, "logps/chosen": -537.1323852539062, "logps/rejected": -622.3277587890625, "loss": 0.4459, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.433082103729248, "rewards/margins": 1.1599032878875732, "rewards/rejected": -3.592985153198242, "step": 1390 }, { "epoch": 0.7327924627060979, "grad_norm": 3.087299031916978, "learning_rate": 1.0095478552050348e-06, "logits/chosen": -0.7585629224777222, "logits/rejected": -0.7968350648880005, "logps/chosen": -505.4625549316406, "logps/rejected": -628.9778442382812, "loss": 0.4734, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.4186644554138184, "rewards/margins": 1.282457947731018, "rewards/rejected": -3.701122283935547, "step": 1400 }, { "epoch": 0.7327924627060979, "eval_logits/chosen": -0.7989752888679504, "eval_logits/rejected": -0.8147993087768555, "eval_logps/chosen": -491.04962158203125, "eval_logps/rejected": -590.251953125, "eval_loss": 0.4832901060581207, "eval_rewards/accuracies": 0.7739999890327454, "eval_rewards/chosen": -2.1465635299682617, "eval_rewards/margins": 1.223341703414917, "eval_rewards/rejected": -3.369905471801758, "eval_runtime": 492.7006, "eval_samples_per_second": 4.059, "eval_steps_per_second": 0.254, "step": 1400 }, { "epoch": 0.73802669458257, "grad_norm": 3.5157604346203226, "learning_rate": 9.731171768139808e-07, "logits/chosen": -0.7569630146026611, "logits/rejected": -0.7974756956100464, "logps/chosen": -481.45477294921875, "logps/rejected": -610.00146484375, "loss": 0.5139, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.1769986152648926, "rewards/margins": 1.2288345098495483, "rewards/rejected": -3.4058330059051514, "step": 1410 }, { "epoch": 0.7432609264590422, "grad_norm": 3.358269083637402, "learning_rate": 9.371964648825221e-07, "logits/chosen": -0.7507213950157166, "logits/rejected": -0.784144401550293, "logps/chosen": -489.03515625, "logps/rejected": -545.8272705078125, "loss": 0.5023, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.055467128753662, "rewards/margins": 0.9931677579879761, "rewards/rejected": -3.0486350059509277, "step": 1420 }, { "epoch": 0.7484951583355143, "grad_norm": 3.450390480039677, "learning_rate": 9.017977166366445e-07, "logits/chosen": -0.7912155985832214, "logits/rejected": -0.8058408498764038, "logps/chosen": -509.6456604003906, "logps/rejected": -619.4719848632812, "loss": 0.4582, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.1829304695129395, "rewards/margins": 1.2820956707000732, "rewards/rejected": -3.4650261402130127, "step": 1430 }, { "epoch": 0.7537293902119864, "grad_norm": 3.9019374901211066, "learning_rate": 8.669327549707096e-07, "logits/chosen": -0.8170151710510254, "logits/rejected": -0.8503853678703308, "logps/chosen": -525.7236328125, "logps/rejected": -609.9266967773438, "loss": 0.4976, "rewards/accuracies": 0.78125, "rewards/chosen": -2.2434840202331543, "rewards/margins": 0.9734998941421509, "rewards/rejected": -3.2169833183288574, "step": 1440 }, { "epoch": 0.7589636220884585, "grad_norm": 3.1069704561124567, "learning_rate": 8.326132244986932e-07, "logits/chosen": -0.8235033750534058, "logits/rejected": -0.8337501287460327, "logps/chosen": -507.8706970214844, "logps/rejected": -617.8888549804688, "loss": 0.4459, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.2898011207580566, "rewards/margins": 1.271906852722168, "rewards/rejected": -3.5617077350616455, "step": 1450 }, { "epoch": 0.7641978539649307, "grad_norm": 2.5011020359087137, "learning_rate": 7.988505876649863e-07, "logits/chosen": -0.7653478384017944, "logits/rejected": -0.8202483057975769, "logps/chosen": -471.88299560546875, "logps/rejected": -579.7264404296875, "loss": 0.475, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.1682538986206055, "rewards/margins": 1.1780481338500977, "rewards/rejected": -3.346302032470703, "step": 1460 }, { "epoch": 0.7694320858414028, "grad_norm": 2.063720356926022, "learning_rate": 7.656561209160248e-07, "logits/chosen": -0.7891589999198914, "logits/rejected": -0.8133258819580078, "logps/chosen": -516.5853271484375, "logps/rejected": -602.1696166992188, "loss": 0.467, "rewards/accuracies": 0.78125, "rewards/chosen": -2.344850540161133, "rewards/margins": 1.223329782485962, "rewards/rejected": -3.5681800842285156, "step": 1470 }, { "epoch": 0.7746663177178749, "grad_norm": 3.4450781896535694, "learning_rate": 7.330409109340563e-07, "logits/chosen": -0.8023967742919922, "logits/rejected": -0.8343321084976196, "logps/chosen": -532.4755859375, "logps/rejected": -600.9357299804688, "loss": 0.4975, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.426600217819214, "rewards/margins": 1.1944777965545654, "rewards/rejected": -3.6210780143737793, "step": 1480 }, { "epoch": 0.7799005495943471, "grad_norm": 4.640657971253548, "learning_rate": 7.010158509342682e-07, "logits/chosen": -0.7905557155609131, "logits/rejected": -0.8068881034851074, "logps/chosen": -546.9576416015625, "logps/rejected": -667.5574340820312, "loss": 0.483, "rewards/accuracies": 0.71875, "rewards/chosen": -2.799778699874878, "rewards/margins": 1.137909173965454, "rewards/rejected": -3.937687635421753, "step": 1490 }, { "epoch": 0.7851347814708192, "grad_norm": 8.807489938353271, "learning_rate": 6.695916370265529e-07, "logits/chosen": -0.7907344698905945, "logits/rejected": -0.8382173776626587, "logps/chosen": -507.33575439453125, "logps/rejected": -633.0894775390625, "loss": 0.4482, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.5701141357421875, "rewards/margins": 1.3424403667449951, "rewards/rejected": -3.9125542640686035, "step": 1500 }, { "epoch": 0.7851347814708192, "eval_logits/chosen": -0.8246029615402222, "eval_logits/rejected": -0.8422741293907166, "eval_logps/chosen": -527.0020751953125, "eval_logps/rejected": -624.8656005859375, "eval_loss": 0.4811870753765106, "eval_rewards/accuracies": 0.7760000228881836, "eval_rewards/chosen": -2.506087303161621, "eval_rewards/margins": 1.209954857826233, "eval_rewards/rejected": -3.7160420417785645, "eval_runtime": 491.111, "eval_samples_per_second": 4.072, "eval_steps_per_second": 0.255, "step": 1500 }, { "epoch": 0.7903690133472913, "grad_norm": 3.367824088455391, "learning_rate": 6.387787646430854e-07, "logits/chosen": -0.7824467420578003, "logits/rejected": -0.8199766874313354, "logps/chosen": -542.2174682617188, "logps/rejected": -609.8660278320312, "loss": 0.4931, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.7443554401397705, "rewards/margins": 1.1063810586929321, "rewards/rejected": -3.850736141204834, "step": 1510 }, { "epoch": 0.7956032452237635, "grad_norm": 2.87877789191255, "learning_rate": 6.085875250329401e-07, "logits/chosen": -0.8328324556350708, "logits/rejected": -0.8468655347824097, "logps/chosen": -507.84844970703125, "logps/rejected": -600.4588623046875, "loss": 0.4568, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.569871187210083, "rewards/margins": 1.0555684566497803, "rewards/rejected": -3.625439405441284, "step": 1520 }, { "epoch": 0.8008374771002356, "grad_norm": 3.688216572569841, "learning_rate": 5.79028001824894e-07, "logits/chosen": -0.7519486546516418, "logits/rejected": -0.7730967402458191, "logps/chosen": -484.6551208496094, "logps/rejected": -676.157958984375, "loss": 0.429, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.3991482257843018, "rewards/margins": 1.4251246452331543, "rewards/rejected": -3.824272871017456, "step": 1530 }, { "epoch": 0.8060717089767077, "grad_norm": 2.22556494947782, "learning_rate": 5.501100676595761e-07, "logits/chosen": -0.8356014490127563, "logits/rejected": -0.8369625210762024, "logps/chosen": -538.4762573242188, "logps/rejected": -637.4407958984375, "loss": 0.462, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.500579833984375, "rewards/margins": 1.1396005153656006, "rewards/rejected": -3.6401805877685547, "step": 1540 }, { "epoch": 0.8113059408531798, "grad_norm": 3.6363110356872355, "learning_rate": 5.218433808920884e-07, "logits/chosen": -0.8045191764831543, "logits/rejected": -0.8361748456954956, "logps/chosen": -514.1395263671875, "logps/rejected": -600.3705444335938, "loss": 0.4748, "rewards/accuracies": 0.78125, "rewards/chosen": -2.4007575511932373, "rewards/margins": 1.0883753299713135, "rewards/rejected": -3.4891326427459717, "step": 1550 }, { "epoch": 0.816540172729652, "grad_norm": 4.618670652390617, "learning_rate": 4.942373823661928e-07, "logits/chosen": -0.8143288493156433, "logits/rejected": -0.8433948755264282, "logps/chosen": -569.4879760742188, "logps/rejected": -613.3157348632812, "loss": 0.5362, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.3494620323181152, "rewards/margins": 1.1645643711090088, "rewards/rejected": -3.514026641845703, "step": 1560 }, { "epoch": 0.821774404606124, "grad_norm": 2.7457651434809893, "learning_rate": 4.6730129226114363e-07, "logits/chosen": -0.8218878507614136, "logits/rejected": -0.8484194874763489, "logps/chosen": -491.6686096191406, "logps/rejected": -611.8516845703125, "loss": 0.4799, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.467026710510254, "rewards/margins": 1.123768925666809, "rewards/rejected": -3.5907950401306152, "step": 1570 }, { "epoch": 0.8270086364825961, "grad_norm": 2.68839096035273, "learning_rate": 4.4104410701222703e-07, "logits/chosen": -0.8048428297042847, "logits/rejected": -0.8185631036758423, "logps/chosen": -523.8101196289062, "logps/rejected": -619.0768432617188, "loss": 0.5228, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.5985608100891113, "rewards/margins": 0.9621597528457642, "rewards/rejected": -3.560720443725586, "step": 1580 }, { "epoch": 0.8322428683590684, "grad_norm": 3.1572366237408316, "learning_rate": 4.154745963060197e-07, "logits/chosen": -0.7831433415412903, "logits/rejected": -0.7975921034812927, "logps/chosen": -479.276123046875, "logps/rejected": -577.49169921875, "loss": 0.4298, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.3617072105407715, "rewards/margins": 1.14790940284729, "rewards/rejected": -3.5096163749694824, "step": 1590 }, { "epoch": 0.8374771002355405, "grad_norm": 3.2177419289192843, "learning_rate": 3.9060130015138863e-07, "logits/chosen": -0.7938388586044312, "logits/rejected": -0.8209096193313599, "logps/chosen": -501.1689453125, "logps/rejected": -594.0531005859375, "loss": 0.4982, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.3808839321136475, "rewards/margins": 1.0191795825958252, "rewards/rejected": -3.4000632762908936, "step": 1600 }, { "epoch": 0.8374771002355405, "eval_logits/chosen": -0.8202841877937317, "eval_logits/rejected": -0.8376915454864502, "eval_logps/chosen": -499.3264465332031, "eval_logps/rejected": -592.1224365234375, "eval_loss": 0.4787100553512573, "eval_rewards/accuracies": 0.7770000100135803, "eval_rewards/chosen": -2.2293312549591064, "eval_rewards/margins": 1.159279465675354, "eval_rewards/rejected": -3.388610601425171, "eval_runtime": 490.4246, "eval_samples_per_second": 4.078, "eval_steps_per_second": 0.255, "step": 1600 }, { "epoch": 0.8427113321120125, "grad_norm": 7.214430205771756, "learning_rate": 3.664325260271953e-07, "logits/chosen": -0.796721339225769, "logits/rejected": -0.8238224983215332, "logps/chosen": -540.4601440429688, "logps/rejected": -581.5528564453125, "loss": 0.4829, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.191901683807373, "rewards/margins": 0.9697766304016113, "rewards/rejected": -3.1616785526275635, "step": 1610 }, { "epoch": 0.8479455639884846, "grad_norm": 2.9830959712764993, "learning_rate": 3.429763461076677e-07, "logits/chosen": -0.795876145362854, "logits/rejected": -0.8391523361206055, "logps/chosen": -538.1182861328125, "logps/rejected": -600.2818603515625, "loss": 0.4793, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.279409408569336, "rewards/margins": 1.0599138736724854, "rewards/rejected": -3.3393235206604004, "step": 1620 }, { "epoch": 0.8531797958649568, "grad_norm": 2.9273391665482746, "learning_rate": 3.202405945663556e-07, "logits/chosen": -0.8152064085006714, "logits/rejected": -0.8364097476005554, "logps/chosen": -526.7260131835938, "logps/rejected": -597.470947265625, "loss": 0.4621, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.458247661590576, "rewards/margins": 1.0275027751922607, "rewards/rejected": -3.485750675201416, "step": 1630 }, { "epoch": 0.8584140277414289, "grad_norm": 3.433887400095838, "learning_rate": 2.982328649595856e-07, "logits/chosen": -0.80964195728302, "logits/rejected": -0.8188964128494263, "logps/chosen": -488.6082458496094, "logps/rejected": -585.5677490234375, "loss": 0.4951, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.1840968132019043, "rewards/margins": 1.1141045093536377, "rewards/rejected": -3.2982017993927, "step": 1640 }, { "epoch": 0.863648259617901, "grad_norm": 2.622521132377607, "learning_rate": 2.7696050769026954e-07, "logits/chosen": -0.8142071962356567, "logits/rejected": -0.8193332552909851, "logps/chosen": -515.5963745117188, "logps/rejected": -645.8536987304688, "loss": 0.4876, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.4191887378692627, "rewards/margins": 1.2284886837005615, "rewards/rejected": -3.6476776599884033, "step": 1650 }, { "epoch": 0.8688824914943732, "grad_norm": 3.0785040031189537, "learning_rate": 2.564306275529341e-07, "logits/chosen": -0.8157971501350403, "logits/rejected": -0.8292611241340637, "logps/chosen": -481.8374938964844, "logps/rejected": -571.3180541992188, "loss": 0.489, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.333627700805664, "rewards/margins": 1.1653764247894287, "rewards/rejected": -3.4990038871765137, "step": 1660 }, { "epoch": 0.8741167233708453, "grad_norm": 2.984496753224684, "learning_rate": 2.3665008136077332e-07, "logits/chosen": -0.8203511238098145, "logits/rejected": -0.8614446520805359, "logps/chosen": -540.2431030273438, "logps/rejected": -649.6897583007812, "loss": 0.4706, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.4780304431915283, "rewards/margins": 1.2412341833114624, "rewards/rejected": -3.719264507293701, "step": 1670 }, { "epoch": 0.8793509552473174, "grad_norm": 2.6893878378493556, "learning_rate": 2.1762547565553293e-07, "logits/chosen": -0.7929474711418152, "logits/rejected": -0.8287181854248047, "logps/chosen": -522.1485595703125, "logps/rejected": -629.0785522460938, "loss": 0.4218, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.3336870670318604, "rewards/margins": 1.2596609592437744, "rewards/rejected": -3.5933480262756348, "step": 1680 }, { "epoch": 0.8845851871237895, "grad_norm": 3.6443267203731198, "learning_rate": 1.993631645009747e-07, "logits/chosen": -0.7802733778953552, "logits/rejected": -0.8199615478515625, "logps/chosen": -495.28399658203125, "logps/rejected": -611.3101196289062, "loss": 0.4402, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.3746984004974365, "rewards/margins": 1.337454915046692, "rewards/rejected": -3.712153196334839, "step": 1690 }, { "epoch": 0.8898194190002617, "grad_norm": 3.031125869912656, "learning_rate": 1.818692473606748e-07, "logits/chosen": -0.7836586833000183, "logits/rejected": -0.8228501081466675, "logps/chosen": -522.3655395507812, "logps/rejected": -597.0633544921875, "loss": 0.4594, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.3934988975524902, "rewards/margins": 1.1604548692703247, "rewards/rejected": -3.5539536476135254, "step": 1700 }, { "epoch": 0.8898194190002617, "eval_logits/chosen": -0.8378602266311646, "eval_logits/rejected": -0.8565782904624939, "eval_logps/chosen": -513.1796264648438, "eval_logps/rejected": -610.4910888671875, "eval_loss": 0.47899821400642395, "eval_rewards/accuracies": 0.7730000019073486, "eval_rewards/chosen": -2.3678627014160156, "eval_rewards/margins": 1.2044339179992676, "eval_rewards/rejected": -3.5722968578338623, "eval_runtime": 490.4611, "eval_samples_per_second": 4.078, "eval_steps_per_second": 0.255, "step": 1700 }, { "epoch": 0.8950536508767338, "grad_norm": 3.891535373197336, "learning_rate": 1.6514956706084885e-07, "logits/chosen": -0.8305776715278625, "logits/rejected": -0.8483774065971375, "logps/chosen": -526.2332763671875, "logps/rejected": -616.9027709960938, "loss": 0.4435, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.3717777729034424, "rewards/margins": 1.235668659210205, "rewards/rejected": -3.6074466705322266, "step": 1710 }, { "epoch": 0.9002878827532059, "grad_norm": 3.9743964527469595, "learning_rate": 1.4920970783889737e-07, "logits/chosen": -0.8023131489753723, "logits/rejected": -0.8172504305839539, "logps/chosen": -517.201416015625, "logps/rejected": -612.9605712890625, "loss": 0.4561, "rewards/accuracies": 0.75, "rewards/chosen": -2.430854082107544, "rewards/margins": 1.1006124019622803, "rewards/rejected": -3.531466245651245, "step": 1720 }, { "epoch": 0.9055221146296781, "grad_norm": 3.3367892778828034, "learning_rate": 1.340549934783164e-07, "logits/chosen": -0.793104887008667, "logits/rejected": -0.8096216320991516, "logps/chosen": -490.0877380371094, "logps/rejected": -592.3488159179688, "loss": 0.4734, "rewards/accuracies": 0.75, "rewards/chosen": -2.2823646068573, "rewards/margins": 1.2109824419021606, "rewards/rejected": -3.493346691131592, "step": 1730 }, { "epoch": 0.9107563465061502, "grad_norm": 3.7031601561414935, "learning_rate": 1.196904855305961e-07, "logits/chosen": -0.7839618921279907, "logits/rejected": -0.7986418008804321, "logps/chosen": -523.5616455078125, "logps/rejected": -599.9172973632812, "loss": 0.493, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.390275001525879, "rewards/margins": 1.185589075088501, "rewards/rejected": -3.57586407661438, "step": 1740 }, { "epoch": 0.9159905783826223, "grad_norm": 3.0931062812733674, "learning_rate": 1.0612098162470302e-07, "logits/chosen": -0.8060128092765808, "logits/rejected": -0.8450511693954468, "logps/chosen": -531.2606201171875, "logps/rejected": -596.30224609375, "loss": 0.5061, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.386965274810791, "rewards/margins": 1.1063708066940308, "rewards/rejected": -3.493335723876953, "step": 1750 }, { "epoch": 0.9212248102590945, "grad_norm": 3.0522935551827777, "learning_rate": 9.335101386471285e-08, "logits/chosen": -0.8363837003707886, "logits/rejected": -0.8385022282600403, "logps/chosen": -485.2688903808594, "logps/rejected": -600.6049194335938, "loss": 0.4747, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.374671459197998, "rewards/margins": 1.2024142742156982, "rewards/rejected": -3.5770859718322754, "step": 1760 }, { "epoch": 0.9264590421355666, "grad_norm": 3.816198597418785, "learning_rate": 8.138484731612273e-08, "logits/chosen": -0.7746875286102295, "logits/rejected": -0.7804522514343262, "logps/chosen": -511.98760986328125, "logps/rejected": -612.0762329101562, "loss": 0.4838, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.5132033824920654, "rewards/margins": 1.177661657333374, "rewards/rejected": -3.6908645629882812, "step": 1770 }, { "epoch": 0.9316932740120387, "grad_norm": 2.5117854405272495, "learning_rate": 7.022647858135501e-08, "logits/chosen": -0.8066379427909851, "logits/rejected": -0.8451001048088074, "logps/chosen": -493.4380798339844, "logps/rejected": -612.2415771484375, "loss": 0.4947, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.3315136432647705, "rewards/margins": 1.325115442276001, "rewards/rejected": -3.6566288471221924, "step": 1780 }, { "epoch": 0.9369275058885108, "grad_norm": 2.293063931614578, "learning_rate": 5.987963446492384e-08, "logits/chosen": -0.7881379723548889, "logits/rejected": -0.8155984878540039, "logps/chosen": -554.5796508789062, "logps/rejected": -694.9855346679688, "loss": 0.4453, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.371554136276245, "rewards/margins": 1.3966000080108643, "rewards/rejected": -3.7681541442871094, "step": 1790 }, { "epoch": 0.942161737764983, "grad_norm": 3.9098036406700367, "learning_rate": 5.034777072871394e-08, "logits/chosen": -0.7948960065841675, "logits/rejected": -0.817171573638916, "logps/chosen": -482.91650390625, "logps/rejected": -598.6817626953125, "loss": 0.4551, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.3254570960998535, "rewards/margins": 1.1948542594909668, "rewards/rejected": -3.5203113555908203, "step": 1800 }, { "epoch": 0.942161737764983, "eval_logits/chosen": -0.8396913409233093, "eval_logits/rejected": -0.8587289452552795, "eval_logps/chosen": -509.1396789550781, "eval_logps/rejected": -605.8721923828125, "eval_loss": 0.4786478281021118, "eval_rewards/accuracies": 0.7730000019073486, "eval_rewards/chosen": -2.3274641036987305, "eval_rewards/margins": 1.1986435651779175, "eval_rewards/rejected": -3.5261073112487793, "eval_runtime": 490.7513, "eval_samples_per_second": 4.075, "eval_steps_per_second": 0.255, "step": 1800 }, { "epoch": 0.9473959696414551, "grad_norm": 3.3434570748221177, "learning_rate": 4.163407093778243e-08, "logits/chosen": -0.7641734480857849, "logits/rejected": -0.7936812043190002, "logps/chosen": -508.19061279296875, "logps/rejected": -573.2919311523438, "loss": 0.4779, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.2934184074401855, "rewards/margins": 1.2088220119476318, "rewards/rejected": -3.5022406578063965, "step": 1810 }, { "epoch": 0.9526302015179272, "grad_norm": 3.306806286518662, "learning_rate": 3.37414453970758e-08, "logits/chosen": -0.7990435361862183, "logits/rejected": -0.8609498739242554, "logps/chosen": -522.0191650390625, "logps/rejected": -591.8367309570312, "loss": 0.4377, "rewards/accuracies": 0.8125, "rewards/chosen": -2.2719483375549316, "rewards/margins": 1.3299857378005981, "rewards/rejected": -3.601933717727661, "step": 1820 }, { "epoch": 0.9578644333943994, "grad_norm": 3.622993124197589, "learning_rate": 2.6672530179410183e-08, "logits/chosen": -0.8073042631149292, "logits/rejected": -0.823306679725647, "logps/chosen": -486.38525390625, "logps/rejected": -573.3196411132812, "loss": 0.493, "rewards/accuracies": 0.75, "rewards/chosen": -2.312579393386841, "rewards/margins": 1.030884027481079, "rewards/rejected": -3.34346342086792, "step": 1830 }, { "epoch": 0.9630986652708715, "grad_norm": 3.2673896792966106, "learning_rate": 2.04296862450451e-08, "logits/chosen": -0.8151887059211731, "logits/rejected": -0.8248012661933899, "logps/chosen": -551.4527587890625, "logps/rejected": -624.1947631835938, "loss": 0.4627, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.5660691261291504, "rewards/margins": 1.0703225135803223, "rewards/rejected": -3.6363918781280518, "step": 1840 }, { "epoch": 0.9683328971473436, "grad_norm": 2.95865023129791, "learning_rate": 1.501499865314171e-08, "logits/chosen": -0.7353194952011108, "logits/rejected": -0.7695177793502808, "logps/chosen": -559.2681884765625, "logps/rejected": -642.7736206054688, "loss": 0.4444, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.3837671279907227, "rewards/margins": 1.1292965412139893, "rewards/rejected": -3.513063907623291, "step": 1850 }, { "epoch": 0.9735671290238157, "grad_norm": 2.492804384265188, "learning_rate": 1.0430275865371265e-08, "logits/chosen": -0.799170732498169, "logits/rejected": -0.8115439414978027, "logps/chosen": -491.9395446777344, "logps/rejected": -578.8630981445312, "loss": 0.4702, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.4343132972717285, "rewards/margins": 1.1897588968276978, "rewards/rejected": -3.6240718364715576, "step": 1860 }, { "epoch": 0.9788013609002879, "grad_norm": 4.425829277630147, "learning_rate": 6.677049141901315e-09, "logits/chosen": -0.7830789685249329, "logits/rejected": -0.8174635171890259, "logps/chosen": -483.9396057128906, "logps/rejected": -624.7434692382812, "loss": 0.4794, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.34547758102417, "rewards/margins": 1.2174358367919922, "rewards/rejected": -3.562913417816162, "step": 1870 }, { "epoch": 0.98403559277676, "grad_norm": 2.8797731827784325, "learning_rate": 3.756572029968708e-09, "logits/chosen": -0.8215857744216919, "logits/rejected": -0.8443831205368042, "logps/chosen": -508.30419921875, "logps/rejected": -605.548828125, "loss": 0.4655, "rewards/accuracies": 0.78125, "rewards/chosen": -2.2893662452697754, "rewards/margins": 1.2450931072235107, "rewards/rejected": -3.534459352493286, "step": 1880 }, { "epoch": 0.9892698246532321, "grad_norm": 4.192074572610901, "learning_rate": 1.6698199452053199e-09, "logits/chosen": -0.7996589541435242, "logits/rejected": -0.8368595838546753, "logps/chosen": -520.9606323242188, "logps/rejected": -608.918701171875, "loss": 0.482, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.332554340362549, "rewards/margins": 1.064354658126831, "rewards/rejected": -3.39690899848938, "step": 1890 }, { "epoch": 0.9945040565297043, "grad_norm": 2.981651951733524, "learning_rate": 4.1748984585560094e-10, "logits/chosen": -0.80968177318573, "logits/rejected": -0.794297993183136, "logps/chosen": -501.98553466796875, "logps/rejected": -608.4963989257812, "loss": 0.4605, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.3502135276794434, "rewards/margins": 1.3177697658538818, "rewards/rejected": -3.667983293533325, "step": 1900 }, { "epoch": 0.9945040565297043, "eval_logits/chosen": -0.8360199928283691, "eval_logits/rejected": -0.8543941378593445, "eval_logps/chosen": -507.2547607421875, "eval_logps/rejected": -604.1884765625, "eval_loss": 0.47848692536354065, "eval_rewards/accuracies": 0.7739999890327454, "eval_rewards/chosen": -2.3086142539978027, "eval_rewards/margins": 1.2006564140319824, "eval_rewards/rejected": -3.509270668029785, "eval_runtime": 492.0941, "eval_samples_per_second": 4.064, "eval_steps_per_second": 0.254, "step": 1900 }, { "epoch": 0.9997382884061764, "grad_norm": 3.6416690762006327, "learning_rate": 0.0, "logits/chosen": -0.8213506937026978, "logits/rejected": -0.8055696487426758, "logps/chosen": -526.1131591796875, "logps/rejected": -621.0297241210938, "loss": 0.4759, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.379620313644409, "rewards/margins": 1.0635095834732056, "rewards/rejected": -3.4431300163269043, "step": 1910 }, { "epoch": 0.9997382884061764, "step": 1910, "total_flos": 0.0, "train_loss": 0.504705511212973, "train_runtime": 53071.3067, "train_samples_per_second": 1.152, "train_steps_per_second": 0.036 } ], "logging_steps": 10, "max_steps": 1910, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }